install adapter for 0.6.5

Signed-off-by: Jefferson Fialho <[email protected]>
opendatahub-io · Dec 19, 2024 · 3f74522 · 3f74522
1 parent e362236
commit 3f74522
Show file tree

Hide file tree

Showing 2 changed files with 122 additions and 160 deletions.
diff --git a/Dockerfile.rocm.ubi b/Dockerfile.rocm.ubi
@@ -1,169 +1,131 @@
-## Global Args ##################################################################
-ARG BASE_UBI_IMAGE_TAG=9.4
+## Global Args #################################################################
+ARG BASE_UBI_IMAGE_TAG=9.5-1733767867
 ARG PYTHON_VERSION=3.12
-# Default ROCm ARCHes to build vLLM for.
-ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
-ARG MAX_JOBS=12
 
-FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
 
+## Base Layer ##################################################################
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
 ARG PYTHON_VERSION
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+RUN microdnf -y update && microdnf install -y \
+    python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \
+    && microdnf clean all
 
-ENV VIRTUAL_ENV=/opt/vllm
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+WORKDIR /workspace
 
-RUN --mount=type=cache,target=/root/.cache/pip \
- microdnf -y update && \
- microdnf install -y --setopt=install_weak_deps=0 --nodocs \
-    python${PYTHON_VERSION}-devel \
-    python${PYTHON_VERSION}-pip \
-    python${PYTHON_VERSION}-wheel && \
-    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \
-    pip install -U pip wheel setuptools uv && \
- microdnf clean all
-
-
-FROM base AS rocm_base
-ARG ROCM_VERSION=6.2.3
-ARG PYTHON_VERSION
-ARG BASE_UBI_IMAGE_TAG
-
-RUN printf "[amdgpu]\n\
-name=amdgpu\n\
-baseurl=https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/${BASE_UBI_IMAGE_TAG}/main/x86_64/\n\
-enabled=1\n\
-priority=50\n\
-gpgcheck=1\n\
-gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key\n\
-[ROCm-${ROCM_VERSION}]\n\
-name=ROCm${ROCM_VERSION}\n\
-baseurl=https://repo.radeon.com/rocm/rhel9/${ROCM_VERSION}/main\n\
-enabled=1\n\
-priority=50\n\
-gpgcheck=1\n\
-gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" > /etc/yum.repos.d/amdgpu.repo
+ENV LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8
 
+# Some utils for dev purposes - tar required for kubectl cp
+RUN microdnf install -y \
+        which procps findutils tar vim git\
+    && microdnf clean all
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    export version="$(awk -F. '{print $1"."$2}' <<< $ROCM_VERSION)" && \
-    uv pip install --pre \
-        --index-url "https://download.pytorch.org/whl/nightly/rocm${version}" \
-        torch==2.6.0.dev20241107+rocm${version}\
-        torchvision==0.20.0.dev20241107+rocm${version} && \
-    # Install libdrm-amdgpu to avoid errors when retrieving device information (amdgpu.ids: No such file or directory)
-    microdnf install -y libdrm-amdgpu && \
-    microdnf clean all
 
+## Python Installer ############################################################
+FROM base as python-install
+ARG PYTHON_VERSION
 
-ENV LD_LIBRARY_PATH="$VIRTUAL_ENV/lib/python${PYTHON_VERSION}/site-packages/numpy.libs:$LD_LIBRARY_PATH"
-ENV LD_LIBRARY_PATH="$VIRTUAL_ENV/lib/python${PYTHON_VERSION}/site-packages/pillow.libs:$LD_LIBRARY_PATH"
-ENV LD_LIBRARY_PATH="$VIRTUAL_ENV/lib/python${PYTHON_VERSION}/site-packages/triton/backends/amd/lib:$LD_LIBRARY_PATH"
-ENV LD_LIBRARY_PATH="$VIRTUAL_ENV/lib/python${PYTHON_VERSION}/site-packages/torch/lib:$LD_LIBRARY_PATH"
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+RUN microdnf install -y \
+    python${PYTHON_VERSION}-devel  && \
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
 
-RUN echo $LD_LIBRARY_PATH | tr : \\n >> /etc/ld.so.conf.d/torch-venv.conf && \
-    ldconfig
 
-FROM rocm_base as rocm_devel
+## CUDA Base ###################################################################
+FROM python-install as cuda-base
 
-ENV CCACHE_DIR=/root/.cache/ccache
+RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
+        https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
 
-RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
-    rpm -ql epel-release && \
-    microdnf -y update && \
-    microdnf -y install \
-        ccache \
-        git \
-        # packages required to build vllm
-        amd-smi \
-        hipblas-devel \
-        hipblaslt-devel \
-        hipcc \
-        hipcub-devel \
-        hipfft-devel \
-        hiprand-devel \
-        hipsolver-devel \
-        hipsparse-devel \
-        hsa-rocr-devel \
-        miopen-hip-devel \
-        rccl-devel \
-        rocblas-devel \
-        rocm-device-libs \
-        rocprim-devel \
-        rocrand-devel \
-        rocthrust-devel \
-        # end packages required to build vllm
-        wget \
-        which && \
+RUN microdnf install -y \
+        cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \
     microdnf clean all
 
-WORKDIR /workspace
-
-ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
-ENV PATH=$PATH:/opt/rocm/bin
-ENV CPLUS_INCLUDE_PATH=$VIRTUAL_ENV/lib/python${PYTHON_VERSION}/site-packages/torch/include:/opt/rocm/include
+ENV CUDA_HOME="/usr/local/cuda" \
+    PATH="${CUDA_HOME}/bin:${PATH}" \
+    LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
 
+## Python cuda base #################################################################
+FROM cuda-base AS python-cuda-base
 
-FROM rocm_devel AS build_amdsmi
-
-# Build AMD SMI wheel
-RUN cd /opt/rocm/share/amd_smi && \
-    python3 -m pip wheel . --wheel-dir=/install
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
-##################################################################################################
+# install cuda and common dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+    uv pip install \
+        -r requirements-cuda.txt
 
-FROM rocm_devel AS build_flashattention
 
-ARG FA_GFX_ARCHS="gfx90a;gfx942"
+## Development #################################################################
+FROM python-cuda-base AS dev
 
-# the FA_BRANCH commit belongs to the ROCm/flash-attention fork, `main_perf` branch
-ARG FA_BRANCH="3cea2fb"
-ARG MAX_JOBS
-ENV MAX_JOBS=${MAX_JOBS}
+# install build and runtime dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+    --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
+    --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
+    --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
+    uv pip install \
+        -r requirements-cuda.txt \
+        -r requirements-dev.txt
 
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=cache,target=/workspace/build \
-    mkdir -p /libs && \
-    cd /libs && \
-    git clone https://github.com/ROCm/flash-attention.git && \
-    cd flash-attention && \
-    git checkout ${FA_BRANCH} && \
-    git submodule update --init && \
-    uv pip install cmake ninja packaging && \
-    env \
-        GPU_ARCHS="${FA_GFX_ARCHS}" \
-        python3 setup.py bdist_wheel --dist-dir=/install
+## Builder #####################################################################
+FROM dev AS build
 
-##################################################################################################
+# install build dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
+    uv pip install -r requirements-build.txt
 
-FROM rocm_devel AS build_vllm
-ARG PYTORCH_ROCM_ARCH
-ARG PYTHON_VERSION
-ARG MAX_JOBS
-ENV MAX_JOBS=${MAX_JOBS}
-ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+# install compiler cache to speed up compilation leveraging local or remote caching
+# git is required for the cutlass kernels
+RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y git ccache && microdnf clean all
 
 COPY . .
 
-ENV VLLM_TARGET_DEVICE="rocm"
-ENV MAX_JOBS=${MAX_JOBS}
-# Make sure punica kernels are built (for LoRA)
+ARG TORCH_CUDA_ARCH_LIST
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+ARG vllm_fa_cmake_gpu_arches
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads=8
+ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
+# Make sure the cuda environment is in the PATH
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -v -U \
-        ninja setuptools-scm>=8 "cmake>=3.26" packaging && \
+    --mount=type=bind,src=.git,target=/workspace/.git \
     env CFLAGS="-march=haswell" \
         CXXFLAGS="$CFLAGS $CXXFLAGS" \
         CMAKE_BUILD_TYPE=Release \
-    python3 setup.py bdist_wheel --dist-dir=dist
+        python3 setup.py bdist_wheel --dist-dir=dist
 
 #################### libsodium Build IMAGE ####################
-FROM rocm_base as libsodium-builder
+FROM base as libsodium-builder
 
-RUN microdnf install -y gcc gzip tar \
+RUN microdnf install -y gcc gzip \
     && microdnf clean all
 
 WORKDIR /usr/src/libsodium
@@ -174,44 +136,43 @@ RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM
     && rm -f libsodium*.tar.gz \
     && mv libsodium*/* ./
 
-RUN CFLAGS="-O3 -Wall -Werror=format-security -Wno-unused-function -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection" \
-    ./configure \
-        --prefix="/usr/" \
-        --libdir=/usr/lib64 && \
-    make -j $(nproc) && \
-    make check
+RUN CFLAGS="-O3 -Wall -Werror=format-security -Wno-unused-function -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection"\
+    ./configure --prefix="/usr/" && make -j $MAX_JOBS && make check
 
-##################################################################################################
-
-FROM rocm_base AS vllm-openai
-ARG MAX_JOBS
+## Release #####################################################################
+FROM python-install AS vllm-openai
+ARG PYTHON_VERSION
 
 WORKDIR /workspace
 
 ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH=$VIRTUAL_ENV/bin:$PATH
 
-# Required for triton
-RUN microdnf install -y --setopt=install_weak_deps=0 --nodocs gcc rsync && \
-    microdnf clean all
+# force using the python venv's cuda runtime libraries
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
+
+# Triton needs a CC compiler
+RUN microdnf install -y gcc \
+    rsync \
+    && microdnf clean all
+
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install "$(echo dist/*.whl)[tensorizer]" --verbose
 
 # Install libsodium for Tensorizer encryption
 RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
     cd /usr/src/libsodium \
     && make install
 
-RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install/amdsmi/ \
-    --mount=type=bind,from=build_flashattention,src=/install,target=/install/flashattention \
-    --mount=type=bind,from=build_vllm,src=/workspace/dist,target=/install/vllm/ \
-    --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=cache,target=/root/.cache/uv \
-    export version="$(awk -F. '{print $1"."$2}' <<< $ROCM_VERSION)" && \
     uv pip install \
-        --index-strategy=unsafe-best-match \
-        --extra-index-url "https://download.pytorch.org/whl/nightly/rocm${version}" \
-        /install/amdsmi/*.whl\
-        /install/flashattention/*.whl\
-        /install/vllm/*.whl
+        "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp312-cp312-linux_x86_64.whl"
 
 ENV HF_HUB_OFFLINE=1 \
     HOME=/home/vllm \
@@ -222,20 +183,15 @@ ENV HF_HUB_OFFLINE=1 \
     VLLM_USAGE_SOURCE=production-docker-image \
     VLLM_WORKER_MULTIPROC_METHOD=fork \
     VLLM_NO_USAGE_STATS=1 \
-    # Silences the HF Tokenizers warning
-    TOKENIZERS_PARALLELISM=false  \
-    RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 \
-    VLLM_USE_TRITON_FLASH_ATTN=0 \
-    HIP_FORCE_DEV_KERNARG=1 \
     OUTLINES_CACHE_DIR=/tmp/outlines \
     NUMBA_CACHE_DIR=/tmp/numba \
     TRITON_CACHE_DIR=/tmp/triton
 
 # setup non-root user for OpenShift
 RUN umask 002 && \
     useradd --uid 2000 --gid 0 vllm && \
-    mkdir -p /licenses /home/vllm && \
-    chmod g+rwx /home/vllm
+    mkdir -p /home/vllm && \
+    chmod g+rwx /home/vllm /usr/src /workspace
 
 COPY LICENSE /licenses/vllm.md
 COPY examples/*.jinja /app/data/template/
@@ -252,8 +208,11 @@ USER root
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,from=build_vllm,src=/workspace/dist,target=/install/vllm/ \
-    HOME=/root uv pip install /install/vllm/*.whl vllm-tgis-adapter==0.5.3
+    --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+    HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]"
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install git+https://github.com/opendatahub-io/vllm-tgis-adapter@vllm0p6p5
 
 ENV GRPC_PORT=8033 \
     PORT=8000 \

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
@@ -209,7 +209,10 @@ USER root
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
-    HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.5.3
+    HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]"
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install git+https://github.com/opendatahub-io/vllm-tgis-adapter@vllm0p6p5
 
 ENV GRPC_PORT=8033 \
     PORT=8000 \