From b52f748d78a0db376d072d592ac6cb336bdc087d Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 14 Oct 2024 14:59:57 -0300 Subject: [PATCH] two stage build Signed-off-by: Max de Bayser --- Dockerfile.ubi | 207 ++----------------------------------------------- 1 file changed, 6 insertions(+), 201 deletions(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 9accc9fb7dccd..5192b9174c9e2 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -1,210 +1,15 @@ -## Global Args ################################################################# -ARG BASE_UBI_IMAGE_TAG=9.4 -ARG PYTHON_VERSION=3.12 - -ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" -ARG vllm_fa_cmake_gpu_arches='80-real;90-real' - -## Base Layer ################################################################## -FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base -ARG PYTHON_VERSION -ENV PYTHON_VERSION=${PYTHON_VERSION} -RUN microdnf -y update && microdnf install -y \ - python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \ - && microdnf clean all - -WORKDIR /workspace - -ENV LANG=C.UTF-8 \ - LC_ALL=C.UTF-8 - -# Some utils for dev purposes - tar required for kubectl cp -RUN microdnf install -y \ - which procps findutils tar vim git\ - && microdnf clean all - - -## Python Installer ############################################################ -FROM base as python-install -ARG PYTHON_VERSION - -ENV VIRTUAL_ENV=/opt/vllm -ENV PATH="$VIRTUAL_ENV/bin:$PATH" -ENV PYTHON_VERSION=${PYTHON_VERSION} -RUN microdnf install -y \ - python${PYTHON_VERSION}-devel && \ - python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all - - -## CUDA Base ################################################################### -FROM python-install as cuda-base - -RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \ - https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo - -RUN microdnf install -y \ - cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \ - microdnf clean all - -ENV CUDA_HOME="/usr/local/cuda" \ - PATH="${CUDA_HOME}/bin:${PATH}" \ - LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}" - -## Python cuda base ################################################################# -FROM cuda-base AS python-cuda-base - -ENV VIRTUAL_ENV=/opt/vllm -ENV PATH="$VIRTUAL_ENV/bin:$PATH" - -# install cuda and common dependencies -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ - --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ - uv pip install \ - -r requirements-cuda.txt - - -## Development ################################################################# -FROM python-cuda-base AS dev - -# install build and runtime dependencies -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ - --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ - --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \ - --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \ - --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \ - uv pip install \ - -r requirements-cuda.txt \ - -r requirements-dev.txt - -## Builder ##################################################################### -FROM dev AS build - -# install build dependencies -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \ - uv pip install -r requirements-build.txt - -# install compiler cache to speed up compilation leveraging local or remote caching -# git is required for the cutlass kernels -RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y git ccache && microdnf clean all - -COPY . . - -ARG TORCH_CUDA_ARCH_LIST -ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST -ARG vllm_fa_cmake_gpu_arches -ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} - -# max jobs used by Ninja to build extensions -ARG max_jobs=2 -ENV MAX_JOBS=${max_jobs} -# number of threads used by nvcc -ARG nvcc_threads=8 -ENV NVCC_THREADS=$nvcc_threads -# make sure punica kernels are built (for LoRA) -ENV VLLM_INSTALL_PUNICA_KERNELS=1 - -# Make sure the cuda environment is in the PATH -ENV PATH=/usr/local/cuda/bin:$PATH - -ENV CCACHE_DIR=/root/.cache/ccache -RUN --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=cache,target=/root/.cache/pip \ - --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,src=.git,target=/workspace/.git \ - env CFLAGS="-march=haswell" \ - CXXFLAGS="$CFLAGS $CXXFLAGS" \ - CMAKE_BUILD_TYPE=Release \ - python3 setup.py bdist_wheel --dist-dir=dist - -#################### libsodium Build IMAGE #################### -FROM base as libsodium-builder - -RUN microdnf install -y gcc gzip \ - && microdnf clean all - -WORKDIR /usr/src/libsodium - -ARG LIBSODIUM_VERSION=1.0.20 -RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \ - && tar -xzvf libsodium*.tar.gz \ - && rm -f libsodium*.tar.gz \ - && mv libsodium*/* ./ - -RUN CFLAGS="-O3 -Wall -Werror=format-security -Wno-unused-function -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection"\ - ./configure --prefix="/usr/" && make -j $MAX_JOBS && make check - -## Release ##################################################################### -FROM python-install AS vllm-openai -ARG PYTHON_VERSION - -WORKDIR /workspace - -ENV VIRTUAL_ENV=/opt/vllm -ENV PATH=$VIRTUAL_ENV/bin/:$PATH - -# force using the python venv's cuda runtime libraries -ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}" -ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}" -ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}" - -# Triton needs a CC compiler -RUN microdnf install -y gcc \ - && microdnf clean all - -# install vllm wheel first, so that torch etc will be installed -RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ - --mount=type=cache,target=/root/.cache/pip \ - --mount=type=cache,target=/root/.cache/uv \ - uv pip install $(echo dist/*.whl)'[tensorizer]' --verbose - -# Install libsodium for Tensorizer encryption -RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \ - cd /usr/src/libsodium \ - && make install - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=cache,target=/root/.cache/uv \ - uv pip install \ - "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp312-cp312-linux_x86_64.whl" - -ENV HF_HUB_OFFLINE=1 \ - HOME=/home/vllm \ - # Allow requested max length to exceed what is extracted from the - # config.json - # see: https://github.com/vllm-project/vllm/pull/7080 - VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ - VLLM_USAGE_SOURCE=production-docker-image \ - VLLM_WORKER_MULTIPROC_METHOD=fork \ - VLLM_NO_USAGE_STATS=1 - -# setup non-root user for OpenShift -RUN umask 002 \ - && useradd --uid 2000 --gid 0 vllm \ - && chmod g+rwx $HOME /usr/src /workspace - -COPY LICENSE /licenses/vllm.md - -# Copy only .jinja files from example directory to template directory -COPY examples/*.jinja /app/data/template/ - -USER 2000 -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] - - -FROM vllm-openai as vllm-grpc-adapter +# Start from released image +FROM quay.io/opendatahub/vllm:cuda-pr-198 as vllm-grpc-adapter USER root +# Copy source code changes into the installed location to overwrite the installed python code +COPY vllm /opt/vllm/lib64/python3.12/site-packages/vllm + # RUN --mount=type=cache,target=/root/.cache/pip \ # pip install vllm-tgis-adapter==0.5.1 RUN --mount=type=cache,target=/root/.cache/pip \ - pip install vllm-tgis-adapter==0.5.2 + pip install git+https://github.com/opendatahub-io/vllm-tgis-adapter@main ENV GRPC_PORT=8033 \ PORT=8000 \