diff --git a/.github/workflows/sync-with-upstream.yml b/.github/workflows/sync-with-upstream.yml new file mode 100644 index 000000000000..53751552f4d2 --- /dev/null +++ b/.github/workflows/sync-with-upstream.yml @@ -0,0 +1,84 @@ +name: "Sync with upstream" + +on: + schedule: + - cron: 20 4 * * * + + workflow_dispatch: + + +env: + # repo to fetch changes from + UPSTREAM_REPO: vllm-project/vllm + # branch to sync + BRANCH: main + +jobs: + upstream-sync: + name: Sync with upstream + runs-on: ubuntu-latest + permissions: + pull-requests: write + contents: write + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Fetch upstream repo + run: | + git remote add upstream https://github.com/${UPSTREAM_REPO} + git fetch upstream + + - name: Check diff + id: diff + shell: bash + run: | + echo 'diff<> $GITHUB_OUTPUT + git diff --stat upstream/${BRANCH} | tee -a >(cat >> $GITHUB_OUTPUT) + echo 'EOF' >> $GITHUB_OUTPUT + + - name: Create PR + if: ${{ steps.diff.outputs.diff != '' }} + env: + GH_TOKEN: ${{ github.token }} + run: | + set -xeu + + git_hash="$(git rev-parse upstream/${BRANCH})" + echo "git_hash=$git_hash" >> $GITHUB_OUTPUT + git_describe="$(git describe --tags upstream/${BRANCH})" + echo "git_describe=$git_describe" >> $GITHUB_OUTPUT + + # echo 'commits<> $GITHUB_OUTPUT + # git log --oneline ..upstream/${BRANCH} >> $GITHUB_OUTPUT + # echo 'EOF' >> $GITHUB_OUTPUT + + upstream_url="https://github.com/${UPSTREAM_REPO}" + upstream_branch="$upstream_url/tree/${BRANCH}" + + title="Sync with upstream@${git_describe}" + body="Merge [${UPSTREAM_REPO}]($upstream_url):[${BRANCH}]($upstream_branch)@[${git_describe}](${upstream_url}/commit/$git_hash) into $BRANCH" + + gh repo set-default $GITHUB_REPOSITORY + pr_number=$(gh pr list -S "Sync with upstream@" --json number --jq '.[0].number') + + if [[ -z $pr_number ]]; then + echo "Creating PR" + gh pr create \ + --head $(echo $UPSTREAM_REPO | sed 's|/|:|g'):${BRANCH} \ + --base ${BRANCH} \ + --label code-sync \ + --title "$title" \ + --body "$body" \ + --draft \ + --no-maintainer-edit + exit 0 + fi + + echo "Updating PR \#${pr_number}" + gh pr edit \ + $pr_number \ + --body "$body" \ + --title "$title" diff --git a/Dockerfile.ubi b/Dockerfile.ubi new file mode 100644 index 000000000000..822363161be2 --- /dev/null +++ b/Dockerfile.ubi @@ -0,0 +1,196 @@ +## Global Args ################################################################# +ARG BASE_UBI_IMAGE_TAG=9.4 +ARG PYTHON_VERSION=3.11 + +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" + +## Base Layer ################################################################## +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base +ARG PYTHON_VERSION + +RUN microdnf install -y \ + python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \ + && microdnf clean all + +WORKDIR /workspace + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +# Some utils for dev purposes - tar required for kubectl cp +RUN microdnf install -y \ + which procps findutils tar vim git\ + && microdnf clean all + + +## Python Installer ############################################################ +FROM base as python-install + +ARG PYTHON_VERSION + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +RUN microdnf install -y \ + python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \ + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel && microdnf clean all + + +## CUDA Base ################################################################### +FROM python-install as cuda-base + +RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \ + https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo + +RUN microdnf install -y \ + cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \ + microdnf clean all + +ENV CUDA_HOME="/usr/local/cuda" \ + PATH="${CUDA_HOME}/bin:${PATH}" \ + LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}" + +## Python cuda base ################################################################# +FROM cuda-base AS python-cuda-base + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# install cuda and common dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ + pip install \ + -r requirements-cuda.txt + +## Development ################################################################# +FROM python-cuda-base AS dev + +# install build and runtime dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ + --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \ + --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \ + --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \ + pip3 install \ + -r requirements-cuda.txt \ + -r requirements-dev.txt + +## Builder ##################################################################### +FROM dev AS build + +# install build dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \ + pip install -r requirements-build.txt + +# install compiler cache to speed up compilation leveraging local or remote caching +# git is required for the cutlass kernels +RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y git ccache && microdnf clean all +# install build dependencies + +# copy input files +COPY csrc csrc +COPY setup.py setup.py +COPY cmake cmake +COPY CMakeLists.txt CMakeLists.txt +COPY requirements-common.txt requirements-common.txt +COPY requirements-cuda.txt requirements-cuda.txt +COPY pyproject.toml pyproject.toml + +ARG TORCH_CUDA_ARCH_LIST +ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST + +# max jobs used by Ninja to build extensions +ARG max_jobs=2 +ENV MAX_JOBS=${max_jobs} +# number of threads used by nvcc +ARG nvcc_threads=8 +ENV NVCC_THREADS=$nvcc_threads +# make sure punica kernels are built (for LoRA) +ENV VLLM_INSTALL_PUNICA_KERNELS=1 + +# Make sure the cuda environment is in the PATH +ENV PATH=/usr/local/cuda/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH + +# Copy the entire directory before building wheel +COPY vllm vllm + +ENV CCACHE_DIR=/root/.cache/ccache +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/pip \ + env CFLAGS="-march=haswell" \ + CXXFLAGS="$CFLAGS $CXXFLAGS" \ + CMAKE_BUILD_TYPE=Release \ + python3 setup.py bdist_wheel --dist-dir=dist + +#################### libsodium Build IMAGE #################### +FROM base as libsodium-builder + +RUN microdnf install -y gcc gzip \ + && microdnf clean all + +WORKDIR /usr/src/libsodium + +ARG LIBSODIUM_VERSION=1.0.20 +RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \ + && tar -xzvf libsodium*.tar.gz \ + && rm -f libsodium*.tar.gz \ + && mv libsodium*/* ./ + +RUN ./configure --prefix="/usr/" && make && make check + +## Release ##################################################################### +FROM python-install AS vllm-openai + +WORKDIR /workspace + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH=$VIRTUAL_ENV/bin/:$PATH + +# Triton needs a CC compiler +RUN microdnf install -y gcc \ + && microdnf clean all + +# Custom cache manager (fix for https://issues.redhat.com/browse/RHOAIENG-8043) +COPY extras/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/custom_cache_manager.py + +# install vllm wheel first, so that torch etc will be installed +RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ + --mount=type=cache,target=/root/.cache/pip \ + pip install $(echo dist/*.whl)'[tensorizer]' --verbose + +# Install libsodium for Tensorizer encryption +RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \ + cd /usr/src/libsodium \ + && make install + +ENV HF_HUB_OFFLINE=1 \ + PORT=8000 \ + HOME=/home/vllm \ + VLLM_USAGE_SOURCE=production-docker-image \ + VLLM_WORKER_MULTIPROC_METHOD=fork \ + TRITON_CACHE_MANAGER="custom_cache_manager:CustomCacheManager" + +# setup non-root user for OpenShift +RUN umask 002 \ + && useradd --uid 2000 --gid 0 vllm \ + && chmod g+rwx $HOME /usr/src /workspace + +COPY LICENSE /licenses/vllm.md + +USER 2000 +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server", "--distributed-executor-backend=mp"] + + +FROM vllm-openai as vllm-grpc-adapter + +USER root + +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install vllm-tgis-adapter==0.1.3 + +ENV GRPC_PORT=8033 +USER 2000 +ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--distributed-executor-backend=mp"] diff --git a/OWNERS b/OWNERS new file mode 100644 index 000000000000..dc965385e186 --- /dev/null +++ b/OWNERS @@ -0,0 +1,18 @@ +approvers: + - dtrifiro + - heyselbi + - rpancham + - RH-steve-grubb + - terrytangyuan + - vaibhavjainwiz + - Xaenalt + - z103cb +reviewers: + - dtrifiro + - heyselbi + - rpancham + - RH-steve-grubb + - terrytangyuan + - vaibhavjainwiz + - Xaenalt + - z103cb diff --git a/extras/custom_cache_manager.py b/extras/custom_cache_manager.py new file mode 100644 index 000000000000..c83ed5b6e865 --- /dev/null +++ b/extras/custom_cache_manager.py @@ -0,0 +1,32 @@ +import os + +from triton.runtime.cache import (FileCacheManager, default_cache_dir, + default_dump_dir, default_override_dir) + + +class CustomCacheManager(FileCacheManager): + + def __init__(self, key, override=False, dump=False): + self.key = key + self.lock_path = None + if dump: + self.cache_dir = default_dump_dir() + self.cache_dir = os.path.join(self.cache_dir, self.key) + self.lock_path = os.path.join(self.cache_dir, "lock") + os.makedirs(self.cache_dir, exist_ok=True) + elif override: + self.cache_dir = default_override_dir() + self.cache_dir = os.path.join(self.cache_dir, self.key) + else: + # create cache directory if it doesn't exist + self.cache_dir = os.getenv("TRITON_CACHE_DIR", + "").strip() or default_cache_dir() + if self.cache_dir: + self.cache_dir = f"{self.cache_dir}_{os.getpid()}" + self.cache_dir = os.path.join(self.cache_dir, self.key) + self.lock_path = os.path.join(self.cache_dir, "lock") + os.makedirs(self.cache_dir, exist_ok=True) + else: + raise RuntimeError("Could not create or locate cache dir") + + print(f"Triton cache dir: {self.cache_dir=}") diff --git a/extras/smoke-test.sh b/extras/smoke-test.sh new file mode 100644 index 000000000000..f03edea4f619 --- /dev/null +++ b/extras/smoke-test.sh @@ -0,0 +1,73 @@ +#!/bin/bash +set -uxo pipefail + +# we will need to download test models off HF hub +unset HF_HUB_OFFLINE + +export HTTP_PORT=8080 +export GRPC_PORT=8033 + + +function wait_for(){ + trap "" ERR # we don't care about errors in this function + + name=$1 + shift + command=$@ + + max_retries=10 + until $command ; do + echo "Waiting for $name to be up (retries_left=$max_retries)..." + sleep 30 + max_retries=$((max_retries-1)) + if [[ max_retries -le 0 ]]; then + echo "Timed out waiting for $name server" >&2 + exit 1 + fi + done +} + +# stop the server on any errors +trap 'kill -9 $server_pid && exit 1' ERR + +# spin up the OpenAPI server in the background +python -m vllm.entrypoints.openai.api_server --port $HTTP_PORT & +server_pid=$! +server_url="http://localhost:$HTTP_PORT" + +wait_for "http server" curl --verbose --connect-timeout 1 --fail-with-body --no-progress-meter "${server_url}/health" + +curl -v --no-progress-meter --fail-with-body \ + "${server_url}/v1/models" | python -m json.tool || \ + +curl -v --no-progress-meter --fail-with-body \ + --header "Content-Type: application/json" \ + --data '{ + "prompt": "A red fedora symbolizes ", + "model": "facebook/opt-125m" +}' \ + "${server_url}/v1/completions" | python -m json.tool + +echo "OpenAI API success" && kill -9 $server_pid + + +# spin up the grpc server in the background +python -m vllm_tgis_adapter --grpc-port $GRPC_PORT & +server_pid=$! +server_url="localhost:$GRPC_PORT" + +# get grpcurl +curl --no-progress-meter --location --output /tmp/grpcurl.tar.gz \ + https://github.com/fullstorydev/grpcurl/releases/download/v1.9.1/grpcurl_1.9.1_linux_x86_64.tar.gz +tar -xf /tmp/grpcurl.tar.gz --directory /tmp + +wait_for "grpc_server" grpc_healthcheck # healthcheck is part of vllm_tgis_adapter + +/tmp/grpcurl -v \ + -plaintext \ + -use-reflection \ + -d '{ "requests": [{"text": "A red fedora symbolizes "}]}' \ + "$server_url" \ + fmaas.GenerationService/Generate + +echo "GRPC API success" && kill -9 $server_pid diff --git a/extras/unit-tests.sh b/extras/unit-tests.sh new file mode 100644 index 000000000000..08b2388b646e --- /dev/null +++ b/extras/unit-tests.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# partially copied from from .buildkite/test-pipeline.yml +set -e + +cd tests || exit 1 + +# we will need to download test models off HF hub +unset HF_HUB_OFFLINE + +# basic correctness +pytest -v -s test_regression.py +pytest -v -s async_engine +VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py +VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py +VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py +VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py +VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py + +# core +pytest -v -s core + +# note: distributed tests are disabled + +# engine tests +pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py +# entrypoint +pytest -v -s entrypoints -m openai + +#inputs (note: multimodal tests are skipped) +pytest -v -s test_inputs.py + +#models +pytest -v -s models -m \"not vlm\" + +# misc +pytest -v -s prefix_caching +pytest -v -s samplers +pytest -v -s test_logits_processor.py +pytest -v -s models -m \"not vlm\" +pytest -v -s worker +VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s spec_decode +# pytest -v -s tensorizer_loader # disabled: requires libsodium +pytest -v -s metrics +pytest -v -s quantization diff --git a/pyproject.toml b/pyproject.toml index 790e01362028..07bd42384889 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "ninja", "packaging", "setuptools >= 49.4.0", - "torch == 2.3.0", + "torch >=2.3.0, <2.4.0", "wheel", ] build-backend = "setuptools.build_meta" diff --git a/requirements-build.txt b/requirements-build.txt index 1a07a94e82e0..6f394e009301 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1,7 +1,7 @@ -# Should be mirrored in pyproject.toml -cmake>=3.21 -ninja -packaging -setuptools>=49.4.0 -torch==2.3.0 -wheel +# Should be mirrored in pyproject.toml +cmake>=3.21 +ninja +packaging +setuptools>=49.4.0 +torch >=2.3.0, <2.4.0 +wheel diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 754070df21c0..5953b41dad4d 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -2,6 +2,6 @@ -r requirements-common.txt # Dependencies for x86_64 CPUs -torch == 2.3.1+cpu; platform_machine != "ppc64le" +torch >= 2.3.1+cpu, <2.4.0+cpu; platform_machine != "ppc64le" torchvision == 0.18.1+cpu; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 10596ed85d60..652eec47deb1 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -4,7 +4,7 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 nvidia-ml-py # for pynvml package -torch == 2.3.0 +torch >= 2.3.0, <2.4.0 # These must be updated alongside torch torchvision == 0.18.0 # Required for phi3v processor, also see https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version xformers == 0.0.26.post1 # Requires PyTorch 2.3.0