diff --git a/.github/workflows/sync-with-upstream.yml b/.github/workflows/sync-with-upstream.yml
new file mode 100644
index 000000000000..53751552f4d2
--- /dev/null
+++ b/.github/workflows/sync-with-upstream.yml
@@ -0,0 +1,84 @@
+name: "Sync with upstream"
+
+on:
+  schedule:
+    - cron: 20 4 * * *
+
+  workflow_dispatch:
+
+
+env:
+  # repo to fetch changes from
+  UPSTREAM_REPO: vllm-project/vllm
+ # branch to sync
+  BRANCH: main
+
+jobs:
+  upstream-sync:
+    name: Sync with upstream
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      contents: write
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Fetch upstream repo
+        run: |
+          git remote add upstream https://github.com/${UPSTREAM_REPO}
+          git fetch upstream
+
+      - name: Check diff
+        id: diff
+        shell: bash
+        run: |
+          echo 'diff<<EOF' >> $GITHUB_OUTPUT
+          git diff --stat upstream/${BRANCH} | tee -a >(cat >> $GITHUB_OUTPUT)
+          echo 'EOF' >> $GITHUB_OUTPUT
+
+      - name: Create PR
+        if: ${{ steps.diff.outputs.diff != '' }}
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          set -xeu
+
+          git_hash="$(git rev-parse upstream/${BRANCH})"
+          echo "git_hash=$git_hash" >> $GITHUB_OUTPUT
+          git_describe="$(git describe --tags upstream/${BRANCH})"
+          echo "git_describe=$git_describe" >> $GITHUB_OUTPUT
+
+          # echo 'commits<<EOF' >> $GITHUB_OUTPUT
+          # git log --oneline ..upstream/${BRANCH} >> $GITHUB_OUTPUT
+          # echo 'EOF' >> $GITHUB_OUTPUT
+
+          upstream_url="https://github.com/${UPSTREAM_REPO}"
+          upstream_branch="$upstream_url/tree/${BRANCH}"
+
+          title="Sync with upstream@${git_describe}"
+          body="Merge [${UPSTREAM_REPO}]($upstream_url):[${BRANCH}]($upstream_branch)@[${git_describe}](${upstream_url}/commit/$git_hash) into $BRANCH"
+
+          gh repo set-default $GITHUB_REPOSITORY
+          pr_number=$(gh pr list -S "Sync with upstream@" --json number --jq '.[0].number')
+
+          if [[ -z $pr_number ]]; then
+            echo "Creating PR"
+            gh pr create \
+              --head $(echo $UPSTREAM_REPO | sed 's|/|:|g'):${BRANCH} \
+              --base ${BRANCH} \
+              --label code-sync \
+              --title "$title" \
+              --body "$body" \
+              --draft \
+              --no-maintainer-edit
+            exit 0
+          fi
+
+          echo "Updating PR \#${pr_number}"
+          gh pr edit \
+            $pr_number \
+            --body "$body" \
+            --title "$title"
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
new file mode 100644
index 000000000000..822363161be2
--- /dev/null
+++ b/Dockerfile.ubi
@@ -0,0 +1,196 @@
+## Global Args #################################################################
+ARG BASE_UBI_IMAGE_TAG=9.4
+ARG PYTHON_VERSION=3.11
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+
+## Base Layer ##################################################################
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
+ARG PYTHON_VERSION
+
+RUN microdnf install -y \
+    python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \
+    && microdnf clean all
+
+WORKDIR /workspace
+
+ENV LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8
+
+# Some utils for dev purposes - tar required for kubectl cp
+RUN microdnf install -y \
+        which procps findutils tar vim git\
+    && microdnf clean all
+
+
+## Python Installer ############################################################
+FROM base as python-install
+
+ARG PYTHON_VERSION
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+RUN microdnf install -y \
+    python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel && microdnf clean all
+
+
+## CUDA Base ###################################################################
+FROM python-install as cuda-base
+
+RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
+        https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
+
+RUN microdnf install -y \
+        cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \
+    microdnf clean all
+
+ENV CUDA_HOME="/usr/local/cuda" \
+    PATH="${CUDA_HOME}/bin:${PATH}" \
+    LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
+
+## Python cuda base #################################################################
+FROM cuda-base AS python-cuda-base
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# install cuda and common dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+    pip install \
+        -r requirements-cuda.txt
+
+## Development #################################################################
+FROM python-cuda-base AS dev
+
+# install build and runtime dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+    --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
+    --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
+    --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
+    pip3 install \
+        -r requirements-cuda.txt \
+        -r requirements-dev.txt
+
+## Builder #####################################################################
+FROM dev AS build
+
+# install build dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
+    pip install -r requirements-build.txt
+
+# install compiler cache to speed up compilation leveraging local or remote caching
+# git is required for the cutlass kernels
+RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y git ccache && microdnf clean all
+# install build dependencies
+
+# copy input files
+COPY csrc csrc
+COPY setup.py setup.py
+COPY cmake cmake
+COPY CMakeLists.txt CMakeLists.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+COPY pyproject.toml pyproject.toml
+
+ARG TORCH_CUDA_ARCH_LIST
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads=8
+ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
+# Make sure the cuda environment is in the PATH
+ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Copy the entire directory before building wheel
+COPY vllm vllm
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    env CFLAGS="-march=haswell" \
+        CXXFLAGS="$CFLAGS $CXXFLAGS" \
+        CMAKE_BUILD_TYPE=Release \
+        python3 setup.py bdist_wheel --dist-dir=dist
+
+#################### libsodium Build IMAGE ####################
+FROM base as libsodium-builder
+
+RUN microdnf install -y gcc gzip \
+    && microdnf clean all
+
+WORKDIR /usr/src/libsodium
+
+ARG LIBSODIUM_VERSION=1.0.20
+RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
+    && tar -xzvf libsodium*.tar.gz \
+    && rm -f libsodium*.tar.gz \
+    && mv libsodium*/* ./
+
+RUN ./configure --prefix="/usr/" && make && make check
+
+## Release #####################################################################
+FROM python-install AS vllm-openai
+
+WORKDIR /workspace
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH=$VIRTUAL_ENV/bin/:$PATH
+
+# Triton needs a CC compiler
+RUN microdnf install -y gcc \
+    && microdnf clean all
+
+# Custom cache manager (fix for https://issues.redhat.com/browse/RHOAIENG-8043)
+COPY extras/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/custom_cache_manager.py
+
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+    --mount=type=cache,target=/root/.cache/pip \
+    pip install $(echo dist/*.whl)'[tensorizer]' --verbose
+
+# Install libsodium for Tensorizer encryption
+RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
+    cd /usr/src/libsodium \
+    && make install
+
+ENV HF_HUB_OFFLINE=1 \
+    PORT=8000 \
+    HOME=/home/vllm \
+    VLLM_USAGE_SOURCE=production-docker-image \
+    VLLM_WORKER_MULTIPROC_METHOD=fork \
+    TRITON_CACHE_MANAGER="custom_cache_manager:CustomCacheManager"
+
+# setup non-root user for OpenShift
+RUN umask 002 \
+    && useradd --uid 2000 --gid 0 vllm \
+    && chmod g+rwx $HOME /usr/src /workspace
+
+COPY LICENSE /licenses/vllm.md
+
+USER 2000
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server", "--distributed-executor-backend=mp"]
+
+
+FROM vllm-openai as vllm-grpc-adapter
+
+USER root
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install vllm-tgis-adapter==0.1.3
+
+ENV GRPC_PORT=8033
+USER 2000
+ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--distributed-executor-backend=mp"]
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 000000000000..dc965385e186
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1,18 @@
+approvers:
+  - dtrifiro
+  - heyselbi
+  - rpancham
+  - RH-steve-grubb
+  - terrytangyuan
+  - vaibhavjainwiz
+  - Xaenalt
+  - z103cb
+reviewers:
+  - dtrifiro
+  - heyselbi
+  - rpancham
+  - RH-steve-grubb
+  - terrytangyuan
+  - vaibhavjainwiz
+  - Xaenalt
+  - z103cb
diff --git a/extras/custom_cache_manager.py b/extras/custom_cache_manager.py
new file mode 100644
index 000000000000..c83ed5b6e865
--- /dev/null
+++ b/extras/custom_cache_manager.py
@@ -0,0 +1,32 @@
+import os
+
+from triton.runtime.cache import (FileCacheManager, default_cache_dir,
+                                  default_dump_dir, default_override_dir)
+
+
+class CustomCacheManager(FileCacheManager):
+
+    def __init__(self, key, override=False, dump=False):
+        self.key = key
+        self.lock_path = None
+        if dump:
+            self.cache_dir = default_dump_dir()
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+            self.lock_path = os.path.join(self.cache_dir, "lock")
+            os.makedirs(self.cache_dir, exist_ok=True)
+        elif override:
+            self.cache_dir = default_override_dir()
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+        else:
+            # create cache directory if it doesn't exist
+            self.cache_dir = os.getenv("TRITON_CACHE_DIR",
+                                       "").strip() or default_cache_dir()
+            if self.cache_dir:
+                self.cache_dir = f"{self.cache_dir}_{os.getpid()}"
+                self.cache_dir = os.path.join(self.cache_dir, self.key)
+                self.lock_path = os.path.join(self.cache_dir, "lock")
+                os.makedirs(self.cache_dir, exist_ok=True)
+            else:
+                raise RuntimeError("Could not create or locate cache dir")
+
+        print(f"Triton cache dir: {self.cache_dir=}")
diff --git a/extras/smoke-test.sh b/extras/smoke-test.sh
new file mode 100644
index 000000000000..f03edea4f619
--- /dev/null
+++ b/extras/smoke-test.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+set -uxo pipefail
+
+# we will need to download test models off HF hub
+unset HF_HUB_OFFLINE
+
+export HTTP_PORT=8080
+export GRPC_PORT=8033
+
+
+function wait_for(){
+    trap "" ERR # we don't care about errors in this function
+
+    name=$1
+    shift
+    command=$@
+
+    max_retries=10
+    until $command ; do
+        echo "Waiting for $name to be up (retries_left=$max_retries)..."
+        sleep 30
+        max_retries=$((max_retries-1))
+        if [[ max_retries -le 0 ]]; then
+            echo "Timed out waiting for $name server" >&2
+            exit 1
+        fi
+    done
+}
+
+# stop the server on any errors
+trap 'kill -9 $server_pid && exit 1' ERR
+
+# spin up the OpenAPI server in the background
+python -m vllm.entrypoints.openai.api_server --port $HTTP_PORT &
+server_pid=$!
+server_url="http://localhost:$HTTP_PORT"
+
+wait_for "http server" curl --verbose --connect-timeout 1 --fail-with-body --no-progress-meter "${server_url}/health"
+
+curl -v --no-progress-meter --fail-with-body \
+  "${server_url}/v1/models" | python -m json.tool || \
+
+curl -v --no-progress-meter --fail-with-body \
+  --header "Content-Type: application/json" \
+  --data '{
+    "prompt": "A red fedora symbolizes ",
+    "model": "facebook/opt-125m"
+}' \
+  "${server_url}/v1/completions" | python -m json.tool
+
+echo "OpenAI API success" && kill -9 $server_pid
+
+
+# spin up the grpc server in the background
+python -m vllm_tgis_adapter --grpc-port $GRPC_PORT &
+server_pid=$!
+server_url="localhost:$GRPC_PORT"
+
+# get grpcurl
+curl --no-progress-meter --location --output /tmp/grpcurl.tar.gz \
+  https://github.com/fullstorydev/grpcurl/releases/download/v1.9.1/grpcurl_1.9.1_linux_x86_64.tar.gz
+tar -xf /tmp/grpcurl.tar.gz --directory /tmp
+
+wait_for "grpc_server" grpc_healthcheck # healthcheck is part of vllm_tgis_adapter
+
+/tmp/grpcurl -v \
+    -plaintext \
+    -use-reflection \
+    -d '{ "requests": [{"text": "A red fedora symbolizes "}]}' \
+    "$server_url" \
+    fmaas.GenerationService/Generate
+
+echo "GRPC API success" && kill -9 $server_pid
diff --git a/extras/unit-tests.sh b/extras/unit-tests.sh
new file mode 100644
index 000000000000..08b2388b646e
--- /dev/null
+++ b/extras/unit-tests.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# partially copied from from .buildkite/test-pipeline.yml
+set -e
+
+cd tests || exit 1
+
+# we will need to download test models off HF hub
+unset HF_HUB_OFFLINE
+
+# basic correctness
+pytest -v -s test_regression.py
+pytest -v -s async_engine
+VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
+VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
+
+# core
+pytest -v -s core
+
+# note: distributed tests are disabled
+
+# engine tests
+pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
+# entrypoint
+pytest -v -s entrypoints -m openai
+
+#inputs (note: multimodal tests are skipped)
+pytest -v -s test_inputs.py
+
+#models
+pytest -v -s models -m \"not vlm\"
+
+# misc
+pytest -v -s prefix_caching
+pytest -v -s samplers
+pytest -v -s test_logits_processor.py
+pytest -v -s models -m \"not vlm\"
+pytest -v -s worker
+VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s spec_decode
+# pytest -v -s tensorizer_loader # disabled: requires libsodium
+pytest -v -s metrics
+pytest -v -s quantization
diff --git a/pyproject.toml b/pyproject.toml
index 790e01362028..07bd42384889 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.3.0",
+    "torch >=2.3.0, <2.4.0",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements-build.txt b/requirements-build.txt
index 1a07a94e82e0..6f394e009301 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,7 +1,7 @@
-# Should be mirrored in pyproject.toml
-cmake>=3.21
-ninja
-packaging
-setuptools>=49.4.0
-torch==2.3.0
-wheel
+# Should be mirrored in pyproject.toml
+cmake>=3.21
+ninja
+packaging
+setuptools>=49.4.0
+torch >=2.3.0, <2.4.0
+wheel
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 754070df21c0..5953b41dad4d 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,6 +2,6 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.3.1+cpu; platform_machine != "ppc64le"
+torch >= 2.3.1+cpu, <2.4.0+cpu; platform_machine != "ppc64le"
 torchvision == 0.18.1+cpu; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
 triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 10596ed85d60..652eec47deb1 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,7 +4,7 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py # for pynvml package
-torch == 2.3.0
+torch >= 2.3.0, <2.4.0
 # These must be updated alongside torch
 torchvision == 0.18.0   # Required for phi3v processor, also see https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.26.post1  # Requires PyTorch 2.3.0