From 5952d811398d3a22f30d72d2d2943787a78f66ea Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 5 Nov 2024 10:50:57 -0500
Subject: [PATCH 01/24] [Frontend] Fix tcp port reservation for api server
 (#10012)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/entrypoints/openai/api_server.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bef36ffdbfcd3..917b347ff1161 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -569,7 +569,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     # This avoids race conditions with ray.
     # see https://github.com/vllm-project/vllm/issues/8204
     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    sock.bind(("", args.port))
+    sock.bind((args.host or "", args.port))
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
 
     def signal_handler(*_) -> None:
         # Interrupt server on sigterm while initializing
@@ -593,13 +594,14 @@ def signal_handler(*_) -> None:
             ssl_certfile=args.ssl_certfile,
             ssl_ca_certs=args.ssl_ca_certs,
             ssl_cert_reqs=args.ssl_cert_reqs,
-            fd=sock.fileno(),
             **uvicorn_kwargs,
         )
 
     # NB: Await server shutdown only after the backend context is exited
     await shutdown_task
 
+    sock.close()
+
 
 if __name__ == "__main__":
     # NOTE(simon):

From cd34029e91ad2d38a58d190331a65f9096c0b157 Mon Sep 17 00:00:00 2001
From: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Date: Tue, 5 Nov 2024 08:48:44 -0800
Subject: [PATCH 02/24] Refactor TPU requirements file and pin build
 dependencies (#10010)

Signed-off-by: Richard Liu <ricliu@google.com>
---
 Dockerfile.tpu                                |  7 ---
 .../getting_started/tpu-installation.rst      | 57 ++-----------------
 requirements-tpu.txt                          | 20 ++++++-
 3 files changed, 23 insertions(+), 61 deletions(-)

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index b43442e4c0af1..0a507b6ecdf60 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -9,12 +9,6 @@ RUN apt-get update && apt-get install -y \
     git \
     ffmpeg libsm6 libxext6 libgl1
 
-# Install the TPU and Pallas dependencies.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
 # Build vLLM.
 COPY . .
 ARG GIT_REPO_CHECK=0
@@ -25,7 +19,6 @@ ENV VLLM_TARGET_DEVICE="tpu"
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
     python3 -m pip install \
-        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         -r requirements-tpu.txt
 RUN python3 setup.py develop
 
diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index f0c812b941c1f..75ab2b6ba02dc 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -119,27 +119,19 @@ Uninstall the existing `torch` and `torch_xla` packages:
 
     pip uninstall torch torch-xla -y
 
-Install `torch` and `torch_xla`
+Install build dependencies:
 
 .. code-block:: bash
 
-    pip install --pre torch==2.6.0.dev20241028+cpu torchvision==0.20.0.dev20241028+cpu --index-url https://download.pytorch.org/whl/nightly/cpu
-    pip install 'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl' -f https://storage.googleapis.com/libtpu-releases/index.html
+    pip install -r requirements-tpu.txt
+    sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev 
 
-Install JAX and Pallas:
+Run the setup script:
 
 .. code-block:: bash
 
-    pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-    pip install jaxlib==0.4.32.dev20240829 jax==0.4.32.dev20240829 -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
-Install other build dependencies:
+   VLLM_TARGET_DEVICE="tpu" python setup.py develop
 
-.. code-block:: bash
-
-    pip install -r requirements-tpu.txt
-    VLLM_TARGET_DEVICE="tpu" python setup.py develop
-    sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev 
 
 Provision Cloud TPUs with GKE 
 -----------------------------
@@ -168,45 +160,6 @@ Run the Docker image with the following command:
     $ # Make sure to add `--privileged --net host --shm-size=16G`.
     $ docker run --privileged --net host --shm-size=16G -it vllm-tpu
 
-
-.. _build_from_source_tpu:
-
-Build from source
------------------
-
-You can also build and install the TPU backend from source.
-
-First, install the dependencies:
-
-.. code-block:: console
-
-    $ # (Recommended) Create a new conda environment.
-    $ conda create -n myenv python=3.10 -y
-    $ conda activate myenv
-
-    $ # Clean up the existing torch and torch-xla packages.
-    $ pip uninstall torch torch-xla -y
-
-    $ # Install PyTorch and PyTorch XLA.
-    $ export DATE="20241017"
-    $ export TORCH_VERSION="2.6.0"
-    $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl
-    $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl
-
-    $ # Install JAX and Pallas.
-    $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-    $ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
-    $ # Install other build dependencies.
-    $ pip install -r requirements-tpu.txt
-
-
-Next, build vLLM from source. This will only take a few seconds:
-
-.. code-block:: console
-
-    $ VLLM_TARGET_DEVICE="tpu" python setup.py develop
-
 .. note::
 
     Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape.
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 4c606cf0a9105..f9a0770804e55 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -2,6 +2,22 @@
 -r requirements-common.txt
 
 # Dependencies for TPU
-# Currently, the TPU backend uses a nightly version of PyTorch XLA.
-# You can install the dependencies in Dockerfile.tpu.
+cmake>=3.26
+ninja
+packaging
+setuptools-scm>=8
+wheel
+jinja2
 ray[default]
+
+# Install torch_xla
+--pre
+--extra-index-url https://download.pytorch.org/whl/nightly/cpu
+--find-links https://storage.googleapis.com/libtpu-releases/index.html
+--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
+--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+torch==2.6.0.dev20241028+cpu
+torchvision==0.20.0.dev20241028+cpu
+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241028-cp310-cp310-linux_x86_64.whl
+jaxlib==0.4.32.dev20240829
+jax==0.4.32.dev20240829

From 09d3550372db10f8c75fddd437325a863265fd82 Mon Sep 17 00:00:00 2001
From: "Chenghao (Alan) Yang" <chenghao@uchicago.edu>
Date: Tue, 5 Nov 2024 11:50:50 -0600
Subject: [PATCH 03/24] [Misc] Add logging for CUDA memory (#10027)

Signed-off-by: Chenghao Yang <yangalan1996@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Chenghao Yang <yangalan1996@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 vllm/worker/model_runner.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 328dab598f8ef..2447eecf7957d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -48,9 +48,10 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.transformers_utils.config import uses_mrope
-from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, async_tensor_h2d,
-                        flatten_2d_lists, is_pin_memory_available,
-                        supports_dynamo, weak_ref_tensor)
+from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache,
+                        async_tensor_h2d, flatten_2d_lists,
+                        is_pin_memory_available, supports_dynamo,
+                        weak_ref_tensor)
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
     _add_attn_metadata_broadcastable_dict,
@@ -1383,16 +1384,16 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
         per sequence in the batch.
         """
         assert not self.model_config.enforce_eager
-        logger.info("Capturing the model for CUDA graphs. This may lead to "
+        logger.info("Capturing cudagraphs for decoding. This may lead to "
                     "unexpected consequences if the model is not static. To "
                     "run the model in eager mode, set 'enforce_eager=True' or "
                     "use '--enforce-eager' in the CLI.")
-        logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. "
-                    "If you are running out of memory, consider decreasing "
-                    "`gpu_memory_utilization` or enforcing eager mode. "
-                    "You can also reduce the `max_num_seqs` as needed "
-                    "to decrease memory usage.")
+        logger.info("If out-of-memory error occurs during cudagraph capture,"
+                    " consider decreasing `gpu_memory_utilization` or "
+                    "switching to eager mode. You can also reduce the "
+                    "`max_num_seqs` as needed to decrease memory usage.")
         start_time = time.perf_counter()
+        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
 
         # Prepare dummy inputs. These will be reused for all batch sizes.
         max_batch_size = self.max_batchsize_to_capture
@@ -1497,9 +1498,12 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                         graph_runner)
 
         end_time = time.perf_counter()
+        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
         elapsed_time = end_time - start_time
+        cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
         # This usually takes < 10 seconds.
-        logger.info("Graph capturing finished in %.0f secs.", elapsed_time)
+        logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
+                    elapsed_time, cuda_graph_size / GiB_bytes)
 
     def _update_inputs_to_capture_for_enc_dec_model(self,
                                                     capture_inputs: Dict[str,

From 731aec5be713a89dccf1d7106290da17621af816 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 5 Nov 2024 13:30:42 -0500
Subject: [PATCH 04/24] [CI/Build] Limit github CI jobs based on files changed
 (#9928)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/workflows/actionlint.yml   |  2 ++
 .github/workflows/clang-format.yml | 12 ++++++++++++
 .github/workflows/mypy.yaml        | 10 ++++++++++
 .github/workflows/ruff.yml         | 17 +++++++++++++----
 .github/workflows/yapf.yml         |  9 ++++++++-
 5 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index b80749aaa8fec..5eddf6b7c649b 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -6,12 +6,14 @@ on:
     paths:
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
   pull_request:
     branches:
       - "main"
     paths:
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
 
 env:
   LC_ALL: en_US.UTF-8
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index 68d60d7365ed1..167c115d8956f 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -6,9 +6,21 @@ on:
   push:
     branches:
       - main
+    paths:
+      - '**/*.h'
+      - '**/*.cpp'
+      - '**/*.cu'
+      - '**/*.cuh'
+      - '.github/workflows/clang-format.yml'
   pull_request:
     branches:
       - main
+    paths:
+      - '**/*.h'
+      - '**/*.cpp'
+      - '**/*.cu'
+      - '**/*.cuh'
+      - '.github/workflows/clang-format.yml'
 
 jobs:
   clang-format:
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 5f1e5f8eeaf7d..18b354948f0cc 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -6,9 +6,19 @@ on:
   push:
     branches:
       - main
+    paths:
+      - '**/*.py'
+      - '.github/workflows/mypy.yaml'
+      - 'tools/mypy.sh'
+      - 'pyproject.toml'
   pull_request:
     branches:
       - main
+    paths:
+      - '**/*.py'
+      - '.github/workflows/mypy.yaml'
+      - 'tools/mypy.sh'
+      - 'pyproject.toml'
 
 jobs:
   mypy:
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 9cc8a9e914474..197f918765e7d 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -6,16 +6,28 @@ on:
   push:
     branches:
       - main
+    paths:
+      - "**/*.py"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/matchers/ruff.json
+      - .github/workflows/ruff.yml
   pull_request:
     branches:
       - main
+    paths:
+      - "**/*.py"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/matchers/ruff.json
+      - .github/workflows/ruff.yml
 
 jobs:
   ruff:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.12"]
     steps:
     - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
     - name: Set up Python ${{ matrix.python-version }}
@@ -30,9 +42,6 @@ jobs:
       run: |
         echo "::add-matcher::.github/workflows/matchers/ruff.json"
         ruff check --output-format github .
-    - name: Spelling check with codespell
-      run: |
-        codespell --toml pyproject.toml
     - name: Run isort
       run: |
         isort . --check-only
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index 9f06b35c19e32..35579302c5c14 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -6,15 +6,22 @@ on:
   push:
     branches:
       - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/yapf.yml
   pull_request:
     branches:
       - main
+    paths:
+      - "**/*.py"
+      - .github/workflows/yapf.yml
+
 jobs:
   yapf:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.12"]
     steps:
     - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
     - name: Set up Python ${{ matrix.python-version }}

From a53046b16fd11436eb2b15421079b7c5b353f955 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 5 Nov 2024 13:42:20 -0500
Subject: [PATCH 05/24] [Model] Support quantization of PixtralHFTransformer
 for PixtralHF (#9921)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/activation.py |  30 +++++++
 vllm/model_executor/models/pixtral.py    | 100 ++++++++++++++---------
 2 files changed, 90 insertions(+), 40 deletions(-)

diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 658a3700f33d6..e347ca80ff765 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -299,3 +299,33 @@ def get_act_fn(
         return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
                                 params_dtype)
     return act_fn
+
+
+_ACTIVATION_AND_MUL_REGISTRY = LazyDict({
+    "gelu": lambda: GeluAndMul(),
+    "silu": lambda: SiluAndMul(),
+})
+
+
+def get_act_and_mul_fn(
+    act_fn_name: str,
+    quant_config: Optional[QuantizationConfig] = None,
+    intermediate_size: Optional[int] = None,
+    input_is_parallel: bool = True,
+    params_dtype: Optional[torch.dtype] = None,
+) -> nn.Module:
+    """Get an activation-and-mul (i.e. SiluAndMul) function by name."""
+    act_fn_name = act_fn_name.lower()
+    if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY:
+        raise ValueError(
+            f"Activation function {act_fn_name!r} is not supported.")
+
+    act_fn = _ACTIVATION_AND_MUL_REGISTRY[act_fn_name]
+    if (quant_config is not None
+            and act_fn_name in quant_config.get_scaled_act_names()):
+        if intermediate_size is None:
+            raise ValueError("intermediate_size must be specified for scaled "
+                             "activation functions.")
+        return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
+                                params_dtype)
+    return act_fn
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 051454c49bff8..ee9f150b17cfc 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -19,8 +19,11 @@
 from vllm.config import CacheConfig, ModelConfig, MultiModalConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
-from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -798,20 +801,24 @@ def __init__(
         super().__init__()
 
         assert config.intermediate_size is not None
-        # TODO: Use quant_config and prefix after optimizing this
-        self.gate_proj = nn.Linear(config.hidden_size,
-                                   config.intermediate_size,
-                                   bias=False)
-        self.up_proj = nn.Linear(config.hidden_size,
-                                 config.intermediate_size,
-                                 bias=False)
-        self.down_proj = nn.Linear(config.intermediate_size,
-                                   config.hidden_size,
-                                   bias=False)
-        self.act = get_act_fn(config.hidden_act)
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_sizes=[config.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(input_size=config.intermediate_size,
+                                           output_size=config.hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        self.act_and_mul = get_act_and_mul_fn(config.hidden_act)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_and_mul(gate_up)
+        x, _ = self.down_proj(x)
+        return x
 
 
 class PixtralHFAttention(nn.Module):
@@ -830,21 +837,21 @@ def __init__(
         self.n_heads = config.num_attention_heads
         self.head_dim = config.hidden_size // config.num_attention_heads
 
-        self.scale = self.head_dim**-0.5
-
-        # TODO: Use quant_config and prefix after optimizing this
-        self.q_proj = nn.Linear(config.hidden_size,
-                                config.hidden_size,
-                                bias=False)
-        self.k_proj = nn.Linear(config.hidden_size,
-                                config.hidden_size,
-                                bias=False)
-        self.v_proj = nn.Linear(config.hidden_size,
-                                config.hidden_size,
-                                bias=False)
-        self.o_proj = nn.Linear(config.hidden_size,
-                                config.hidden_size,
-                                bias=False)
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=config.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.n_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
 
     def forward(
         self,
@@ -854,13 +861,13 @@ def forward(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         batch, patches, _ = hidden_states.size()
 
-        q = self.q_proj(hidden_states)
-        k = self.k_proj(hidden_states)
-        v = self.v_proj(hidden_states)
+        qkv_states, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv_states.chunk(3, dim=-1)
 
         # Transpose q and k to apply HF's Rotary Position Embedding
         q = q.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2)
         k = k.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch, patches, self.n_heads, self.head_dim)
         cos, sin = position_embeddings
         q, k = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=0)
 
@@ -868,22 +875,21 @@ def forward(
             # Transpose q and k back for attention
             q = q.transpose(1, 2).contiguous()
             k = k.transpose(1, 2).contiguous()
-            v = v.reshape(batch, patches, self.n_heads, self.head_dim)
 
             out = xops.memory_efficient_attention(q,
                                                   k,
                                                   v,
                                                   attn_bias=attention_mask)
         else:
-            v = v.reshape(batch, patches, self.n_heads,
-                          self.head_dim).transpose(1, 2)
+            v = v.transpose(1, 2)
             out = nn.functional.scaled_dot_product_attention(
                 q, k, v, attn_mask=attention_mask)
             out = out.transpose(1, 2)
 
-        out = out.reshape(batch, patches, self.n_heads * self.head_dim)
+        out = out.view(batch, patches, self.n_heads * self.head_dim)
+        attn_output, _ = self.o_proj(out)
 
-        return self.o_proj(out)
+        return attn_output, None
 
 
 class PixtralHFTransformerBlock(nn.Module):
@@ -912,9 +918,9 @@ def forward(
         attention_mask: torch.Tensor,
         position_embeddings: torch.Tensor,
     ) -> torch.Tensor:
-        r = self.attention.forward(self.attention_norm(hidden_states),
-                                   attention_mask=attention_mask,
-                                   position_embeddings=position_embeddings)
+        r, _ = self.attention.forward(self.attention_norm(hidden_states),
+                                      attention_mask=attention_mask,
+                                      position_embeddings=position_embeddings)
         h = hidden_states + r
         r = self.feed_forward.forward(self.ffn_norm(h))
         out = h + r
@@ -1053,10 +1059,24 @@ def forward(
     # (TODO) Add prefix argument for filtering out weights to be loaded
     #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        stacked_params_mapping = []
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
         params_dict = dict(self.named_parameters())
+        layer_count = len(self.transformer.layers)
 
         for name, loaded_weight in weights:
+            # omit layers when num_hidden_layers_override is set
+            if name.startswith("transformer.layers"):
+                layer_idx = int(name.split(".")[2])
+                if layer_idx >= layer_count:
+                    continue
+
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue

From d2e80332a7cedcfd23ec705b109c5fa3ad94fcc0 Mon Sep 17 00:00:00 2001
From: lkchen <github@lkchen.net>
Date: Tue, 5 Nov 2024 11:30:02 -0800
Subject: [PATCH 06/24] [Feature] Update benchmark_throughput.py to support
 image input (#9851)

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
Co-authored-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/README.md               | 11 ++++
 benchmarks/benchmark_throughput.py | 82 +++++++++++++++++++++++-------
 2 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 192d6c4022c83..2aa4a285021f1 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -6,3 +6,14 @@ You can download the dataset by running:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
+
+## Downloading the ShareGPT4V dataset
+
+The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
+will ignore a datapoint if the referred image is missing.
+```bash
+wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
+mkdir coco -p
+wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
+unzip coco/train2017.zip -d coco/
+```
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 262b8652e49ff..159cf055737ce 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -8,6 +8,7 @@
 
 import torch
 import uvloop
+from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
@@ -38,12 +39,33 @@ class SampleRequest:
     multi_modal_data: Optional[MultiModalDataDict] = None
 
 
-def sample_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
-) -> List[SampleRequest]:
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+    """Prepend and append special tokens around the question to form a prompt.
+
+    Args:
+        question: The input question text to wrap with special tokens
+        model: The name of the model being used, to determine which special
+            tokens to add
+
+    Returns:
+        The formatted prompt string with appropriate special tokens for the
+            model
+
+    Raises:
+        ValueError: If an unsupported model name is provided
+    """
+    model = model.lower()
+    if "pixtral" in model:
+        return f"<s>[INST]{question}\n[IMG][/INST]"
+    raise ValueError(f"Unsupported model {model}")
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    dataset_path: str = args.dataset
+    num_requests: int = args.num_prompts
+    fixed_output_len: Optional[int] = args.output_len
+    model: str = args.model
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
 
@@ -52,23 +74,36 @@ def sample_requests(
         dataset = json.load(f)
     # Filter out the conversations with less than 2 turns.
     dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-
     # Shuffle the dataset.
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
     filtered_dataset: List[SampleRequest] = []
-    for i in range(len(dataset)):
+    for data in dataset:
         if len(filtered_dataset) == num_requests:
             break
 
+        # Only keep the first two turns of each conversation.
+        prompt = data["conversations"][0]["value"]
+        completion = data["conversations"][1]["value"]
+
+        multi_modal_data: Optional[MultiModalDataDict] = None
+        if "image" in data:
+            multi_modal_data = multi_modal_data or {}
+            image_path = data["image"]
+            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+            assert isinstance(image_path,
+                              str), "Only support single image input"
+            try:
+                multi_modal_data["image"] = Image.open(image_path).convert(
+                    "RGB")
+            except FileNotFoundError:
+                # Ignore datapoint where asset is missing
+                continue
+            prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
         # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
         prompt_token_ids = tokenizer(prompt).input_ids
-        completion = dataset[i][1]
         completion_token_ids = tokenizer(completion).input_ids
         prompt_len = len(prompt_token_ids)
         output_len = len(completion_token_ids
@@ -82,7 +117,8 @@ def sample_requests(
         filtered_dataset.append(
             SampleRequest(prompt=prompt,
                           prompt_len=prompt_len,
-                          expected_output_len=output_len))
+                          expected_output_len=output_len,
+                          multi_modal_data=multi_modal_data))
 
     return filtered_dataset
 
@@ -99,7 +135,9 @@ def run_vllm(
     prompts: List[TextPrompt] = []
     sampling_params: List[SamplingParams] = []
     for request in requests:
-        prompts.append(TextPrompt(prompt=request.prompt))
+        prompts.append(
+            TextPrompt(prompt=request.prompt,
+                       multi_modal_data=request.multi_modal_data))
         sampling_params.append(
             SamplingParams(
                 n=n,
@@ -148,7 +186,9 @@ async def run_vllm_async(
         prompts: List[TextPrompt] = []
         sampling_params: List[SamplingParams] = []
         for request in requests:
-            prompts.append(TextPrompt(prompt=request.prompt))
+            prompts.append(
+                TextPrompt(prompt=request.prompt,
+                           multi_modal_data=request.multi_modal_data))
             sampling_params.append(
                 SamplingParams(
                     n=n,
@@ -272,9 +312,10 @@ def main(args: argparse.Namespace):
             for _ in range(args.num_prompts)
         ]
     else:
-        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
-                                   args.output_len)
+        requests = sample_requests(tokenizer, args)
 
+    is_multi_modal = any(request.multi_modal_data is not None
+                         for request in requests)
     if args.backend == "vllm":
         if args.async_engine:
             elapsed_time = uvloop.run(
@@ -300,6 +341,11 @@ def main(args: argparse.Namespace):
                            for request in requests)
     total_output_tokens = sum(request.expected_output_len
                               for request in requests)
+    if is_multi_modal:
+        print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+              "following metrics are not accurate because image tokens are not"
+              " counted. See vllm-project/vllm/issues/9778 for details.")
+        # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
           f"{total_output_tokens / elapsed_time:.2f} output tokens/s")

From b9c64c0ca79ccdea608f337fbb7e4b0c75fe3aac Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 6 Nov 2024 03:40:08 +0800
Subject: [PATCH 07/24] [Misc] Modify BNB parameter name (#9997)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../layers/quantization/bitsandbytes.py            |  9 +++++----
 vllm/model_executor/layers/resampler.py            |  2 +-
 vllm/model_executor/model_loader/loader.py         | 14 +++++---------
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 718967a065192..78965d7b9495c 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -203,8 +203,9 @@ def create_qweight_for_4bit():
             qweight = create_qweight_for_8bit()
         else:
             qweight = create_qweight_for_4bit()
-
-        layer.register_parameter("qweight", qweight)
+        # Enable parameters to have the same name as in the BNB
+        # checkpoint format.
+        layer.register_parameter("weight", qweight)
         set_weight_attrs(qweight, extra_weight_attrs)
 
     def apply(self,
@@ -234,7 +235,7 @@ def _apply_8bit_weight(
             reshape_after_matmul = True
         bf_x = x.to(torch.bfloat16)
 
-        qweight = layer.qweight
+        qweight = layer.weight
         offsets = qweight.bnb_shard_offsets
         quant_states = qweight.bnb_quant_state
         matmul_states = qweight.matmul_state
@@ -313,7 +314,7 @@ def _apply_4bit_weight(
             reshape_after_matmul = True
         bf_x = x.to(torch.bfloat16)
 
-        qweight = layer.qweight
+        qweight = layer.weight
         quant_states = qweight.bnb_quant_state
         offsets = qweight.bnb_shard_offsets
 
diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index bce91f1d7fd5e..bca44d2bf2e28 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -177,7 +177,7 @@ def __init__(self,
                                             embed_dim,
                                             bias=False,
                                             quant_config=quant_config,
-                                            prefix=prefix)
+                                            prefix=f"{prefix}.kv_proj")
         else:
             # Maintain the same return value with ReplicatedLinear.forward
             self.kv_proj = lambda *args, **kwargs: (  # type: ignore # noqa 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index c3e0290f270ae..1f8d531198324 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -892,7 +892,7 @@ def _quantized_8bit_generator(self, hf_weights_files, use_safetensors,
             if not weight_name.lower().endswith(".scb"):
                 continue
 
-            weight_key = weight_name.lower().replace(".scb", ".qweight")
+            weight_key = weight_name.lower().replace(".scb", ".weight")
             quant_state_dict[weight_key] = weight_tensor
 
         for weight_name, weight_tensor in self._hf_weight_iter(
@@ -901,11 +901,9 @@ def _quantized_8bit_generator(self, hf_weights_files, use_safetensors,
             if self._is_8bit_weight_name(weight_name):
                 continue
 
-            qweight_name = weight_name.replace(".weight", ".qweight")
-
-            if qweight_name in quant_state_dict:
+            if weight_name in quant_state_dict:
                 set_weight_attrs(weight_tensor, {"load_in_8bit": True})
-                yield qweight_name, weight_tensor
+                yield weight_name, weight_tensor
             else:
                 yield weight_name, weight_tensor
 
@@ -950,9 +948,8 @@ def _parse_quant_state(param_name: str,
             (f"{weight_name}.quant_state.bitsandbytes__fp4" \
                     in temp_state_dict):
                 quant_state = _parse_quant_state(weight_name, temp_state_dict)
-                weight_name = weight_name.replace(".weight", ".qweight")
                 quant_state_dict[weight_name] = quant_state
-                yield weight_name.replace(".weight", ".qweight"), weight_tensor
+                yield weight_name, weight_tensor
             else:
                 yield weight_name, weight_tensor
 
@@ -967,7 +964,6 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
 
             if any(target_module in weight_name for target_module in
                    self.target_modules) and weight_name.endswith(".weight"):
-                weight_name = weight_name.replace(".weight", ".qweight")
                 # Without sharding
                 if any(
                         weight_name.startswith(module)
@@ -1093,7 +1089,7 @@ def _load_weights(self, model_config: ModelConfig,
                 # Some models, such as MiniCPM V2.5/2.6, contain both
                 # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
                 # from being incorrectly identified as being present in
-                # 'vpm.encoder.layers.0.self_attn.qkv_proj.qweight
+                # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
                 if shard_pos > 0 and quant_param_name[shard_pos - 1] == ".":
                     shard_index = index
                     quant_param_name = quant_param_name.replace(

From 02462465ea1c45163fde632fb94e0e4939ee8a59 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 5 Nov 2024 16:02:23 -0500
Subject: [PATCH 08/24] [CI] Prune tests/models/decoder_only/language/* tests
 (#9940)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml                 |  3 +-
 .../decoder_only/language/test_big_models.py  | 93 -------------------
 .../models/decoder_only/language/test_fp8.py  | 10 +-
 .../decoder_only/language/test_gptq_marlin.py | 13 ---
 .../language/test_gptq_marlin_24.py           | 12 +--
 .../decoder_only/language/test_marlin.py      | 69 --------------
 .../decoder_only/language/test_mistral.py     | 37 ++++----
 .../decoder_only/language/test_models.py      | 69 +++++++-------
 .../models/decoder_only/language/test_qwen.py | 34 -------
 9 files changed, 70 insertions(+), 270 deletions(-)
 delete mode 100644 tests/models/decoder_only/language/test_big_models.py
 delete mode 100644 tests/models/decoder_only/language/test_marlin.py
 delete mode 100644 tests/models/decoder_only/language/test_qwen.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9444dc43ea97e..1eb749f64d36b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -321,7 +321,6 @@ steps:
   - tests/models/decoder_only/language
   commands:
     - pytest -v -s models/decoder_only/language/test_models.py
-    - pytest -v -s models/decoder_only/language/test_big_models.py
 
 - label: Decoder-only Language Models Test (Extended) # 1h20min
   nightly: true
@@ -329,7 +328,7 @@ steps:
   - vllm/
   - tests/models/decoder_only/language
   commands:
-    - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
+    - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
 
 - label: Decoder-only Multi-Modal Models Test (Standard)
   #mirror_hardwares: [amd]
diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py
deleted file mode 100644
index fcfc159e4f5a0..0000000000000
--- a/tests/models/decoder_only/language/test_big_models.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""Compare the outputs of HF and vLLM when using greedy sampling.
-
-This tests bigger models and use half precision.
-
-Run `pytest tests/models/test_big_models.py`.
-"""
-import pytest
-
-from vllm.platforms import current_platform
-
-from ...utils import check_logprobs_close, check_outputs_equal
-
-MODELS = [
-    "meta-llama/Llama-2-7b-hf",
-    # "mistralai/Mistral-7B-v0.1",  # Tested by test_mistral.py
-    # "Deci/DeciLM-7b",  # Broken
-    # "tiiuae/falcon-7b",  # Broken
-    "EleutherAI/gpt-j-6b",
-    # "mosaicml/mpt-7b",  # Broken
-    # "Qwen/Qwen1.5-0.5B"  # Broken,
-]
-
-if not current_platform.is_cpu():
-    MODELS += [
-        # fused_moe which not supported on CPU
-        "openbmb/MiniCPM3-4B",
-        # Head size isn't supported on CPU
-        "h2oai/h2o-danube3-4b-base",
-    ]
-
-# TODO: remove this after CPU float16 support ready
-target_dtype = "float" if current_platform.is_cpu() else "half"
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [32])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-
-    if model == "openbmb/MiniCPM3-4B":
-        # the output becomes slightly different when upgrading to
-        # pytorch 2.5 . Changing to logprobs checks instead of exact
-        # output checks.
-        NUM_LOG_PROBS = 8
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
-
-        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
-
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-    else:
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", [target_dtype])
-def test_model_print(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py
index 5a947ce62c785..f874bf6c73142 100644
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -21,11 +21,11 @@
     "kv_cache_dtype,base_model,test_model,scale_path",
     [
         # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
-        ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
-         "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
+        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
+         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
         # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
-         "meta-llama/Meta-Llama-3-8B-Instruct", None),
+        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct", None),
         # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
         ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
          "meta-llama/Llama-2-7b-chat-hf",
@@ -33,7 +33,7 @@
     ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("enforce_eager", [False, True])
+@pytest.mark.parametrize("enforce_eager", [True])
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py
index 2155e83dbe915..a896f145c11f1 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@@ -22,24 +22,11 @@
 MAX_MODEL_LEN = 1024
 
 MODELS = [
-    # act_order==False, group_size=channelwise
-    ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
-    # act_order==False, group_size=128
-    ("TheBloke/Llama-2-7B-GPTQ", "main"),
-
     # act_order==True, group_size=128
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
-    # act_order==True, group_size=64
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
-    # act_order==True, group_size=32
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
 
     # 8-bit, act_order==True, group_size=channelwise
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
-    # 8-bit, act_order==True, group_size=128
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
-    # 8-bit, act_order==True, group_size=32
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
 
     # 4-bit, act_order==True, group_size=128
     ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py
index d65be05f141b4..aa63f9f36a3a8 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@@ -25,16 +25,16 @@ class ModelPair:
     # 4-bit, group_size == 128
     ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
               model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
-    # 4-bit, group_size == channelwise
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
-              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
+    # # 4-bit, group_size == channelwise
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
 
     # 8-bit, group_size == 128
     ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
               model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
-    # 8-bit, group_size == channelwise
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
-              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
+    # # 8-bit, group_size == channelwise
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
 ]
 
 
diff --git a/tests/models/decoder_only/language/test_marlin.py b/tests/models/decoder_only/language/test_marlin.py
deleted file mode 100644
index c802346dee8af..0000000000000
--- a/tests/models/decoder_only/language/test_marlin.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""Compare the outputs of a GPTQ model to a Marlin model.
-
-Note: GPTQ and Marlin do not have bitwise correctness.
-As a result, in this test, we just confirm that the top selected tokens of the
-Marlin/GPTQ models are in the top 3 selections of each other.
-
-Note: Marlin internally uses locks to synchronize the threads. This can
-result in very slight nondeterminism for Marlin. As a result, we re-run the test
-up to 3 times to see if we pass.
-
-Run `pytest tests/models/test_marlin.py`.
-"""
-from dataclasses import dataclass
-
-import pytest
-
-from tests.quantization.utils import is_quant_method_supported
-
-from ...utils import check_logprobs_close
-
-
-@dataclass
-class ModelPair:
-    model_marlin: str
-    model_gptq: str
-
-
-model_pairs = [
-    ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128",
-              model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"),
-    ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin",
-              model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"),
-    ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin",
-              model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq")
-]
-
-
-@pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(not is_quant_method_supported("marlin"),
-                    reason="Marlin is not supported on this GPU type.")
-@pytest.mark.parametrize("model_pair", model_pairs)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    vllm_runner,
-    example_prompts,
-    model_pair: ModelPair,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    with vllm_runner(model_pair.model_marlin,
-                     dtype=dtype,
-                     quantization="marlin") as marlin_model:
-        marlin_outputs = marlin_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-
-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
-        gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-
-    check_logprobs_close(
-        outputs_0_lst=gptq_outputs,
-        outputs_1_lst=marlin_outputs,
-        name_0="gptq",
-        name_1="marlin",
-    )
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 174b905d9cbb9..5be44c54a717c 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -4,7 +4,7 @@
 """
 import pytest
 
-from vllm import LLM, SamplingParams
+from vllm import SamplingParams
 
 from ...utils import check_logprobs_close
 
@@ -15,6 +15,10 @@
     # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
+MISTRAL_FORMAT_MODELS = [
+    "mistralai/Mistral-7B-Instruct-v0.3",
+]
+
 SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 SYMBOLIC_LANG_PROMPTS = [
     "勇敢な船乗りについての詩を書く",  # japanese
@@ -95,7 +99,7 @@ def test_models(
     )
 
 
-@pytest.mark.parametrize("model", MODELS[1:])
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
@@ -135,28 +139,29 @@ def test_mistral_format(
     )
 
 
-@pytest.mark.parametrize("model", MODELS[1:])
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS)
 def test_mistral_symbolic_languages(
+    vllm_runner,
     model: str,
     dtype: str,
-    prompt: str,
 ) -> None:
-    prompt = "hi"
-    msg = {"role": "user", "content": prompt}
-    llm = LLM(model=model,
-              dtype=dtype,
-              max_model_len=8192,
-              tokenizer_mode="mistral",
-              config_format="mistral",
-              load_format="mistral")
-    outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS)
-    assert "�" not in outputs[0].outputs[0].text.strip()
+    with vllm_runner(model,
+                     dtype=dtype,
+                     max_model_len=8192,
+                     tokenizer_mode="mistral",
+                     config_format="mistral",
+                     load_format="mistral") as vllm_model:
+        for prompt in SYMBOLIC_LANG_PROMPTS:
+            msg = {"role": "user", "content": prompt}
+            outputs = vllm_model.model.chat([msg],
+                                            sampling_params=SAMPLING_PARAMS)
+            assert "�" not in outputs[0].outputs[0].text.strip()
 
 
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("model", MODELS[1:])  # v1 can't do func calling
+@pytest.mark.parametrize("model",
+                         MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
 def test_mistral_function_calling(
     vllm_runner,
     model: str,
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 68055cbe29095..05117666f8c3f 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -7,25 +7,39 @@
 """
 import pytest
 
-from ...utils import check_outputs_equal
+from vllm.platforms import current_platform
+
+from ...utils import check_logprobs_close
 
 MODELS = [
-    "facebook/opt-125m",
-    "gpt2",
-    "bigcode/tiny_starcoder_py",
-    "EleutherAI/pythia-70m",
-    "bigscience/bloom-560m",  # Testing alibi slopes.
-    "microsoft/phi-2",
-    "stabilityai/stablelm-3b-4e1t",
-    # "allenai/OLMo-1B",  # Broken
-    "bigcode/starcoder2-3b",
-    "google/gemma-1.1-2b-it",
+    "facebook/opt-125m",  # opt
+    "openai-community/gpt2",  # gpt2
+    # "Milos/slovak-gpt-j-405M",  # gptj
+    # "bigcode/tiny_starcoder_py",  # gpt_bigcode
+    # "EleutherAI/pythia-70m",  # gpt_neox
+    "bigscience/bloom-560m",  # bloom - testing alibi slopes
+    "microsoft/phi-2",  # phi
+    # "stabilityai/stablelm-3b-4e1t",  # stablelm
+    # "bigcode/starcoder2-3b",  # starcoder2
+    "google/gemma-1.1-2b-it",  # gemma
+    "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
+    "meta-llama/Llama-3.2-1B-Instruct",  # llama
 ]
 
+if not current_platform.is_cpu():
+    MODELS += [
+        # fused_moe which not supported on CPU
+        "openbmb/MiniCPM3-4B",
+    ]
+
+# TODO: remove this after CPU float16 support ready
+target_dtype = "float" if current_platform.is_cpu() else "half"
+
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(
     hf_runner,
     vllm_runner,
@@ -33,33 +47,24 @@ def test_models(
     model: str,
     dtype: str,
     max_tokens: int,
+    num_logprobs: int,
 ) -> None:
-    # To pass the small model tests, we need full precision.
-    assert dtype == "float"
 
     with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
 
-    check_outputs_equal(
+    check_logprobs_close(
         outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_outputs,
         name_0="hf",
         name_1="vllm",
     )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_model_print(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
diff --git a/tests/models/decoder_only/language/test_qwen.py b/tests/models/decoder_only/language/test_qwen.py
deleted file mode 100644
index 128fe65afbb84..0000000000000
--- a/tests/models/decoder_only/language/test_qwen.py
+++ /dev/null
@@ -1,34 +0,0 @@
-"""Ensure that a text-only Qwen model can be run without throwing an error.
-We explicitly test this because Qwen is implemented as a multimodal and
-supports a visual encoder for models like Qwen-VL.
-"""
-from typing import List, Type
-
-import pytest
-
-from ....conftest import VllmRunner
-
-models = [
-    "Qwen/Qwen-7B-Chat"  # Has no visual encoder
-]
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_text_only_qwen_model_can_be_loaded_and_run(
-    vllm_runner: Type[VllmRunner],
-    example_prompts: List[str],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-):
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_model.generate_greedy_logprobs(
-            example_prompts,
-            max_tokens,
-            num_logprobs=num_logprobs,
-        )

From 235366fe2eb3144321978e181af94487f0215595 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 5 Nov 2024 16:02:32 -0500
Subject: [PATCH 09/24] [CI] Prune back the number of tests in tests/kernels/*
 (#9932)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 tests/kernels/test_activation.py            |  2 +-
 tests/kernels/test_attention.py             |  2 +-
 tests/kernels/test_awq_marlin.py            | 16 ++++++-----
 tests/kernels/test_blocksparse_attention.py |  6 ++---
 tests/kernels/test_cache.py                 |  2 +-
 tests/kernels/test_cutlass.py               | 30 ++++++++++++++++-----
 tests/kernels/test_int8_quant.py            |  7 +++--
 tests/kernels/test_marlin_gemm.py           |  2 +-
 tests/kernels/test_moe.py                   | 23 +++++++++-------
 tests/kernels/test_pos_encoding.py          |  6 ++---
 10 files changed, 60 insertions(+), 36 deletions(-)

diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index 057a11746014c..a84501f9c303f 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -14,7 +14,7 @@
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
-D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
+D = [512, 13824]  # Arbitrary values for testing
 SEEDS = [0]
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 4ecd0fc1a21ad..3e3c0668198ad 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -33,7 +33,7 @@
 
 # FlashAttention forward only supports head dimension at most 128
 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
-HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
+HEAD_SIZES = [64, 80, 120, 256]
 
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py
index 59917dd2c58ad..238d6426bf099 100644
--- a/tests/kernels/test_awq_marlin.py
+++ b/tests/kernels/test_awq_marlin.py
@@ -14,13 +14,17 @@
     awq_marlin_quantize)
 from vllm.scalar_type import scalar_types
 
+NUM_EXPERTS = [8, 64]
+TOP_KS = [2, 6]
+GROUP_SIZES = [-1, 32, 128]
 
-@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
-@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
-@pytest.mark.parametrize("k", [128, 1024, 512])
-@pytest.mark.parametrize("e", [8, 64])
-@pytest.mark.parametrize("topk", [2, 6])
-@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+
+@pytest.mark.parametrize("m", [1, 33, 64, 222])
+@pytest.mark.parametrize("n", [128, 2048])
+@pytest.mark.parametrize("k", [128, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("group_size", GROUP_SIZES)
 @pytest.mark.skipif(not (ops.supports_moe_ops
                          and hasattr(torch.ops._moe_C, "marlin_gemm_moe")),
                     reason="Marlin is not supported on this GPU type.")
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py
index fb601852dd523..fad342d1b5923 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -25,10 +25,10 @@
 DTYPES = [torch.half, torch.bfloat16]
 NUM_GEN_SEQS = [3]  # Arbitrary values for testing
 NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
-NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
+NUM_HEADS = [(40, 40)]  # Arbitrary values for testing
 
 HEAD_SIZES = [64, 112]
-BLOCK_SIZES = [16, 32]
+BLOCK_SIZES = [16]
 USE_ALIBI = [False, True]
 KV_CACHE_DTYPE = ["auto", "fp8"]
 SEEDS = [0]
@@ -37,7 +37,7 @@
 BLOCKSPARSE_VERT_STRIDES = [8]
 
 BLOCKSPARSE_BLOCK_SIZES = [64]
-BLOCKSPARSE_HEADS_SLIDINGS = [0, 2, -1]
+BLOCKSPARSE_HEADS_SLIDINGS = [2, -1]
 BLOCKSPARSE_HOMO_HEADS = [True, False]
 
 
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index e2b4778b94b9e..40550ed51e2c7 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -13,7 +13,7 @@
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
-HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
+HEAD_SIZES = [64, 80, 120, 256]
 BLOCK_SIZES = [8, 16, 32]
 
 # Arbitrary values for testing
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 993e67e827ea0..afe53797322f9 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -11,6 +11,28 @@
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 
+MNK_FACTORS = [
+    (1, 256, 128),
+    (1, 16384, 1024),
+    (1, 24576, 496),
+    (16, 256, 496),
+    (16, 16384, 128),
+    (16, 24576, 4096),
+    (32, 8192, 4096),
+    (32, 16384, 4096),
+    (33, 1024, 1024),
+    (33, 8192, 128),
+    (64, 2048, 496),
+    (64, 16384, 1024),
+    (100, 8192, 496),
+    (128, 32768, 4096),
+    (256, 4096, 4096),
+    (512, 256, 1024),
+    (512, 8192, 4096),
+    (512, 16384, 128),
+    (512, 24576, 128),
+]
+
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
@@ -116,9 +138,7 @@ def cutlass_int8_gemm_helper(m: int,
             (out, a, b, scale_a, scale_b, bias))
 
 
-@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
-@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024])
-@pytest.mark.parametrize("k", [128, 496, 1024])
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("use_bias", [True, False])
@@ -129,9 +149,7 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
     cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
 
 
-@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 33, 1])
-@pytest.mark.parametrize("n", [2048, 8192, 16384, 256, 1024])
-@pytest.mark.parametrize("k", [128, 496, 1024])
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("use_bias", [True, False])
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index 8db6a0d0d9fa4..12c578db0893c 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -7,11 +7,10 @@
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
-                8193]  # Arbitrary values for testing
+HIDDEN_SIZES = [16, 67, 768, 5137, 8193]  # Arbitrary values for testing
 NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
 SEEDS = [0]
-SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
+SCALE = [0.1, 2.1]
 
 
 def opcheck_int8_quant_static(output, input, scale, azp=None):
@@ -132,7 +131,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("scale", SCALE[2:])  # Reduce test time
+@pytest.mark.parametrize("scale", SCALE)
 @pytest.mark.parametrize("azp", [-255, 54])
 @torch.inference_mode()
 def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index 5cfd4d6da7a86..b6dd68cc51a9f 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -35,7 +35,7 @@
 USE_FP32_REDUCE_OPTS = [False, True]
 
 MARLIN_K_CHUNKS = [128]
-MARLIN_N_CHUNKS = [64, 128, 256]
+MARLIN_N_CHUNKS = [64, 256]
 
 MARLIN_24_K_CHUNKS = [128]
 MARLIN_24_N_CHUNKS = [512]
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 19c3fc1e1fe3a..17428ebfc2e28 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -20,12 +20,15 @@
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
+NUM_EXPERTS = [8, 64]
+TOP_KS = [2, 6]
 
-@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
-@pytest.mark.parametrize("n", [2048, 256, 1024])
+
+@pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
 @pytest.mark.parametrize("k", [128, 511, 1024])
-@pytest.mark.parametrize("e", [8, 64])
-@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_fused_moe(
     m: int,
@@ -93,12 +96,12 @@ def test_mixtral_moe(dtype: torch.dtype):
                                atol=mixtral_moe_tol[dtype])
 
 
-@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
-@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
-@pytest.mark.parametrize("k", [128, 1024, 512])
-@pytest.mark.parametrize("e", [8, 64])
-@pytest.mark.parametrize("topk", [2, 6])
-@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("m", [1, 33, 64, 222])
+@pytest.mark.parametrize("n", [128, 2048])
+@pytest.mark.parametrize("k", [128, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("group_size", [-1, 32, 128])
 @pytest.mark.parametrize("act_order", [True, False])
 @pytest.mark.parametrize("num_bits", [4, 8])
 @pytest.mark.parametrize("is_k_full", [True, False])
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index b408559cc0b07..eee77c22ab81a 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -11,10 +11,10 @@
 
 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
+HEAD_SIZES = [64, 80, 112, 120, 256]
 ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
-NUM_HEADS = [7, 17]  # Arbitrary values for testing
-BATCH_SIZES = [1, 5]  # Arbitrary values for testing
+NUM_HEADS = [17]  # Arbitrary values for testing
+BATCH_SIZES = [5]  # Arbitrary values for testing
 SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]
 CUDA_DEVICES = [

From ca9844b340f45f23f8d30fdce23777d215ad987c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 5 Nov 2024 14:49:20 -0800
Subject: [PATCH 10/24] [bugfix] fix weak ref in piecewise cudagraph and
 tractable test (#10048)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/compile/piecewise/test_toy_llama.py | 111 ++++++++++++++++++++--
 vllm/compilation/backends.py              |  82 +++++++++++++---
 2 files changed, 168 insertions(+), 25 deletions(-)

diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index e3e5a7d0fc5a5..9c65059c6b348 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -1,6 +1,10 @@
 """
 Test the piecewise compilation with a simple model, comparing the output
 with and without the piecewise compilation.
+
+This is a tractable model, the weights and computation are specially designed
+if the config `tractable_init` is set to True. Otherwise, the weights are
+initialized randomly with a fixed seed.
 """
 import os
 from dataclasses import dataclass
@@ -49,6 +53,12 @@ class LlamaConfig:
     mlp_size: int = 256
     vocab_size: int = 128
     num_layers: int = 2
+    init_value: float = 1.0
+    tractable_init: bool = False
+    random_seed: int = 0
+
+    def __post_init__(self):
+        assert self.mlp_size >= self.hidden_size
 
 
 class LlamaMLP(nn.Module):
@@ -66,10 +76,23 @@ def __init__(self, config: LlamaConfig) -> None:
             bias=False,
         )
 
-        self.gate_up_projection.weight.data.fill_(0.0)
-        self.down_projection.weight.data.fill_(0.0)
+        if config.tractable_init:
+            nn.init.eye_(self.gate_up_projection.weight.data[:config.mlp_size])
+            nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size:])
+            nn.init.eye_(self.down_projection.weight.data)
+        else:
+            nn.init.xavier_normal_(self.gate_up_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+            nn.init.xavier_normal_(self.down_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
 
     def forward(self, x):
+        # for tractable_init and positive input, this is
+        # essentially an elementwise-square
         x = self.gate_up_projection(x)
         x = x[:, :x.size(1) // 2] * torch.nn.functional.relu(
             x[:, x.size(1) // 2:])
@@ -84,21 +107,39 @@ def __init__(self, config: LlamaConfig) -> None:
         self.qkv_projection = nn.Linear(
             in_features=config.hidden_size,
             out_features=config.hidden_size * 3,
+            bias=False,
         )
 
         self.output_projection = nn.Linear(
             in_features=config.hidden_size,
             out_features=config.hidden_size,
+            bias=False,
         )
 
-        self.qkv_projection.weight.data.fill_(0.0)
-        self.output_projection.weight.data.fill_(0.0)
+        if config.tractable_init:
+            nn.init.eye_(self.qkv_projection.weight.data[:config.hidden_size])
+            nn.init.eye_(self.qkv_projection.weight.data[config.hidden_size:2 *
+                                                         config.hidden_size])
+            nn.init.eye_(self.qkv_projection.weight.data[2 *
+                                                         config.hidden_size:])
+            nn.init.eye_(self.output_projection.weight.data)
+        else:
+            nn.init.xavier_normal_(self.qkv_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
+            nn.init.xavier_normal_(self.output_projection.weight.data,
+                                   generator=torch.Generator().manual_seed(
+                                       config.random_seed),
+                                   gain=0.001)
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
+        # for tractable_init, this is:
+        # output = (hidden_states * 3 + positions * 2)
         qkv = self.qkv_projection(hidden_states)
         hidden_size = qkv.size(-1) // 3
         q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1)
@@ -126,20 +167,29 @@ def forward(
         hidden_states: torch.Tensor,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        For tractable computation:
+        - if residual is None, the outputs are:
+            - residual = (hidden_states + 1) * 3 + positions * 2 + hidden_states = hidden_states * 4 + positions * 2 + 3
+            - hidden_states = (residual + 1) ** 2
+        - if residual is not None, the outputs are:
+            - residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3
+            - hidden_states = (residual + 1) ** 2
+        """ # noqa
         if residual is None:
             residual = hidden_states
-            hidden_states = hidden_states / 2
+            hidden_states = hidden_states + 1
         else:
             hidden_states = hidden_states + residual
             residual = hidden_states
-            hidden_states = hidden_states / 2
+            hidden_states = hidden_states + 1
 
         hidden_states = self.self_attention(positions=positions,
                                             hidden_states=hidden_states)
 
         hidden_states = hidden_states + residual
         residual = hidden_states
-        hidden_states = hidden_states / 2
+        hidden_states = hidden_states + 1
         hidden_states = self.mlp(hidden_states)
 
         return hidden_states, residual
@@ -156,7 +206,8 @@ def __init__(self, config: LlamaConfig) -> None:
         self.layers = nn.ModuleList(
             [LlamaDecoderLayer(config) for _ in range(config.num_layers)])
 
-        self.embedding_tokens.weight.data.fill_(0.0)
+        # this is the initial value of the hidden states
+        self.embedding_tokens.weight.data.fill_(config.init_value)
 
     def forward(
         self,
@@ -170,6 +221,28 @@ def forward(
         return hidden_states
 
 
+def tractable_computation(input_ids: torch.Tensor,
+                          positions: torch.Tensor,
+                          config: LlamaConfig,
+                          init_value: float = 1.0) -> torch.Tensor:
+    hidden_states = torch.ones(input_ids.size(0),
+                               config.hidden_size,
+                               device=input_ids.device,
+                               dtype=input_ids.dtype) * init_value
+
+    # first layer
+    residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
+    hidden_states = (residual + 1)**2
+
+    # following layers
+    for _ in range(config.num_layers - 1):
+        hidden_states = hidden_states + residual
+        residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
+        hidden_states = (residual + 1)**2
+
+    return hidden_states
+
+
 @torch.inference_mode
 def run_model(llama_config,
               use_compile: bool,
@@ -213,7 +286,15 @@ def run_model(llama_config,
     del os.environ["VLLM_TORCH_COMPILE_LEVEL"]
     set_compilation_config(None)
 
-    return output.cpu()
+    output = output.cpu()
+
+    if llama_config.tractable_init:
+        expected_output = tractable_computation(input_ids[:2], positions[:2],
+                                                llama_config).cpu()
+
+        assert torch.allclose(output, expected_output)
+    else:
+        return output.cpu()
 
 
 def test_toy_llama():
@@ -222,7 +303,13 @@ def test_toy_llama():
     llama_config = LlamaConfig(hidden_size=128,
                                mlp_size=256,
                                vocab_size=128,
-                               num_layers=2)
+                               num_layers=12)
+
+    tractable_config = LlamaConfig(hidden_size=128,
+                                   mlp_size=256,
+                                   vocab_size=128,
+                                   num_layers=2,
+                                   tractable_init=True)
 
     outputs = []
     with compilation_counter.expect(
@@ -233,6 +320,8 @@ def test_toy_llama():
             num_cudagraph_caputured=0,
     ):
         outputs.append(run_model(llama_config, use_compile=False))
+    run_model(tractable_config, use_compile=False)
+
     with compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
             num_piecewise_graphs_seen=1,
@@ -242,6 +331,7 @@ def test_toy_llama():
             2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
         outputs.append(run_model(llama_config, use_compile=True))
+    run_model(tractable_config, use_compile=True)
 
     with compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
@@ -257,6 +347,7 @@ def test_toy_llama():
     ):
         outputs.append(
             run_model(llama_config, use_compile=True, split_attn=True))
+    run_model(tractable_config, use_compile=True, split_attn=True)
 
     for i in range(1, len(outputs)):
         assert torch.allclose(outputs[0], outputs[i])
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 96ddcba467c5b..de32cabbe6d07 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -6,6 +6,7 @@
 import torch
 import torch.fx as fx
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.utils import weak_ref_tensors
 
@@ -193,6 +194,7 @@ def wrap_inductor(graph,
 @dataclasses.dataclass
 class SplitItem:
     submod_name: str
+    graph_id: int
     is_splitting_graph: bool
     graph: fx.GraphModule
 
@@ -226,9 +228,7 @@ def split_graph(graph: fx.GraphModule,
 
     outputs = []
 
-    # sort the names to make sure the order is deterministic
     names = [name for (name, module) in split_gm.named_modules()]
-    names.sort()
 
     for name in names:
         if "." in name or name == "":
@@ -238,7 +238,11 @@ def split_graph(graph: fx.GraphModule,
         module = getattr(split_gm, name)
 
         graph_id = int(name.replace("submod_", ""))
-        outputs.append(SplitItem(name, graph_id in split_op_graphs, module))
+        outputs.append(
+            SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
+
+    # sort by intetger graph_id, rather than string name
+    outputs.sort(key=lambda x: x.graph_id)
 
     return split_gm, outputs
 
@@ -252,6 +256,11 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
     It runs the given graph with fake inputs, and compile some
     submodules specified by `compile_submod_names` with the given
     compilation configs.
+
+    NOTE: the order in `compile_submod_names` matters, because
+    it will be used to determine the order of the compiled piecewise
+    graphs. The first graph will handle logging, and the last graph
+    has some special cudagraph output handling.
     """
 
     def __init__(self, module: torch.fx.GraphModule,
@@ -263,7 +272,6 @@ def __init__(self, module: torch.fx.GraphModule,
         self.compile_submod_names = compile_submod_names
         self.compilation_configs = compilation_configs
         self.graph_pool = graph_pool
-        self.have_seen_first_graph = False
 
     def run(self, *args):
         fake_args = [
@@ -279,6 +287,7 @@ def call_module(self, target: torch.fx.node.Target,
         output = super().call_module(target, args, kwargs)
 
         if target in self.compile_submod_names:
+            index = self.compile_submod_names.index(target)
             submod = self.fetch_attr(target)
             sym_shape_indices = [
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
@@ -288,15 +297,14 @@ def call_module(self, target: torch.fx.node.Target,
                 args,
                 self.compilation_configs.inductor_compile_config,
                 runtime_shape=None,
-                do_logging=not self.have_seen_first_graph,
+                do_logging=index == 0,
                 use_inductor=self.compilation_configs.use_inductor)
 
             self.module.__dict__[target] = PiecewiseBackend(
-                submod, self.compilation_configs, self.graph_pool,
-                not self.have_seen_first_graph, sym_shape_indices,
+                submod, self.compilation_configs, self.graph_pool, index,
+                len(self.compile_submod_names), sym_shape_indices,
                 compiled_graph_for_general_shape)
 
-            self.have_seen_first_graph = True
             compilation_counter.num_piecewise_capturable_graphs_seen += 1
 
         return output
@@ -352,8 +360,9 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             graph, self.compilation_configs.non_cudagraph_ops)
 
         from torch._dynamo.utils import lazy_format_graph_code
-        logger.debug("%s",
-                     lazy_format_graph_code("stiching module", self.split_gm))
+        logger.debug("%s", lazy_format_graph_code("before split", self.graph))
+        logger.debug("%s", lazy_format_graph_code("after split",
+                                                  self.split_gm))
 
         compilation_counter.num_piecewise_graphs_seen += len(
             self.piecewise_graphs)
@@ -385,12 +394,17 @@ class ConcreteSizeEntry:
     cudagraph: Optional[torch.cuda.CUDAGraph] = None
     output: Optional[Any] = None
 
+    # for cudagraph debugging, track the input addresses
+    # during capture, and check if they are the same during replay
+    input_addresses: Optional[List[int]] = None
+
 
 class PiecewiseBackend:
 
     def __init__(self, graph: fx.GraphModule,
                  compilation_configs: CompilationConfig, graph_pool: Any,
-                 is_first_graph: bool, sym_shape_indices: List[int],
+                 piecewise_compile_index: int, total_piecewise_compiles: int,
+                 sym_shape_indices: List[int],
                  compiled_graph_for_general_shape: Callable):
         """
         The backend for piecewise compilation.
@@ -408,7 +422,12 @@ def __init__(self, graph: fx.GraphModule,
         self.graph = graph
         self.compilation_configs = compilation_configs
         self.graph_pool = graph_pool
-        self.is_first_graph = is_first_graph
+        self.piecewise_compile_index = piecewise_compile_index
+        self.total_piecewise_compiles = total_piecewise_compiles
+
+        self.is_first_graph = piecewise_compile_index == 0
+        self.is_last_graph = (
+            piecewise_compile_index == total_piecewise_compiles - 1)
 
         self.compile_sizes: Set[int] = set(
             self.compilation_configs.compile_sizes)
@@ -422,6 +441,8 @@ def __init__(self, graph: fx.GraphModule,
 
         self.sym_shape_indices = sym_shape_indices
 
+        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+
         # the entries for different shapes that we need to either
         # compile or capture cudagraph
         self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
@@ -476,14 +497,45 @@ def __call__(self, *args) -> Any:
                 logger.info("Capturing a cudagraph for shape %s",
                             runtime_shape)
 
+            input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            entry.input_addresses = input_addresses
             cudagraph = torch.cuda.CUDAGraph()
+
+            # mind-exploding: carefully manage the reference and memory.
             with torch.cuda.graph(cudagraph, pool=self.graph_pool):
-                entry.output = weak_ref_tensors(entry.runnable(*args))
+                # `output` is managed by pytorch's cudagraph pool
+                output = entry.runnable(*args)
+                if self.is_last_graph:
+                    # by converting it to weak ref,
+                    # the original `output` will immediately be released
+                    # to save memory. It is only safe to do this for
+                    # the last graph, because the output of the last graph
+                    # will not be used by any other cuda graph.
+                    output = weak_ref_tensors(output)
+
+            # here we always use weak ref for the output
+            # to save memory
+            entry.output = weak_ref_tensors(output)
+            entry.cudagraph = cudagraph
 
             compilation_counter.num_cudagraph_caputured += 1
 
-            entry.cudagraph = cudagraph
-            return entry.output
+            # important: we need to return the output, rather than
+            # the weak ref of the output, so that pytorch can correctly
+            # manage the memory during cuda graph capture
+            return output
+
+        if self.is_debugging_mode:
+            # check if the input addresses are the same
+            new_input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            assert new_input_addresses == entry.input_addresses, (
+                "Input addresses for cudagraphs are different during replay."
+                f" Expected {entry.input_addresses}, got {new_input_addresses}"
+            )
 
         entry.cudagraph.replay()
         return entry.output

From 43300bd98a54d48e97d9fb78c9db88eda3a88c64 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Tue, 5 Nov 2024 16:34:40 -0800
Subject: [PATCH 11/24] [Bugfix] Properly propagate trust_remote_code settings
 (#10047)

Signed-off-by: Zifei Tong <zifeitong@gmail.com>
---
 vllm/model_executor/models/chatglm.py |  7 ++++---
 vllm/model_executor/models/molmo.py   | 22 ++++++++++++----------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index c3c9ec703c1e6..181f3c2b0fc35 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -54,8 +54,9 @@ def mm_input_mapper_for_glmv(
     data: MultiModalData[object],
 ) -> Dict:
     model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(model_config.tokenizer,
-                                     trust_remote_code=True)
+    tokenizer = cached_get_tokenizer(
+        model_config.tokenizer,
+        trust_remote_code=model_config.trust_remote_code)
     if tokenizer is None:
         raise RuntimeError("No HuggingFace processor is available "
                            "to process the image object")
@@ -525,7 +526,7 @@ def _parse_and_validate_image_input(
             elif isinstance(pixel_values, list):
                 return torch.concat(pixel_values)
             else:
-                raise TypeError("""pixel_values must be a torch.Tensor 
+                raise TypeError("""pixel_values must be a torch.Tensor
                     or a list of torch.Tensor
                     """)
         return GLMImagePixelInputs(pixel_values=pixel_values)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index ba798833e26a9..07c06149f0206 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -844,9 +844,10 @@ def get_max_tokens(max_crops: int, crop_patches: int, left_margin: int,
 
 
 def get_max_molmo_image_tokens(ctx: InputContext) -> int:
-    processor = cached_get_processor(ctx.model_config.model,
-                                     trust_remote_code=True,
-                                     revision=ctx.model_config.code_revision)
+    processor = cached_get_processor(
+        ctx.model_config.model,
+        trust_remote_code=ctx.model_config.trust_remote_code,
+        revision=ctx.model_config.code_revision)
     image_processor = processor.image_processor
     max_llm_image_tokens = get_max_tokens(
         image_processor.max_crops,
@@ -870,9 +871,10 @@ def image_input_mapper_for_molmo(
 
 def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
                          mm_counts: Mapping[str, int]):
-    processor = cached_get_processor(ctx.model_config.model,
-                                     trust_remote_code=True,
-                                     revision=ctx.model_config.code_revision)
+    processor = cached_get_processor(
+        ctx.model_config.model,
+        trust_remote_code=ctx.model_config.trust_remote_code,
+        revision=ctx.model_config.code_revision)
     image_processor = processor.image_processor
 
     base_image_input_d = image_processor.image_patch_size
@@ -935,11 +937,11 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
     multi_modal_data = inputs.get("multi_modal_data")
     image = None if multi_modal_data is None else multi_modal_data.get("image")
 
-    processor = cached_get_processor(ctx.model_config.model,
-                                     trust_remote_code=True,
-                                     revision=ctx.model_config.code_revision)
-
     model_config = ctx.model_config
+    processor = cached_get_processor(
+        ctx.model_config.model,
+        trust_remote_code=model_config.trust_remote_code,
+        revision=ctx.model_config.code_revision)
     tokenizer = cached_get_tokenizer(
         model_config.tokenizer,
         trust_remote_code=model_config.trust_remote_code)

From 966e31697bdeb47b33b3e26b4aab5999c85f3e90 Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Tue, 5 Nov 2024 21:39:26 -0300
Subject: [PATCH 12/24] [Bugfix] Fix pickle of input when async output
 processing is on (#9931)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 .../test_basic_correctness.py                 | 26 +++++++++++++++++++
 vllm/worker/model_runner.py                   | 12 +++++++++
 2 files changed, 38 insertions(+)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 79647589d5204..7f16baa65a644 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -156,3 +156,29 @@ def test_model_with_failure(vllm_runner) -> None:
                           ModelInputForGPUWithSamplingMetadata)
     finally:
         os.remove(filename)
+
+
+def test_failure_with_async_out_proc(vllm_runner) -> None:
+
+    filename = None
+    try:
+        with vllm_runner("facebook/opt-125m",
+                         dtype="half",
+                         enforce_eager=False,
+                         gpu_memory_utilization=0.7) as vllm_model,\
+             patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
+                       side_effect=ValueError()):
+            model_config = vllm_model.model.llm_engine.model_config
+            assert model_config.use_async_output_proc
+            with pytest.raises(ValueError) as exc_info:
+                vllm_model.generate_greedy('how to make pizza?', 250)
+            matches = re.search(r"input dumped to (.+).pkl",
+                                str(exc_info.value))
+            assert matches is not None
+
+            filename = f"{matches.group(1)}.pkl"
+    finally:
+        # Clean up
+        if filename is not None:
+            os.remove(filename)
+        pass
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 2447eecf7957d..1e8ea4e8e79cf 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -136,6 +136,18 @@ def from_broadcasted_tensor_dict(
                 attn_backend, tensor_dict)
         return cls(**tensor_dict)
 
+    # Exclude `async_callback` to be able to pickle this object
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["async_callback"]
+        return state
+
+    # TODO: What happens when we depickle this object?
+    # How can we update this callback to properly pass it to the engine?
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self.__dict__.update({'async_callback': None})
+
 
 @dataclass(frozen=True)
 class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):

From 0c63c34f725f0b519fa094fbeca6e3cf12c911c1 Mon Sep 17 00:00:00 2001
From: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Date: Wed, 6 Nov 2024 10:45:45 +0900
Subject: [PATCH 13/24] [Bugfix][SpecDecode] kv corruption with bonus tokens in
 spec decode (#9730)

Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
---
 tests/spec_decode/test_multi_step_worker.py | 107 ++++++++++++++++++++
 tests/spec_decode/utils.py                  |   4 +-
 vllm/spec_decode/draft_model_runner.py      |  35 ++++++-
 vllm/spec_decode/multi_step_worker.py       |  23 ++++-
 4 files changed, 159 insertions(+), 10 deletions(-)

diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index e6f7f480eebb2..0b5d82b6610ca 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -5,6 +5,8 @@
 import pytest
 import torch
 
+from vllm.attention.selector import (_Backend,
+                                     global_force_attn_backend_context_manager)
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import (ExecuteModelRequest, HiddenStates, Logprob,
@@ -303,6 +305,7 @@ def test_multi_step_with_batch_expansion_correct_output():
         seed,
         model_runner_cls=TP1DraftModelRunner,
     )
+    multi_step_worker.set_include_gpu_probs_tensor()
     worker = create_worker(
         Worker,
         model_name,
@@ -397,6 +400,7 @@ def test_multi_step_with_batch_expansion_incorrect_output():
         seed,
         model_runner_cls=TP1DraftModelRunner,
     )
+    multi_step_worker.set_include_gpu_probs_tensor()
     worker = create_worker(
         Worker,
         model_name,
@@ -477,6 +481,109 @@ def test_multi_step_with_batch_expansion_incorrect_output():
     assert (num_mismatch > 0)
 
 
+@torch.inference_mode()
+@pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
+# The choice of backends forces the multi_step_worker to choose between
+# the vanilla model_runner and TP1DraftModelRunner and that we can test
+# both code paths.
+@pytest.mark.parametrize('attn_backend',
+                         [_Backend.XFORMERS, _Backend.FLASH_ATTN])
+def test_multi_step_correct_kvcache(num_steps, attn_backend):
+    """Verify that the KV cache of the draft model 
+    is correctly updated for sequences with bonus token.
+    """
+    seed = 100
+    model_name = "JackFram/llama-68m"
+
+    block_size = 16
+    num_gpu_blocks = 2048 // block_size
+    batch_size = 1
+
+    with global_force_attn_backend_context_manager(attn_backend):
+        dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
+        multi_step_worker = create_worker(MultiStepWorker,
+                                          model_name,
+                                          block_size,
+                                          num_gpu_blocks,
+                                          seed,
+                                          model_runner_cls=TP1DraftModelRunner,
+                                          dtype=dtype)
+        multi_step_worker.set_include_gpu_probs_tensor()
+        worker = create_worker(Worker,
+                               model_name,
+                               block_size,
+                               num_gpu_blocks,
+                               seed,
+                               dtype=dtype)
+
+        prompts = [[0] for _ in range(batch_size)]
+        # Already generate two tokens for the sequence
+        # so that we can simulate the bonus token case
+        multi_step_continuations = [[
+            random.randint(0, 1000),
+            random.randint(0, 1000)
+        ] for _ in prompts]
+        final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
+
+        seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
+        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            continuations=multi_step_continuations,
+            final_prompt_lens=final_prompt_lens)
+
+        # Run multi-step.
+        zero_kv_cache(multi_step_worker.cache_engine)
+        multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list),
+                                         sample_len=num_steps,
+                                         seq_ids_with_bonus_token_in_last_step=
+                                         seq_ids_with_bonus_token_in_last_step)
+
+        # Run single-step repeatedly.
+        zero_kv_cache(worker.cache_engine)
+        # Generate the kv cache for the bonus token first
+        single_step_continuations = [c[:1] for c in multi_step_continuations]
+        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            continuations=single_step_continuations,
+            final_prompt_lens=final_prompt_lens)
+        single_step_output = worker.execute_model(
+            execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list))
+        for _ in range(num_steps):
+            seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+                prompts,
+                num_gpu_blocks,
+                block_size,
+                continuations=multi_step_continuations,
+                final_prompt_lens=final_prompt_lens)
+
+            single_step_output = worker.execute_model(
+                execute_model_req=ExecuteModelRequest(
+                    seq_group_metadata_list=seq_group_metadata_list))
+
+            for i, seq_group_output in enumerate(single_step_output[-1]):
+                multi_step_continuations[i].append(
+                    seq_group_output.samples[0].output_token)
+
+        # Verify that the KV cache of the single-step and
+        # multi-step workers are the same.
+        single_step_gpu_cache = worker.cache_engine[0].gpu_cache
+        multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
+        num_layers = len(single_step_gpu_cache)
+        allclose = lambda a, b: torch.allclose(
+            a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
+        for i in range(num_layers):
+            assert allclose(single_step_gpu_cache[i][0],
+                            multi_step_gpu_cache[i][0])
+            assert allclose(single_step_gpu_cache[i][1],
+                            multi_step_gpu_cache[i][1])
+
+
 @torch.inference_mode()
 def test_draft_proposals_full_speculation_len():
     """Verify Top1Proposer correctly handles case where all sequences
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index 6cf0cfb09b8fa..e5cb0530f9961 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -68,12 +68,14 @@ def create_worker(cls: Callable[..., T],
                   seed: int,
                   is_driver_worker: bool = True,
                   enforce_eager: bool = True,
-                  model_runner_cls: Optional[ModelRunner] = None) -> T:
+                  model_runner_cls: Optional[ModelRunner] = None,
+                  dtype: Optional[str] = "auto") -> T:
     engine_args = EngineArgs(
         model=model_name,
         seed=seed,
         block_size=block_size,
         enforce_eager=enforce_eager,
+        dtype=dtype,
     )
     engine_config = engine_args.create_engine_config()
 
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 17cc0ad1a4a3a..6330ac027db74 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -54,6 +54,8 @@ def __init__(self, *args, **kwargs):
 
         super().__init__(*args, **kwargs)
 
+        self.indices_of_seq_with_bonus_tokens = None
+
     def _update_sampling_metadata(self, sampling_metadata, num_seqs,
                                   num_queries):
 
@@ -159,6 +161,10 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest):
         # TODO: Add soft-tuning prompt adapter support
         return not self.prompt_adapter_config
 
+    def set_indices_of_seq_with_bonus_tokens(self,
+                                             indices_of_seq_with_bonus_tokens):
+        self.indices_of_seq_with_bonus_tokens = indices_of_seq_with_bonus_tokens
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -284,11 +290,30 @@ def execute_model(
                                                model_input.sampling_metadata)
 
             # Sample the next token.
-            outputs.append(
-                self.model.sample(
-                    logits=logits,
-                    sampling_metadata=model_input.sampling_metadata,
-                ))
+            output = self.model.sample(
+                logits=logits,
+                sampling_metadata=model_input.sampling_metadata,
+            )
+            outputs.append(output)
+
+            if model_input.attn_metadata.num_prefills == 0 \
+                and self.indices_of_seq_with_bonus_tokens is not None:
+                assert output.sampled_token_ids is not None
+                # output.sampled_token_ids should be of shape (num_seqs, 1)
+                nums_seqs, num_tokens_per_seq = output.sampled_token_ids.shape
+                assert num_tokens_per_seq == 1
+                count = 0
+                for i in range(nums_seqs):
+                    bonus_seq_idx = self.indices_of_seq_with_bonus_tokens[
+                        count]
+                    if i != bonus_seq_idx:
+                        # The following might cause a cpu->gpu sync
+                        # However, the performance impact is negligible as we
+                        # benchmarked on H100.
+                        output.sampled_token_ids[
+                            i, :] = model_input.input_tokens[bonus_seq_idx]
+                    else:
+                        count += 1
 
             # Prepare inputs for the next step
             if step != num_steps - 1:
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 4b53fbe056c47..f49b98f5c9528 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -81,6 +81,8 @@ def sampler_output(
             # Here we run the draft_model_runner with multi-step prepare
             # on the GPU directly
             expanded_request.num_steps = sample_len
+            self.model_runner.set_indices_of_seq_with_bonus_tokens(
+                indices_of_seq_with_bonus_tokens)
             model_outputs = self.execute_model(
                 execute_model_req=expanded_request)
         else:
@@ -97,7 +99,8 @@ def sampler_output(
                 model_output = model_output[0]
 
                 self._append_new_tokens(
-                    model_output, expanded_request.seq_group_metadata_list)
+                    model_output, expanded_request.seq_group_metadata_list,
+                    indices_of_seq_with_bonus_tokens)
                 model_outputs.append(model_output)
 
         filtered_model_outputs = self._filter_model_output(
@@ -221,13 +224,15 @@ def get_spec_proposals(
     @staticmethod
     def _append_new_tokens(
             model_output: List[SamplerOutput],
-            seq_group_metadata_list: List[SequenceGroupMetadata]) -> None:
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            indices_of_seq_with_bonus_tokens: List[int]) -> None:
         """Given model output from a single run, append the tokens to the
         sequences. This is normally done outside of the worker, but it is
         required if the worker is to perform multiple forward passes.
         """
-        for seq_group_metadata, sequence_group_outputs in zip(
-                seq_group_metadata_list, model_output):
+        count = 0
+        for index, (seq_group_metadata, sequence_group_outputs) in enumerate(
+                zip(seq_group_metadata_list, model_output)):
             seq_group_metadata.is_prompt = False
 
             for seq_output in sequence_group_outputs.samples:
@@ -237,6 +242,16 @@ def _append_new_tokens(
 
                 token_id = seq_output.output_token
                 token_logprob = seq_output.logprobs[token_id]
+                # Determine the actual token ID to be generated,
+                # considering bonus tokens
+                if index != indices_of_seq_with_bonus_tokens[count]:
+                    bonus_seq_metadata = seq_group_metadata_list[
+                        indices_of_seq_with_bonus_tokens[count]]
+                    _, bonus_token_seq_data = next(
+                        iter(bonus_seq_metadata.seq_data.items()))
+                    token_id = bonus_token_seq_data.output_token_ids[-1]
+                else:
+                    count += 1
 
                 seq.append_token_id(token_id, token_logprob.logprob)
                 seq.update_num_computed_tokens(1)

From c4cacbaa7faf9d0d3b2aa26e5df496724e80cb05 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 5 Nov 2024 18:19:50 -0800
Subject: [PATCH 14/24] [v1] reduce graph capture time for piecewise cudagraph
 (#10059)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index de32cabbe6d07..05deee7bd5473 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,7 +1,9 @@
 import copy
 import dataclasses
 import operator
+from contextlib import ExitStack
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+from unittest.mock import patch
 
 import torch
 import torch.fx as fx
@@ -503,17 +505,29 @@ def __call__(self, *args) -> Any:
             entry.input_addresses = input_addresses
             cudagraph = torch.cuda.CUDAGraph()
 
-            # mind-exploding: carefully manage the reference and memory.
-            with torch.cuda.graph(cudagraph, pool=self.graph_pool):
-                # `output` is managed by pytorch's cudagraph pool
-                output = entry.runnable(*args)
-                if self.is_last_graph:
-                    # by converting it to weak ref,
-                    # the original `output` will immediately be released
-                    # to save memory. It is only safe to do this for
-                    # the last graph, because the output of the last graph
-                    # will not be used by any other cuda graph.
-                    output = weak_ref_tensors(output)
+            with ExitStack() as stack:
+                if not self.is_first_graph:
+                    # during every model forward, we will capture
+                    # many pieces of cudagraphs (roughly one per layer).
+                    # running gc again and again across layers will
+                    # make the cudagraph capture very slow.
+                    # therefore, we only run gc for the first graph,
+                    # and disable gc for the rest of the graphs.
+                    stack.enter_context(patch("gc.collect", lambda: None))
+                    stack.enter_context(
+                        patch("torch.cuda.empty_cache", lambda: None))
+
+                # mind-exploding: carefully manage the reference and memory.
+                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
+                    # `output` is managed by pytorch's cudagraph pool
+                    output = entry.runnable(*args)
+                    if self.is_last_graph:
+                        # by converting it to weak ref,
+                        # the original `output` will immediately be released
+                        # to save memory. It is only safe to do this for
+                        # the last graph, because the output of the last graph
+                        # will not be used by any other cuda graph.
+                        output = weak_ref_tensors(output)
 
             # here we always use weak ref for the output
             # to save memory

From 82bfc38d079b1ef5f4b88ac7094a00029d2e99af Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 6 Nov 2024 12:05:05 +0800
Subject: [PATCH 15/24] [Misc] Sort the list of embedding models (#10037)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/registry.py | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index af52fbffba19e..792c6cec34ae0 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -94,33 +94,23 @@
 _EMBEDDING_MODELS = {
     # [Text-only]
     "BertModel": ("bert", "BertEmbeddingModel"),
+    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"),
     "LlamaModel": ("llama", "LlamaEmbeddingModel"),
+    **{
+        # Multiple models share the same architecture, so we include them all
+        k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
+        if arch == "LlamaForCausalLM"
+    },
     "MistralModel": ("llama", "LlamaEmbeddingModel"),
-    "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
-    "Qwen2ForSequenceClassification": (
-        "qwen2_cls", "Qwen2ForSequenceClassification"),
-    "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
-    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
+    "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
+    "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"),  # noqa: E501
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
 }
 
-def add_embedding_models(base_models, embedding_models):
-    with_pooler_method_models = {}
-    embedding_models_name = embedding_models.keys()
-    for name, (path, arch) in base_models.items():
-        if arch in embedding_models_name:
-            with_pooler_method_models[name] = (path, arch)
-    return with_pooler_method_models
-
-_EMBEDDING_MODELS = {
-    **add_embedding_models(_TEXT_GENERATION_MODELS, _EMBEDDING_MODELS),
-    **_EMBEDDING_MODELS,
-}
-
 _MULTIMODAL_MODELS = {
     # [Decoder-only]
     "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),

From ffc0f2b47add6e0f70e2b5d4b4aaac64ee97f8ad Mon Sep 17 00:00:00 2001
From: Peter Salas <peter@fixie.ai>
Date: Tue, 5 Nov 2024 20:19:15 -0800
Subject: [PATCH 16/24] [Model][OpenVINO] Fix regressions from #8346 (#10045)

Signed-off-by: Peter Salas <peter@fixie.ai>
---
 .buildkite/run-openvino-test.sh     |  2 +-
 vllm/attention/backends/openvino.py | 12 +++++++++++-
 vllm/model_executor/models/molmo.py |  6 +++---
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
index 70e56596c4a86..35ad5c0ddde77 100755
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py
index 6fddfc2002120..be06d16009988 100644
--- a/vllm/attention/backends/openvino.py
+++ b/vllm/attention/backends/openvino.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Tuple, Type
+from typing import Dict, List, Optional, Tuple, Type
 
 import openvino as ov
 import torch
@@ -7,6 +7,7 @@
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadata)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.multimodal import MultiModalPlaceholderMap
 
 
 def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor,
@@ -128,3 +129,12 @@ class OpenVINOAttentionMetadata:
     # Shape: scalar
     # Type: i32
     max_context_len: torch.Tensor
+
+    # The index maps that relate multi-modal embeddings to the corresponding
+    # placeholders.
+    #
+    # N.B. These aren't really related to attention and don't belong on this
+    # type -- this is just a temporary solution to make them available to
+    # `model_executable`.
+    multi_modal_placeholder_index_maps: Optional[Dict[
+        str, MultiModalPlaceholderMap.IndexMap]]
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 07c06149f0206..522aa748f78b6 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -21,8 +21,8 @@
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
                               tensor_model_parallel_all_gather)
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
-                         token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -915,7 +915,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
     if "image_masks" in out:
         dummy_imgdata["image_masks"] = out["image_masks"]
     dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long)
-    return dummy_seqdata, {"image": dummy_imgdata}
+    return DummyData(dummy_seqdata, {"image": dummy_imgdata})
 
 
 def pad_images(

From 2bcbae704c0d52913c6a2887260fc6bde6c20361 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Tue, 5 Nov 2024 21:28:29 -0700
Subject: [PATCH 17/24] [Bugfix] Fix edge-case crash when using chat with the
 Mistral Tekken Tokenizer (#10051)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 tests/models/decoder_only/language/test_mistral.py | 9 ++++++---
 vllm/transformers_utils/tokenizers/mistral.py      | 8 ++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 5be44c54a717c..6ec4b7e7e3f71 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -10,19 +10,22 @@
 
 MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.1",
-    "mistralai/Mistral-7B-Instruct-v0.3",
-    # Mistral-Nemo is to big for CI, but passes locally
-    # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
 MISTRAL_FORMAT_MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.3",
+    # uses the v3-Tekken tokenizer
+    "mistralai/Ministral-8B-Instruct-2410",
+    # Mistral-Nemo is to big for CI, but passes locally
+    # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
 SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 SYMBOLIC_LANG_PROMPTS = [
     "勇敢な船乗りについての詩を書く",  # japanese
     "寫一首關於勇敢的水手的詩",  # chinese
+    "ပုံပြင်လေးပြောပြပါ်:\n",  # burmese
+    "Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n",  # see https://github.com/vllm-project/vllm/pull/9625
 ]
 
 # for function calling
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 896f70bc1dafd..ccffdcc2a4df2 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -254,7 +254,7 @@ def decode(self,
                skip_special_tokens: bool = True) -> str:
         assert (
             skip_special_tokens
-        ), "Skipping special tokens is not supported for Mistral tokenizers."
+        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
 
         if isinstance(ids, int):
             ids = [ids]
@@ -268,12 +268,16 @@ def convert_ids_to_tokens(
         # TODO(Patrick) - potentially allow special tokens to not be skipped
         assert (
             skip_special_tokens
-        ), "Skipping special tokens is not supported for Mistral tokenizers."
+        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
 
         assert isinstance(self.tokenizer,
                           (Tekkenizer, SentencePieceTokenizer)), type(
                               self.tokenizer)
 
+        if isinstance(self.tokenizer, Tekkenizer):
+            # skip special tokens
+            ids = [i for i in ids if i > self.tokenizer.num_special_tokens]
+
         tokens = [self.tokenizer.id_to_piece(id) for id in ids]
 
         if any("�" in t for t in tokens):

From ea928f608c44b825d28609460e0d375a5f877940 Mon Sep 17 00:00:00 2001
From: arakowsk-amd <182798202+arakowsk-amd@users.noreply.github.com>
Date: Tue, 5 Nov 2024 21:10:40 -0800
Subject: [PATCH 18/24] [Bugfix] Gpt-j-6B patch kv_scale to k_scale path 
 (#10063)

Signed-off-by: Alex Rakowski <alex.rakowski@amd.com>
Signed-off-by: Alex Rakowski <182798202+arakowsk-amd@users.noreply.github.com>
---
 vllm/model_executor/models/gpt_j.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 0451d16b6c738..9a42b359ae44f 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -36,7 +36,8 @@
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -308,6 +309,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue

From 9d59b755934899b7ec5d7bb5b90d15bfd2302475 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Tue, 5 Nov 2024 21:13:09 -0800
Subject: [PATCH 19/24] [Bugfix] Remove CustomChatCompletionContentPartParam
 multimodal input type (#10054)

Signed-off-by: Zifei Tong <zifeitong@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 8da08d4b2c93c..2b339ab6d44e4 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -22,7 +22,6 @@
                                ChatCompletionToolMessageParam)
 # yapf: enable
 # pydantic needs the TypedDict from typing_extensions
-from pydantic import ConfigDict
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 from typing_extensions import Required, TypeAlias, TypedDict
 
@@ -52,17 +51,10 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
     """The type of the content part."""
 
 
-class CustomChatCompletionContentPartParam(TypedDict, total=False):
-    __pydantic_config__ = ConfigDict(extra="allow")  # type: ignore
-
-    type: Required[str]
-    """The type of the content part."""
-
-
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
     """A simpler version of the param that only accepts a plain image_url.
     This is supported by OpenAI API, although it is not documented.
-    
+
     Example:
     {
         "image_url": "https://example.com/image.jpg"
@@ -73,7 +65,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
 
 class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
     """A simpler version of the param that only accepts a plain audio_url.
-    
+
     Example:
     {
         "audio_url": "https://example.com/audio.mp3"
@@ -85,7 +77,6 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
 ChatCompletionContentPartParam: TypeAlias = Union[
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
     ChatCompletionContentPartRefusalParam,
-    CustomChatCompletionContentPartParam,
     CustomChatCompletionContentSimpleImageParam,
     CustomChatCompletionContentSimpleAudioParam, str]
 

From 40899855520eb9497606bdb2b1b4e619233e598a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 5 Nov 2024 22:16:04 -0800
Subject: [PATCH 20/24] [V1] Integrate Piecewise CUDA graphs (#10058)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/compilation/backends.py             |   7 +-
 vllm/v1/attention/backends/flash_attn.py |  35 ++++---
 vllm/v1/worker/gpu_model_runner.py       | 127 +++++++++++++++++++----
 3 files changed, 133 insertions(+), 36 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 05deee7bd5473..abd1d16accaf7 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -496,8 +496,11 @@ def __call__(self, *args) -> Any:
                 return entry.runnable(*args)
 
             if self.is_first_graph:
-                logger.info("Capturing a cudagraph for shape %s",
-                            runtime_shape)
+                # Since we capture cudagraph for many different shapes and
+                # capturing is fast, we don't need to log it for every shape.
+                # We only log it in the debug mode.
+                logger.debug("Capturing a cudagraph for shape %s",
+                             runtime_shape)
 
             input_addresses = [
                 x.data_ptr() for x in args if isinstance(x, torch.Tensor)
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index b2af89ebf854a..906f06777a136 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -51,6 +51,7 @@ class FlashAttentionMetadata:
     # |-------------------- seq_len ---------------------|
     #                                   |-- query_len ---|
 
+    num_actual_tokens: int  # Number of tokens excluding padding.
     max_query_len: int
     query_start_loc: torch.Tensor
     max_seq_len: int
@@ -134,7 +135,9 @@ def forward(
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
-        output = torch.ops.vllm.unified_flash_attention(
+        output = torch.empty_like(query)
+        torch.ops.vllm.unified_flash_attention(
+            output,
             query,
             key,
             value,
@@ -154,6 +157,7 @@ def forward(
 
 
 def unified_flash_attention(
+    output: torch.Tensor,
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -168,17 +172,17 @@ def unified_flash_attention(
     window_size: Optional[List[int]] = None,
     alibi_slopes: Optional[torch.Tensor] = None,
     logits_soft_cap: Optional[float] = None,
-) -> torch.Tensor:
+) -> None:
     current_metadata = get_forward_context()
     if current_metadata is None:
         # Profiling run.
-        return torch.empty_like(query)
+        return
 
     assert current_metadata is not None
     assert isinstance(current_metadata, FlashAttentionMetadata)
     attn_metadata: FlashAttentionMetadata = current_metadata
+    num_actual_tokens = attn_metadata.num_actual_tokens
 
-    num_tokens, hidden_size = query.shape
     # Reshape the query, key, and value tensors.
     query = query.view(-1, num_heads, head_size)
     key = key.view(-1, num_kv_heads, head_size)
@@ -188,18 +192,18 @@ def unified_flash_attention(
     key_cache = kv_cache[0]
     value_cache = kv_cache[1]
     torch.ops._C_cache_ops.reshape_and_cache_flash(
-        key,
-        value,
-        kv_cache[0],
-        kv_cache[1],
+        key[:num_actual_tokens],
+        value[:num_actual_tokens],
+        key_cache,
+        value_cache,
         attn_metadata.slot_mapping,
         kv_cache_dtype,
         k_scale,
         v_scale,
     )
 
-    output = flash_attn_varlen_func(
-        q=query,
+    attn_output = flash_attn_varlen_func(
+        q=query[:num_actual_tokens],
         k=key_cache,
         v=value_cache,
         cu_seqlens_q=attn_metadata.query_start_loc,
@@ -213,10 +217,13 @@ def unified_flash_attention(
         block_table=attn_metadata.block_table,
         softcap=logits_soft_cap,
     )
-    return output.view(num_tokens, hidden_size)
+    attn_output = attn_output.view(num_actual_tokens, -1)
+    # TODO(woosuk): Optimize this.
+    output[:num_actual_tokens].copy_(attn_output)
 
 
 def unified_flash_attention_fake(
+    output: torch.Tensor,
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -231,13 +238,13 @@ def unified_flash_attention_fake(
     window_size: Optional[List[int]] = None,
     alibi_slopes: Optional[torch.Tensor] = None,
     logits_soft_cap: Optional[float] = None,
-) -> torch.Tensor:
-    return torch.empty_like(query)
+) -> None:
+    return
 
 
 direct_register_custom_op(
     op_name="unified_flash_attention",
     op_func=unified_flash_attention,
-    mutates_args=["kv_cache"],
+    mutates_args=["kv_cache", "output"],
     fake_impl=unified_flash_attention_fake,
 )
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ae4239f8e1fab..63bf7c2e605a2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,3 +1,5 @@
+import os
+import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Set
 from unittest.mock import patch
@@ -7,11 +9,16 @@
 import torch.distributed
 import torch.nn as nn
 
+from vllm import envs
+from vllm.compilation.compile_context import set_compile_context
+from vllm.compilation.config import CompilationConfig
+from vllm.compilation.levels import CompilationLevel
 from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MultiModalDataDict
+from vllm.plugins import set_compilation_config
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv,
                         is_pin_memory_available)
@@ -86,6 +93,18 @@ def __init__(
             pin_memory=self.pin_memory,
         )
 
+        self.use_cuda_graph = (envs.VLLM_TORCH_COMPILE_LEVEL
+                               == CompilationLevel.PIECEWISE
+                               and not self.model_config.enforce_eager)
+        # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
+        self.cudagraph_batch_sizes = [1, 2, 4] + [i for i in range(8, 513, 8)]
+        self.input_ids = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int32,
+                                     device=self.device)
+        self.positions = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int64,
+                                     device=self.device)
+
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove stopped requests from the cached states.
         # Keep the states of the pre-empted requests.
@@ -268,12 +287,16 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         seq_start_loc_np[0] = 0
         np.cumsum(seq_lens, out=seq_start_loc_np[1:])
 
-        input_ids = input_ids.to(self.device, non_blocking=True)
-        positions = positions.to(self.device, non_blocking=True).long()
+        self.input_ids[:total_num_scheduled_tokens].copy_(input_ids,
+                                                          non_blocking=True)
+        self.positions[:total_num_scheduled_tokens].copy_(positions,
+                                                          non_blocking=True)
+
         query_start_loc = query_start_loc.to(self.device, non_blocking=True)
         seq_start_loc = seq_start_loc.to(self.device, non_blocking=True)
         slot_mapping = slot_mapping.to(self.device, non_blocking=True).long()
         attn_metadata = FlashAttentionMetadata(
+            num_actual_tokens=total_num_scheduled_tokens,
             max_query_len=max_num_scheduled_tokens,
             query_start_loc=query_start_loc,
             max_seq_len=max_seq_len,
@@ -287,7 +310,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # token from the partial request.
         # TODO: Support prompt logprobs.
         logits_indices = query_start_loc[1:] - 1
-        return input_ids, positions, attn_metadata, logits_indices
+        return attn_metadata, logits_indices
 
     def _prepare_sampling(
         self,
@@ -310,16 +333,26 @@ def execute_model(
         scheduler_output: "SchedulerOutput",
     ) -> ModelRunnerOutput:
         self._update_states(scheduler_output)
-        inputs = self._prepare_inputs(scheduler_output)
-        input_ids, positions, attn_metadata, logits_indices = inputs
+        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
+        num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        if (self.use_cuda_graph
+                and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
+            # Use piecewise CUDA graphs.
+            # Add padding to the batch size.
+            num_input_tokens = self._get_padded_batch_size(
+                num_scheduled_tokens)
+        else:
+            # Eager mode.
+            num_input_tokens = num_scheduled_tokens
 
         with set_forward_context(attn_metadata):
             hidden_states = self.model(
-                input_ids=input_ids,
-                positions=positions,
+                input_ids=self.input_ids[:num_input_tokens],
+                positions=self.positions[:num_input_tokens],
                 kv_caches=self.kv_caches,
-                attn_metadata=attn_metadata,
+                attn_metadata=None,
             )
+        hidden_states = hidden_states[:num_scheduled_tokens]
         hidden_states = hidden_states[logits_indices]
         logits = self.model.compute_logits(hidden_states, None)
 
@@ -371,6 +404,18 @@ def execute_model(
         return model_runner_output
 
     def load_model(self) -> None:
+        if self.use_cuda_graph:
+            # FIXME(woosuk): Currently, the custom ops are not supported
+            # in the piecewise compilation mode. We rely on TorchInductor
+            # to optimize the model.
+            os.environ["VLLM_CUSTOM_OPS"] = "none"
+            set_compilation_config(
+                CompilationConfig(
+                    use_cudagraph=True,
+                    non_cudagraph_ops=["vllm.unified_flash_attention"],
+                    use_inductor=True,
+                ))
+
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:  # noqa: SIM117
             with patch("vllm.model_executor.layers.sampler.Sampler", Sampler):
@@ -381,26 +426,61 @@ def load_model(self) -> None:
                     self.model_memory_usage / float(2**30))
 
     def _dummy_run(self, model: nn.Module, num_tokens: int) -> None:
-        input_ids = torch.zeros(num_tokens,
-                                dtype=torch.int32,
-                                device=self.device)
-        positions = torch.zeros(num_tokens,
-                                dtype=torch.long,
-                                device=self.device)
-        kv_caches = [None for _ in range(self.num_attn_layers)]
-        model(input_ids, positions, kv_caches, attn_metadata=None)
-        return
+        # use an empty tensor instead of `None`` to force Dynamo to pass
+        # it by reference, rather by specializing on the value `None`.
+        # the `dtype` argument does not matter, and we use `float32` as
+        # a placeholder (it has wide hardware support).
+        # it is important to create tensors inside the loop, rather than
+        # multiplying the list, to avoid Dynamo from treating them as
+        # tensor aliasing.
+        dummy_kv_caches = [
+            torch.tensor([], dtype=torch.float32, device=self.device)
+            for _ in range(self.num_attn_layers)
+        ]
+        with set_forward_context(None):  # noqa: SIM117
+            with set_compile_context(self.cudagraph_batch_sizes):
+                # Trigger compilation for general shape.
+                model(self.input_ids,
+                      self.positions,
+                      dummy_kv_caches,
+                      attn_metadata=None)
 
     @torch.inference_mode()
     def profile_run(self) -> None:
         self._dummy_run(self.model, self.max_num_tokens)
         torch.cuda.synchronize()
-        return
 
     @torch.inference_mode()
     def capture_model(self) -> None:
-        # TODO: Implement CUDA graph support.
-        return
+        if not self.use_cuda_graph:
+            logger.warning(
+                "Skipping CUDA graph capture. Please set "
+                "VLLM_TORCH_COMPILE_LEVEL=%d to use CUDA graphs.",
+                CompilationLevel.PIECEWISE)
+            return
+
+        start_time = time.perf_counter()
+        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
+
+        with set_forward_context(None):
+            # Trigger CUDA graph capture for specific shapes.
+            # Capture the large shapes first so that the smaller shapes
+            # can reuse the memory pool allocated for the large shapes.
+            for num_tokens in reversed(self.cudagraph_batch_sizes):
+                self.model(
+                    self.input_ids[:num_tokens],
+                    self.positions[:num_tokens],
+                    kv_caches=self.kv_caches,
+                    attn_metadata=None,
+                )
+
+        end_time = time.perf_counter()
+        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
+        elapsed_time = end_time - start_time
+        cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
+        # This usually takes 5~20 seconds.
+        logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
+                    elapsed_time, cuda_graph_size / (1 << 30))
 
     def initialize_kv_cache(self, num_blocks: int) -> None:
         assert len(self.kv_caches) == 0
@@ -412,6 +492,13 @@ def initialize_kv_cache(self, num_blocks: int) -> None:
                             dtype=self.kv_cache_dtype,
                             device=self.device))
 
+    def _get_padded_batch_size(self, batch_size: int) -> Optional[int]:
+        # TODO: Optimize this?
+        for size in self.cudagraph_batch_sizes:
+            if batch_size <= size:
+                return size
+        return None
+
 
 @dataclass
 class CachedRequestState:

From 4be3a45158a7fb707973d4b00410e0d2981e6825 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 5 Nov 2024 22:35:03 -0800
Subject: [PATCH 21/24] [distributed] add function to create ipc buffers
 directly (#10064)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  1 +
 tests/distributed/test_ca_buffer_sharing.py   | 59 +++++++++++++++++++
 .../device_communicators/custom_all_reduce.py | 31 ++++++++++
 3 files changed, 91 insertions(+)
 create mode 100644 tests/distributed/test_ca_buffer_sharing.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 1eb749f64d36b..3e940549862ea 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -510,6 +510,7 @@ steps:
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
   - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
   - pytest -v -s -x lora/test_mixtral.py
 
diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py
new file mode 100644
index 0000000000000..fc4043cd3014e
--- /dev/null
+++ b/tests/distributed/test_ca_buffer_sharing.py
@@ -0,0 +1,59 @@
+# can only run on machines with p2p access across GPUs
+# can only run with torchrun:
+# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py
+
+import ctypes
+
+import torch
+import torch.distributed as dist
+
+from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+from vllm.distributed.device_communicators.custom_all_reduce import (  # noqa
+    CustomAllreduce)
+
+# create a cpu process group for communicating metadata (ipc handle)
+dist.init_process_group(backend="gloo")
+rank = local_rank = dist.get_rank()
+world_size = dist.get_world_size()
+
+# every process sets its own device (differently)
+lib = CudaRTLibrary()
+lib.cudaSetDevice(rank)
+
+buffer_size_in_bytes = 1024
+byte_value = 2  # the value we write to the buffer for verification
+
+pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)
+
+print(f"Rank {rank} has pointers {pointers}")
+
+dist.barrier()
+torch.cuda.synchronize()
+
+if rank == 0:
+    # the first rank tries to write to all buffers
+    for p in pointers:
+        pointer = ctypes.c_void_p(p)
+        lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes)
+
+dist.barrier()
+torch.cuda.synchronize()
+
+host_data = (ctypes.c_char * buffer_size_in_bytes)()
+
+# all ranks read from all buffers, and check if the data is correct
+for p in pointers:
+    pointer = ctypes.c_void_p(p)
+    lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes)
+    for i in range(buffer_size_in_bytes):
+        assert ord(host_data[i]) == byte_value, (
+            f"Rank {rank} failed"
+            f" to verify buffer {p}. Expected {byte_value}, "
+            f"got {ord(host_data[i])}")
+
+print(f"Rank {rank} verified all buffers")
+
+dist.barrier()
+torch.cuda.synchronize()
+
+CustomAllreduce.free_shared_buffer(pointers)
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index c3632aee6d11a..3b5d92561cf25 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,3 +1,4 @@
+import ctypes
 from contextlib import contextmanager
 from typing import Any, List, Optional, Union
 
@@ -7,6 +8,7 @@
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
 from vllm.distributed.parallel_state import in_the_same_node_as
@@ -174,6 +176,35 @@ def __init__(self,
                                        offsets, rank, self.full_nvlink)
         self.register_buffer(self.buffer)
 
+    @staticmethod
+    def create_shared_buffer(
+            size_in_bytes: int,
+            group: Optional[ProcessGroup] = None) -> List[int]:
+        lib = CudaRTLibrary()
+        pointer = lib.cudaMalloc(size_in_bytes)
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        world_size = dist.get_world_size(group=group)
+        rank = dist.get_rank(group=group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=group)
+
+        pointers: List[int] = []
+        for i, h in enumerate(handles):
+            if i == rank:
+                pointers.append(pointer.value)  # type: ignore
+            else:
+                pointers.append(
+                    lib.cudaIpcOpenMemHandle(h).value)  # type: ignore
+
+        return pointers
+
+    @staticmethod
+    def free_shared_buffer(pointers: List[int],
+                           group: Optional[ProcessGroup] = None) -> None:
+        rank = dist.get_rank(group=group)
+        lib = CudaRTLibrary()
+        lib.cudaFree(ctypes.c_void_p(pointers[rank]))
+
     @contextmanager
     def capture(self):
         """

From 21063c11c7d340dbb01460e22d98d3619737cd4d Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Wed, 6 Nov 2024 02:11:55 -0500
Subject: [PATCH 22/24] [CI/Build] drop support for  Python 3.8 EOL (#8464)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 .../convert-results-json-to-markdown.py       | 10 +--
 .../scripts/generate-nightly-markdown.py      |  4 +-
 .../scripts/summary-nightly-results.py        |  4 +-
 .github/workflows/mypy.yaml                   |  2 +-
 .github/workflows/publish.yml                 |  2 +-
 .github/workflows/ruff.yml                    | 32 ++++-----
 .github/workflows/yapf.yml                    | 26 ++++----
 .readthedocs.yaml                             | 11 ++--
 CMakeLists.txt                                | 36 +++++-----
 benchmarks/backend_request_func.py            | 22 ++-----
 benchmarks/kernels/benchmark_machete.py       |  6 +-
 csrc/quantization/machete/generate.py         |  8 +--
 docs/source/getting_started/installation.rst  | 10 +--
 pyproject.toml                                |  4 +-
 setup.py                                      |  9 ++-
 tests/compile/piecewise/test_toy_llama.py     |  4 +-
 tests/conftest.py                             | 29 +++-----
 tests/core/block/test_prefix_caching_block.py | 12 ++--
 tests/kernels/test_mamba_ssm.py               |  2 +-
 .../mm_processor_kwargs/test_qwen.py          |  2 +-
 tests/samplers/test_rejection_sampler.py      | 10 ++-
 tests/test_logger.py                          |  2 +-
 tests/tokenization/test_detokenize.py         |  4 +-
 tools/profiler/print_layerwise_table.py       |  2 +-
 tools/profiler/visualize_layerwise_profile.py |  2 +-
 tools/report_build_time_ninja.py              | 32 ++++-----
 use_existing_torch.py                         |  2 +-
 .../ops/blocksparse_attention/interface.py    |  6 +-
 vllm/config.py                                |  7 +-
 vllm/core/evictor.py                          |  2 +-
 .../custom_all_reduce_utils.py                |  2 +-
 vllm/engine/async_llm_engine.py               |  2 +-
 vllm/engine/llm_engine.py                     |  4 +-
 vllm/engine/metrics_types.py                  |  2 +-
 vllm/engine/output_processor/multi_step.py    |  2 +-
 vllm/entrypoints/chat_utils.py                |  2 +-
 vllm/entrypoints/openai/run_batch.py          |  2 +-
 vllm/executor/ray_gpu_executor.py             |  2 +-
 vllm/logger.py                                |  3 +-
 vllm/lora/models.py                           |  4 +-
 vllm/model_executor/custom_op.py              |  2 +-
 vllm/model_executor/layers/resampler.py       |  1 -
 .../model_executor/layers/rotary_embedding.py |  1 -
 vllm/model_executor/model_loader/loader.py    |  2 +-
 vllm/model_executor/model_loader/openvino.py  |  2 +-
 .../model_executor/model_loader/tensorizer.py |  5 +-
 .../model_loader/weight_utils.py              |  9 ++-
 vllm/model_executor/models/arctic.py          |  4 +-
 vllm/model_executor/models/baichuan.py        |  1 -
 vllm/model_executor/models/bloom.py           |  1 -
 vllm/model_executor/models/chatglm.py         |  1 -
 vllm/model_executor/models/commandr.py        |  1 -
 vllm/model_executor/models/dbrx.py            |  1 -
 vllm/model_executor/models/decilm.py          |  1 -
 vllm/model_executor/models/deepseek.py        |  1 -
 vllm/model_executor/models/deepseek_v2.py     |  1 -
 vllm/model_executor/models/exaone.py          |  1 -
 vllm/model_executor/models/falcon.py          |  1 -
 vllm/model_executor/models/fuyu.py            |  1 -
 vllm/model_executor/models/gemma.py           |  1 -
 vllm/model_executor/models/gemma2.py          |  1 -
 .../models/glm4_vision_encoder.py             |  1 -
 vllm/model_executor/models/gpt2.py            |  1 -
 vllm/model_executor/models/gpt_bigcode.py     |  1 -
 vllm/model_executor/models/gpt_j.py           |  1 -
 vllm/model_executor/models/gpt_neox.py        |  1 -
 vllm/model_executor/models/granite.py         |  1 -
 vllm/model_executor/models/granitemoe.py      |  1 -
 .../models/idefics2_vision_model.py           |  2 -
 vllm/model_executor/models/internlm2.py       |  1 -
 vllm/model_executor/models/internlm2_ve.py    |  1 -
 vllm/model_executor/models/jais.py            |  1 -
 vllm/model_executor/models/jamba.py           |  1 -
 vllm/model_executor/models/llama.py           |  1 -
 vllm/model_executor/models/mamba.py           |  1 -
 vllm/model_executor/models/minicpm.py         |  1 -
 vllm/model_executor/models/minicpm3.py        |  1 -
 vllm/model_executor/models/minicpmv.py        |  1 -
 vllm/model_executor/models/mixtral.py         |  1 -
 vllm/model_executor/models/mixtral_quant.py   |  1 -
 vllm/model_executor/models/mllama.py          |  1 -
 vllm/model_executor/models/mlp_speculator.py  |  2 +-
 vllm/model_executor/models/molmo.py           |  6 +-
 vllm/model_executor/models/mpt.py             |  1 -
 vllm/model_executor/models/nemotron.py        |  1 -
 vllm/model_executor/models/olmo.py            |  1 -
 vllm/model_executor/models/opt.py             |  1 -
 vllm/model_executor/models/orion.py           |  1 -
 vllm/model_executor/models/persimmon.py       |  1 -
 vllm/model_executor/models/phi.py             |  1 -
 vllm/model_executor/models/phi3.py            |  1 -
 vllm/model_executor/models/phi3v.py           |  1 -
 vllm/model_executor/models/phimoe.py          |  1 -
 vllm/model_executor/models/pixtral.py         | 10 +--
 vllm/model_executor/models/qwen.py            |  1 -
 vllm/model_executor/models/qwen2.py           |  7 +-
 vllm/model_executor/models/qwen2_audio.py     |  1 -
 vllm/model_executor/models/qwen2_cls.py       |  7 +-
 vllm/model_executor/models/qwen2_moe.py       |  1 -
 vllm/model_executor/models/qwen2_rm.py        |  7 +-
 vllm/model_executor/models/qwen2_vl.py        | 10 ++-
 vllm/model_executor/models/solar.py           |  1 -
 vllm/model_executor/models/stablelm.py        |  1 -
 vllm/model_executor/models/starcoder2.py      |  1 -
 vllm/model_executor/models/xverse.py          |  1 -
 vllm/multimodal/base.py                       | 66 +++++++++++--------
 vllm/prompt_adapter/utils.py                  | 17 +++--
 vllm/transformers_utils/config.py             |  2 +-
 vllm/transformers_utils/configs/chatglm.py    |  1 -
 vllm/transformers_utils/configs/exaone.py     |  1 -
 vllm/transformers_utils/configs/jais.py       |  1 -
 vllm/transformers_utils/configs/mpt.py        |  7 +-
 vllm/transformers_utils/configs/nemotron.py   |  7 +-
 vllm/transformers_utils/configs/solar.py      |  1 -
 vllm/utils.py                                 |  4 +-
 115 files changed, 240 insertions(+), 322 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index f90e464288cf1..7cf05610b9953 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -56,7 +56,7 @@
 
 def read_markdown(file):
     if os.path.exists(file):
-        with open(file, "r") as f:
+        with open(file) as f:
             return f.read() + "\n"
     else:
         return f"{file} not found.\n"
@@ -75,14 +75,14 @@ def results_to_json(latency, throughput, serving):
     # collect results
     for test_file in results_folder.glob("*.json"):
 
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
             raw_result = json.loads(f.read())
 
         if "serving" in str(test_file):
             # this result is generated via `benchmark_serving.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                 command = json.loads(f.read())
             raw_result.update(command)
 
@@ -97,7 +97,7 @@ def results_to_json(latency, throughput, serving):
             # this result is generated via `benchmark_latency.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                 command = json.loads(f.read())
             raw_result.update(command)
 
@@ -119,7 +119,7 @@ def results_to_json(latency, throughput, serving):
             # this result is generated via `benchmark_throughput.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                 command = json.loads(f.read())
             raw_result.update(command)
 
diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
index 6059588fe7277..052060c576300 100644
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -72,7 +72,7 @@ def main(args):
 
     # collect results
     for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
             results = results + json.loads(f.read())
 
     # generate markdown table
@@ -80,7 +80,7 @@ def main(args):
 
     md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
 
-    with open(args.description, "r") as f:
+    with open(args.description) as f:
         description = f.read()
 
     description = description.format(
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index 4e4d4cd4ca3c6..92d6fad73a94c 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -36,11 +36,11 @@
     # collect results
     for test_file in results_folder.glob("*.json"):
 
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
             raw_result = json.loads(f.read())
 
         # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
+        with open(test_file.with_suffix(".commands")) as f:
             command = json.loads(f.read())
         raw_result.update(command)
 
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 18b354948f0cc..28d2e5fb8dbd9 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -25,7 +25,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index f959a1cacf866..578c3fbd4e816 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -48,7 +48,7 @@ jobs:
       fail-fast: false
       matrix:
           os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+          python-version: ['3.9', '3.10', '3.11', '3.12']
           pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
           cuda-version: ['11.8', '12.1']
 
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 197f918765e7d..edf98ce2fcab0 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -29,19 +29,19 @@ jobs:
       matrix:
         python-version: ["3.12"]
     steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install -r requirements-lint.txt
-    - name: Analysing the code with ruff
-      run: |
-        echo "::add-matcher::.github/workflows/matchers/ruff.json"
-        ruff check --output-format github .
-    - name: Run isort
-      run: |
-        isort . --check-only
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-lint.txt
+      - name: Analysing the code with ruff
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/ruff.json"
+          ruff check --output-format github .
+      - name: Run isort
+        run: |
+          isort . --check-only
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index 35579302c5c14..4221c139ccf79 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -23,16 +23,16 @@ jobs:
       matrix:
         python-version: ["3.12"]
     steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install yapf==0.32.0
-        pip install toml==0.10.2
-    - name: Running yapf
-      run: |
-        yapf --diff --recursive .
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install yapf==0.32.0
+          pip install toml==0.10.2
+      - name: Running yapf
+        run: |
+          yapf --diff --recursive .
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 42cbf18a0f712..34735700a224e 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -6,17 +6,16 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.8"
+    python: '3.9'
 
 sphinx:
-   configuration: docs/source/conf.py
-   fail_on_warning: true
+  configuration: docs/source/conf.py
+  fail_on_warning: true
 
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats: []
 
 # Optionally declare the Python requirements required to build your docs
 python:
-   install:
-   - requirements: docs/requirements-docs.txt
-
+  install:
+    - requirements: docs/requirements-docs.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 943424bc4edfa..c372ba98befbf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,9 +128,9 @@ endif()
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   #
-  # For cuda we want to be able to control which architectures we compile for on 
+  # For cuda we want to be able to control which architectures we compile for on
   # a per-file basis in order to cut down on compile time. So here we extract
-  # the set of architectures we want to compile for and remove the from the 
+  # the set of architectures we want to compile for and remove the from the
   # CMAKE_CUDA_FLAGS so that they are not applied globally.
   #
   clear_cuda_arches(CUDA_ARCH_FLAGS)
@@ -138,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
   # Filter the target architectures by the supported supported archs
   # since for some files we will build for all CUDA_ARCHS.
-  cuda_archs_loose_intersection(CUDA_ARCHS 
+  cuda_archs_loose_intersection(CUDA_ARCHS
     "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
   message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
 else()
@@ -236,7 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # are not supported by Machete yet.
   cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
   if (MARLIN_ARCHS)
-    set(MARLIN_SRCS 
+    set(MARLIN_SRCS
        "csrc/quantization/fp8/fp8_marlin.cu"
        "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
        "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
@@ -277,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                      "in CUDA target architectures")
     endif()
 
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
+    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
     # build any 3x kernels
     set(SCALED_MM_3X_ARCHS)
   endif()
@@ -285,7 +285,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   #
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
-  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS 
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
     "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
@@ -316,10 +316,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
     #
-    # For the Machete kernels we automatically generate sources for various 
+    # For the Machete kernels we automatically generate sources for various
     # preselected input type pairs and schedules.
     # Generate sources:
-    set(MACHETE_GEN_SCRIPT 
+    set(MACHETE_GEN_SCRIPT
       ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
     file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
 
@@ -329,8 +329,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
         OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
       execute_process(
-        COMMAND ${CMAKE_COMMAND} -E env 
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH 
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
           ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
         RESULT_VARIABLE machete_generation_result
         OUTPUT_VARIABLE machete_generation_output
@@ -340,11 +340,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
       if (NOT machete_generation_result EQUAL 0)
         message(FATAL_ERROR "Machete generation failed."
-                            " Result: \"${machete_generation_result}\"" 
+                            " Result: \"${machete_generation_result}\""
                             "\nCheck the log for details: "
                             "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
       else()
-        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} 
+        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
             CACHE STRING "Last run machete generate script hash" FORCE)
         message(STATUS "Machete generation completed successfully.")
       endif()
@@ -366,7 +366,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
     message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
   else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
         AND MACHETE_ARCHS)
       message(STATUS "Not building Machete kernels as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
@@ -392,8 +392,8 @@ define_gpu_extension_target(
   USE_SABI 3
   WITH_SOABI)
 
-# If CUTLASS is compiled on NVCC >= 12.5, it by default uses 
-# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the 
+# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
+# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
 # driver API. This causes problems when linking with earlier versions of CUDA.
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
@@ -471,9 +471,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
   return()
 endif ()
 
-# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target  
-# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the 
-# arches in the CUDA case (and instead set the gencodes on a per file basis) 
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
+# arches in the CUDA case (and instead set the gencodes on a per file basis)
 # we need to manually set VLLM_GPU_ARCHES here.
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   foreach(_ARCH ${CUDA_ARCHS})
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 0a903877f000d..a42e70170ba28 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -79,7 +79,7 @@ async def async_request_tgi(
                         # any data, we should skip it.
                         if chunk_bytes.startswith(":"):
                             continue
-                        chunk = remove_prefix(chunk_bytes, "data:")
+                        chunk = chunk_bytes.removeprefix("data:")
 
                         data = json.loads(chunk)
                         timestamp = time.perf_counter()
@@ -144,8 +144,8 @@ async def async_request_trt_llm(
                         if not chunk_bytes:
                             continue
 
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data:")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data:")
 
                         data = json.loads(chunk)
                         output.generated_text += data["text_output"]
@@ -261,8 +261,8 @@ async def async_request_openai_completions(
                         if not chunk_bytes:
                             continue
 
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data: ")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
                         if chunk == "[DONE]":
                             latency = time.perf_counter() - st
                         else:
@@ -349,8 +349,8 @@ async def async_request_openai_chat_completions(
                         if not chunk_bytes:
                             continue
 
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data: ")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
                         if chunk == "[DONE]":
                             latency = time.perf_counter() - st
                         else:
@@ -389,14 +389,6 @@ async def async_request_openai_chat_completions(
     return output
 
 
-# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
-    if text.startswith(prefix):
-        return text[len(prefix):]
-    return text
-
-
 def get_model(pretrained_model_name_or_path: str) -> str:
     if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
         from modelscope import snapshot_download
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index b70c4b94c97a1..665b50bf18cf0 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -269,10 +269,10 @@ def run_square_bench(args):
 
 
 def run_range_bench(args):
-    m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
-    m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
+    m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
+    m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
     m_increment, k_increment, n_increment = \
-        [int(x) for x in args.dim_increment.split(",")]
+        (int(x) for x in args.dim_increment.split(","))
     Ms = list(range(m_start, m_end + 1, m_increment))
     Ks = list(range(k_start, k_end + 1, k_increment))
     Ns = list(range(n_start, n_end + 1, n_increment))
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index ebbe76cfb944a..d126af1849024 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -468,7 +468,7 @@ def generate():
     impl_configs = []
 
     GPTQ_kernel_type_configs = list(
-        (TypeConfig(
+        TypeConfig(
             element_a=element_a,
             element_b=element_b,
             element_b_scale=element_a,
@@ -476,7 +476,7 @@ def generate():
             element_d=element_a,
             accumulator=DataType.f32,
         ) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
-         for element_a in (DataType.f16, DataType.bf16)))
+        for element_a in (DataType.f16, DataType.bf16))
 
     GPTQ_kernel_specializations = [
         Specialization(with_C=False, with_zeropoints=False, with_scales=True)
@@ -490,7 +490,7 @@ def generate():
     ]
 
     AWQ_kernel_type_configs = list(
-        (TypeConfig(
+        TypeConfig(
             element_a=element_a,
             element_b=element_b,
             element_b_scale=element_a,
@@ -498,7 +498,7 @@ def generate():
             element_d=element_a,
             accumulator=DataType.f32,
         ) for element_b in (DataType.u4, DataType.u8)
-         for element_a in (DataType.f16, DataType.bf16)))
+        for element_a in (DataType.f16, DataType.bf16))
 
     AWQ_kernel_specializations = [
         Specialization(with_C=False, with_zeropoints=True, with_scales=True)
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index a706b285edede..61871cdf41125 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -10,7 +10,7 @@ Requirements
 ============
 
 * OS: Linux
-* Python: 3.8 - 3.12
+* Python: 3.9 -- 3.12
 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 
 Install released versions
@@ -148,7 +148,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
 .. tip::
 
     Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
-    For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` . 
+    For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
     As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
 
 
@@ -181,8 +181,8 @@ to be run simultaneously, via the environment variable ``MAX_JOBS``. For example
     $ export MAX_JOBS=6
     $ pip install -e .
 
-This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. 
-A side effect is a much slower build process. 
+This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
+A side effect is a much slower build process.
 
 Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
 
@@ -209,7 +209,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
 Unsupported OS build
 --------------------
 
-vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. 
+vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
 
 Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing:
 
diff --git a/pyproject.toml b/pyproject.toml
index 0bbab3cd3fbc3..3562569647391 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,7 @@ select = [
     # Pyflakes
     "F",
     # pyupgrade
-    # "UP",
+    "UP",
     # flake8-bugbear
     "B",
     # flake8-simplify
@@ -55,7 +55,7 @@ ignore = [
 ]
 
 [tool.mypy]
-python_version = "3.8"
+python_version = "3.9"
 
 ignore_missing_imports = true
 check_untyped_defs = true
diff --git a/setup.py b/setup.py
index 8abeb0ba739db..f145a33258d70 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,4 @@
 import importlib.util
-import io
 import logging
 import os
 import re
@@ -327,7 +326,7 @@ def get_neuronxcc_version():
                                 "__init__.py")
 
     # Check if the command was executed successfully
-    with open(version_file, "rt") as fp:
+    with open(version_file) as fp:
         content = fp.read()
 
     # Extract the version using a regular expression
@@ -404,7 +403,8 @@ def read_readme() -> str:
     """Read the README file if present."""
     p = get_path("README.md")
     if os.path.isfile(p):
-        return io.open(get_path("README.md"), "r", encoding="utf-8").read()
+        with open(get_path("README.md"), encoding="utf-8") as f:
+            return f.read()
     else:
         return ""
 
@@ -498,7 +498,6 @@ def _read_requirements(filename: str) -> List[str]:
         "Documentation": "https://vllm.readthedocs.io/en/latest/",
     },
     classifiers=[
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
@@ -512,7 +511,7 @@ def _read_requirements(filename: str) -> List[str]:
     ],
     packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
                                     "tests*")),
-    python_requires=">=3.8",
+    python_requires=">=3.9",
     install_requires=get_requirements(),
     ext_modules=ext_modules,
     extras_require={
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 9c65059c6b348..73fa9e9906936 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -429,8 +429,8 @@ def benchmark():
     # print in tabular format
     print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
     for b in cudagraph_sizes:
-        print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
-               f"\t{piecewise_cudagraph_time[b]:.3f}"))
+        print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
+              f"\t{piecewise_cudagraph_time[b]:.3f}")
 
 
 if __name__ == "__main__":
diff --git a/tests/conftest.py b/tests/conftest.py
index bdc6ffb148602..f9dfabc82639b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,5 @@
 import json
 import os
-import sys
 import tempfile
 from collections import UserList
 from enum import Enum
@@ -52,7 +51,7 @@
 
 
 def _read_prompts(filename: str) -> List[str]:
-    with open(filename, "r") as f:
+    with open(filename) as f:
         prompts = f.readlines()
         return prompts
 
@@ -62,14 +61,8 @@ class _ImageAssetPrompts(TypedDict):
     cherry_blossom: str
 
 
-if sys.version_info < (3, 9):
-    # UserList cannot be subscripted
-    class _ImageAssetsBase(UserList):
-        pass
-else:
-
-    class _ImageAssetsBase(UserList[ImageAsset]):
-        pass
+class _ImageAssetsBase(UserList[ImageAsset]):
+    pass
 
 
 class _ImageAssets(_ImageAssetsBase):
@@ -94,14 +87,8 @@ class _VideoAssetPrompts(TypedDict):
     sample_demo_1: str
 
 
-if sys.version_info < (3, 9):
-    # UserList cannot be subscripted
-    class _VideoAssetsBase(UserList):
-        pass
-else:
-
-    class _VideoAssetsBase(UserList[VideoAsset]):
-        pass
+class _VideoAssetsBase(UserList[VideoAsset]):
+    pass
 
 
 class _VideoAssets(_VideoAssetsBase):
@@ -958,7 +945,7 @@ def dummy_opt_path():
                               "*.msgpack"
                           ])
         assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
             config = json.load(f)
         config["architectures"] = ["MyOPTForCausalLM"]
         with open(json_path, "w") as f:
@@ -977,7 +964,7 @@ def dummy_llava_path():
                               "*.msgpack"
                           ])
         assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
             config = json.load(f)
         config["architectures"] = ["MyLlava"]
         with open(json_path, "w") as f:
@@ -996,7 +983,7 @@ def dummy_gemma2_embedding_path():
                               "*.msgpack"
                           ])
         assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
             config = json.load(f)
         config["architectures"] = ["MyGemma2Embedding"]
         with open(json_path, "w") as f:
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index 1a6e17ef7b445..d325b9606843e 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -99,13 +99,11 @@ def test_blocks_have_correct_hash_in_chain(block_size: int,
 
         token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
 
-        first_chain, second_chain = [
-            TestPrefixCachingBlock.create_chain(
-                block_size=block_size,
-                token_ids=token_ids,
-                num_empty_trailing_blocks=num_empty_trailing_blocks)
-            for _ in range(2)
-        ]
+        first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            num_empty_trailing_blocks=num_empty_trailing_blocks)
+                                     for _ in range(2))
 
         for first_chain_block, second_chain_block in zip(
                 first_chain, second_chain):
diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py
index ad05a97685351..19d1158c79c73 100644
--- a/tests/kernels/test_mamba_ssm.py
+++ b/tests/kernels/test_mamba_ssm.py
@@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
         for var in (u_ref, delta_ref, B_ref, C_ref, z_ref)
     ]
     for i in range(len(seqlens[0])):
-        u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits]
+        u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits)
         if padded_state_indices[i] == PAD_SLOT_ID:
             continue
         out_ref_s, _ = selective_scan_ref(
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
index a01651b171d60..6ae8a6a704b0a 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
@@ -104,7 +104,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
 # Sad path tests for the multimodal input processor and mapper, respectively
 @pytest.mark.parametrize("mm_data", [
     {
-        "image": torch.rand((5))
+        "image": torch.rand(5)
     },
     {
         "image": torch.rand((5, 5, 5, 5, 5))
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index a8deab3718be1..f5497976faf7a 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -413,12 +413,10 @@ def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler):
     def generate_probs_for_test(
         self, draft_and_target_probs_equal: bool
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        draft_probs, target_probs = [
-            F.softmax(
-                torch.rand(self.vocab_size, dtype=torch.float32),
-                dim=-1,
-            ) for _ in range(2)
-        ]
+        draft_probs, target_probs = (F.softmax(
+            torch.rand(self.vocab_size, dtype=torch.float32),
+            dim=-1,
+        ) for _ in range(2))
 
         num_reference_probs = 100
         reference_probs = F.softmax(
diff --git a/tests/test_logger.py b/tests/test_logger.py
index fadf66f2b61d4..a937b0812ed0c 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -29,7 +29,7 @@ def test_trace_function_call():
     cur_dir = os.path.dirname(__file__)
     enable_trace_function_call(path, cur_dir)
     f1(1)
-    with open(path, 'r') as f:
+    with open(path) as f:
         content = f.read()
 
     assert "f1" in content
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index a3e70a40db979..84348cbc0bced 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -93,10 +93,10 @@ def test_mistral_edge_case(tokenizer, truth):
 def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
     if "mistral" in tokenizer_name:
         yield (
-            bool(True) if request.param else
+            True if request.param else
             pytest.skip("mistral doesn't support skip_special_tokens=False"))
     else:
-        yield bool(True) if request.param else bool(False)
+        yield bool(request.param)
 
 
 @pytest.mark.parametrize("truth", TRUTH)
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index bbd24b085e3a7..081076ad7dbdc 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -46,7 +46,7 @@ def get_entries(node, curr_depth=0):
 
     args = parser.parse_args()
 
-    with open(args.json_trace, "r") as f:
+    with open(args.json_trace) as f:
         profile_data = json.load(f)
 
     if args.table == "summary":
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index 65ee3ae108ae1..efd6beee865c2 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -434,7 +434,7 @@ def make_plot_title_suffix(profile_json: dict) -> str:
                 f"{', Sparsity ' + sparsity if sparsity else ''}")
 
     profile_json = None
-    with open(json_trace, "r") as f:
+    with open(json_trace) as f:
         profile_json = json.load(f)
     assert profile_json is not None
 
diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
index 33431a33ac837..51ad2adc74fe1 100644
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -81,7 +81,7 @@ def WeightedDuration(self):
         # Allow for modest floating-point errors
         epsilon = 0.000002
         if (self.weighted_duration > self.Duration() + epsilon):
-            print('%s > %s?' % (self.weighted_duration, self.Duration()))
+            print('{} > {}?'.format(self.weighted_duration, self.Duration()))
         assert (self.weighted_duration <= self.Duration() + epsilon)
         return self.weighted_duration
 
@@ -104,7 +104,7 @@ def ReadTargets(log, show_all):
     The result is a list of Target objects."""
     header = log.readline()
     assert header == '# ninja log v5\n', \
-           'unrecognized ninja log version %r' % header
+           'unrecognized ninja log version {!r}'.format(header)
     targets_dict = {}
     last_end_seen = 0.0
     for line in log:
@@ -254,8 +254,8 @@ def SummarizeEntries(entries, extra_step_types):
     # Warn if the sum of weighted times is off by more than half a second.
     if abs(length - weighted_total) > 500:
         print('Warning: Possible corrupt ninja log, results may be '
-              'untrustworthy. Length = %.3f, weighted total = %.3f' %
-              (length, weighted_total))
+              'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format(
+                  length, weighted_total))
 
     entries_by_ext = defaultdict(list)
     for target in entries:
@@ -263,16 +263,17 @@ def SummarizeEntries(entries, extra_step_types):
         entries_by_ext[extension].append(target)
 
     for key, values in entries_by_ext.items():
-        print('    Longest build steps for %s:' % key)
+        print('    Longest build steps for {}:'.format(key))
         values.sort(key=lambda x: x.WeightedDuration())
         for target in values[-long_count:]:
-            print('      %8.1f weighted s to build %s (%.1f s elapsed time)' %
-                  (target.WeightedDuration(), target.DescribeTargets(),
-                   target.Duration()))
-
-    print('    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
-          'parallelism)' %
-          (length, total_cpu_time, total_cpu_time * 1.0 / length))
+            print(
+                '      {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'.
+                format(target.WeightedDuration(), target.DescribeTargets(),
+                       target.Duration()))
+
+    print('    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
+          'parallelism)'.format(length, total_cpu_time,
+                                total_cpu_time * 1.0 / length))
     print('    %d build steps completed, average of %1.2f/s' %
           (len(entries), len(entries) / (length)))
 
@@ -298,11 +299,12 @@ def main():
         long_ext_count += len(args.step_types.split(';'))
 
     try:
-        with open(log_file, 'r') as log:
+        with open(log_file) as log:
             entries = ReadTargets(log, False)
             SummarizeEntries(entries, args.step_types)
-    except IOError:
-        print('Log file %r not found, no build summary created.' % log_file)
+    except OSError:
+        print('Log file {!r} not found, no build summary created.'.format(
+            log_file))
         return errno.ENOENT
 
 
diff --git a/use_existing_torch.py b/use_existing_torch.py
index e11746459908b..319d262898fe3 100644
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -4,7 +4,7 @@
 requires_files += ["pyproject.toml"]
 for file in requires_files:
     print(f">>> cleaning {file}")
-    with open(file, 'r') as f:
+    with open(file) as f:
         lines = f.readlines()
     if "torch" in "".join(lines).lower():
         print("removed:")
diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py
index a98eb431ac7fc..350f88c8f9740 100644
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -192,10 +192,8 @@ def spda(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None):
         attn_mask = self.dense_attn_mask[None, :, :maxlen, :maxlen]
 
         q2 = self.transpose_and_pad(q, cu_seqlens, maxlen, 1)
-        k2, v2 = [
-            self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
-            for x in [k, v]
-        ]
+        k2, v2 = (self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
+                  for x in [k, v])
         spda_output = torch.nn.functional.scaled_dot_product_attention(
             q2, k2, v2, attn_mask=attn_mask, scale=sm_scale)
         return self.transpose_and_unpad(spda_output, cu_seqlens)
diff --git a/vllm/config.py b/vllm/config.py
index 814e00c8785f0..851d35dfd9fb0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -668,9 +668,10 @@ def get_multimodal_config(self) -> "MultiModalConfig":
     @property
     def is_encoder_decoder_model(self) -> bool:
         """Extract the HF encoder/decoder model flag."""
-        return getattr(self.hf_config, "is_encoder_decoder", False) or (
-            (hasattr(self.hf_config, "text_config") and getattr(
-                self.hf_config.text_config, "is_encoder_decoder", False)))
+        return getattr(
+            self.hf_config, "is_encoder_decoder",
+            False) or (hasattr(self.hf_config, "text_config") and getattr(
+                self.hf_config.text_config, "is_encoder_decoder", False))
 
     @property
     def is_multimodal_model(self) -> bool:
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 0b943e6e65f1c..ed7e06cab2996 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -52,7 +52,7 @@ def num_blocks(self) -> int:
         pass
 
 
-class BlockMetaData():
+class BlockMetaData:
     """Data structure for storing key data describe cached block, so that
     evitor could use to make its decision which one to choose for eviction
 
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 983e772a3f79b..1f78e10cc1dcd 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -240,7 +240,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
     if is_distributed:
         get_world_group().barrier()
     logger.info("reading GPU P2P access cache from %s", path)
-    with open(path, "r") as f:
+    with open(path) as f:
         cache = json.load(f)
     _gpu_p2p_access_cache = cache
     return _gpu_p2p_access_cache[f"{src}->{tgt}"]
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index b0fdc67776bbd..161b85646b6e8 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -812,7 +812,7 @@ async def _engine_abort(self, request_ids: Iterable[str]):
     async def run_engine_loop(engine_ref: ReferenceType):
         """We use a weakref to the engine so that the running loop
         doesn't prevent the engine being garbage collected."""
-        engine: Optional["AsyncLLMEngine"] = engine_ref()
+        engine: Optional[AsyncLLMEngine] = engine_ref()
         if not engine:
             return
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index a1809b1a9dd26..404e7ed2c6ef9 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1541,8 +1541,8 @@ def _has_remaining_steps(
                 seq_group.state.remaining_steps != ref_remaining_steps
                 for seq_group in seq_group_metadata_list[1:]
         ]):
-            raise AssertionError(("All running sequence groups should "
-                                  "have the same remaining steps."))
+            raise AssertionError("All running sequence groups should "
+                                 "have the same remaining steps.")
 
         return ref_remaining_steps > 0
 
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 25b7a7479672a..19dcbfe57d112 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -77,7 +77,7 @@ def __init__(self, local_interval: float) -> None:
         self.num_generation_tokens: List[int] = []
         self.last_local_log = time.time()
         self.local_interval = local_interval
-        self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
+        self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None
 
     @abstractmethod
     def log(self, stats: Stats) -> None:
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 3ed37a269c4b4..223790806ab18 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -63,7 +63,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
             single_step_process_prompt_logprob(self, seq_group, output)
 
     @staticmethod
-    @functools.lru_cache()
+    @functools.lru_cache
     def _log_prompt_logprob_unsupported_warning_once():
         # Reminder: Please update docs/source/serving/compatibility_matrix.rst
         # If the feature combo become valid
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 2b339ab6d44e4..0ada0aaacda24 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -362,7 +362,7 @@ def load_chat_template(
     if chat_template is None:
         return None
     try:
-        with open(chat_template, "r") as f:
+        with open(chat_template) as f:
             resolved_chat_template = f.read()
     except OSError as e:
         if isinstance(chat_template, Path):
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index a64467a311523..0d016d949d22b 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -120,7 +120,7 @@ async def read_file(path_or_url: str) -> str:
                    session.get(path_or_url) as resp:
             return await resp.text()
     else:
-        with open(path_or_url, "r", encoding="utf-8") as f:
+        with open(path_or_url, encoding="utf-8") as f:
             return f.read()
 
 
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 9433dce842b09..66bab2c686c67 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -32,7 +32,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
     uses_ray: bool = True
 
     def _init_executor(self) -> None:
-        self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
+        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
         # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
diff --git a/vllm/logger.py b/vllm/logger.py
index ccf09691a052a..d6fcda02a0fb3 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -67,8 +67,7 @@ def _configure_vllm_root_logger() -> None:
             raise RuntimeError(
                 "Could not load logging config. File does not exist: %s",
                 VLLM_LOGGING_CONFIG_PATH)
-        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8",
-                  mode="r") as file:
+        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
             custom_config = json.loads(file.read())
 
         if not isinstance(custom_config, dict):
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index d0279f273db7a..81e274612b73b 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -343,7 +343,7 @@ def __init__(
             # text modules (e.g. ChatGLM)
             and hasattr(self.model, "get_mm_mapping"))
         self.packed_modules: Dict[str, List[str]] = {}
-        self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
+        self.modules: Dict[str, BaseLayerWithLoRA] = {}
         # Dict instead of a Set for compatibility with LRUCache.
         self._last_mapping: Optional[LoRAMapping] = None
         self._create_lora_modules()
@@ -548,7 +548,7 @@ def create_dummy_lora(
             else:
                 parts = module_name.split(".")
                 replacements = self.packed_modules_mapping[parts[-1]]
-                subloras: List[Optional["LoRALayerWeights"]] = []
+                subloras: List[Optional[LoRALayerWeights]] = []
                 for i, r in enumerate(replacements):
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name + "." + r,
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 764f4e9c99df8..bfca15c2b6a3e 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -103,7 +103,7 @@ def enabled(cls) -> bool:
     # On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE
     # Specifying 'all' or 'none' in VLLM_CUSTOM_OPS takes precedence.
     @staticmethod
-    @lru_cache()
+    @lru_cache
     def default_on() -> bool:
         count_none = envs.VLLM_CUSTOM_OPS.count("none")
         count_all = envs.VLLM_CUSTOM_OPS.count("all")
diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index bca44d2bf2e28..aae806f6af323 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 2158ad3339673..ac60e0e6d48a0 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 1f8d531198324..464915248c9ad 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -746,7 +746,7 @@ def __init__(self, load_config: LoadConfig):
 
         config_file_path = self._get_config_file(qlora_adapter)
 
-        with open(config_file_path, "r") as f:
+        with open(config_file_path) as f:
             config = json.load(f)
             self.target_modules = config["target_modules"]
 
diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
index 573f2a04895d9..e6299295c85a2 100644
--- a/vllm/model_executor/model_loader/openvino.py
+++ b/vllm/model_executor/model_loader/openvino.py
@@ -190,7 +190,7 @@ def get_model(
     kv_cache_dtype: ov.Type,
     **kwargs,
 ) -> torch.nn.Module:
-    lora_config = kwargs.get("lora_config", None)
+    lora_config = kwargs.get("lora_config")
     ov_core = kwargs.get("ov_core")
     if lora_config:
         raise ValueError(
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 36f33d6d139ee..437d2772e1f28 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -280,7 +280,7 @@ def __init__(self, tensorizer_config: TensorizerConfig,
         self.tensorizer_args = (
             self.tensorizer_config._construct_tensorizer_args())
         self.extra_kwargs = extra_kwargs
-        if extra_kwargs.get("quant_config", None) is not None:
+        if extra_kwargs.get("quant_config") is not None:
             self.quant_config = extra_kwargs["quant_config"]
         else:
             self.quant_config = quant_config
@@ -380,8 +380,7 @@ def tensorizer_weights_iterator(
     stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
     with TensorDeserializer(stream, **deserializer_args,
                             device="cpu") as state:
-        for name, param in state.items():
-            yield name, param
+        yield from state.items()
     del state
 
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 0c51314bc90df..9488d54edf365 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -188,7 +188,7 @@ def get_quant_config(model_config: ModelConfig,
             f"{quant_config_files}")
 
     quant_config_file = quant_config_files[0]
-    with open(quant_config_file, "r") as f:
+    with open(quant_config_file) as f:
         config = json.load(f)
 
         if model_config.quantization == "bitsandbytes":
@@ -306,7 +306,7 @@ def filter_duplicate_safetensors_files(hf_weights_files: List[str],
 
     # Iterate through the weight_map (weight_name: safetensors files)
     # to identify weights that we should use.
-    with open(index_file_name, "r") as f:
+    with open(index_file_name) as f:
         weight_map = json.load(f)["weight_map"]
     weight_files_in_index = set()
     for weight_name in weight_map:
@@ -382,7 +382,7 @@ def np_cache_weights_iterator(
             with open(weight_names_file, "w") as f:
                 json.dump(weight_names, f)
 
-    with open(weight_names_file, "r") as f:
+    with open(weight_names_file) as f:
         weight_names = json.load(f)
 
     for name in weight_names:
@@ -423,8 +423,7 @@ def pt_weights_iterator(
             bar_format=_BAR_FORMAT,
     ):
         state = torch.load(bin_file, map_location="cpu")
-        for name, param in state.items():
-            yield name, param
+        yield from state.items()
         del state
         torch.cuda.empty_cache()
 
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index fd29d4ccc59d8..5b712ba83c25a 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -48,7 +48,7 @@ def __init__(self,
                  is_residual_mlp: bool = False,
                  quant_config: Optional[QuantizationConfig] = None,
                  reduce_results: bool = True):
-        super(ArcticMLP, self).__init__()
+        super().__init__()
         self.hidden_size = config.hidden_size
         self.expert_id = expert_id
         self.layer_id = layer_id
@@ -89,7 +89,7 @@ def __init__(self,
                  params_dtype: Optional[torch.dtype] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  reduce_results: bool = True):
-        super(ArcticMoE, self).__init__()
+        super().__init__()
 
         self.tp_size = tp_size or get_tensor_model_parallel_world_size()
         self.hidden_size = config.hidden_size
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index f2cfdf8ffd30a..1fbf4135add7a 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 77ab7de6165fb..83ff39a30fbe3 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 181f3c2b0fc35..881b86564e811 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/THUDM/GLM-4
 """Inference-only ChatGLM model compatible with THUDM weights."""
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 348e6d20f3297..835682ca3b379 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index aae7ab7370b74..3e60eee2d8fe2 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 from typing import Iterable, List, Optional, Tuple, Union
 
 import torch
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index 7ed2b96e65c49..8c9653463858b 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 DeciAI Research Team. All rights reserved.
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 5b4db8f258711..d278ea5b6a991 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index d4ad0c6b5c99e..834be78bce87b 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 22f194c776b69..23efe0359cb4a 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py
 # Copyright 2024 The LG U+ CTO AI Tech Lab.
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index c376347811965..ad07fc3b3776e 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 0de590d1d8372..3db82a898159b 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
 # Copyright 2023 The vLLM team.
 # Copyright 2023 HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 029178af61da0..fc3f5cb20afb0 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2023 The vLLM team.
 # Copyright (c) Google Inc.
 #
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 9238ed839c9de..c365880109ef8 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
index 3213a8b29a104..025615b0920fd 100644
--- a/vllm/model_executor/models/glm4_vision_encoder.py
+++ b/vllm/model_executor/models/glm4_vision_encoder.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/THUDM/GLM-4
 """Inference-only GLM-4v model visual encoder compatible with THUDM weights."""
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 3330d84021368..a06200c4b7e08 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 24c79a8855475..7612ea641d95c 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 9a42b359ae44f..b28a6081b868f 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 1bccef7a5f173..931052c7cccf0 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index c968817747754..bee48f377e0f5 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 5307bb21adb96..691a6e77c46c4 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 43f4f29814e6d..53869b8fa6bd8 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -1,5 +1,3 @@
-# coding=utf-8
-
 # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
 # Copyright 2024 The vLLM team.
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 313d98b649b48..afefb6cd9fa96 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 from functools import partial
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index edd867e4b6457..108fc8382049d 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 from typing import List, Optional, Tuple, Union
 
 import torch
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index b947f24a693b5..301893f74cb87 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 6f7949c880e61..81d88a47c1941 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 """Inference-only Jamba model."""
 from typing import Iterable, List, Optional, Tuple
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 38a31f420cec9..6c0a8b5ef8451 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 985ba6f3c60c1..aac4b7aa2661d 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 """PyTorch MAMBA model."""
 from typing import Iterable, List, Optional, Tuple
 
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 03fb036020f2f..acf03cd8cb8ad 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 3b5fd95328d74..eeedf55cf3e57 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2024 The ModelBest team.
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index f90df6b7df036..5acd3f65896c7 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 1514243ad59c9..e9b9c4d838faa 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 63e2c60a84271..9647d69be8a0a 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 251bfc079684e..5fa8d19b97fe8 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index 42ccd01298169..ae218d749fc0b 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -37,7 +37,7 @@ def __init__(
         eps=1e-06,
         elementwise_scale_and_shift=True,
     ):
-        super(MLPSpeculatorLayerNorm, self).__init__()
+        super().__init__()
         self.elementwise_scale_and_shift = elementwise_scale_and_shift
         if self.elementwise_scale_and_shift:
             self.weight = nn.Parameter(torch.empty(normalized_shape))
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 522aa748f78b6..785b53670542f 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1121,9 +1121,9 @@ def _merge_multimodal_embeddings(
             batch_size * num_image * num_patch, -1).contiguous()
 
         image_input_idx = image_input_idx * valid.to(image_input_idx.dtype)
-        offset = torch.cat(
-            [seq_len.new_zeros(
-                (1)), seq_len.cumsum(dim=0)[:-1]], dim=0)[:, None]
+        offset = torch.cat([seq_len.new_zeros(1),
+                            seq_len.cumsum(dim=0)[:-1]],
+                           dim=0)[:, None]
         image_input_idx = image_input_idx + offset.to(image_input_idx.dtype)
         image_input_idx = image_input_idx.flatten()[:, None]
         mat = image_input_idx == torch.arange(
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index ee802030a5ef3..fdd8af79b5470 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
 from typing import Iterable, List, Optional, Tuple, Union
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 72a09129fed63..b649064536dc2 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 90ab8abcb84b4..dd3f58289a227 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
 # Copyright 2024 The vLLM team.
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 7521ab749e10f..7a76e4a0906db 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 055407587c598..a338a93c2dd9a 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
 # Copyright (c) OrionStar Inc.
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index fc9ef15db26c0..bd4a9f698bacd 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
 # Copyright 2023 The vLLM team.
 # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 4e7935a7636c5..492122450b237 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
index 02b2ff01c3832..34141511ea791 100644
--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from llama.py
 """Inference-only Phi3 model code inherit from Llama.py"""
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 5b477a8ed5f49..1c41891ced416 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index bb8a9327b4ac8..59843ae3dfd59 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index ee9f150b17cfc..6e9092432467a 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -136,11 +136,11 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
 
         if image_token_id not in inputs['prompt_token_ids']:
             raise ValueError(
-                (f"You've passed {inputs=} without {image_token_id=}"
-                 " Make sure to process your input via mistral_common's"
-                 " tokenizer or pass a chat completion request. For more"
-                 " For more info, see: "
-                 "https://github.com/vllm-project/vllm/issues/8411."))
+                f"You've passed {inputs=} without {image_token_id=}"
+                " Make sure to process your input via mistral_common's"
+                " tokenizer or pass a chat completion request. For more"
+                " For more info, see: "
+                "https://github.com/vllm-project/vllm/issues/8411.")
 
     return inputs
 
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index b2b5c70182135..3a0e33e8a3eff 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
 # Copyright (c) Alibaba Cloud.
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 72b286fe6f6d6..49b3de1304cca 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
 # Copyright 2024 The Qwen team.
@@ -417,9 +416,9 @@ def __init__(
                 and hasattr(config, "max_window_layers")):
             raise ValueError("Sliding window for some but all layers is not "
                              "supported. This model uses sliding window "
-                             "but `max_window_layers` = %s is less than "
-                             "`num_hidden_layers` = %s. Please open an issue "
-                             "to discuss this feature." % (
+                             "but `max_window_layers` = {} is less than "
+                             "`num_hidden_layers` = {}. Please open an issue "
+                             "to discuss this feature.".format(
                                  config.max_window_layers,
                                  config.num_hidden_layers,
                              ))
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 6114548bda42c..556c09400ee83 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py
index 2d6f3e90f761c..b9e3b74c477e2 100644
--- a/vllm/model_executor/models/qwen2_cls.py
+++ b/vllm/model_executor/models/qwen2_cls.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
 # Copyright 2024 Kakao Corp. (Kanana-X Team)
@@ -60,9 +59,9 @@ def __init__(
                 and hasattr(config, "max_window_layers")):
             raise ValueError("Sliding window for some but all layers is not "
                              "supported. This model uses sliding window "
-                             "but `max_window_layers` = %s is less than "
-                             "`num_hidden_layers` = %s. Please open an issue "
-                             "to discuss this feature." % (
+                             "but `max_window_layers` = {} is less than "
+                             "`num_hidden_layers` = {}. Please open an issue "
+                             "to discuss this feature.".format(
                                  config.max_window_layers,
                                  config.num_hidden_layers,
                              ))
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index dac85e35d369d..98bb48a274e49 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
 # Copyright 2024 The Qwen team.
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 901b1daaa14a4..0fbf305da8b94 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
 # Copyright 2024 The Qwen team.
@@ -71,9 +70,9 @@ def __init__(
                 and hasattr(config, "max_window_layers")):
             raise ValueError("Sliding window for some but all layers is not "
                              "supported. This model uses sliding window "
-                             "but `max_window_layers` = %s is less than "
-                             "`num_hidden_layers` = %s. Please open an issue "
-                             "to discuss this feature." % (
+                             "but `max_window_layers` = {} is less than "
+                             "`num_hidden_layers` = {}. Please open an issue "
+                             "to discuss this feature.".format(
                                  config.max_window_layers,
                                  config.num_hidden_layers,
                              ))
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index d801903f8f9fe..e30b84e8dd44c 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
 # Copyright 2024 The Qwen team.
@@ -246,9 +245,8 @@ def forward(
         q, k, v = dist_utils.split_tensor_along_last_dim(x, 3)
         batch_size = q.shape[1]
 
-        q, k, v = [
-            rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
-        ]
+        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
+                   for x in (q, k, v))
         if rotary_pos_emb is not None:
             q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
             k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
@@ -258,7 +256,7 @@ def forward(
             #   flash_attn_varlen_func)
             from flash_attn import flash_attn_varlen_func
 
-            q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
 
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             output = flash_attn_varlen_func(q,
@@ -276,7 +274,7 @@ def forward(
                                       b=batch_size)
         elif self.attn_backend == _Backend.TORCH_SDPA:
             seq_length = q.size(1)
-            q, k, v = [rearrange(x, "b s h d -> b h s d") for x in [q, k, v]]
+            q, k, v = (rearrange(x, "b s h d -> b h s d") for x in [q, k, v])
             attention_mask = torch.zeros([1, seq_length, seq_length],
                                          device=q.device,
                                          dtype=torch.bool)
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index e3e7ccb5cf179..1b233ac7427dd 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 083a48588d01a..34389b645a7c1 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
 # All rights reserved.
 #
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 8f0644bca3e2e..b24c5dadb2b2b 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index 036789642d3c4..e559988ada753 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://huggingface.co/xverse/XVERSE-7B/blob/main/modeling_xverse.py
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 6b10d0c609f13..5ff6f93fb25b4 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,4 +1,3 @@
-import sys
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Mapping,
@@ -34,14 +33,9 @@
 :meth:`MultiModalInputs.batch`.
 """
 
-if sys.version_info < (3, 9):
-    # UserDict cannot be subscripted
-    class _MultiModalInputsBase(UserDict):
-        pass
-else:
 
-    class _MultiModalInputsBase(UserDict[str, NestedTensors]):
-        pass
+class _MultiModalInputsBase(UserDict[str, NestedTensors]):
+    pass
 
 
 class MultiModalInputs(_MultiModalInputsBase):
@@ -262,18 +256,23 @@ def wrapper(model_cls: N) -> N:
                 logger.warning(
                     "Model class %s already has an input mapper "
                     "registered to %s. It is overwritten by the new one.",
-                    model_cls, self)
+                    model_cls,
+                    self,
+                )
 
-            self._input_mappers[model_cls] = mapper \
-                or self._default_input_mapper
+            self._input_mappers[model_cls] = (mapper
+                                              or self._default_input_mapper)
 
             return model_cls
 
         return wrapper
 
-    def map_input(self, model_config: "ModelConfig",
-                  data: MultiModalData[object],
-                  mm_processor_kwargs: Dict[str, Any]) -> MultiModalInputs:
+    def map_input(
+        self,
+        model_config: "ModelConfig",
+        data: MultiModalData[object],
+        mm_processor_kwargs: Dict[str, Any],
+    ) -> MultiModalInputs:
         """
         Transform the data into a dictionary of model inputs using the
         input mapper registered for that model.
@@ -348,13 +347,15 @@ def wrapper(model_cls: N) -> N:
                 logger.warning(
                     "Model class %s already calculates maximum number of "
                     "tokens in %s. It is overwritten by the new one.",
-                    model_cls, self)
+                    model_cls,
+                    self,
+                )
 
             if isinstance(max_mm_tokens, int):
                 self._validate_max_multimodal_tokens(max_mm_tokens)
 
-            self._max_mm_tokens[model_cls] = max_mm_tokens \
-                or self._default_max_multimodal_tokens
+            self._max_mm_tokens[model_cls] = (
+                max_mm_tokens or self._default_max_multimodal_tokens)
 
             return model_cls
 
@@ -482,8 +483,10 @@ def from_seq_group(
         placeholder_maps: Dict[str, MultiModalPlaceholderMap] = defaultdict(
             MultiModalPlaceholderMap)
 
-        for modality, placeholders in seq_group.multi_modal_placeholders.items(
-        ):
+        for (
+                modality,
+                placeholders,
+        ) in seq_group.multi_modal_placeholders.items():
             mm_items = mm_data.pop(modality)
             if not isinstance(mm_items, list):
                 mm_items = [mm_items]
@@ -499,8 +502,11 @@ def from_seq_group(
         return mm_data, placeholder_maps
 
     def append_items_from_seq_group(
-            self, positions: range, multi_modal_items: List[_T],
-            multi_modal_placeholders: List[PlaceholderRange]) -> List[_T]:
+        self,
+        positions: range,
+        multi_modal_items: List[_T],
+        multi_modal_placeholders: List[PlaceholderRange],
+    ) -> List[_T]:
         """
         Adds the multi-modal items that intersect ```positions`` to this
         placeholder map and returns the intersecting items.
@@ -515,20 +521,26 @@ def append_items_from_seq_group(
                                              multi_modal_items):
             placeholder = range(
                 placeholder_dict["offset"],
-                placeholder_dict["offset"] + placeholder_dict["length"])
-            intersection = range(max(positions.start, placeholder.start),
-                                 min(positions.stop, placeholder.stop))
+                placeholder_dict["offset"] + placeholder_dict["length"],
+            )
+            intersection = range(
+                max(positions.start, placeholder.start),
+                min(positions.stop, placeholder.stop),
+            )
 
             if not intersection:
                 # Skip this multi-modal item.
                 continue
 
-            token_embedding_range = range(intersection.start - positions.start,
-                                          intersection.stop - positions.start)
+            token_embedding_range = range(
+                intersection.start - positions.start,
+                intersection.stop - positions.start,
+            )
 
             multimodal_embedding_range = range(
                 intersection.start - placeholder.start + self.src_len,
-                intersection.stop - placeholder.start + self.src_len)
+                intersection.stop - placeholder.start + self.src_len,
+            )
 
             intersecting_items.append(mm_item)
             self.dest_ranges.append(token_embedding_range)
diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
index 4cde2a0254b90..473b87c89c21d 100644
--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
@@ -37,9 +37,8 @@ def load_peft_weights(model_id: str,
             Additional arguments to pass to the `hf_hub_download` method when 
             loading from the HuggingFace Hub.
     """
-    path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"])
-            if hf_hub_download_kwargs.get("subfolder", None) is not None else
-            model_id)
+    path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) if
+            hf_hub_download_kwargs.get("subfolder") is not None else model_id)
 
     if device is None:
         device = infer_device()
@@ -51,19 +50,19 @@ def load_peft_weights(model_id: str,
         filename = os.path.join(path, WEIGHTS_NAME)
         use_safetensors = False
     else:
-        token = hf_hub_download_kwargs.get("token", None)
+        token = hf_hub_download_kwargs.get("token")
         if token is None:
-            token = hf_hub_download_kwargs.get("use_auth_token", None)
+            token = hf_hub_download_kwargs.get("use_auth_token")
 
         hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"],
                                      SAFETENSORS_WEIGHTS_NAME)
-                        if hf_hub_download_kwargs.get("subfolder", None)
-                        is not None else SAFETENSORS_WEIGHTS_NAME)
+                        if hf_hub_download_kwargs.get("subfolder") is not None
+                        else SAFETENSORS_WEIGHTS_NAME)
         has_remote_safetensors_file = file_exists(
             repo_id=model_id,
             filename=hub_filename,
-            revision=hf_hub_download_kwargs.get("revision", None),
-            repo_type=hf_hub_download_kwargs.get("repo_type", None),
+            revision=hf_hub_download_kwargs.get("revision"),
+            repo_type=hf_hub_download_kwargs.get("repo_type"),
             token=token,
         )
         use_safetensors = has_remote_safetensors_file
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 08697274854e0..1a5870aa4f84c 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -308,7 +308,7 @@ def load_params_config(model, revision) -> PretrainedConfig:
         config_path = Path(
             hf_hub_download(model, config_file_name, revision=revision))
 
-    with open(config_path, "r") as file:
+    with open(config_path) as file:
         config_dict = json.load(file)
 
     config_mapping = {
diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py
index 49d2b8d8e21b1..e563bf6268d72 100644
--- a/vllm/transformers_utils/configs/chatglm.py
+++ b/vllm/transformers_utils/configs/chatglm.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
 from transformers import PretrainedConfig
diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py
index 805b8ad930039..f60a59f554133 100644
--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copied from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
 # Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
index b06a946f34a47..82f129eb2018e 100644
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 # Copyright 2023 Cerebras Systems.
diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py
index 497db0ae48c96..0f047c8b0361c 100644
--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copied from
 # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
 """A HuggingFace-style model configuration."""
@@ -117,10 +116,10 @@ def _validate_config(self) -> None:
                                                      init_config_defaults)
         if self.d_model % self.n_heads != 0:
             raise ValueError('d_model must be divisible by n_heads')
-        if any((
+        if any(
                 prob < 0 or prob > 1 for prob in
-            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
-        )):
+            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
+             ]):
             raise ValueError(
                 "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
                 "probabilities and must be between 0 and 1")
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index 139e6b3cdacbe..93fec667d1cf3 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 #
@@ -144,7 +143,7 @@ def __init__(
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        head_dim = head_dim or kwargs.get("kv_channels", None)
+        head_dim = head_dim or kwargs.get("kv_channels")
         self.head_dim = head_dim if head_dim is not None else (
             hidden_size // num_attention_heads)
 
@@ -160,8 +159,8 @@ def __init__(
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
         # for backward compatibility
-        partial_rotary_factor = kwargs.get("rope_percent", None) or kwargs.get(
-            "rope_percentage", None) or partial_rotary_factor
+        partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
+            "rope_percentage") or partial_rotary_factor
         self.partial_rotary_factor = partial_rotary_factor
         self._rope_scaling_validation()
         self.attention_bias = attention_bias
diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py
index d5113bf01695a..0c1c048f670ee 100644
--- a/vllm/transformers_utils/configs/solar.py
+++ b/vllm/transformers_utils/configs/solar.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
diff --git a/vllm/utils.py b/vllm/utils.py
index 0b75e8761c916..6edc8d72f6bcf 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1153,7 +1153,7 @@ class SortedHelpFormatter(argparse.HelpFormatter):
 
     def add_arguments(self, actions):
         actions = sorted(actions, key=lambda x: x.option_strings)
-        super(SortedHelpFormatter, self).add_arguments(actions)
+        super().add_arguments(actions)
 
 
 class FlexibleArgumentParser(argparse.ArgumentParser):
@@ -1279,7 +1279,7 @@ def _load_config_file(self, file_path: str) -> List[str]:
 
         config: Dict[str, Union[int, str]] = {}
         try:
-            with open(file_path, 'r') as config_file:
+            with open(file_path) as config_file:
                 config = yaml.safe_load(config_file)
         except Exception as ex:
             logger.error(

From a5fda50a10641e47c0c290907f30ef2add6d4e7a Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Wed, 6 Nov 2024 16:50:37 +0800
Subject: [PATCH 23/24] [CI/Build] Fix large_gpu_mark reason (#10070)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utils.py b/tests/utils.py
index 16e21f68c7c96..00c7dabe16a7b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -699,7 +699,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
 
     return pytest.mark.skipif(
         memory_gb < min_gb,
-        reason=f"Need at least {memory_gb}GB GPU memory to run the test.",
+        reason=f"Need at least {min_gb}GB GPU memory to run the test.",
     )
 
 

From 8e62377e52c804e368f020e76220a261ebfe300a Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 6 Nov 2024 14:45:06 +0200
Subject: [PATCH 24/24] format.sh

---
 .../test_lm_eval_correctness.py               | 20 ++++++++-----------
 setup.py                                      |  3 +--
 vllm/executor/ray_hpu_executor.py             |  2 +-
 vllm/model_executor/layers/sampler.py         |  2 +-
 vllm/worker/hpu_model_runner.py               |  7 +++----
 5 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
index 3df0621f49a72..9a31f59b828a9 100644
--- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
@@ -76,18 +76,14 @@ def report_performance(task, input_lens, output_lens, time, record_property):
     context_lens = [i + o for i, o in zip(input_lens, output_lens)]
     gen_tput = sum(output_lens) / time
     all_lens = [input_lens, output_lens, context_lens]
-    min_input_tokens, min_output_tokens, min_context_tokens = [
-        min(x) for x in all_lens
-    ]
-    max_input_tokens, max_output_tokens, max_context_tokens = [
-        max(x) for x in all_lens
-    ]
-    mean_input_tokens, mean_output_tokens, mean_context_tokens = [
-        statistics.mean(x) for x in all_lens
-    ]
-    stddev_input_tokens, stddev_output_tokens, stddev_context_tokens = [
-        statistics.stdev(x) for x in all_lens
-    ]
+    min_input_tokens, min_output_tokens, min_context_tokens = (
+        min(x) for x in all_lens)
+    max_input_tokens, max_output_tokens, max_context_tokens = (
+        max(x) for x in all_lens)
+    mean_input_tokens, mean_output_tokens, mean_context_tokens = (
+        statistics.mean(x) for x in all_lens)
+    stddev_input_tokens, stddev_output_tokens, stddev_context_tokens = (
+        statistics.stdev(x) for x in all_lens)
     msg = (
         f'{task} | estimated average generation throughput: {gen_tput:.2f} tokens/s \n'  # noqa: G004, E501
         f'{task} | input_tokens   | min: {min_input_tokens} | max: {max_input_tokens} | mean: {mean_input_tokens:.2f} | stddev: {stddev_input_tokens:.2f}\n'  # noqa: E501
diff --git a/setup.py b/setup.py
index 17ed12009af0f..5939d9ec89457 100644
--- a/setup.py
+++ b/setup.py
@@ -382,8 +382,7 @@ def get_gaudi_sw_version():
     output = subprocess.run("hl-smi",
                             shell=True,
                             text=True,
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.PIPE,
+                            capture_output=True,
                             env={"ENABLE_CONSOLE": "true"})
     if output.returncode == 0 and output.stdout:
         return output.stdout.split("\n")[2].replace(
diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py
index 58b1447531841..ebfaafd29f92c 100644
--- a/vllm/executor/ray_hpu_executor.py
+++ b/vllm/executor/ray_hpu_executor.py
@@ -34,7 +34,7 @@ class RayHPUExecutor(DistributedGPUExecutor):
     uses_ray: bool = True
 
     def _init_executor(self) -> None:
-        self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
+        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
         # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 388a871c40d2e..56ac01680ee45 100755
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -358,7 +358,7 @@ def _get_bin_counts_and_mask(
     return bin_counts, mask
 
 
-class ApplyToppTopkScalar():
+class ApplyToppTopkScalar:
     """
     The original implementation of _apply_top_k_top_p is more general
     as it uses vector topp, topk
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 60a3ba2650377..9e2ee861ebbd4 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -73,8 +73,7 @@ class Singleton(type):
 
     def __call__(cls, *args, **kwargs):
         if cls not in cls._instances:
-            cls._instances[cls] = super(Singleton,
-                                        cls).__call__(*args, **kwargs)
+            cls._instances[cls] = super().__call__(*args, **kwargs)
         return cls._instances[cls]
 
 
@@ -281,7 +280,7 @@ def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt):
     return indices, offsets
 
 
-class HpuModelAdapter():
+class HpuModelAdapter:
 
     def __init__(self, model, block_size, dtype, enforce_eager):
         self.model = model
@@ -1788,7 +1787,7 @@ def _maybe_wrap_in_hpu_graph(*args, **kwargs):
     ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs)
 
 
-class HabanaProfilerCounterHelper():
+class HabanaProfilerCounterHelper:
 
     def __init__(self):
         self.niter = 0