From 5952d811398d3a22f30d72d2d2943787a78f66ea Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 5 Nov 2024 10:50:57 -0500 Subject: [PATCH 01/24] [Frontend] Fix tcp port reservation for api server (#10012) Signed-off-by: Russell Bryant --- vllm/entrypoints/openai/api_server.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index bef36ffdbfcd3..917b347ff1161 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -569,7 +569,8 @@ async def run_server(args, **uvicorn_kwargs) -> None: # This avoids race conditions with ray. # see https://github.com/vllm-project/vllm/issues/8204 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.bind(("", args.port)) + sock.bind((args.host or "", args.port)) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) def signal_handler(*_) -> None: # Interrupt server on sigterm while initializing @@ -593,13 +594,14 @@ def signal_handler(*_) -> None: ssl_certfile=args.ssl_certfile, ssl_ca_certs=args.ssl_ca_certs, ssl_cert_reqs=args.ssl_cert_reqs, - fd=sock.fileno(), **uvicorn_kwargs, ) # NB: Await server shutdown only after the backend context is exited await shutdown_task + sock.close() + if __name__ == "__main__": # NOTE(simon): From cd34029e91ad2d38a58d190331a65f9096c0b157 Mon Sep 17 00:00:00 2001 From: Richard Liu <39319471+richardsliu@users.noreply.github.com> Date: Tue, 5 Nov 2024 08:48:44 -0800 Subject: [PATCH 02/24] Refactor TPU requirements file and pin build dependencies (#10010) Signed-off-by: Richard Liu --- Dockerfile.tpu | 7 --- .../getting_started/tpu-installation.rst | 57 ++----------------- requirements-tpu.txt | 20 ++++++- 3 files changed, 23 insertions(+), 61 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index b43442e4c0af1..0a507b6ecdf60 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -9,12 +9,6 @@ RUN apt-get update && apt-get install -y \ git \ ffmpeg libsm6 libxext6 libgl1 -# Install the TPU and Pallas dependencies. -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html - # Build vLLM. COPY . . ARG GIT_REPO_CHECK=0 @@ -25,7 +19,6 @@ ENV VLLM_TARGET_DEVICE="tpu" RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ python3 -m pip install \ - 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ -r requirements-tpu.txt RUN python3 setup.py develop diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst index f0c812b941c1f..75ab2b6ba02dc 100644 --- a/docs/source/getting_started/tpu-installation.rst +++ b/docs/source/getting_started/tpu-installation.rst @@ -119,27 +119,19 @@ Uninstall the existing `torch` and `torch_xla` packages: pip uninstall torch torch-xla -y -Install `torch` and `torch_xla` +Install build dependencies: .. code-block:: bash - pip install --pre torch==2.6.0.dev20241028+cpu torchvision==0.20.0.dev20241028+cpu --index-url https://download.pytorch.org/whl/nightly/cpu - pip install 'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl' -f https://storage.googleapis.com/libtpu-releases/index.html + pip install -r requirements-tpu.txt + sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev -Install JAX and Pallas: +Run the setup script: .. code-block:: bash - pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html - pip install jaxlib==0.4.32.dev20240829 jax==0.4.32.dev20240829 -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html - -Install other build dependencies: + VLLM_TARGET_DEVICE="tpu" python setup.py develop -.. code-block:: bash - - pip install -r requirements-tpu.txt - VLLM_TARGET_DEVICE="tpu" python setup.py develop - sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev Provision Cloud TPUs with GKE ----------------------------- @@ -168,45 +160,6 @@ Run the Docker image with the following command: $ # Make sure to add `--privileged --net host --shm-size=16G`. $ docker run --privileged --net host --shm-size=16G -it vllm-tpu - -.. _build_from_source_tpu: - -Build from source ------------------ - -You can also build and install the TPU backend from source. - -First, install the dependencies: - -.. code-block:: console - - $ # (Recommended) Create a new conda environment. - $ conda create -n myenv python=3.10 -y - $ conda activate myenv - - $ # Clean up the existing torch and torch-xla packages. - $ pip uninstall torch torch-xla -y - - $ # Install PyTorch and PyTorch XLA. - $ export DATE="20241017" - $ export TORCH_VERSION="2.6.0" - $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl - $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl - - $ # Install JAX and Pallas. - $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html - $ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html - - $ # Install other build dependencies. - $ pip install -r requirements-tpu.txt - - -Next, build vLLM from source. This will only take a few seconds: - -.. code-block:: console - - $ VLLM_TARGET_DEVICE="tpu" python setup.py develop - .. note:: Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape. diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 4c606cf0a9105..f9a0770804e55 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -2,6 +2,22 @@ -r requirements-common.txt # Dependencies for TPU -# Currently, the TPU backend uses a nightly version of PyTorch XLA. -# You can install the dependencies in Dockerfile.tpu. +cmake>=3.26 +ninja +packaging +setuptools-scm>=8 +wheel +jinja2 ray[default] + +# Install torch_xla +--pre +--extra-index-url https://download.pytorch.org/whl/nightly/cpu +--find-links https://storage.googleapis.com/libtpu-releases/index.html +--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html +--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html +torch==2.6.0.dev20241028+cpu +torchvision==0.20.0.dev20241028+cpu +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241028-cp310-cp310-linux_x86_64.whl +jaxlib==0.4.32.dev20240829 +jax==0.4.32.dev20240829 From 09d3550372db10f8c75fddd437325a863265fd82 Mon Sep 17 00:00:00 2001 From: "Chenghao (Alan) Yang" Date: Tue, 5 Nov 2024 11:50:50 -0600 Subject: [PATCH 03/24] [Misc] Add logging for CUDA memory (#10027) Signed-off-by: Chenghao Yang Signed-off-by: youkaichao Co-authored-by: Chenghao Yang Co-authored-by: youkaichao --- vllm/worker/model_runner.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 328dab598f8ef..2447eecf7957d 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -48,9 +48,10 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.transformers_utils.config import uses_mrope -from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, async_tensor_h2d, - flatten_2d_lists, is_pin_memory_available, - supports_dynamo, weak_ref_tensor) +from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache, + async_tensor_h2d, flatten_2d_lists, + is_pin_memory_available, supports_dynamo, + weak_ref_tensor) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict, @@ -1383,16 +1384,16 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: per sequence in the batch. """ assert not self.model_config.enforce_eager - logger.info("Capturing the model for CUDA graphs. This may lead to " + logger.info("Capturing cudagraphs for decoding. This may lead to " "unexpected consequences if the model is not static. To " "run the model in eager mode, set 'enforce_eager=True' or " "use '--enforce-eager' in the CLI.") - logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. " - "If you are running out of memory, consider decreasing " - "`gpu_memory_utilization` or enforcing eager mode. " - "You can also reduce the `max_num_seqs` as needed " - "to decrease memory usage.") + logger.info("If out-of-memory error occurs during cudagraph capture," + " consider decreasing `gpu_memory_utilization` or " + "switching to eager mode. You can also reduce the " + "`max_num_seqs` as needed to decrease memory usage.") start_time = time.perf_counter() + start_free_gpu_memory = torch.cuda.mem_get_info()[0] # Prepare dummy inputs. These will be reused for all batch sizes. max_batch_size = self.max_batchsize_to_capture @@ -1497,9 +1498,12 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: graph_runner) end_time = time.perf_counter() + end_free_gpu_memory = torch.cuda.mem_get_info()[0] elapsed_time = end_time - start_time + cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory # This usually takes < 10 seconds. - logger.info("Graph capturing finished in %.0f secs.", elapsed_time) + logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", + elapsed_time, cuda_graph_size / GiB_bytes) def _update_inputs_to_capture_for_enc_dec_model(self, capture_inputs: Dict[str, From 731aec5be713a89dccf1d7106290da17621af816 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 5 Nov 2024 13:30:42 -0500 Subject: [PATCH 04/24] [CI/Build] Limit github CI jobs based on files changed (#9928) Signed-off-by: Russell Bryant --- .github/workflows/actionlint.yml | 2 ++ .github/workflows/clang-format.yml | 12 ++++++++++++ .github/workflows/mypy.yaml | 10 ++++++++++ .github/workflows/ruff.yml | 17 +++++++++++++---- .github/workflows/yapf.yml | 9 ++++++++- 5 files changed, 45 insertions(+), 5 deletions(-) diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index b80749aaa8fec..5eddf6b7c649b 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -6,12 +6,14 @@ on: paths: - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' pull_request: branches: - "main" paths: - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' env: LC_ALL: en_US.UTF-8 diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index 68d60d7365ed1..167c115d8956f 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -6,9 +6,21 @@ on: push: branches: - main + paths: + - '**/*.h' + - '**/*.cpp' + - '**/*.cu' + - '**/*.cuh' + - '.github/workflows/clang-format.yml' pull_request: branches: - main + paths: + - '**/*.h' + - '**/*.cpp' + - '**/*.cu' + - '**/*.cuh' + - '.github/workflows/clang-format.yml' jobs: clang-format: diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 5f1e5f8eeaf7d..18b354948f0cc 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -6,9 +6,19 @@ on: push: branches: - main + paths: + - '**/*.py' + - '.github/workflows/mypy.yaml' + - 'tools/mypy.sh' + - 'pyproject.toml' pull_request: branches: - main + paths: + - '**/*.py' + - '.github/workflows/mypy.yaml' + - 'tools/mypy.sh' + - 'pyproject.toml' jobs: mypy: diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 9cc8a9e914474..197f918765e7d 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -6,16 +6,28 @@ on: push: branches: - main + paths: + - "**/*.py" + - pyproject.toml + - requirements-lint.txt + - .github/workflows/matchers/ruff.json + - .github/workflows/ruff.yml pull_request: branches: - main + paths: + - "**/*.py" + - pyproject.toml + - requirements-lint.txt + - .github/workflows/matchers/ruff.json + - .github/workflows/ruff.yml jobs: ruff: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.12"] steps: - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} @@ -30,9 +42,6 @@ jobs: run: | echo "::add-matcher::.github/workflows/matchers/ruff.json" ruff check --output-format github . - - name: Spelling check with codespell - run: | - codespell --toml pyproject.toml - name: Run isort run: | isort . --check-only diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index 9f06b35c19e32..35579302c5c14 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -6,15 +6,22 @@ on: push: branches: - main + paths: + - "**/*.py" + - .github/workflows/yapf.yml pull_request: branches: - main + paths: + - "**/*.py" + - .github/workflows/yapf.yml + jobs: yapf: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.12"] steps: - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} From a53046b16fd11436eb2b15421079b7c5b353f955 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 5 Nov 2024 13:42:20 -0500 Subject: [PATCH 05/24] [Model] Support quantization of PixtralHFTransformer for PixtralHF (#9921) Signed-off-by: mgoin --- vllm/model_executor/layers/activation.py | 30 +++++++ vllm/model_executor/models/pixtral.py | 100 ++++++++++++++--------- 2 files changed, 90 insertions(+), 40 deletions(-) diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 658a3700f33d6..e347ca80ff765 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -299,3 +299,33 @@ def get_act_fn( return ScaledActivation(act_fn, intermediate_size, input_is_parallel, params_dtype) return act_fn + + +_ACTIVATION_AND_MUL_REGISTRY = LazyDict({ + "gelu": lambda: GeluAndMul(), + "silu": lambda: SiluAndMul(), +}) + + +def get_act_and_mul_fn( + act_fn_name: str, + quant_config: Optional[QuantizationConfig] = None, + intermediate_size: Optional[int] = None, + input_is_parallel: bool = True, + params_dtype: Optional[torch.dtype] = None, +) -> nn.Module: + """Get an activation-and-mul (i.e. SiluAndMul) function by name.""" + act_fn_name = act_fn_name.lower() + if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY: + raise ValueError( + f"Activation function {act_fn_name!r} is not supported.") + + act_fn = _ACTIVATION_AND_MUL_REGISTRY[act_fn_name] + if (quant_config is not None + and act_fn_name in quant_config.get_scaled_act_names()): + if intermediate_size is None: + raise ValueError("intermediate_size must be specified for scaled " + "activation functions.") + return ScaledActivation(act_fn, intermediate_size, input_is_parallel, + params_dtype) + return act_fn diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 051454c49bff8..ee9f150b17cfc 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -19,8 +19,11 @@ from vllm.config import CacheConfig, ModelConfig, MultiModalConfig from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext, token_inputs) -from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -798,20 +801,24 @@ def __init__( super().__init__() assert config.intermediate_size is not None - # TODO: Use quant_config and prefix after optimizing this - self.gate_proj = nn.Linear(config.hidden_size, - config.intermediate_size, - bias=False) - self.up_proj = nn.Linear(config.hidden_size, - config.intermediate_size, - bias=False) - self.down_proj = nn.Linear(config.intermediate_size, - config.hidden_size, - bias=False) - self.act = get_act_fn(config.hidden_act) + self.gate_up_proj = MergedColumnParallelLinear( + input_size=config.hidden_size, + output_sizes=[config.intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(input_size=config.intermediate_size, + output_size=config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") + self.act_and_mul = get_act_and_mul_fn(config.hidden_act) def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x)) + gate_up, _ = self.gate_up_proj(x) + x = self.act_and_mul(gate_up) + x, _ = self.down_proj(x) + return x class PixtralHFAttention(nn.Module): @@ -830,21 +837,21 @@ def __init__( self.n_heads = config.num_attention_heads self.head_dim = config.hidden_size // config.num_attention_heads - self.scale = self.head_dim**-0.5 - - # TODO: Use quant_config and prefix after optimizing this - self.q_proj = nn.Linear(config.hidden_size, - config.hidden_size, - bias=False) - self.k_proj = nn.Linear(config.hidden_size, - config.hidden_size, - bias=False) - self.v_proj = nn.Linear(config.hidden_size, - config.hidden_size, - bias=False) - self.o_proj = nn.Linear(config.hidden_size, - config.hidden_size, - bias=False) + self.qkv_proj = QKVParallelLinear( + hidden_size=config.hidden_size, + head_size=self.head_dim, + total_num_heads=self.n_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + input_size=config.hidden_size, + output_size=config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) def forward( self, @@ -854,13 +861,13 @@ def forward( ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: batch, patches, _ = hidden_states.size() - q = self.q_proj(hidden_states) - k = self.k_proj(hidden_states) - v = self.v_proj(hidden_states) + qkv_states, _ = self.qkv_proj(hidden_states) + q, k, v = qkv_states.chunk(3, dim=-1) # Transpose q and k to apply HF's Rotary Position Embedding q = q.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2) k = k.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2) + v = v.view(batch, patches, self.n_heads, self.head_dim) cos, sin = position_embeddings q, k = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=0) @@ -868,22 +875,21 @@ def forward( # Transpose q and k back for attention q = q.transpose(1, 2).contiguous() k = k.transpose(1, 2).contiguous() - v = v.reshape(batch, patches, self.n_heads, self.head_dim) out = xops.memory_efficient_attention(q, k, v, attn_bias=attention_mask) else: - v = v.reshape(batch, patches, self.n_heads, - self.head_dim).transpose(1, 2) + v = v.transpose(1, 2) out = nn.functional.scaled_dot_product_attention( q, k, v, attn_mask=attention_mask) out = out.transpose(1, 2) - out = out.reshape(batch, patches, self.n_heads * self.head_dim) + out = out.view(batch, patches, self.n_heads * self.head_dim) + attn_output, _ = self.o_proj(out) - return self.o_proj(out) + return attn_output, None class PixtralHFTransformerBlock(nn.Module): @@ -912,9 +918,9 @@ def forward( attention_mask: torch.Tensor, position_embeddings: torch.Tensor, ) -> torch.Tensor: - r = self.attention.forward(self.attention_norm(hidden_states), - attention_mask=attention_mask, - position_embeddings=position_embeddings) + r, _ = self.attention.forward(self.attention_norm(hidden_states), + attention_mask=attention_mask, + position_embeddings=position_embeddings) h = hidden_states + r r = self.feed_forward.forward(self.ffn_norm(h)) out = h + r @@ -1053,10 +1059,24 @@ def forward( # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - stacked_params_mapping = [] + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] params_dict = dict(self.named_parameters()) + layer_count = len(self.transformer.layers) for name, loaded_weight in weights: + # omit layers when num_hidden_layers_override is set + if name.startswith("transformer.layers"): + layer_idx = int(name.split(".")[2]) + if layer_idx >= layer_count: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue From d2e80332a7cedcfd23ec705b109c5fa3ad94fcc0 Mon Sep 17 00:00:00 2001 From: lkchen Date: Tue, 5 Nov 2024 11:30:02 -0800 Subject: [PATCH 06/24] [Feature] Update benchmark_throughput.py to support image input (#9851) Signed-off-by: Linkun Chen Co-authored-by: Linkun Chen --- benchmarks/README.md | 11 ++++ benchmarks/benchmark_throughput.py | 82 +++++++++++++++++++++++------- 2 files changed, 75 insertions(+), 18 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 192d6c4022c83..2aa4a285021f1 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -6,3 +6,14 @@ You can download the dataset by running: ```bash wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json ``` + +## Downloading the ShareGPT4V dataset + +The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts +will ignore a datapoint if the referred image is missing. +```bash +wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json +mkdir coco -p +wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip +unzip coco/train2017.zip -d coco/ +``` diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 262b8652e49ff..159cf055737ce 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -8,6 +8,7 @@ import torch import uvloop +from PIL import Image from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) @@ -38,12 +39,33 @@ class SampleRequest: multi_modal_data: Optional[MultiModalDataDict] = None -def sample_requests( - dataset_path: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int], -) -> List[SampleRequest]: +def _get_prompt_for_image_model(question: str, *, model: str) -> str: + """Prepend and append special tokens around the question to form a prompt. + + Args: + question: The input question text to wrap with special tokens + model: The name of the model being used, to determine which special + tokens to add + + Returns: + The formatted prompt string with appropriate special tokens for the + model + + Raises: + ValueError: If an unsupported model name is provided + """ + model = model.lower() + if "pixtral" in model: + return f"[INST]{question}\n[IMG][/INST]" + raise ValueError(f"Unsupported model {model}") + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + dataset_path: str = args.dataset + num_requests: int = args.num_prompts + fixed_output_len: Optional[int] = args.output_len + model: str = args.model if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -52,23 +74,36 @@ def sample_requests( dataset = json.load(f) # Filter out the conversations with less than 2 turns. dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] - # Shuffle the dataset. random.shuffle(dataset) # Filter out sequences that are too long or too short filtered_dataset: List[SampleRequest] = [] - for i in range(len(dataset)): + for data in dataset: if len(filtered_dataset) == num_requests: break + # Only keep the first two turns of each conversation. + prompt = data["conversations"][0]["value"] + completion = data["conversations"][1]["value"] + + multi_modal_data: Optional[MultiModalDataDict] = None + if "image" in data: + multi_modal_data = multi_modal_data or {} + image_path = data["image"] + # TODO(vllm-project/vllm/issues/9778): Support multiple images. + assert isinstance(image_path, + str), "Only support single image input" + try: + multi_modal_data["image"] = Image.open(image_path).convert( + "RGB") + except FileNotFoundError: + # Ignore datapoint where asset is missing + continue + prompt = _get_prompt_for_image_model(question=prompt, model=model) + # Tokenize the prompts and completions. - prompt = dataset[i][0] prompt_token_ids = tokenizer(prompt).input_ids - completion = dataset[i][1] completion_token_ids = tokenizer(completion).input_ids prompt_len = len(prompt_token_ids) output_len = len(completion_token_ids @@ -82,7 +117,8 @@ def sample_requests( filtered_dataset.append( SampleRequest(prompt=prompt, prompt_len=prompt_len, - expected_output_len=output_len)) + expected_output_len=output_len, + multi_modal_data=multi_modal_data)) return filtered_dataset @@ -99,7 +135,9 @@ def run_vllm( prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: - prompts.append(TextPrompt(prompt=request.prompt)) + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, @@ -148,7 +186,9 @@ async def run_vllm_async( prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: - prompts.append(TextPrompt(prompt=request.prompt)) + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, @@ -272,9 +312,10 @@ def main(args: argparse.Namespace): for _ in range(args.num_prompts) ] else: - requests = sample_requests(args.dataset, args.num_prompts, tokenizer, - args.output_len) + requests = sample_requests(tokenizer, args) + is_multi_modal = any(request.multi_modal_data is not None + for request in requests) if args.backend == "vllm": if args.async_engine: elapsed_time = uvloop.run( @@ -300,6 +341,11 @@ def main(args: argparse.Namespace): for request in requests) total_output_tokens = sum(request.expected_output_len for request in requests) + if is_multi_modal: + print("\033[91mWARNING\033[0m: Multi-modal request detected. The " + "following metrics are not accurate because image tokens are not" + " counted. See vllm-project/vllm/issues/9778 for details.") + # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length. print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s") From b9c64c0ca79ccdea608f337fbb7e4b0c75fe3aac Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 6 Nov 2024 03:40:08 +0800 Subject: [PATCH 07/24] [Misc] Modify BNB parameter name (#9997) Signed-off-by: Jee Jee Li --- .../layers/quantization/bitsandbytes.py | 9 +++++---- vllm/model_executor/layers/resampler.py | 2 +- vllm/model_executor/model_loader/loader.py | 14 +++++--------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 718967a065192..78965d7b9495c 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -203,8 +203,9 @@ def create_qweight_for_4bit(): qweight = create_qweight_for_8bit() else: qweight = create_qweight_for_4bit() - - layer.register_parameter("qweight", qweight) + # Enable parameters to have the same name as in the BNB + # checkpoint format. + layer.register_parameter("weight", qweight) set_weight_attrs(qweight, extra_weight_attrs) def apply(self, @@ -234,7 +235,7 @@ def _apply_8bit_weight( reshape_after_matmul = True bf_x = x.to(torch.bfloat16) - qweight = layer.qweight + qweight = layer.weight offsets = qweight.bnb_shard_offsets quant_states = qweight.bnb_quant_state matmul_states = qweight.matmul_state @@ -313,7 +314,7 @@ def _apply_4bit_weight( reshape_after_matmul = True bf_x = x.to(torch.bfloat16) - qweight = layer.qweight + qweight = layer.weight quant_states = qweight.bnb_quant_state offsets = qweight.bnb_shard_offsets diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py index bce91f1d7fd5e..bca44d2bf2e28 100644 --- a/vllm/model_executor/layers/resampler.py +++ b/vllm/model_executor/layers/resampler.py @@ -177,7 +177,7 @@ def __init__(self, embed_dim, bias=False, quant_config=quant_config, - prefix=prefix) + prefix=f"{prefix}.kv_proj") else: # Maintain the same return value with ReplicatedLinear.forward self.kv_proj = lambda *args, **kwargs: ( # type: ignore # noqa diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index c3e0290f270ae..1f8d531198324 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -892,7 +892,7 @@ def _quantized_8bit_generator(self, hf_weights_files, use_safetensors, if not weight_name.lower().endswith(".scb"): continue - weight_key = weight_name.lower().replace(".scb", ".qweight") + weight_key = weight_name.lower().replace(".scb", ".weight") quant_state_dict[weight_key] = weight_tensor for weight_name, weight_tensor in self._hf_weight_iter( @@ -901,11 +901,9 @@ def _quantized_8bit_generator(self, hf_weights_files, use_safetensors, if self._is_8bit_weight_name(weight_name): continue - qweight_name = weight_name.replace(".weight", ".qweight") - - if qweight_name in quant_state_dict: + if weight_name in quant_state_dict: set_weight_attrs(weight_tensor, {"load_in_8bit": True}) - yield qweight_name, weight_tensor + yield weight_name, weight_tensor else: yield weight_name, weight_tensor @@ -950,9 +948,8 @@ def _parse_quant_state(param_name: str, (f"{weight_name}.quant_state.bitsandbytes__fp4" \ in temp_state_dict): quant_state = _parse_quant_state(weight_name, temp_state_dict) - weight_name = weight_name.replace(".weight", ".qweight") quant_state_dict[weight_name] = quant_state - yield weight_name.replace(".weight", ".qweight"), weight_tensor + yield weight_name, weight_tensor else: yield weight_name, weight_tensor @@ -967,7 +964,6 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors, if any(target_module in weight_name for target_module in self.target_modules) and weight_name.endswith(".weight"): - weight_name = weight_name.replace(".weight", ".qweight") # Without sharding if any( weight_name.startswith(module) @@ -1093,7 +1089,7 @@ def _load_weights(self, model_config: ModelConfig, # Some models, such as MiniCPM V2.5/2.6, contain both # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj' # from being incorrectly identified as being present in - # 'vpm.encoder.layers.0.self_attn.qkv_proj.qweight + # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight if shard_pos > 0 and quant_param_name[shard_pos - 1] == ".": shard_index = index quant_param_name = quant_param_name.replace( From 02462465ea1c45163fde632fb94e0e4939ee8a59 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 5 Nov 2024 16:02:23 -0500 Subject: [PATCH 08/24] [CI] Prune tests/models/decoder_only/language/* tests (#9940) Signed-off-by: mgoin --- .buildkite/test-pipeline.yaml | 3 +- .../decoder_only/language/test_big_models.py | 93 ------------------- .../models/decoder_only/language/test_fp8.py | 10 +- .../decoder_only/language/test_gptq_marlin.py | 13 --- .../language/test_gptq_marlin_24.py | 12 +-- .../decoder_only/language/test_marlin.py | 69 -------------- .../decoder_only/language/test_mistral.py | 37 ++++---- .../decoder_only/language/test_models.py | 69 +++++++------- .../models/decoder_only/language/test_qwen.py | 34 ------- 9 files changed, 70 insertions(+), 270 deletions(-) delete mode 100644 tests/models/decoder_only/language/test_big_models.py delete mode 100644 tests/models/decoder_only/language/test_marlin.py delete mode 100644 tests/models/decoder_only/language/test_qwen.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 9444dc43ea97e..1eb749f64d36b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -321,7 +321,6 @@ steps: - tests/models/decoder_only/language commands: - pytest -v -s models/decoder_only/language/test_models.py - - pytest -v -s models/decoder_only/language/test_big_models.py - label: Decoder-only Language Models Test (Extended) # 1h20min nightly: true @@ -329,7 +328,7 @@ steps: - vllm/ - tests/models/decoder_only/language commands: - - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py + - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py - label: Decoder-only Multi-Modal Models Test (Standard) #mirror_hardwares: [amd] diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py deleted file mode 100644 index fcfc159e4f5a0..0000000000000 --- a/tests/models/decoder_only/language/test_big_models.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Compare the outputs of HF and vLLM when using greedy sampling. - -This tests bigger models and use half precision. - -Run `pytest tests/models/test_big_models.py`. -""" -import pytest - -from vllm.platforms import current_platform - -from ...utils import check_logprobs_close, check_outputs_equal - -MODELS = [ - "meta-llama/Llama-2-7b-hf", - # "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py - # "Deci/DeciLM-7b", # Broken - # "tiiuae/falcon-7b", # Broken - "EleutherAI/gpt-j-6b", - # "mosaicml/mpt-7b", # Broken - # "Qwen/Qwen1.5-0.5B" # Broken, -] - -if not current_platform.is_cpu(): - MODELS += [ - # fused_moe which not supported on CPU - "openbmb/MiniCPM3-4B", - # Head size isn't supported on CPU - "h2oai/h2o-danube3-4b-base", - ] - -# TODO: remove this after CPU float16 support ready -target_dtype = "float" if current_platform.is_cpu() else "half" - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [32]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - - if model == "openbmb/MiniCPM3-4B": - # the output becomes slightly different when upgrading to - # pytorch 2.5 . Changing to logprobs checks instead of exact - # output checks. - NUM_LOG_PROBS = 8 - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, NUM_LOG_PROBS) - - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - else: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py index 5a947ce62c785..f874bf6c73142 100644 --- a/tests/models/decoder_only/language/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -21,11 +21,11 @@ "kv_cache_dtype,base_model,test_model,scale_path", [ # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. - ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None), + ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct", + "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None), # Test FP16 checkpoint w. fp8_e5m2 kv-cache. - ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct", - "meta-llama/Meta-Llama-3-8B-Instruct", None), + ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-1B-Instruct", None), # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-7b-chat-hf", @@ -33,7 +33,7 @@ ]) # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) -@pytest.mark.parametrize("enforce_eager", [False, True]) +@pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py index 2155e83dbe915..a896f145c11f1 100644 --- a/tests/models/decoder_only/language/test_gptq_marlin.py +++ b/tests/models/decoder_only/language/test_gptq_marlin.py @@ -22,24 +22,11 @@ MAX_MODEL_LEN = 1024 MODELS = [ - # act_order==False, group_size=channelwise - ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"), - # act_order==False, group_size=128 - ("TheBloke/Llama-2-7B-GPTQ", "main"), - # act_order==True, group_size=128 ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"), - # act_order==True, group_size=64 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"), - # act_order==True, group_size=32 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"), # 8-bit, act_order==True, group_size=channelwise ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"), - # 8-bit, act_order==True, group_size=128 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"), - # 8-bit, act_order==True, group_size=32 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"), # 4-bit, act_order==True, group_size=128 ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main") diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py index d65be05f141b4..aa63f9f36a3a8 100644 --- a/tests/models/decoder_only/language/test_gptq_marlin_24.py +++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py @@ -25,16 +25,16 @@ class ModelPair: # 4-bit, group_size == 128 ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128", model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"), - # 4-bit, group_size == channelwise - ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise", - model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"), + # # 4-bit, group_size == channelwise + # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise", + # model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"), # 8-bit, group_size == 128 ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128", model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"), - # 8-bit, group_size == channelwise - ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise", - model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"), + # # 8-bit, group_size == channelwise + # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise", + # model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"), ] diff --git a/tests/models/decoder_only/language/test_marlin.py b/tests/models/decoder_only/language/test_marlin.py deleted file mode 100644 index c802346dee8af..0000000000000 --- a/tests/models/decoder_only/language/test_marlin.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Compare the outputs of a GPTQ model to a Marlin model. - -Note: GPTQ and Marlin do not have bitwise correctness. -As a result, in this test, we just confirm that the top selected tokens of the -Marlin/GPTQ models are in the top 3 selections of each other. - -Note: Marlin internally uses locks to synchronize the threads. This can -result in very slight nondeterminism for Marlin. As a result, we re-run the test -up to 3 times to see if we pass. - -Run `pytest tests/models/test_marlin.py`. -""" -from dataclasses import dataclass - -import pytest - -from tests.quantization.utils import is_quant_method_supported - -from ...utils import check_logprobs_close - - -@dataclass -class ModelPair: - model_marlin: str - model_gptq: str - - -model_pairs = [ - ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128", - model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"), - ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin", - model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"), - ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", - model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq") -] - - -@pytest.mark.flaky(reruns=2) -@pytest.mark.skipif(not is_quant_method_supported("marlin"), - reason="Marlin is not supported on this GPU type.") -@pytest.mark.parametrize("model_pair", model_pairs) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - vllm_runner, - example_prompts, - model_pair: ModelPair, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: - with vllm_runner(model_pair.model_marlin, - dtype=dtype, - quantization="marlin") as marlin_model: - marlin_outputs = marlin_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - with vllm_runner(model_pair.model_gptq, dtype=dtype, - quantization="gptq") as gptq_model: - gptq_outputs = gptq_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - check_logprobs_close( - outputs_0_lst=gptq_outputs, - outputs_1_lst=marlin_outputs, - name_0="gptq", - name_1="marlin", - ) diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index 174b905d9cbb9..5be44c54a717c 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -4,7 +4,7 @@ """ import pytest -from vllm import LLM, SamplingParams +from vllm import SamplingParams from ...utils import check_logprobs_close @@ -15,6 +15,10 @@ # "mistralai/Mistral-Nemo-Instruct-2407" ] +MISTRAL_FORMAT_MODELS = [ + "mistralai/Mistral-7B-Instruct-v0.3", +] + SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5) SYMBOLIC_LANG_PROMPTS = [ "勇敢な船乗りについての詩を書く", # japanese @@ -95,7 +99,7 @@ def test_models( ) -@pytest.mark.parametrize("model", MODELS[1:]) +@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) @@ -135,28 +139,29 @@ def test_mistral_format( ) -@pytest.mark.parametrize("model", MODELS[1:]) +@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS) def test_mistral_symbolic_languages( + vllm_runner, model: str, dtype: str, - prompt: str, ) -> None: - prompt = "hi" - msg = {"role": "user", "content": prompt} - llm = LLM(model=model, - dtype=dtype, - max_model_len=8192, - tokenizer_mode="mistral", - config_format="mistral", - load_format="mistral") - outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS) - assert "�" not in outputs[0].outputs[0].text.strip() + with vllm_runner(model, + dtype=dtype, + max_model_len=8192, + tokenizer_mode="mistral", + config_format="mistral", + load_format="mistral") as vllm_model: + for prompt in SYMBOLIC_LANG_PROMPTS: + msg = {"role": "user", "content": prompt} + outputs = vllm_model.model.chat([msg], + sampling_params=SAMPLING_PARAMS) + assert "�" not in outputs[0].outputs[0].text.strip() @pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("model", MODELS[1:]) # v1 can't do func calling +@pytest.mark.parametrize("model", + MISTRAL_FORMAT_MODELS) # v1 can't do func calling def test_mistral_function_calling( vllm_runner, model: str, diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 68055cbe29095..05117666f8c3f 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -7,25 +7,39 @@ """ import pytest -from ...utils import check_outputs_equal +from vllm.platforms import current_platform + +from ...utils import check_logprobs_close MODELS = [ - "facebook/opt-125m", - "gpt2", - "bigcode/tiny_starcoder_py", - "EleutherAI/pythia-70m", - "bigscience/bloom-560m", # Testing alibi slopes. - "microsoft/phi-2", - "stabilityai/stablelm-3b-4e1t", - # "allenai/OLMo-1B", # Broken - "bigcode/starcoder2-3b", - "google/gemma-1.1-2b-it", + "facebook/opt-125m", # opt + "openai-community/gpt2", # gpt2 + # "Milos/slovak-gpt-j-405M", # gptj + # "bigcode/tiny_starcoder_py", # gpt_bigcode + # "EleutherAI/pythia-70m", # gpt_neox + "bigscience/bloom-560m", # bloom - testing alibi slopes + "microsoft/phi-2", # phi + # "stabilityai/stablelm-3b-4e1t", # stablelm + # "bigcode/starcoder2-3b", # starcoder2 + "google/gemma-1.1-2b-it", # gemma + "Qwen/Qwen2.5-0.5B-Instruct", # qwen2 + "meta-llama/Llama-3.2-1B-Instruct", # llama ] +if not current_platform.is_cpu(): + MODELS += [ + # fused_moe which not supported on CPU + "openbmb/MiniCPM3-4B", + ] + +# TODO: remove this after CPU float16 support ready +target_dtype = "float" if current_platform.is_cpu() else "half" + @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) def test_models( hf_runner, vllm_runner, @@ -33,33 +47,24 @@ def test_models( model: str, dtype: str, max_tokens: int, + num_logprobs: int, ) -> None: - # To pass the small model tests, we need full precision. - assert dtype == "float" with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) - check_outputs_equal( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", name_1="vllm", ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/decoder_only/language/test_qwen.py b/tests/models/decoder_only/language/test_qwen.py deleted file mode 100644 index 128fe65afbb84..0000000000000 --- a/tests/models/decoder_only/language/test_qwen.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Ensure that a text-only Qwen model can be run without throwing an error. -We explicitly test this because Qwen is implemented as a multimodal and -supports a visual encoder for models like Qwen-VL. -""" -from typing import List, Type - -import pytest - -from ....conftest import VllmRunner - -models = [ - "Qwen/Qwen-7B-Chat" # Has no visual encoder -] - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_text_only_qwen_model_can_be_loaded_and_run( - vllm_runner: Type[VllmRunner], - example_prompts: List[str], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, -): - with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_model.generate_greedy_logprobs( - example_prompts, - max_tokens, - num_logprobs=num_logprobs, - ) From 235366fe2eb3144321978e181af94487f0215595 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 5 Nov 2024 16:02:32 -0500 Subject: [PATCH 09/24] [CI] Prune back the number of tests in tests/kernels/* (#9932) Signed-off-by: mgoin --- tests/kernels/test_activation.py | 2 +- tests/kernels/test_attention.py | 2 +- tests/kernels/test_awq_marlin.py | 16 ++++++----- tests/kernels/test_blocksparse_attention.py | 6 ++--- tests/kernels/test_cache.py | 2 +- tests/kernels/test_cutlass.py | 30 ++++++++++++++++----- tests/kernels/test_int8_quant.py | 7 +++-- tests/kernels/test_marlin_gemm.py | 2 +- tests/kernels/test_moe.py | 23 +++++++++------- tests/kernels/test_pos_encoding.py | 6 ++--- 10 files changed, 60 insertions(+), 36 deletions(-) diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 057a11746014c..a84501f9c303f 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -14,7 +14,7 @@ DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing -D = [512, 4096, 5120, 13824] # Arbitrary values for testing +D = [512, 13824] # Arbitrary values for testing SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 4ecd0fc1a21ad..3e3c0668198ad 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -33,7 +33,7 @@ # FlashAttention forward only supports head dimension at most 128 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62 -HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] +HEAD_SIZES = [64, 80, 120, 256] BLOCK_SIZES = [16, 32] USE_ALIBI = [False, True] diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py index 59917dd2c58ad..238d6426bf099 100644 --- a/tests/kernels/test_awq_marlin.py +++ b/tests/kernels/test_awq_marlin.py @@ -14,13 +14,17 @@ awq_marlin_quantize) from vllm.scalar_type import scalar_types +NUM_EXPERTS = [8, 64] +TOP_KS = [2, 6] +GROUP_SIZES = [-1, 32, 128] -@pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [128, 2048, 256, 1024]) -@pytest.mark.parametrize("k", [128, 1024, 512]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) -@pytest.mark.parametrize("group_size", [-1, 32, 64, 128]) + +@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("n", [128, 2048]) +@pytest.mark.parametrize("k", [128, 1024]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("group_size", GROUP_SIZES) @pytest.mark.skipif(not (ops.supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe")), reason="Marlin is not supported on this GPU type.") diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index fb601852dd523..fad342d1b5923 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -25,10 +25,10 @@ DTYPES = [torch.half, torch.bfloat16] NUM_GEN_SEQS = [3] # Arbitrary values for testing NUM_PREFILL_SEQS = [3] # Arbitrary values for testing -NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing +NUM_HEADS = [(40, 40)] # Arbitrary values for testing HEAD_SIZES = [64, 112] -BLOCK_SIZES = [16, 32] +BLOCK_SIZES = [16] USE_ALIBI = [False, True] KV_CACHE_DTYPE = ["auto", "fp8"] SEEDS = [0] @@ -37,7 +37,7 @@ BLOCKSPARSE_VERT_STRIDES = [8] BLOCKSPARSE_BLOCK_SIZES = [64] -BLOCKSPARSE_HEADS_SLIDINGS = [0, 2, -1] +BLOCKSPARSE_HEADS_SLIDINGS = [2, -1] BLOCKSPARSE_HOMO_HEADS = [True, False] diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index e2b4778b94b9e..40550ed51e2c7 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -13,7 +13,7 @@ NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing NUM_HEADS = [8] # Arbitrary values for testing -HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] +HEAD_SIZES = [64, 80, 120, 256] BLOCK_SIZES = [8, 16, 32] # Arbitrary values for testing diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 993e67e827ea0..afe53797322f9 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -11,6 +11,28 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform +MNK_FACTORS = [ + (1, 256, 128), + (1, 16384, 1024), + (1, 24576, 496), + (16, 256, 496), + (16, 16384, 128), + (16, 24576, 4096), + (32, 8192, 4096), + (32, 16384, 4096), + (33, 1024, 1024), + (33, 8192, 128), + (64, 2048, 496), + (64, 16384, 1024), + (100, 8192, 496), + (128, 32768, 4096), + (256, 4096, 4096), + (512, 256, 1024), + (512, 8192, 4096), + (512, 16384, 128), + (512, 24576, 128), +] + CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] @@ -116,9 +138,7 @@ def cutlass_int8_gemm_helper(m: int, (out, a, b, scale_a, scale_b, bias)) -@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33]) -@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024]) -@pytest.mark.parametrize("k", [128, 496, 1024]) +@pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("use_bias", [True, False]) @@ -129,9 +149,7 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool, cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias) -@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [2048, 8192, 16384, 256, 1024]) -@pytest.mark.parametrize("k", [128, 496, 1024]) +@pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("use_bias", [True, False]) diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index 8db6a0d0d9fa4..12c578db0893c 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -7,11 +7,10 @@ from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] -HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192, - 8193] # Arbitrary values for testing +HIDDEN_SIZES = [16, 67, 768, 5137, 8193] # Arbitrary values for testing NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing SEEDS = [0] -SCALE = [0.1, 0.5, 0.8, 1.2, 2.1] +SCALE = [0.1, 2.1] def opcheck_int8_quant_static(output, input, scale, azp=None): @@ -132,7 +131,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("scale", SCALE[2:]) # Reduce test time +@pytest.mark.parametrize("scale", SCALE) @pytest.mark.parametrize("azp", [-255, 54]) @torch.inference_mode() def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index 5cfd4d6da7a86..b6dd68cc51a9f 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -35,7 +35,7 @@ USE_FP32_REDUCE_OPTS = [False, True] MARLIN_K_CHUNKS = [128] -MARLIN_N_CHUNKS = [64, 128, 256] +MARLIN_N_CHUNKS = [64, 256] MARLIN_24_K_CHUNKS = [128] MARLIN_24_N_CHUNKS = [512] diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 19c3fc1e1fe3a..17428ebfc2e28 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -20,12 +20,15 @@ from vllm.platforms import current_platform from vllm.scalar_type import scalar_types +NUM_EXPERTS = [8, 64] +TOP_KS = [2, 6] -@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [2048, 256, 1024]) + +@pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128]) +@pytest.mark.parametrize("n", [128, 1024, 2048]) @pytest.mark.parametrize("k", [128, 511, 1024]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) def test_fused_moe( m: int, @@ -93,12 +96,12 @@ def test_mixtral_moe(dtype: torch.dtype): atol=mixtral_moe_tol[dtype]) -@pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [128, 2048, 256, 1024]) -@pytest.mark.parametrize("k", [128, 1024, 512]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) -@pytest.mark.parametrize("group_size", [-1, 32, 64, 128]) +@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("n", [128, 2048]) +@pytest.mark.parametrize("k", [128, 1024]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("group_size", [-1, 32, 128]) @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) @pytest.mark.parametrize("is_k_full", [True, False]) diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index b408559cc0b07..eee77c22ab81a 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -11,10 +11,10 @@ IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] -HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] +HEAD_SIZES = [64, 80, 112, 120, 256] ROTARY_DIMS = [None, 32] # None means rotary dim == head size -NUM_HEADS = [7, 17] # Arbitrary values for testing -BATCH_SIZES = [1, 5] # Arbitrary values for testing +NUM_HEADS = [17] # Arbitrary values for testing +BATCH_SIZES = [5] # Arbitrary values for testing SEQ_LENS = [11, 8192] # Arbitrary values for testing SEEDS = [0] CUDA_DEVICES = [ From ca9844b340f45f23f8d30fdce23777d215ad987c Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 5 Nov 2024 14:49:20 -0800 Subject: [PATCH 10/24] [bugfix] fix weak ref in piecewise cudagraph and tractable test (#10048) Signed-off-by: youkaichao --- tests/compile/piecewise/test_toy_llama.py | 111 ++++++++++++++++++++-- vllm/compilation/backends.py | 82 +++++++++++++--- 2 files changed, 168 insertions(+), 25 deletions(-) diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index e3e5a7d0fc5a5..9c65059c6b348 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -1,6 +1,10 @@ """ Test the piecewise compilation with a simple model, comparing the output with and without the piecewise compilation. + +This is a tractable model, the weights and computation are specially designed +if the config `tractable_init` is set to True. Otherwise, the weights are +initialized randomly with a fixed seed. """ import os from dataclasses import dataclass @@ -49,6 +53,12 @@ class LlamaConfig: mlp_size: int = 256 vocab_size: int = 128 num_layers: int = 2 + init_value: float = 1.0 + tractable_init: bool = False + random_seed: int = 0 + + def __post_init__(self): + assert self.mlp_size >= self.hidden_size class LlamaMLP(nn.Module): @@ -66,10 +76,23 @@ def __init__(self, config: LlamaConfig) -> None: bias=False, ) - self.gate_up_projection.weight.data.fill_(0.0) - self.down_projection.weight.data.fill_(0.0) + if config.tractable_init: + nn.init.eye_(self.gate_up_projection.weight.data[:config.mlp_size]) + nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size:]) + nn.init.eye_(self.down_projection.weight.data) + else: + nn.init.xavier_normal_(self.gate_up_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) + nn.init.xavier_normal_(self.down_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) def forward(self, x): + # for tractable_init and positive input, this is + # essentially an elementwise-square x = self.gate_up_projection(x) x = x[:, :x.size(1) // 2] * torch.nn.functional.relu( x[:, x.size(1) // 2:]) @@ -84,21 +107,39 @@ def __init__(self, config: LlamaConfig) -> None: self.qkv_projection = nn.Linear( in_features=config.hidden_size, out_features=config.hidden_size * 3, + bias=False, ) self.output_projection = nn.Linear( in_features=config.hidden_size, out_features=config.hidden_size, + bias=False, ) - self.qkv_projection.weight.data.fill_(0.0) - self.output_projection.weight.data.fill_(0.0) + if config.tractable_init: + nn.init.eye_(self.qkv_projection.weight.data[:config.hidden_size]) + nn.init.eye_(self.qkv_projection.weight.data[config.hidden_size:2 * + config.hidden_size]) + nn.init.eye_(self.qkv_projection.weight.data[2 * + config.hidden_size:]) + nn.init.eye_(self.output_projection.weight.data) + else: + nn.init.xavier_normal_(self.qkv_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) + nn.init.xavier_normal_(self.output_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, ) -> torch.Tensor: + # for tractable_init, this is: + # output = (hidden_states * 3 + positions * 2) qkv = self.qkv_projection(hidden_states) hidden_size = qkv.size(-1) // 3 q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1) @@ -126,20 +167,29 @@ def forward( hidden_states: torch.Tensor, residual: Optional[torch.Tensor], ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + For tractable computation: + - if residual is None, the outputs are: + - residual = (hidden_states + 1) * 3 + positions * 2 + hidden_states = hidden_states * 4 + positions * 2 + 3 + - hidden_states = (residual + 1) ** 2 + - if residual is not None, the outputs are: + - residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3 + - hidden_states = (residual + 1) ** 2 + """ # noqa if residual is None: residual = hidden_states - hidden_states = hidden_states / 2 + hidden_states = hidden_states + 1 else: hidden_states = hidden_states + residual residual = hidden_states - hidden_states = hidden_states / 2 + hidden_states = hidden_states + 1 hidden_states = self.self_attention(positions=positions, hidden_states=hidden_states) hidden_states = hidden_states + residual residual = hidden_states - hidden_states = hidden_states / 2 + hidden_states = hidden_states + 1 hidden_states = self.mlp(hidden_states) return hidden_states, residual @@ -156,7 +206,8 @@ def __init__(self, config: LlamaConfig) -> None: self.layers = nn.ModuleList( [LlamaDecoderLayer(config) for _ in range(config.num_layers)]) - self.embedding_tokens.weight.data.fill_(0.0) + # this is the initial value of the hidden states + self.embedding_tokens.weight.data.fill_(config.init_value) def forward( self, @@ -170,6 +221,28 @@ def forward( return hidden_states +def tractable_computation(input_ids: torch.Tensor, + positions: torch.Tensor, + config: LlamaConfig, + init_value: float = 1.0) -> torch.Tensor: + hidden_states = torch.ones(input_ids.size(0), + config.hidden_size, + device=input_ids.device, + dtype=input_ids.dtype) * init_value + + # first layer + residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3 + hidden_states = (residual + 1)**2 + + # following layers + for _ in range(config.num_layers - 1): + hidden_states = hidden_states + residual + residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3 + hidden_states = (residual + 1)**2 + + return hidden_states + + @torch.inference_mode def run_model(llama_config, use_compile: bool, @@ -213,7 +286,15 @@ def run_model(llama_config, del os.environ["VLLM_TORCH_COMPILE_LEVEL"] set_compilation_config(None) - return output.cpu() + output = output.cpu() + + if llama_config.tractable_init: + expected_output = tractable_computation(input_ids[:2], positions[:2], + llama_config).cpu() + + assert torch.allclose(output, expected_output) + else: + return output.cpu() def test_toy_llama(): @@ -222,7 +303,13 @@ def test_toy_llama(): llama_config = LlamaConfig(hidden_size=128, mlp_size=256, vocab_size=128, - num_layers=2) + num_layers=12) + + tractable_config = LlamaConfig(hidden_size=128, + mlp_size=256, + vocab_size=128, + num_layers=2, + tractable_init=True) outputs = [] with compilation_counter.expect( @@ -233,6 +320,8 @@ def test_toy_llama(): num_cudagraph_caputured=0, ): outputs.append(run_model(llama_config, use_compile=False)) + run_model(tractable_config, use_compile=False) + with compilation_counter.expect( num_graphs_seen=1, # one graph for the model num_piecewise_graphs_seen=1, @@ -242,6 +331,7 @@ def test_toy_llama(): 2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): outputs.append(run_model(llama_config, use_compile=True)) + run_model(tractable_config, use_compile=True) with compilation_counter.expect( num_graphs_seen=1, # one graph for the model @@ -257,6 +347,7 @@ def test_toy_llama(): ): outputs.append( run_model(llama_config, use_compile=True, split_attn=True)) + run_model(tractable_config, use_compile=True, split_attn=True) for i in range(1, len(outputs)): assert torch.allclose(outputs[0], outputs[i]) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 96ddcba467c5b..de32cabbe6d07 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -6,6 +6,7 @@ import torch import torch.fx as fx +import vllm.envs as envs from vllm.logger import init_logger from vllm.utils import weak_ref_tensors @@ -193,6 +194,7 @@ def wrap_inductor(graph, @dataclasses.dataclass class SplitItem: submod_name: str + graph_id: int is_splitting_graph: bool graph: fx.GraphModule @@ -226,9 +228,7 @@ def split_graph(graph: fx.GraphModule, outputs = [] - # sort the names to make sure the order is deterministic names = [name for (name, module) in split_gm.named_modules()] - names.sort() for name in names: if "." in name or name == "": @@ -238,7 +238,11 @@ def split_graph(graph: fx.GraphModule, module = getattr(split_gm, name) graph_id = int(name.replace("submod_", "")) - outputs.append(SplitItem(name, graph_id in split_op_graphs, module)) + outputs.append( + SplitItem(name, graph_id, (graph_id in split_op_graphs), module)) + + # sort by intetger graph_id, rather than string name + outputs.sort(key=lambda x: x.graph_id) return split_gm, outputs @@ -252,6 +256,11 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): It runs the given graph with fake inputs, and compile some submodules specified by `compile_submod_names` with the given compilation configs. + + NOTE: the order in `compile_submod_names` matters, because + it will be used to determine the order of the compiled piecewise + graphs. The first graph will handle logging, and the last graph + has some special cudagraph output handling. """ def __init__(self, module: torch.fx.GraphModule, @@ -263,7 +272,6 @@ def __init__(self, module: torch.fx.GraphModule, self.compile_submod_names = compile_submod_names self.compilation_configs = compilation_configs self.graph_pool = graph_pool - self.have_seen_first_graph = False def run(self, *args): fake_args = [ @@ -279,6 +287,7 @@ def call_module(self, target: torch.fx.node.Target, output = super().call_module(target, args, kwargs) if target in self.compile_submod_names: + index = self.compile_submod_names.index(target) submod = self.fetch_attr(target) sym_shape_indices = [ i for i, x in enumerate(args) if isinstance(x, torch.SymInt) @@ -288,15 +297,14 @@ def call_module(self, target: torch.fx.node.Target, args, self.compilation_configs.inductor_compile_config, runtime_shape=None, - do_logging=not self.have_seen_first_graph, + do_logging=index == 0, use_inductor=self.compilation_configs.use_inductor) self.module.__dict__[target] = PiecewiseBackend( - submod, self.compilation_configs, self.graph_pool, - not self.have_seen_first_graph, sym_shape_indices, + submod, self.compilation_configs, self.graph_pool, index, + len(self.compile_submod_names), sym_shape_indices, compiled_graph_for_general_shape) - self.have_seen_first_graph = True compilation_counter.num_piecewise_capturable_graphs_seen += 1 return output @@ -352,8 +360,9 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: graph, self.compilation_configs.non_cudagraph_ops) from torch._dynamo.utils import lazy_format_graph_code - logger.debug("%s", - lazy_format_graph_code("stiching module", self.split_gm)) + logger.debug("%s", lazy_format_graph_code("before split", self.graph)) + logger.debug("%s", lazy_format_graph_code("after split", + self.split_gm)) compilation_counter.num_piecewise_graphs_seen += len( self.piecewise_graphs) @@ -385,12 +394,17 @@ class ConcreteSizeEntry: cudagraph: Optional[torch.cuda.CUDAGraph] = None output: Optional[Any] = None + # for cudagraph debugging, track the input addresses + # during capture, and check if they are the same during replay + input_addresses: Optional[List[int]] = None + class PiecewiseBackend: def __init__(self, graph: fx.GraphModule, compilation_configs: CompilationConfig, graph_pool: Any, - is_first_graph: bool, sym_shape_indices: List[int], + piecewise_compile_index: int, total_piecewise_compiles: int, + sym_shape_indices: List[int], compiled_graph_for_general_shape: Callable): """ The backend for piecewise compilation. @@ -408,7 +422,12 @@ def __init__(self, graph: fx.GraphModule, self.graph = graph self.compilation_configs = compilation_configs self.graph_pool = graph_pool - self.is_first_graph = is_first_graph + self.piecewise_compile_index = piecewise_compile_index + self.total_piecewise_compiles = total_piecewise_compiles + + self.is_first_graph = piecewise_compile_index == 0 + self.is_last_graph = ( + piecewise_compile_index == total_piecewise_compiles - 1) self.compile_sizes: Set[int] = set( self.compilation_configs.compile_sizes) @@ -422,6 +441,8 @@ def __init__(self, graph: fx.GraphModule, self.sym_shape_indices = sym_shape_indices + self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" + # the entries for different shapes that we need to either # compile or capture cudagraph self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {} @@ -476,14 +497,45 @@ def __call__(self, *args) -> Any: logger.info("Capturing a cudagraph for shape %s", runtime_shape) + input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + entry.input_addresses = input_addresses cudagraph = torch.cuda.CUDAGraph() + + # mind-exploding: carefully manage the reference and memory. with torch.cuda.graph(cudagraph, pool=self.graph_pool): - entry.output = weak_ref_tensors(entry.runnable(*args)) + # `output` is managed by pytorch's cudagraph pool + output = entry.runnable(*args) + if self.is_last_graph: + # by converting it to weak ref, + # the original `output` will immediately be released + # to save memory. It is only safe to do this for + # the last graph, because the output of the last graph + # will not be used by any other cuda graph. + output = weak_ref_tensors(output) + + # here we always use weak ref for the output + # to save memory + entry.output = weak_ref_tensors(output) + entry.cudagraph = cudagraph compilation_counter.num_cudagraph_caputured += 1 - entry.cudagraph = cudagraph - return entry.output + # important: we need to return the output, rather than + # the weak ref of the output, so that pytorch can correctly + # manage the memory during cuda graph capture + return output + + if self.is_debugging_mode: + # check if the input addresses are the same + new_input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + assert new_input_addresses == entry.input_addresses, ( + "Input addresses for cudagraphs are different during replay." + f" Expected {entry.input_addresses}, got {new_input_addresses}" + ) entry.cudagraph.replay() return entry.output From 43300bd98a54d48e97d9fb78c9db88eda3a88c64 Mon Sep 17 00:00:00 2001 From: zifeitong Date: Tue, 5 Nov 2024 16:34:40 -0800 Subject: [PATCH 11/24] [Bugfix] Properly propagate trust_remote_code settings (#10047) Signed-off-by: Zifei Tong --- vllm/model_executor/models/chatglm.py | 7 ++++--- vllm/model_executor/models/molmo.py | 22 ++++++++++++---------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index c3c9ec703c1e6..181f3c2b0fc35 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -54,8 +54,9 @@ def mm_input_mapper_for_glmv( data: MultiModalData[object], ) -> Dict: model_config = ctx.model_config - tokenizer = cached_get_tokenizer(model_config.tokenizer, - trust_remote_code=True) + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code) if tokenizer is None: raise RuntimeError("No HuggingFace processor is available " "to process the image object") @@ -525,7 +526,7 @@ def _parse_and_validate_image_input( elif isinstance(pixel_values, list): return torch.concat(pixel_values) else: - raise TypeError("""pixel_values must be a torch.Tensor + raise TypeError("""pixel_values must be a torch.Tensor or a list of torch.Tensor """) return GLMImagePixelInputs(pixel_values=pixel_values) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index ba798833e26a9..07c06149f0206 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -844,9 +844,10 @@ def get_max_tokens(max_crops: int, crop_patches: int, left_margin: int, def get_max_molmo_image_tokens(ctx: InputContext) -> int: - processor = cached_get_processor(ctx.model_config.model, - trust_remote_code=True, - revision=ctx.model_config.code_revision) + processor = cached_get_processor( + ctx.model_config.model, + trust_remote_code=ctx.model_config.trust_remote_code, + revision=ctx.model_config.code_revision) image_processor = processor.image_processor max_llm_image_tokens = get_max_tokens( image_processor.max_crops, @@ -870,9 +871,10 @@ def image_input_mapper_for_molmo( def dummy_data_for_molmo(ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]): - processor = cached_get_processor(ctx.model_config.model, - trust_remote_code=True, - revision=ctx.model_config.code_revision) + processor = cached_get_processor( + ctx.model_config.model, + trust_remote_code=ctx.model_config.trust_remote_code, + revision=ctx.model_config.code_revision) image_processor = processor.image_processor base_image_input_d = image_processor.image_patch_size @@ -935,11 +937,11 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): multi_modal_data = inputs.get("multi_modal_data") image = None if multi_modal_data is None else multi_modal_data.get("image") - processor = cached_get_processor(ctx.model_config.model, - trust_remote_code=True, - revision=ctx.model_config.code_revision) - model_config = ctx.model_config + processor = cached_get_processor( + ctx.model_config.model, + trust_remote_code=model_config.trust_remote_code, + revision=ctx.model_config.code_revision) tokenizer = cached_get_tokenizer( model_config.tokenizer, trust_remote_code=model_config.trust_remote_code) From 966e31697bdeb47b33b3e26b4aab5999c85f3e90 Mon Sep 17 00:00:00 2001 From: Wallas Henrique Date: Tue, 5 Nov 2024 21:39:26 -0300 Subject: [PATCH 12/24] [Bugfix] Fix pickle of input when async output processing is on (#9931) Signed-off-by: Wallas Santos --- .../test_basic_correctness.py | 26 +++++++++++++++++++ vllm/worker/model_runner.py | 12 +++++++++ 2 files changed, 38 insertions(+) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 79647589d5204..7f16baa65a644 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -156,3 +156,29 @@ def test_model_with_failure(vllm_runner) -> None: ModelInputForGPUWithSamplingMetadata) finally: os.remove(filename) + + +def test_failure_with_async_out_proc(vllm_runner) -> None: + + filename = None + try: + with vllm_runner("facebook/opt-125m", + dtype="half", + enforce_eager=False, + gpu_memory_utilization=0.7) as vllm_model,\ + patch("vllm.model_executor.models.opt.OPTForCausalLM.forward", + side_effect=ValueError()): + model_config = vllm_model.model.llm_engine.model_config + assert model_config.use_async_output_proc + with pytest.raises(ValueError) as exc_info: + vllm_model.generate_greedy('how to make pizza?', 250) + matches = re.search(r"input dumped to (.+).pkl", + str(exc_info.value)) + assert matches is not None + + filename = f"{matches.group(1)}.pkl" + finally: + # Clean up + if filename is not None: + os.remove(filename) + pass diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 2447eecf7957d..1e8ea4e8e79cf 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -136,6 +136,18 @@ def from_broadcasted_tensor_dict( attn_backend, tensor_dict) return cls(**tensor_dict) + # Exclude `async_callback` to be able to pickle this object + def __getstate__(self): + state = self.__dict__.copy() + del state["async_callback"] + return state + + # TODO: What happens when we depickle this object? + # How can we update this callback to properly pass it to the engine? + def __setstate__(self, state): + self.__dict__.update(state) + self.__dict__.update({'async_callback': None}) + @dataclass(frozen=True) class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU): From 0c63c34f725f0b519fa094fbeca6e3cf12c911c1 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Wed, 6 Nov 2024 10:45:45 +0900 Subject: [PATCH 13/24] [Bugfix][SpecDecode] kv corruption with bonus tokens in spec decode (#9730) Co-authored-by: LiuXiaoxuanPKU --- tests/spec_decode/test_multi_step_worker.py | 107 ++++++++++++++++++++ tests/spec_decode/utils.py | 4 +- vllm/spec_decode/draft_model_runner.py | 35 ++++++- vllm/spec_decode/multi_step_worker.py | 23 ++++- 4 files changed, 159 insertions(+), 10 deletions(-) diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index e6f7f480eebb2..0b5d82b6610ca 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -5,6 +5,8 @@ import pytest import torch +from vllm.attention.selector import (_Backend, + global_force_attn_backend_context_manager) from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.utils import set_random_seed from vllm.sequence import (ExecuteModelRequest, HiddenStates, Logprob, @@ -303,6 +305,7 @@ def test_multi_step_with_batch_expansion_correct_output(): seed, model_runner_cls=TP1DraftModelRunner, ) + multi_step_worker.set_include_gpu_probs_tensor() worker = create_worker( Worker, model_name, @@ -397,6 +400,7 @@ def test_multi_step_with_batch_expansion_incorrect_output(): seed, model_runner_cls=TP1DraftModelRunner, ) + multi_step_worker.set_include_gpu_probs_tensor() worker = create_worker( Worker, model_name, @@ -477,6 +481,109 @@ def test_multi_step_with_batch_expansion_incorrect_output(): assert (num_mismatch > 0) +@torch.inference_mode() +@pytest.mark.parametrize('num_steps', [1, 2, 3, 4]) +# The choice of backends forces the multi_step_worker to choose between +# the vanilla model_runner and TP1DraftModelRunner and that we can test +# both code paths. +@pytest.mark.parametrize('attn_backend', + [_Backend.XFORMERS, _Backend.FLASH_ATTN]) +def test_multi_step_correct_kvcache(num_steps, attn_backend): + """Verify that the KV cache of the draft model + is correctly updated for sequences with bonus token. + """ + seed = 100 + model_name = "JackFram/llama-68m" + + block_size = 16 + num_gpu_blocks = 2048 // block_size + batch_size = 1 + + with global_force_attn_backend_context_manager(attn_backend): + dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32' + multi_step_worker = create_worker(MultiStepWorker, + model_name, + block_size, + num_gpu_blocks, + seed, + model_runner_cls=TP1DraftModelRunner, + dtype=dtype) + multi_step_worker.set_include_gpu_probs_tensor() + worker = create_worker(Worker, + model_name, + block_size, + num_gpu_blocks, + seed, + dtype=dtype) + + prompts = [[0] for _ in range(batch_size)] + # Already generate two tokens for the sequence + # so that we can simulate the bonus token case + multi_step_continuations = [[ + random.randint(0, 1000), + random.randint(0, 1000) + ] for _ in prompts] + final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts] + + seq_ids_with_bonus_token_in_last_step = set(range(batch_size)) + seq_group_metadata_list = create_seq_group_metadata_from_prompts( + prompts, + num_gpu_blocks, + block_size, + continuations=multi_step_continuations, + final_prompt_lens=final_prompt_lens) + + # Run multi-step. + zero_kv_cache(multi_step_worker.cache_engine) + multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest( + seq_group_metadata_list=seq_group_metadata_list), + sample_len=num_steps, + seq_ids_with_bonus_token_in_last_step= + seq_ids_with_bonus_token_in_last_step) + + # Run single-step repeatedly. + zero_kv_cache(worker.cache_engine) + # Generate the kv cache for the bonus token first + single_step_continuations = [c[:1] for c in multi_step_continuations] + seq_group_metadata_list = create_seq_group_metadata_from_prompts( + prompts, + num_gpu_blocks, + block_size, + continuations=single_step_continuations, + final_prompt_lens=final_prompt_lens) + single_step_output = worker.execute_model( + execute_model_req=ExecuteModelRequest( + seq_group_metadata_list=seq_group_metadata_list)) + for _ in range(num_steps): + seq_group_metadata_list = create_seq_group_metadata_from_prompts( + prompts, + num_gpu_blocks, + block_size, + continuations=multi_step_continuations, + final_prompt_lens=final_prompt_lens) + + single_step_output = worker.execute_model( + execute_model_req=ExecuteModelRequest( + seq_group_metadata_list=seq_group_metadata_list)) + + for i, seq_group_output in enumerate(single_step_output[-1]): + multi_step_continuations[i].append( + seq_group_output.samples[0].output_token) + + # Verify that the KV cache of the single-step and + # multi-step workers are the same. + single_step_gpu_cache = worker.cache_engine[0].gpu_cache + multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache + num_layers = len(single_step_gpu_cache) + allclose = lambda a, b: torch.allclose( + a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2) + for i in range(num_layers): + assert allclose(single_step_gpu_cache[i][0], + multi_step_gpu_cache[i][0]) + assert allclose(single_step_gpu_cache[i][1], + multi_step_gpu_cache[i][1]) + + @torch.inference_mode() def test_draft_proposals_full_speculation_len(): """Verify Top1Proposer correctly handles case where all sequences diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 6cf0cfb09b8fa..e5cb0530f9961 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -68,12 +68,14 @@ def create_worker(cls: Callable[..., T], seed: int, is_driver_worker: bool = True, enforce_eager: bool = True, - model_runner_cls: Optional[ModelRunner] = None) -> T: + model_runner_cls: Optional[ModelRunner] = None, + dtype: Optional[str] = "auto") -> T: engine_args = EngineArgs( model=model_name, seed=seed, block_size=block_size, enforce_eager=enforce_eager, + dtype=dtype, ) engine_config = engine_args.create_engine_config() diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 17cc0ad1a4a3a..6330ac027db74 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -54,6 +54,8 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.indices_of_seq_with_bonus_tokens = None + def _update_sampling_metadata(self, sampling_metadata, num_seqs, num_queries): @@ -159,6 +161,10 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): # TODO: Add soft-tuning prompt adapter support return not self.prompt_adapter_config + def set_indices_of_seq_with_bonus_tokens(self, + indices_of_seq_with_bonus_tokens): + self.indices_of_seq_with_bonus_tokens = indices_of_seq_with_bonus_tokens + @torch.inference_mode() def execute_model( self, @@ -284,11 +290,30 @@ def execute_model( model_input.sampling_metadata) # Sample the next token. - outputs.append( - self.model.sample( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - )) + output = self.model.sample( + logits=logits, + sampling_metadata=model_input.sampling_metadata, + ) + outputs.append(output) + + if model_input.attn_metadata.num_prefills == 0 \ + and self.indices_of_seq_with_bonus_tokens is not None: + assert output.sampled_token_ids is not None + # output.sampled_token_ids should be of shape (num_seqs, 1) + nums_seqs, num_tokens_per_seq = output.sampled_token_ids.shape + assert num_tokens_per_seq == 1 + count = 0 + for i in range(nums_seqs): + bonus_seq_idx = self.indices_of_seq_with_bonus_tokens[ + count] + if i != bonus_seq_idx: + # The following might cause a cpu->gpu sync + # However, the performance impact is negligible as we + # benchmarked on H100. + output.sampled_token_ids[ + i, :] = model_input.input_tokens[bonus_seq_idx] + else: + count += 1 # Prepare inputs for the next step if step != num_steps - 1: diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 4b53fbe056c47..f49b98f5c9528 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -81,6 +81,8 @@ def sampler_output( # Here we run the draft_model_runner with multi-step prepare # on the GPU directly expanded_request.num_steps = sample_len + self.model_runner.set_indices_of_seq_with_bonus_tokens( + indices_of_seq_with_bonus_tokens) model_outputs = self.execute_model( execute_model_req=expanded_request) else: @@ -97,7 +99,8 @@ def sampler_output( model_output = model_output[0] self._append_new_tokens( - model_output, expanded_request.seq_group_metadata_list) + model_output, expanded_request.seq_group_metadata_list, + indices_of_seq_with_bonus_tokens) model_outputs.append(model_output) filtered_model_outputs = self._filter_model_output( @@ -221,13 +224,15 @@ def get_spec_proposals( @staticmethod def _append_new_tokens( model_output: List[SamplerOutput], - seq_group_metadata_list: List[SequenceGroupMetadata]) -> None: + seq_group_metadata_list: List[SequenceGroupMetadata], + indices_of_seq_with_bonus_tokens: List[int]) -> None: """Given model output from a single run, append the tokens to the sequences. This is normally done outside of the worker, but it is required if the worker is to perform multiple forward passes. """ - for seq_group_metadata, sequence_group_outputs in zip( - seq_group_metadata_list, model_output): + count = 0 + for index, (seq_group_metadata, sequence_group_outputs) in enumerate( + zip(seq_group_metadata_list, model_output)): seq_group_metadata.is_prompt = False for seq_output in sequence_group_outputs.samples: @@ -237,6 +242,16 @@ def _append_new_tokens( token_id = seq_output.output_token token_logprob = seq_output.logprobs[token_id] + # Determine the actual token ID to be generated, + # considering bonus tokens + if index != indices_of_seq_with_bonus_tokens[count]: + bonus_seq_metadata = seq_group_metadata_list[ + indices_of_seq_with_bonus_tokens[count]] + _, bonus_token_seq_data = next( + iter(bonus_seq_metadata.seq_data.items())) + token_id = bonus_token_seq_data.output_token_ids[-1] + else: + count += 1 seq.append_token_id(token_id, token_logprob.logprob) seq.update_num_computed_tokens(1) From c4cacbaa7faf9d0d3b2aa26e5df496724e80cb05 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 5 Nov 2024 18:19:50 -0800 Subject: [PATCH 14/24] [v1] reduce graph capture time for piecewise cudagraph (#10059) Signed-off-by: youkaichao --- vllm/compilation/backends.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index de32cabbe6d07..05deee7bd5473 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -1,7 +1,9 @@ import copy import dataclasses import operator +from contextlib import ExitStack from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union +from unittest.mock import patch import torch import torch.fx as fx @@ -503,17 +505,29 @@ def __call__(self, *args) -> Any: entry.input_addresses = input_addresses cudagraph = torch.cuda.CUDAGraph() - # mind-exploding: carefully manage the reference and memory. - with torch.cuda.graph(cudagraph, pool=self.graph_pool): - # `output` is managed by pytorch's cudagraph pool - output = entry.runnable(*args) - if self.is_last_graph: - # by converting it to weak ref, - # the original `output` will immediately be released - # to save memory. It is only safe to do this for - # the last graph, because the output of the last graph - # will not be used by any other cuda graph. - output = weak_ref_tensors(output) + with ExitStack() as stack: + if not self.is_first_graph: + # during every model forward, we will capture + # many pieces of cudagraphs (roughly one per layer). + # running gc again and again across layers will + # make the cudagraph capture very slow. + # therefore, we only run gc for the first graph, + # and disable gc for the rest of the graphs. + stack.enter_context(patch("gc.collect", lambda: None)) + stack.enter_context( + patch("torch.cuda.empty_cache", lambda: None)) + + # mind-exploding: carefully manage the reference and memory. + with torch.cuda.graph(cudagraph, pool=self.graph_pool): + # `output` is managed by pytorch's cudagraph pool + output = entry.runnable(*args) + if self.is_last_graph: + # by converting it to weak ref, + # the original `output` will immediately be released + # to save memory. It is only safe to do this for + # the last graph, because the output of the last graph + # will not be used by any other cuda graph. + output = weak_ref_tensors(output) # here we always use weak ref for the output # to save memory From 82bfc38d079b1ef5f4b88ac7094a00029d2e99af Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 6 Nov 2024 12:05:05 +0800 Subject: [PATCH 15/24] [Misc] Sort the list of embedding models (#10037) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/registry.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index af52fbffba19e..792c6cec34ae0 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -94,33 +94,23 @@ _EMBEDDING_MODELS = { # [Text-only] "BertModel": ("bert", "BertEmbeddingModel"), + "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"), "LlamaModel": ("llama", "LlamaEmbeddingModel"), + **{ + # Multiple models share the same architecture, so we include them all + k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items() + if arch == "LlamaForCausalLM" + }, "MistralModel": ("llama", "LlamaEmbeddingModel"), - "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), - "Qwen2ForSequenceClassification": ( - "qwen2_cls", "Qwen2ForSequenceClassification"), - "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), - "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), + "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), + "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"), # noqa: E501 # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), } -def add_embedding_models(base_models, embedding_models): - with_pooler_method_models = {} - embedding_models_name = embedding_models.keys() - for name, (path, arch) in base_models.items(): - if arch in embedding_models_name: - with_pooler_method_models[name] = (path, arch) - return with_pooler_method_models - -_EMBEDDING_MODELS = { - **add_embedding_models(_TEXT_GENERATION_MODELS, _EMBEDDING_MODELS), - **_EMBEDDING_MODELS, -} - _MULTIMODAL_MODELS = { # [Decoder-only] "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"), From ffc0f2b47add6e0f70e2b5d4b4aaac64ee97f8ad Mon Sep 17 00:00:00 2001 From: Peter Salas Date: Tue, 5 Nov 2024 20:19:15 -0800 Subject: [PATCH 16/24] [Model][OpenVINO] Fix regressions from #8346 (#10045) Signed-off-by: Peter Salas --- .buildkite/run-openvino-test.sh | 2 +- vllm/attention/backends/openvino.py | 12 +++++++++++- vllm/model_executor/models/molmo.py | 6 +++--- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh index 70e56596c4a86..35ad5c0ddde77 100755 --- a/.buildkite/run-openvino-test.sh +++ b/.buildkite/run-openvino-test.sh @@ -11,4 +11,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py index 6fddfc2002120..be06d16009988 100644 --- a/vllm/attention/backends/openvino.py +++ b/vllm/attention/backends/openvino.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import List, Tuple, Type +from typing import Dict, List, Optional, Tuple, Type import openvino as ov import torch @@ -7,6 +7,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionMetadata) from vllm.attention.backends.utils import CommonAttentionState +from vllm.multimodal import MultiModalPlaceholderMap def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor, @@ -128,3 +129,12 @@ class OpenVINOAttentionMetadata: # Shape: scalar # Type: i32 max_context_len: torch.Tensor + + # The index maps that relate multi-modal embeddings to the corresponding + # placeholders. + # + # N.B. These aren't really related to attention and don't belong on this + # type -- this is just a temporary solution to make them available to + # `model_executable`. + multi_modal_placeholder_index_maps: Optional[Dict[ + str, MultiModalPlaceholderMap.IndexMap]] diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 07c06149f0206..522aa748f78b6 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -21,8 +21,8 @@ get_tensor_model_parallel_world_size, split_tensor_along_last_dim, tensor_model_parallel_all_gather) -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -915,7 +915,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int, if "image_masks" in out: dummy_imgdata["image_masks"] = out["image_masks"] dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long) - return dummy_seqdata, {"image": dummy_imgdata} + return DummyData(dummy_seqdata, {"image": dummy_imgdata}) def pad_images( From 2bcbae704c0d52913c6a2887260fc6bde6c20361 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Tue, 5 Nov 2024 21:28:29 -0700 Subject: [PATCH 17/24] [Bugfix] Fix edge-case crash when using chat with the Mistral Tekken Tokenizer (#10051) Signed-off-by: Travis Johnson --- tests/models/decoder_only/language/test_mistral.py | 9 ++++++--- vllm/transformers_utils/tokenizers/mistral.py | 8 ++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index 5be44c54a717c..6ec4b7e7e3f71 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -10,19 +10,22 @@ MODELS = [ "mistralai/Mistral-7B-Instruct-v0.1", - "mistralai/Mistral-7B-Instruct-v0.3", - # Mistral-Nemo is to big for CI, but passes locally - # "mistralai/Mistral-Nemo-Instruct-2407" ] MISTRAL_FORMAT_MODELS = [ "mistralai/Mistral-7B-Instruct-v0.3", + # uses the v3-Tekken tokenizer + "mistralai/Ministral-8B-Instruct-2410", + # Mistral-Nemo is to big for CI, but passes locally + # "mistralai/Mistral-Nemo-Instruct-2407" ] SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5) SYMBOLIC_LANG_PROMPTS = [ "勇敢な船乗りについての詩を書く", # japanese "寫一首關於勇敢的水手的詩", # chinese + "ပုံပြင်လေးပြောပြပါ်:\n", # burmese + "Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n", # see https://github.com/vllm-project/vllm/pull/9625 ] # for function calling diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 896f70bc1dafd..ccffdcc2a4df2 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -254,7 +254,7 @@ def decode(self, skip_special_tokens: bool = True) -> str: assert ( skip_special_tokens - ), "Skipping special tokens is not supported for Mistral tokenizers." + ), "skip_special_tokens=False is not supported for Mistral tokenizers." if isinstance(ids, int): ids = [ids] @@ -268,12 +268,16 @@ def convert_ids_to_tokens( # TODO(Patrick) - potentially allow special tokens to not be skipped assert ( skip_special_tokens - ), "Skipping special tokens is not supported for Mistral tokenizers." + ), "skip_special_tokens=False is not supported for Mistral tokenizers." assert isinstance(self.tokenizer, (Tekkenizer, SentencePieceTokenizer)), type( self.tokenizer) + if isinstance(self.tokenizer, Tekkenizer): + # skip special tokens + ids = [i for i in ids if i > self.tokenizer.num_special_tokens] + tokens = [self.tokenizer.id_to_piece(id) for id in ids] if any("�" in t for t in tokens): From ea928f608c44b825d28609460e0d375a5f877940 Mon Sep 17 00:00:00 2001 From: arakowsk-amd <182798202+arakowsk-amd@users.noreply.github.com> Date: Tue, 5 Nov 2024 21:10:40 -0800 Subject: [PATCH 18/24] [Bugfix] Gpt-j-6B patch kv_scale to k_scale path (#10063) Signed-off-by: Alex Rakowski Signed-off-by: Alex Rakowski <182798202+arakowsk-amd@users.noreply.github.com> --- vllm/model_executor/models/gpt_j.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 0451d16b6c738..9a42b359ae44f 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -36,7 +36,8 @@ from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors @@ -308,6 +309,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader(param, loaded_weight, shard_id) break else: + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue From 9d59b755934899b7ec5d7bb5b90d15bfd2302475 Mon Sep 17 00:00:00 2001 From: zifeitong Date: Tue, 5 Nov 2024 21:13:09 -0800 Subject: [PATCH 19/24] [Bugfix] Remove CustomChatCompletionContentPartParam multimodal input type (#10054) Signed-off-by: Zifei Tong --- vllm/entrypoints/chat_utils.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 8da08d4b2c93c..2b339ab6d44e4 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -22,7 +22,6 @@ ChatCompletionToolMessageParam) # yapf: enable # pydantic needs the TypedDict from typing_extensions -from pydantic import ConfigDict from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from typing_extensions import Required, TypeAlias, TypedDict @@ -52,17 +51,10 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False): """The type of the content part.""" -class CustomChatCompletionContentPartParam(TypedDict, total=False): - __pydantic_config__ = ConfigDict(extra="allow") # type: ignore - - type: Required[str] - """The type of the content part.""" - - class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain image_url. This is supported by OpenAI API, although it is not documented. - + Example: { "image_url": "https://example.com/image.jpg" @@ -73,7 +65,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain audio_url. - + Example: { "audio_url": "https://example.com/audio.mp3" @@ -85,7 +77,6 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): ChatCompletionContentPartParam: TypeAlias = Union[ OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, ChatCompletionContentPartRefusalParam, - CustomChatCompletionContentPartParam, CustomChatCompletionContentSimpleImageParam, CustomChatCompletionContentSimpleAudioParam, str] From 40899855520eb9497606bdb2b1b4e619233e598a Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 5 Nov 2024 22:16:04 -0800 Subject: [PATCH 20/24] [V1] Integrate Piecewise CUDA graphs (#10058) Signed-off-by: Woosuk Kwon --- vllm/compilation/backends.py | 7 +- vllm/v1/attention/backends/flash_attn.py | 35 ++++--- vllm/v1/worker/gpu_model_runner.py | 127 +++++++++++++++++++---- 3 files changed, 133 insertions(+), 36 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 05deee7bd5473..abd1d16accaf7 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -496,8 +496,11 @@ def __call__(self, *args) -> Any: return entry.runnable(*args) if self.is_first_graph: - logger.info("Capturing a cudagraph for shape %s", - runtime_shape) + # Since we capture cudagraph for many different shapes and + # capturing is fast, we don't need to log it for every shape. + # We only log it in the debug mode. + logger.debug("Capturing a cudagraph for shape %s", + runtime_shape) input_addresses = [ x.data_ptr() for x in args if isinstance(x, torch.Tensor) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index b2af89ebf854a..906f06777a136 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -51,6 +51,7 @@ class FlashAttentionMetadata: # |-------------------- seq_len ---------------------| # |-- query_len ---| + num_actual_tokens: int # Number of tokens excluding padding. max_query_len: int query_start_loc: torch.Tensor max_seq_len: int @@ -134,7 +135,9 @@ def forward( assert k_scale == 1.0 and v_scale == 1.0, ( "key/v_scale is not supported in FlashAttention.") - output = torch.ops.vllm.unified_flash_attention( + output = torch.empty_like(query) + torch.ops.vllm.unified_flash_attention( + output, query, key, value, @@ -154,6 +157,7 @@ def forward( def unified_flash_attention( + output: torch.Tensor, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, @@ -168,17 +172,17 @@ def unified_flash_attention( window_size: Optional[List[int]] = None, alibi_slopes: Optional[torch.Tensor] = None, logits_soft_cap: Optional[float] = None, -) -> torch.Tensor: +) -> None: current_metadata = get_forward_context() if current_metadata is None: # Profiling run. - return torch.empty_like(query) + return assert current_metadata is not None assert isinstance(current_metadata, FlashAttentionMetadata) attn_metadata: FlashAttentionMetadata = current_metadata + num_actual_tokens = attn_metadata.num_actual_tokens - num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, num_heads, head_size) key = key.view(-1, num_kv_heads, head_size) @@ -188,18 +192,18 @@ def unified_flash_attention( key_cache = kv_cache[0] value_cache = kv_cache[1] torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - kv_cache[0], - kv_cache[1], + key[:num_actual_tokens], + value[:num_actual_tokens], + key_cache, + value_cache, attn_metadata.slot_mapping, kv_cache_dtype, k_scale, v_scale, ) - output = flash_attn_varlen_func( - q=query, + attn_output = flash_attn_varlen_func( + q=query[:num_actual_tokens], k=key_cache, v=value_cache, cu_seqlens_q=attn_metadata.query_start_loc, @@ -213,10 +217,13 @@ def unified_flash_attention( block_table=attn_metadata.block_table, softcap=logits_soft_cap, ) - return output.view(num_tokens, hidden_size) + attn_output = attn_output.view(num_actual_tokens, -1) + # TODO(woosuk): Optimize this. + output[:num_actual_tokens].copy_(attn_output) def unified_flash_attention_fake( + output: torch.Tensor, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, @@ -231,13 +238,13 @@ def unified_flash_attention_fake( window_size: Optional[List[int]] = None, alibi_slopes: Optional[torch.Tensor] = None, logits_soft_cap: Optional[float] = None, -) -> torch.Tensor: - return torch.empty_like(query) +) -> None: + return direct_register_custom_op( op_name="unified_flash_attention", op_func=unified_flash_attention, - mutates_args=["kv_cache"], + mutates_args=["kv_cache", "output"], fake_impl=unified_flash_attention_fake, ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ae4239f8e1fab..63bf7c2e605a2 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,3 +1,5 @@ +import os +import time from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Optional, Set from unittest.mock import patch @@ -7,11 +9,16 @@ import torch.distributed import torch.nn as nn +from vllm import envs +from vllm.compilation.compile_context import set_compile_context +from vllm.compilation.config import CompilationConfig +from vllm.compilation.levels import CompilationLevel from vllm.config import VllmConfig from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model from vllm.multimodal import MultiModalDataDict +from vllm.plugins import set_compilation_config from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv, is_pin_memory_available) @@ -86,6 +93,18 @@ def __init__( pin_memory=self.pin_memory, ) + self.use_cuda_graph = (envs.VLLM_TORCH_COMPILE_LEVEL + == CompilationLevel.PIECEWISE + and not self.model_config.enforce_eager) + # TODO(woosuk): Provide an option to tune the max cudagraph batch size. + self.cudagraph_batch_sizes = [1, 2, 4] + [i for i in range(8, 513, 8)] + self.input_ids = torch.zeros(self.max_num_tokens, + dtype=torch.int32, + device=self.device) + self.positions = torch.zeros(self.max_num_tokens, + dtype=torch.int64, + device=self.device) + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Remove stopped requests from the cached states. # Keep the states of the pre-empted requests. @@ -268,12 +287,16 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): seq_start_loc_np[0] = 0 np.cumsum(seq_lens, out=seq_start_loc_np[1:]) - input_ids = input_ids.to(self.device, non_blocking=True) - positions = positions.to(self.device, non_blocking=True).long() + self.input_ids[:total_num_scheduled_tokens].copy_(input_ids, + non_blocking=True) + self.positions[:total_num_scheduled_tokens].copy_(positions, + non_blocking=True) + query_start_loc = query_start_loc.to(self.device, non_blocking=True) seq_start_loc = seq_start_loc.to(self.device, non_blocking=True) slot_mapping = slot_mapping.to(self.device, non_blocking=True).long() attn_metadata = FlashAttentionMetadata( + num_actual_tokens=total_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens, query_start_loc=query_start_loc, max_seq_len=max_seq_len, @@ -287,7 +310,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # token from the partial request. # TODO: Support prompt logprobs. logits_indices = query_start_loc[1:] - 1 - return input_ids, positions, attn_metadata, logits_indices + return attn_metadata, logits_indices def _prepare_sampling( self, @@ -310,16 +333,26 @@ def execute_model( scheduler_output: "SchedulerOutput", ) -> ModelRunnerOutput: self._update_states(scheduler_output) - inputs = self._prepare_inputs(scheduler_output) - input_ids, positions, attn_metadata, logits_indices = inputs + attn_metadata, logits_indices = self._prepare_inputs(scheduler_output) + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + if (self.use_cuda_graph + and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): + # Use piecewise CUDA graphs. + # Add padding to the batch size. + num_input_tokens = self._get_padded_batch_size( + num_scheduled_tokens) + else: + # Eager mode. + num_input_tokens = num_scheduled_tokens with set_forward_context(attn_metadata): hidden_states = self.model( - input_ids=input_ids, - positions=positions, + input_ids=self.input_ids[:num_input_tokens], + positions=self.positions[:num_input_tokens], kv_caches=self.kv_caches, - attn_metadata=attn_metadata, + attn_metadata=None, ) + hidden_states = hidden_states[:num_scheduled_tokens] hidden_states = hidden_states[logits_indices] logits = self.model.compute_logits(hidden_states, None) @@ -371,6 +404,18 @@ def execute_model( return model_runner_output def load_model(self) -> None: + if self.use_cuda_graph: + # FIXME(woosuk): Currently, the custom ops are not supported + # in the piecewise compilation mode. We rely on TorchInductor + # to optimize the model. + os.environ["VLLM_CUSTOM_OPS"] = "none" + set_compilation_config( + CompilationConfig( + use_cudagraph=True, + non_cudagraph_ops=["vllm.unified_flash_attention"], + use_inductor=True, + )) + logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: # noqa: SIM117 with patch("vllm.model_executor.layers.sampler.Sampler", Sampler): @@ -381,26 +426,61 @@ def load_model(self) -> None: self.model_memory_usage / float(2**30)) def _dummy_run(self, model: nn.Module, num_tokens: int) -> None: - input_ids = torch.zeros(num_tokens, - dtype=torch.int32, - device=self.device) - positions = torch.zeros(num_tokens, - dtype=torch.long, - device=self.device) - kv_caches = [None for _ in range(self.num_attn_layers)] - model(input_ids, positions, kv_caches, attn_metadata=None) - return + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value `None`. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + # it is important to create tensors inside the loop, rather than + # multiplying the list, to avoid Dynamo from treating them as + # tensor aliasing. + dummy_kv_caches = [ + torch.tensor([], dtype=torch.float32, device=self.device) + for _ in range(self.num_attn_layers) + ] + with set_forward_context(None): # noqa: SIM117 + with set_compile_context(self.cudagraph_batch_sizes): + # Trigger compilation for general shape. + model(self.input_ids, + self.positions, + dummy_kv_caches, + attn_metadata=None) @torch.inference_mode() def profile_run(self) -> None: self._dummy_run(self.model, self.max_num_tokens) torch.cuda.synchronize() - return @torch.inference_mode() def capture_model(self) -> None: - # TODO: Implement CUDA graph support. - return + if not self.use_cuda_graph: + logger.warning( + "Skipping CUDA graph capture. Please set " + "VLLM_TORCH_COMPILE_LEVEL=%d to use CUDA graphs.", + CompilationLevel.PIECEWISE) + return + + start_time = time.perf_counter() + start_free_gpu_memory = torch.cuda.mem_get_info()[0] + + with set_forward_context(None): + # Trigger CUDA graph capture for specific shapes. + # Capture the large shapes first so that the smaller shapes + # can reuse the memory pool allocated for the large shapes. + for num_tokens in reversed(self.cudagraph_batch_sizes): + self.model( + self.input_ids[:num_tokens], + self.positions[:num_tokens], + kv_caches=self.kv_caches, + attn_metadata=None, + ) + + end_time = time.perf_counter() + end_free_gpu_memory = torch.cuda.mem_get_info()[0] + elapsed_time = end_time - start_time + cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory + # This usually takes 5~20 seconds. + logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", + elapsed_time, cuda_graph_size / (1 << 30)) def initialize_kv_cache(self, num_blocks: int) -> None: assert len(self.kv_caches) == 0 @@ -412,6 +492,13 @@ def initialize_kv_cache(self, num_blocks: int) -> None: dtype=self.kv_cache_dtype, device=self.device)) + def _get_padded_batch_size(self, batch_size: int) -> Optional[int]: + # TODO: Optimize this? + for size in self.cudagraph_batch_sizes: + if batch_size <= size: + return size + return None + @dataclass class CachedRequestState: From 4be3a45158a7fb707973d4b00410e0d2981e6825 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 5 Nov 2024 22:35:03 -0800 Subject: [PATCH 21/24] [distributed] add function to create ipc buffers directly (#10064) Signed-off-by: youkaichao --- .buildkite/test-pipeline.yaml | 1 + tests/distributed/test_ca_buffer_sharing.py | 59 +++++++++++++++++++ .../device_communicators/custom_all_reduce.py | 31 ++++++++++ 3 files changed, 91 insertions(+) create mode 100644 tests/distributed/test_ca_buffer_sharing.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 1eb749f64d36b..3e940549862ea 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -510,6 +510,7 @@ steps: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus - pytest -v -s -x lora/test_mixtral.py diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py new file mode 100644 index 0000000000000..fc4043cd3014e --- /dev/null +++ b/tests/distributed/test_ca_buffer_sharing.py @@ -0,0 +1,59 @@ +# can only run on machines with p2p access across GPUs +# can only run with torchrun: +# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py + +import ctypes + +import torch +import torch.distributed as dist + +from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary +from vllm.distributed.device_communicators.custom_all_reduce import ( # noqa + CustomAllreduce) + +# create a cpu process group for communicating metadata (ipc handle) +dist.init_process_group(backend="gloo") +rank = local_rank = dist.get_rank() +world_size = dist.get_world_size() + +# every process sets its own device (differently) +lib = CudaRTLibrary() +lib.cudaSetDevice(rank) + +buffer_size_in_bytes = 1024 +byte_value = 2 # the value we write to the buffer for verification + +pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes) + +print(f"Rank {rank} has pointers {pointers}") + +dist.barrier() +torch.cuda.synchronize() + +if rank == 0: + # the first rank tries to write to all buffers + for p in pointers: + pointer = ctypes.c_void_p(p) + lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes) + +dist.barrier() +torch.cuda.synchronize() + +host_data = (ctypes.c_char * buffer_size_in_bytes)() + +# all ranks read from all buffers, and check if the data is correct +for p in pointers: + pointer = ctypes.c_void_p(p) + lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes) + for i in range(buffer_size_in_bytes): + assert ord(host_data[i]) == byte_value, ( + f"Rank {rank} failed" + f" to verify buffer {p}. Expected {byte_value}, " + f"got {ord(host_data[i])}") + +print(f"Rank {rank} verified all buffers") + +dist.barrier() +torch.cuda.synchronize() + +CustomAllreduce.free_shared_buffer(pointers) diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index c3632aee6d11a..3b5d92561cf25 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -1,3 +1,4 @@ +import ctypes from contextlib import contextmanager from typing import Any, List, Optional, Union @@ -7,6 +8,7 @@ import vllm.envs as envs from vllm import _custom_ops as ops +from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary from vllm.distributed.device_communicators.custom_all_reduce_utils import ( gpu_p2p_access_check) from vllm.distributed.parallel_state import in_the_same_node_as @@ -174,6 +176,35 @@ def __init__(self, offsets, rank, self.full_nvlink) self.register_buffer(self.buffer) + @staticmethod + def create_shared_buffer( + size_in_bytes: int, + group: Optional[ProcessGroup] = None) -> List[int]: + lib = CudaRTLibrary() + pointer = lib.cudaMalloc(size_in_bytes) + handle = lib.cudaIpcGetMemHandle(pointer) + world_size = dist.get_world_size(group=group) + rank = dist.get_rank(group=group) + handles = [None] * world_size + dist.all_gather_object(handles, handle, group=group) + + pointers: List[int] = [] + for i, h in enumerate(handles): + if i == rank: + pointers.append(pointer.value) # type: ignore + else: + pointers.append( + lib.cudaIpcOpenMemHandle(h).value) # type: ignore + + return pointers + + @staticmethod + def free_shared_buffer(pointers: List[int], + group: Optional[ProcessGroup] = None) -> None: + rank = dist.get_rank(group=group) + lib = CudaRTLibrary() + lib.cudaFree(ctypes.c_void_p(pointers[rank])) + @contextmanager def capture(self): """ From 21063c11c7d340dbb01460e22d98d3619737cd4d Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Wed, 6 Nov 2024 02:11:55 -0500 Subject: [PATCH 22/24] [CI/Build] drop support for Python 3.8 EOL (#8464) Signed-off-by: Aaron Pham --- .../convert-results-json-to-markdown.py | 10 +-- .../scripts/generate-nightly-markdown.py | 4 +- .../scripts/summary-nightly-results.py | 4 +- .github/workflows/mypy.yaml | 2 +- .github/workflows/publish.yml | 2 +- .github/workflows/ruff.yml | 32 ++++----- .github/workflows/yapf.yml | 26 ++++---- .readthedocs.yaml | 11 ++-- CMakeLists.txt | 36 +++++----- benchmarks/backend_request_func.py | 22 ++----- benchmarks/kernels/benchmark_machete.py | 6 +- csrc/quantization/machete/generate.py | 8 +-- docs/source/getting_started/installation.rst | 10 +-- pyproject.toml | 4 +- setup.py | 9 ++- tests/compile/piecewise/test_toy_llama.py | 4 +- tests/conftest.py | 29 +++----- tests/core/block/test_prefix_caching_block.py | 12 ++-- tests/kernels/test_mamba_ssm.py | 2 +- .../mm_processor_kwargs/test_qwen.py | 2 +- tests/samplers/test_rejection_sampler.py | 10 ++- tests/test_logger.py | 2 +- tests/tokenization/test_detokenize.py | 4 +- tools/profiler/print_layerwise_table.py | 2 +- tools/profiler/visualize_layerwise_profile.py | 2 +- tools/report_build_time_ninja.py | 32 ++++----- use_existing_torch.py | 2 +- .../ops/blocksparse_attention/interface.py | 6 +- vllm/config.py | 7 +- vllm/core/evictor.py | 2 +- .../custom_all_reduce_utils.py | 2 +- vllm/engine/async_llm_engine.py | 2 +- vllm/engine/llm_engine.py | 4 +- vllm/engine/metrics_types.py | 2 +- vllm/engine/output_processor/multi_step.py | 2 +- vllm/entrypoints/chat_utils.py | 2 +- vllm/entrypoints/openai/run_batch.py | 2 +- vllm/executor/ray_gpu_executor.py | 2 +- vllm/logger.py | 3 +- vllm/lora/models.py | 4 +- vllm/model_executor/custom_op.py | 2 +- vllm/model_executor/layers/resampler.py | 1 - .../model_executor/layers/rotary_embedding.py | 1 - vllm/model_executor/model_loader/loader.py | 2 +- vllm/model_executor/model_loader/openvino.py | 2 +- .../model_executor/model_loader/tensorizer.py | 5 +- .../model_loader/weight_utils.py | 9 ++- vllm/model_executor/models/arctic.py | 4 +- vllm/model_executor/models/baichuan.py | 1 - vllm/model_executor/models/bloom.py | 1 - vllm/model_executor/models/chatglm.py | 1 - vllm/model_executor/models/commandr.py | 1 - vllm/model_executor/models/dbrx.py | 1 - vllm/model_executor/models/decilm.py | 1 - vllm/model_executor/models/deepseek.py | 1 - vllm/model_executor/models/deepseek_v2.py | 1 - vllm/model_executor/models/exaone.py | 1 - vllm/model_executor/models/falcon.py | 1 - vllm/model_executor/models/fuyu.py | 1 - vllm/model_executor/models/gemma.py | 1 - vllm/model_executor/models/gemma2.py | 1 - .../models/glm4_vision_encoder.py | 1 - vllm/model_executor/models/gpt2.py | 1 - vllm/model_executor/models/gpt_bigcode.py | 1 - vllm/model_executor/models/gpt_j.py | 1 - vllm/model_executor/models/gpt_neox.py | 1 - vllm/model_executor/models/granite.py | 1 - vllm/model_executor/models/granitemoe.py | 1 - .../models/idefics2_vision_model.py | 2 - vllm/model_executor/models/internlm2.py | 1 - vllm/model_executor/models/internlm2_ve.py | 1 - vllm/model_executor/models/jais.py | 1 - vllm/model_executor/models/jamba.py | 1 - vllm/model_executor/models/llama.py | 1 - vllm/model_executor/models/mamba.py | 1 - vllm/model_executor/models/minicpm.py | 1 - vllm/model_executor/models/minicpm3.py | 1 - vllm/model_executor/models/minicpmv.py | 1 - vllm/model_executor/models/mixtral.py | 1 - vllm/model_executor/models/mixtral_quant.py | 1 - vllm/model_executor/models/mllama.py | 1 - vllm/model_executor/models/mlp_speculator.py | 2 +- vllm/model_executor/models/molmo.py | 6 +- vllm/model_executor/models/mpt.py | 1 - vllm/model_executor/models/nemotron.py | 1 - vllm/model_executor/models/olmo.py | 1 - vllm/model_executor/models/opt.py | 1 - vllm/model_executor/models/orion.py | 1 - vllm/model_executor/models/persimmon.py | 1 - vllm/model_executor/models/phi.py | 1 - vllm/model_executor/models/phi3.py | 1 - vllm/model_executor/models/phi3v.py | 1 - vllm/model_executor/models/phimoe.py | 1 - vllm/model_executor/models/pixtral.py | 10 +-- vllm/model_executor/models/qwen.py | 1 - vllm/model_executor/models/qwen2.py | 7 +- vllm/model_executor/models/qwen2_audio.py | 1 - vllm/model_executor/models/qwen2_cls.py | 7 +- vllm/model_executor/models/qwen2_moe.py | 1 - vllm/model_executor/models/qwen2_rm.py | 7 +- vllm/model_executor/models/qwen2_vl.py | 10 ++- vllm/model_executor/models/solar.py | 1 - vllm/model_executor/models/stablelm.py | 1 - vllm/model_executor/models/starcoder2.py | 1 - vllm/model_executor/models/xverse.py | 1 - vllm/multimodal/base.py | 66 +++++++++++-------- vllm/prompt_adapter/utils.py | 17 +++-- vllm/transformers_utils/config.py | 2 +- vllm/transformers_utils/configs/chatglm.py | 1 - vllm/transformers_utils/configs/exaone.py | 1 - vllm/transformers_utils/configs/jais.py | 1 - vllm/transformers_utils/configs/mpt.py | 7 +- vllm/transformers_utils/configs/nemotron.py | 7 +- vllm/transformers_utils/configs/solar.py | 1 - vllm/utils.py | 4 +- 115 files changed, 240 insertions(+), 322 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index f90e464288cf1..7cf05610b9953 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -56,7 +56,7 @@ def read_markdown(file): if os.path.exists(file): - with open(file, "r") as f: + with open(file) as f: return f.read() + "\n" else: return f"{file} not found.\n" @@ -75,14 +75,14 @@ def results_to_json(latency, throughput, serving): # collect results for test_file in results_folder.glob("*.json"): - with open(test_file, "r") as f: + with open(test_file) as f: raw_result = json.loads(f.read()) if "serving" in str(test_file): # this result is generated via `benchmark_serving.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) @@ -97,7 +97,7 @@ def results_to_json(latency, throughput, serving): # this result is generated via `benchmark_latency.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) @@ -119,7 +119,7 @@ def results_to_json(latency, throughput, serving): # this result is generated via `benchmark_throughput.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py index 6059588fe7277..052060c576300 100644 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -72,7 +72,7 @@ def main(args): # collect results for test_file in results_folder.glob("*_nightly_results.json"): - with open(test_file, "r") as f: + with open(test_file) as f: results = results + json.loads(f.read()) # generate markdown table @@ -80,7 +80,7 @@ def main(args): md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) - with open(args.description, "r") as f: + with open(args.description) as f: description = f.read() description = description.format( diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 4e4d4cd4ca3c6..92d6fad73a94c 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -36,11 +36,11 @@ # collect results for test_file in results_folder.glob("*.json"): - with open(test_file, "r") as f: + with open(test_file) as f: raw_result = json.loads(f.read()) # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 18b354948f0cc..28d2e5fb8dbd9 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -25,7 +25,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index f959a1cacf866..578c3fbd4e816 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -48,7 +48,7 @@ jobs: fail-fast: false matrix: os: ['ubuntu-20.04'] - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12'] pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt. cuda-version: ['11.8', '12.1'] diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 197f918765e7d..edf98ce2fcab0 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -29,19 +29,19 @@ jobs: matrix: python-version: ["3.12"] steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Analysing the code with ruff - run: | - echo "::add-matcher::.github/workflows/matchers/ruff.json" - ruff check --output-format github . - - name: Run isort - run: | - isort . --check-only + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Analysing the code with ruff + run: | + echo "::add-matcher::.github/workflows/matchers/ruff.json" + ruff check --output-format github . + - name: Run isort + run: | + isort . --check-only diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index 35579302c5c14..4221c139ccf79 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -23,16 +23,16 @@ jobs: matrix: python-version: ["3.12"] steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install yapf==0.32.0 - pip install toml==0.10.2 - - name: Running yapf - run: | - yapf --diff --recursive . + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install yapf==0.32.0 + pip install toml==0.10.2 + - name: Running yapf + run: | + yapf --diff --recursive . diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 42cbf18a0f712..34735700a224e 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -6,17 +6,16 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.8" + python: '3.9' sphinx: - configuration: docs/source/conf.py - fail_on_warning: true + configuration: docs/source/conf.py + fail_on_warning: true # If using Sphinx, optionally build your docs in additional formats such as PDF formats: [] # Optionally declare the Python requirements required to build your docs python: - install: - - requirements: docs/requirements-docs.txt - + install: + - requirements: docs/requirements-docs.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 943424bc4edfa..c372ba98befbf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,9 +128,9 @@ endif() if(VLLM_GPU_LANG STREQUAL "CUDA") # - # For cuda we want to be able to control which architectures we compile for on + # For cuda we want to be able to control which architectures we compile for on # a per-file basis in order to cut down on compile time. So here we extract - # the set of architectures we want to compile for and remove the from the + # the set of architectures we want to compile for and remove the from the # CMAKE_CUDA_FLAGS so that they are not applied globally. # clear_cuda_arches(CUDA_ARCH_FLAGS) @@ -138,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "CUDA target architectures: ${CUDA_ARCHS}") # Filter the target architectures by the supported supported archs # since for some files we will build for all CUDA_ARCHS. - cuda_archs_loose_intersection(CUDA_ARCHS + cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}") message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}") else() @@ -236,7 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # are not supported by Machete yet. cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS}) if (MARLIN_ARCHS) - set(MARLIN_SRCS + set(MARLIN_SRCS "csrc/quantization/fp8/fp8_marlin.cu" "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" @@ -277,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "in CUDA target architectures") endif() - # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't + # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't # build any 3x kernels set(SCALED_MM_3X_ARCHS) endif() @@ -285,7 +285,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # kernels for the remaining archs that are not already built for 3x. - cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS + cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) @@ -316,10 +316,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS) # - # For the Machete kernels we automatically generate sources for various + # For the Machete kernels we automatically generate sources for various # preselected input type pairs and schedules. # Generate sources: - set(MACHETE_GEN_SCRIPT + set(MACHETE_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py) file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH) @@ -329,8 +329,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH} OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH}) execute_process( - COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH + COMMAND ${CMAKE_COMMAND} -E env + PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT} RESULT_VARIABLE machete_generation_result OUTPUT_VARIABLE machete_generation_output @@ -340,11 +340,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if (NOT machete_generation_result EQUAL 0) message(FATAL_ERROR "Machete generation failed." - " Result: \"${machete_generation_result}\"" + " Result: \"${machete_generation_result}\"" "\nCheck the log for details: " "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log") else() - set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} + set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} CACHE STRING "Last run machete generate script hash" FORCE) message(STATUS "Machete generation completed successfully.") endif() @@ -366,7 +366,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS) message(STATUS "Not building Machete kernels as CUDA Compiler version is " "not >= 12.0, we recommend upgrading to CUDA 12.0 or " @@ -392,8 +392,8 @@ define_gpu_extension_target( USE_SABI 3 WITH_SOABI) -# If CUTLASS is compiled on NVCC >= 12.5, it by default uses -# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the +# If CUTLASS is compiled on NVCC >= 12.5, it by default uses +# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the # driver API. This causes problems when linking with earlier versions of CUDA. # Setting this variable sidesteps the issue by calling the driver directly. target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) @@ -471,9 +471,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda") return() endif () -# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target -# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the -# arches in the CUDA case (and instead set the gencodes on a per file basis) +# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target +# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the +# arches in the CUDA case (and instead set the gencodes on a per file basis) # we need to manually set VLLM_GPU_ARCHES here. if(VLLM_GPU_LANG STREQUAL "CUDA") foreach(_ARCH ${CUDA_ARCHS}) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 0a903877f000d..a42e70170ba28 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -79,7 +79,7 @@ async def async_request_tgi( # any data, we should skip it. if chunk_bytes.startswith(":"): continue - chunk = remove_prefix(chunk_bytes, "data:") + chunk = chunk_bytes.removeprefix("data:") data = json.loads(chunk) timestamp = time.perf_counter() @@ -144,8 +144,8 @@ async def async_request_trt_llm( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data:") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data:") data = json.loads(chunk) output.generated_text += data["text_output"] @@ -261,8 +261,8 @@ async def async_request_openai_completions( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") if chunk == "[DONE]": latency = time.perf_counter() - st else: @@ -349,8 +349,8 @@ async def async_request_openai_chat_completions( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") if chunk == "[DONE]": latency = time.perf_counter() - st else: @@ -389,14 +389,6 @@ async def async_request_openai_chat_completions( return output -# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix) -# introduced in Python 3.9 -def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - return text[len(prefix):] - return text - - def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': from modelscope import snapshot_download diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index b70c4b94c97a1..665b50bf18cf0 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -269,10 +269,10 @@ def run_square_bench(args): def run_range_bench(args): - m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")] - m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")] + m_start, k_start, n_start = (int(x) for x in args.dim_start.split(",")) + m_end, k_end, n_end = (int(x) for x in args.dim_end.split(",")) m_increment, k_increment, n_increment = \ - [int(x) for x in args.dim_increment.split(",")] + (int(x) for x in args.dim_increment.split(",")) Ms = list(range(m_start, m_end + 1, m_increment)) Ks = list(range(k_start, k_end + 1, k_increment)) Ns = list(range(n_start, n_end + 1, n_increment)) diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index ebbe76cfb944a..d126af1849024 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -468,7 +468,7 @@ def generate(): impl_configs = [] GPTQ_kernel_type_configs = list( - (TypeConfig( + TypeConfig( element_a=element_a, element_b=element_b, element_b_scale=element_a, @@ -476,7 +476,7 @@ def generate(): element_d=element_a, accumulator=DataType.f32, ) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128) - for element_a in (DataType.f16, DataType.bf16))) + for element_a in (DataType.f16, DataType.bf16)) GPTQ_kernel_specializations = [ Specialization(with_C=False, with_zeropoints=False, with_scales=True) @@ -490,7 +490,7 @@ def generate(): ] AWQ_kernel_type_configs = list( - (TypeConfig( + TypeConfig( element_a=element_a, element_b=element_b, element_b_scale=element_a, @@ -498,7 +498,7 @@ def generate(): element_d=element_a, accumulator=DataType.f32, ) for element_b in (DataType.u4, DataType.u8) - for element_a in (DataType.f16, DataType.bf16))) + for element_a in (DataType.f16, DataType.bf16)) AWQ_kernel_specializations = [ Specialization(with_C=False, with_zeropoints=True, with_scales=True) diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index a706b285edede..61871cdf41125 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -10,7 +10,7 @@ Requirements ============ * OS: Linux -* Python: 3.8 - 3.12 +* Python: 3.9 -- 3.12 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) Install released versions @@ -148,7 +148,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T .. tip:: Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. - For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . + For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. @@ -181,8 +181,8 @@ to be run simultaneously, via the environment variable ``MAX_JOBS``. For example $ export MAX_JOBS=6 $ pip install -e . -This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. -A side effect is a much slower build process. +This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. +A side effect is a much slower build process. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. @@ -209,7 +209,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed: Unsupported OS build -------------------- -vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. +vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing: diff --git a/pyproject.toml b/pyproject.toml index 0bbab3cd3fbc3..3562569647391 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ select = [ # Pyflakes "F", # pyupgrade - # "UP", + "UP", # flake8-bugbear "B", # flake8-simplify @@ -55,7 +55,7 @@ ignore = [ ] [tool.mypy] -python_version = "3.8" +python_version = "3.9" ignore_missing_imports = true check_untyped_defs = true diff --git a/setup.py b/setup.py index 8abeb0ba739db..f145a33258d70 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ import importlib.util -import io import logging import os import re @@ -327,7 +326,7 @@ def get_neuronxcc_version(): "__init__.py") # Check if the command was executed successfully - with open(version_file, "rt") as fp: + with open(version_file) as fp: content = fp.read() # Extract the version using a regular expression @@ -404,7 +403,8 @@ def read_readme() -> str: """Read the README file if present.""" p = get_path("README.md") if os.path.isfile(p): - return io.open(get_path("README.md"), "r", encoding="utf-8").read() + with open(get_path("README.md"), encoding="utf-8") as f: + return f.read() else: return "" @@ -498,7 +498,6 @@ def _read_requirements(filename: str) -> List[str]: "Documentation": "https://vllm.readthedocs.io/en/latest/", }, classifiers=[ - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -512,7 +511,7 @@ def _read_requirements(filename: str) -> List[str]: ], packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples", "tests*")), - python_requires=">=3.8", + python_requires=">=3.9", install_requires=get_requirements(), ext_modules=ext_modules, extras_require={ diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 9c65059c6b348..73fa9e9906936 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -429,8 +429,8 @@ def benchmark(): # print in tabular format print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph") for b in cudagraph_sizes: - print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}" - f"\t{piecewise_cudagraph_time[b]:.3f}")) + print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}" + f"\t{piecewise_cudagraph_time[b]:.3f}") if __name__ == "__main__": diff --git a/tests/conftest.py b/tests/conftest.py index bdc6ffb148602..f9dfabc82639b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,5 @@ import json import os -import sys import tempfile from collections import UserList from enum import Enum @@ -52,7 +51,7 @@ def _read_prompts(filename: str) -> List[str]: - with open(filename, "r") as f: + with open(filename) as f: prompts = f.readlines() return prompts @@ -62,14 +61,8 @@ class _ImageAssetPrompts(TypedDict): cherry_blossom: str -if sys.version_info < (3, 9): - # UserList cannot be subscripted - class _ImageAssetsBase(UserList): - pass -else: - - class _ImageAssetsBase(UserList[ImageAsset]): - pass +class _ImageAssetsBase(UserList[ImageAsset]): + pass class _ImageAssets(_ImageAssetsBase): @@ -94,14 +87,8 @@ class _VideoAssetPrompts(TypedDict): sample_demo_1: str -if sys.version_info < (3, 9): - # UserList cannot be subscripted - class _VideoAssetsBase(UserList): - pass -else: - - class _VideoAssetsBase(UserList[VideoAsset]): - pass +class _VideoAssetsBase(UserList[VideoAsset]): + pass class _VideoAssets(_VideoAssetsBase): @@ -958,7 +945,7 @@ def dummy_opt_path(): "*.msgpack" ]) assert os.path.exists(json_path) - with open(json_path, "r") as f: + with open(json_path) as f: config = json.load(f) config["architectures"] = ["MyOPTForCausalLM"] with open(json_path, "w") as f: @@ -977,7 +964,7 @@ def dummy_llava_path(): "*.msgpack" ]) assert os.path.exists(json_path) - with open(json_path, "r") as f: + with open(json_path) as f: config = json.load(f) config["architectures"] = ["MyLlava"] with open(json_path, "w") as f: @@ -996,7 +983,7 @@ def dummy_gemma2_embedding_path(): "*.msgpack" ]) assert os.path.exists(json_path) - with open(json_path, "r") as f: + with open(json_path) as f: config = json.load(f) config["architectures"] = ["MyGemma2Embedding"] with open(json_path, "w") as f: diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index 1a6e17ef7b445..d325b9606843e 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -99,13 +99,11 @@ def test_blocks_have_correct_hash_in_chain(block_size: int, token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)] - first_chain, second_chain = [ - TestPrefixCachingBlock.create_chain( - block_size=block_size, - token_ids=token_ids, - num_empty_trailing_blocks=num_empty_trailing_blocks) - for _ in range(2) - ] + first_chain, second_chain = (TestPrefixCachingBlock.create_chain( + block_size=block_size, + token_ids=token_ids, + num_empty_trailing_blocks=num_empty_trailing_blocks) + for _ in range(2)) for first_chain_block, second_chain_block in zip( first_chain, second_chain): diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py index ad05a97685351..19d1158c79c73 100644 --- a/tests/kernels/test_mamba_ssm.py +++ b/tests/kernels/test_mamba_ssm.py @@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C, for var in (u_ref, delta_ref, B_ref, C_ref, z_ref) ] for i in range(len(seqlens[0])): - u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits] + u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits) if padded_state_indices[i] == PAD_SLOT_ID: continue out_ref_s, _ = selective_scan_ref( diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py index a01651b171d60..6ae8a6a704b0a 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py @@ -104,7 +104,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen, # Sad path tests for the multimodal input processor and mapper, respectively @pytest.mark.parametrize("mm_data", [ { - "image": torch.rand((5)) + "image": torch.rand(5) }, { "image": torch.rand((5, 5, 5, 5, 5)) diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index a8deab3718be1..f5497976faf7a 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -413,12 +413,10 @@ def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler): def generate_probs_for_test( self, draft_and_target_probs_equal: bool ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - draft_probs, target_probs = [ - F.softmax( - torch.rand(self.vocab_size, dtype=torch.float32), - dim=-1, - ) for _ in range(2) - ] + draft_probs, target_probs = (F.softmax( + torch.rand(self.vocab_size, dtype=torch.float32), + dim=-1, + ) for _ in range(2)) num_reference_probs = 100 reference_probs = F.softmax( diff --git a/tests/test_logger.py b/tests/test_logger.py index fadf66f2b61d4..a937b0812ed0c 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -29,7 +29,7 @@ def test_trace_function_call(): cur_dir = os.path.dirname(__file__) enable_trace_function_call(path, cur_dir) f1(1) - with open(path, 'r') as f: + with open(path) as f: content = f.read() assert "f1" in content diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index a3e70a40db979..84348cbc0bced 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -93,10 +93,10 @@ def test_mistral_edge_case(tokenizer, truth): def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]: if "mistral" in tokenizer_name: yield ( - bool(True) if request.param else + True if request.param else pytest.skip("mistral doesn't support skip_special_tokens=False")) else: - yield bool(True) if request.param else bool(False) + yield bool(request.param) @pytest.mark.parametrize("truth", TRUTH) diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index bbd24b085e3a7..081076ad7dbdc 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -46,7 +46,7 @@ def get_entries(node, curr_depth=0): args = parser.parse_args() - with open(args.json_trace, "r") as f: + with open(args.json_trace) as f: profile_data = json.load(f) if args.table == "summary": diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index 65ee3ae108ae1..efd6beee865c2 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -434,7 +434,7 @@ def make_plot_title_suffix(profile_json: dict) -> str: f"{', Sparsity ' + sparsity if sparsity else ''}") profile_json = None - with open(json_trace, "r") as f: + with open(json_trace) as f: profile_json = json.load(f) assert profile_json is not None diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 33431a33ac837..51ad2adc74fe1 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -81,7 +81,7 @@ def WeightedDuration(self): # Allow for modest floating-point errors epsilon = 0.000002 if (self.weighted_duration > self.Duration() + epsilon): - print('%s > %s?' % (self.weighted_duration, self.Duration())) + print('{} > {}?'.format(self.weighted_duration, self.Duration())) assert (self.weighted_duration <= self.Duration() + epsilon) return self.weighted_duration @@ -104,7 +104,7 @@ def ReadTargets(log, show_all): The result is a list of Target objects.""" header = log.readline() assert header == '# ninja log v5\n', \ - 'unrecognized ninja log version %r' % header + 'unrecognized ninja log version {!r}'.format(header) targets_dict = {} last_end_seen = 0.0 for line in log: @@ -254,8 +254,8 @@ def SummarizeEntries(entries, extra_step_types): # Warn if the sum of weighted times is off by more than half a second. if abs(length - weighted_total) > 500: print('Warning: Possible corrupt ninja log, results may be ' - 'untrustworthy. Length = %.3f, weighted total = %.3f' % - (length, weighted_total)) + 'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format( + length, weighted_total)) entries_by_ext = defaultdict(list) for target in entries: @@ -263,16 +263,17 @@ def SummarizeEntries(entries, extra_step_types): entries_by_ext[extension].append(target) for key, values in entries_by_ext.items(): - print(' Longest build steps for %s:' % key) + print(' Longest build steps for {}:'.format(key)) values.sort(key=lambda x: x.WeightedDuration()) for target in values[-long_count:]: - print(' %8.1f weighted s to build %s (%.1f s elapsed time)' % - (target.WeightedDuration(), target.DescribeTargets(), - target.Duration())) - - print(' %.1f s weighted time (%.1f s elapsed time sum, %1.1fx ' - 'parallelism)' % - (length, total_cpu_time, total_cpu_time * 1.0 / length)) + print( + ' {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'. + format(target.WeightedDuration(), target.DescribeTargets(), + target.Duration())) + + print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x ' + 'parallelism)'.format(length, total_cpu_time, + total_cpu_time * 1.0 / length)) print(' %d build steps completed, average of %1.2f/s' % (len(entries), len(entries) / (length))) @@ -298,11 +299,12 @@ def main(): long_ext_count += len(args.step_types.split(';')) try: - with open(log_file, 'r') as log: + with open(log_file) as log: entries = ReadTargets(log, False) SummarizeEntries(entries, args.step_types) - except IOError: - print('Log file %r not found, no build summary created.' % log_file) + except OSError: + print('Log file {!r} not found, no build summary created.'.format( + log_file)) return errno.ENOENT diff --git a/use_existing_torch.py b/use_existing_torch.py index e11746459908b..319d262898fe3 100644 --- a/use_existing_torch.py +++ b/use_existing_torch.py @@ -4,7 +4,7 @@ requires_files += ["pyproject.toml"] for file in requires_files: print(f">>> cleaning {file}") - with open(file, 'r') as f: + with open(file) as f: lines = f.readlines() if "torch" in "".join(lines).lower(): print("removed:") diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py index a98eb431ac7fc..350f88c8f9740 100644 --- a/vllm/attention/ops/blocksparse_attention/interface.py +++ b/vllm/attention/ops/blocksparse_attention/interface.py @@ -192,10 +192,8 @@ def spda(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None): attn_mask = self.dense_attn_mask[None, :, :maxlen, :maxlen] q2 = self.transpose_and_pad(q, cu_seqlens, maxlen, 1) - k2, v2 = [ - self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio) - for x in [k, v] - ] + k2, v2 = (self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio) + for x in [k, v]) spda_output = torch.nn.functional.scaled_dot_product_attention( q2, k2, v2, attn_mask=attn_mask, scale=sm_scale) return self.transpose_and_unpad(spda_output, cu_seqlens) diff --git a/vllm/config.py b/vllm/config.py index 814e00c8785f0..851d35dfd9fb0 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -668,9 +668,10 @@ def get_multimodal_config(self) -> "MultiModalConfig": @property def is_encoder_decoder_model(self) -> bool: """Extract the HF encoder/decoder model flag.""" - return getattr(self.hf_config, "is_encoder_decoder", False) or ( - (hasattr(self.hf_config, "text_config") and getattr( - self.hf_config.text_config, "is_encoder_decoder", False))) + return getattr( + self.hf_config, "is_encoder_decoder", + False) or (hasattr(self.hf_config, "text_config") and getattr( + self.hf_config.text_config, "is_encoder_decoder", False)) @property def is_multimodal_model(self) -> bool: diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 0b943e6e65f1c..ed7e06cab2996 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -52,7 +52,7 @@ def num_blocks(self) -> int: pass -class BlockMetaData(): +class BlockMetaData: """Data structure for storing key data describe cached block, so that evitor could use to make its decision which one to choose for eviction diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 983e772a3f79b..1f78e10cc1dcd 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -240,7 +240,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: if is_distributed: get_world_group().barrier() logger.info("reading GPU P2P access cache from %s", path) - with open(path, "r") as f: + with open(path) as f: cache = json.load(f) _gpu_p2p_access_cache = cache return _gpu_p2p_access_cache[f"{src}->{tgt}"] diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index b0fdc67776bbd..161b85646b6e8 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -812,7 +812,7 @@ async def _engine_abort(self, request_ids: Iterable[str]): async def run_engine_loop(engine_ref: ReferenceType): """We use a weakref to the engine so that the running loop doesn't prevent the engine being garbage collected.""" - engine: Optional["AsyncLLMEngine"] = engine_ref() + engine: Optional[AsyncLLMEngine] = engine_ref() if not engine: return diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index a1809b1a9dd26..404e7ed2c6ef9 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1541,8 +1541,8 @@ def _has_remaining_steps( seq_group.state.remaining_steps != ref_remaining_steps for seq_group in seq_group_metadata_list[1:] ]): - raise AssertionError(("All running sequence groups should " - "have the same remaining steps.")) + raise AssertionError("All running sequence groups should " + "have the same remaining steps.") return ref_remaining_steps > 0 diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 25b7a7479672a..19dcbfe57d112 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -77,7 +77,7 @@ def __init__(self, local_interval: float) -> None: self.num_generation_tokens: List[int] = [] self.last_local_log = time.time() self.local_interval = local_interval - self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None + self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None @abstractmethod def log(self, stats: Stats) -> None: diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 3ed37a269c4b4..223790806ab18 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -63,7 +63,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, single_step_process_prompt_logprob(self, seq_group, output) @staticmethod - @functools.lru_cache() + @functools.lru_cache def _log_prompt_logprob_unsupported_warning_once(): # Reminder: Please update docs/source/serving/compatibility_matrix.rst # If the feature combo become valid diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 2b339ab6d44e4..0ada0aaacda24 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -362,7 +362,7 @@ def load_chat_template( if chat_template is None: return None try: - with open(chat_template, "r") as f: + with open(chat_template) as f: resolved_chat_template = f.read() except OSError as e: if isinstance(chat_template, Path): diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index a64467a311523..0d016d949d22b 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -120,7 +120,7 @@ async def read_file(path_or_url: str) -> str: session.get(path_or_url) as resp: return await resp.text() else: - with open(path_or_url, "r", encoding="utf-8") as f: + with open(path_or_url, encoding="utf-8") as f: return f.read() diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 9433dce842b09..66bab2c686c67 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -32,7 +32,7 @@ class RayGPUExecutor(DistributedGPUExecutor): uses_ray: bool = True def _init_executor(self) -> None: - self.forward_dag: Optional["ray.dag.CompiledDAG"] = None + self.forward_dag: Optional[ray.dag.CompiledDAG] = None # If the env var is set, it uses the Ray's compiled DAG API # which optimizes the control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. diff --git a/vllm/logger.py b/vllm/logger.py index ccf09691a052a..d6fcda02a0fb3 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -67,8 +67,7 @@ def _configure_vllm_root_logger() -> None: raise RuntimeError( "Could not load logging config. File does not exist: %s", VLLM_LOGGING_CONFIG_PATH) - with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8", - mode="r") as file: + with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file: custom_config = json.loads(file.read()) if not isinstance(custom_config, dict): diff --git a/vllm/lora/models.py b/vllm/lora/models.py index d0279f273db7a..81e274612b73b 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -343,7 +343,7 @@ def __init__( # text modules (e.g. ChatGLM) and hasattr(self.model, "get_mm_mapping")) self.packed_modules: Dict[str, List[str]] = {} - self.modules: Dict[str, "BaseLayerWithLoRA"] = {} + self.modules: Dict[str, BaseLayerWithLoRA] = {} # Dict instead of a Set for compatibility with LRUCache. self._last_mapping: Optional[LoRAMapping] = None self._create_lora_modules() @@ -548,7 +548,7 @@ def create_dummy_lora( else: parts = module_name.split(".") replacements = self.packed_modules_mapping[parts[-1]] - subloras: List[Optional["LoRALayerWeights"]] = [] + subloras: List[Optional[LoRALayerWeights]] = [] for i, r in enumerate(replacements): lora = LoRALayerWeights.create_dummy_lora_weights( module_name + "." + r, diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 764f4e9c99df8..bfca15c2b6a3e 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -103,7 +103,7 @@ def enabled(cls) -> bool: # On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE # Specifying 'all' or 'none' in VLLM_CUSTOM_OPS takes precedence. @staticmethod - @lru_cache() + @lru_cache def default_on() -> bool: count_none = envs.VLLM_CUSTOM_OPS.count("none") count_all = envs.VLLM_CUSTOM_OPS.count("all") diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py index bca44d2bf2e28..aae806f6af323 100644 --- a/vllm/model_executor/layers/resampler.py +++ b/vllm/model_executor/layers/resampler.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 2158ad3339673..ac60e0e6d48a0 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 1f8d531198324..464915248c9ad 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -746,7 +746,7 @@ def __init__(self, load_config: LoadConfig): config_file_path = self._get_config_file(qlora_adapter) - with open(config_file_path, "r") as f: + with open(config_file_path) as f: config = json.load(f) self.target_modules = config["target_modules"] diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index 573f2a04895d9..e6299295c85a2 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -190,7 +190,7 @@ def get_model( kv_cache_dtype: ov.Type, **kwargs, ) -> torch.nn.Module: - lora_config = kwargs.get("lora_config", None) + lora_config = kwargs.get("lora_config") ov_core = kwargs.get("ov_core") if lora_config: raise ValueError( diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 36f33d6d139ee..437d2772e1f28 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -280,7 +280,7 @@ def __init__(self, tensorizer_config: TensorizerConfig, self.tensorizer_args = ( self.tensorizer_config._construct_tensorizer_args()) self.extra_kwargs = extra_kwargs - if extra_kwargs.get("quant_config", None) is not None: + if extra_kwargs.get("quant_config") is not None: self.quant_config = extra_kwargs["quant_config"] else: self.quant_config = quant_config @@ -380,8 +380,7 @@ def tensorizer_weights_iterator( stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params) with TensorDeserializer(stream, **deserializer_args, device="cpu") as state: - for name, param in state.items(): - yield name, param + yield from state.items() del state diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 0c51314bc90df..9488d54edf365 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -188,7 +188,7 @@ def get_quant_config(model_config: ModelConfig, f"{quant_config_files}") quant_config_file = quant_config_files[0] - with open(quant_config_file, "r") as f: + with open(quant_config_file) as f: config = json.load(f) if model_config.quantization == "bitsandbytes": @@ -306,7 +306,7 @@ def filter_duplicate_safetensors_files(hf_weights_files: List[str], # Iterate through the weight_map (weight_name: safetensors files) # to identify weights that we should use. - with open(index_file_name, "r") as f: + with open(index_file_name) as f: weight_map = json.load(f)["weight_map"] weight_files_in_index = set() for weight_name in weight_map: @@ -382,7 +382,7 @@ def np_cache_weights_iterator( with open(weight_names_file, "w") as f: json.dump(weight_names, f) - with open(weight_names_file, "r") as f: + with open(weight_names_file) as f: weight_names = json.load(f) for name in weight_names: @@ -423,8 +423,7 @@ def pt_weights_iterator( bar_format=_BAR_FORMAT, ): state = torch.load(bin_file, map_location="cpu") - for name, param in state.items(): - yield name, param + yield from state.items() del state torch.cuda.empty_cache() diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index fd29d4ccc59d8..5b712ba83c25a 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -48,7 +48,7 @@ def __init__(self, is_residual_mlp: bool = False, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True): - super(ArcticMLP, self).__init__() + super().__init__() self.hidden_size = config.hidden_size self.expert_id = expert_id self.layer_id = layer_id @@ -89,7 +89,7 @@ def __init__(self, params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True): - super(ArcticMoE, self).__init__() + super().__init__() self.tp_size = tp_size or get_tensor_model_parallel_world_size() self.hidden_size = config.hidden_size diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index f2cfdf8ffd30a..1fbf4135add7a 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 77ab7de6165fb..83ff39a30fbe3 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 181f3c2b0fc35..881b86564e811 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/THUDM/GLM-4 """Inference-only ChatGLM model compatible with THUDM weights.""" diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 348e6d20f3297..835682ca3b379 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index aae7ab7370b74..3e60eee2d8fe2 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -1,4 +1,3 @@ -# coding=utf-8 from typing import Iterable, List, Optional, Tuple, Union import torch diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py index 7ed2b96e65c49..8c9653463858b 100644 --- a/vllm/model_executor/models/decilm.py +++ b/vllm/model_executor/models/decilm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 DeciAI Research Team. All rights reserved. diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 5b4db8f258711..d278ea5b6a991 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index d4ad0c6b5c99e..834be78bce87b 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 22f194c776b69..23efe0359cb4a 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py # Copyright 2024 The LG U+ CTO AI Tech Lab. diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index c376347811965..ad07fc3b3776e 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 0de590d1d8372..3db82a898159b 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -1,4 +1,3 @@ -# coding=utf-8 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py # Copyright 2023 The vLLM team. # Copyright 2023 HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 029178af61da0..fc3f5cb20afb0 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2023 The vLLM team. # Copyright (c) Google Inc. # diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 9238ed839c9de..c365880109ef8 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 The vLLM team. # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index 3213a8b29a104..025615b0920fd 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/THUDM/GLM-4 """Inference-only GLM-4v model visual encoder compatible with THUDM weights.""" diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 3330d84021368..a06200c4b7e08 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 24c79a8855475..7612ea641d95c 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 9a42b359ae44f..b28a6081b868f 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 1bccef7a5f173..931052c7cccf0 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index c968817747754..bee48f377e0f5 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 5307bb21adb96..691a6e77c46c4 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 43f4f29814e6d..53869b8fa6bd8 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -1,5 +1,3 @@ -# coding=utf-8 - # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py # Copyright 2024 The vLLM team. # Copyright 2024 the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 313d98b649b48..afefb6cd9fa96 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from functools import partial from typing import Any, Dict, Iterable, List, Optional, Tuple, Union diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index edd867e4b6457..108fc8382049d 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from typing import List, Optional, Tuple, Union import torch diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index b947f24a693b5..301893f74cb87 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 6f7949c880e61..81d88a47c1941 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,4 +1,3 @@ -# coding=utf-8 """Inference-only Jamba model.""" from typing import Iterable, List, Optional, Tuple diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 38a31f420cec9..6c0a8b5ef8451 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 985ba6f3c60c1..aac4b7aa2661d 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,4 +1,3 @@ -# coding=utf-8 """PyTorch MAMBA model.""" from typing import Iterable, List, Optional, Tuple diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 03fb036020f2f..acf03cd8cb8ad 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 3b5fd95328d74..eeedf55cf3e57 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2024 The ModelBest team. diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index f90df6b7df036..5acd3f65896c7 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 1514243ad59c9..e9b9c4d838faa 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 63e2c60a84271..9647d69be8a0a 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 251bfc079684e..5fa8d19b97fe8 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index 42ccd01298169..ae218d749fc0b 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -37,7 +37,7 @@ def __init__( eps=1e-06, elementwise_scale_and_shift=True, ): - super(MLPSpeculatorLayerNorm, self).__init__() + super().__init__() self.elementwise_scale_and_shift = elementwise_scale_and_shift if self.elementwise_scale_and_shift: self.weight = nn.Parameter(torch.empty(normalized_shape)) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 522aa748f78b6..785b53670542f 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1121,9 +1121,9 @@ def _merge_multimodal_embeddings( batch_size * num_image * num_patch, -1).contiguous() image_input_idx = image_input_idx * valid.to(image_input_idx.dtype) - offset = torch.cat( - [seq_len.new_zeros( - (1)), seq_len.cumsum(dim=0)[:-1]], dim=0)[:, None] + offset = torch.cat([seq_len.new_zeros(1), + seq_len.cumsum(dim=0)[:-1]], + dim=0)[:, None] image_input_idx = image_input_idx + offset.to(image_input_idx.dtype) image_input_idx = image_input_idx.flatten()[:, None] mat = image_input_idx == torch.arange( diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index ee802030a5ef3..fdd8af79b5470 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math from typing import Iterable, List, Optional, Tuple, Union diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 72a09129fed63..b649064536dc2 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 90ab8abcb84b4..dd3f58289a227 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py # Copyright 2024 The vLLM team. diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 7521ab749e10f..7a76e4a0906db 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 055407587c598..a338a93c2dd9a 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py # Copyright (c) OrionStar Inc. diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index fc9ef15db26c0..bd4a9f698bacd 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -1,4 +1,3 @@ -# coding=utf-8 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py # Copyright 2023 The vLLM team. # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 4e7935a7636c5..492122450b237 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py index 02b2ff01c3832..34141511ea791 100644 --- a/vllm/model_executor/models/phi3.py +++ b/vllm/model_executor/models/phi3.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from llama.py """Inference-only Phi3 model code inherit from Llama.py""" diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 5b477a8ed5f49..1c41891ced416 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 The vLLM team. # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index bb8a9327b4ac8..59843ae3dfd59 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index ee9f150b17cfc..6e9092432467a 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -136,11 +136,11 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs): if image_token_id not in inputs['prompt_token_ids']: raise ValueError( - (f"You've passed {inputs=} without {image_token_id=}" - " Make sure to process your input via mistral_common's" - " tokenizer or pass a chat completion request. For more" - " For more info, see: " - "https://github.com/vllm-project/vllm/issues/8411.")) + f"You've passed {inputs=} without {image_token_id=}" + " Make sure to process your input via mistral_common's" + " tokenizer or pass a chat completion request. For more" + " For more info, see: " + "https://github.com/vllm-project/vllm/issues/8411.") return inputs diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index b2b5c70182135..3a0e33e8a3eff 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py # Copyright (c) Alibaba Cloud. diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 72b286fe6f6d6..49b3de1304cca 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py # Copyright 2024 The Qwen team. @@ -417,9 +416,9 @@ def __init__( and hasattr(config, "max_window_layers")): raise ValueError("Sliding window for some but all layers is not " "supported. This model uses sliding window " - "but `max_window_layers` = %s is less than " - "`num_hidden_layers` = %s. Please open an issue " - "to discuss this feature." % ( + "but `max_window_layers` = {} is less than " + "`num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( config.max_window_layers, config.num_hidden_layers, )) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 6114548bda42c..556c09400ee83 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py index 2d6f3e90f761c..b9e3b74c477e2 100644 --- a/vllm/model_executor/models/qwen2_cls.py +++ b/vllm/model_executor/models/qwen2_cls.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py # Copyright 2024 Kakao Corp. (Kanana-X Team) @@ -60,9 +59,9 @@ def __init__( and hasattr(config, "max_window_layers")): raise ValueError("Sliding window for some but all layers is not " "supported. This model uses sliding window " - "but `max_window_layers` = %s is less than " - "`num_hidden_layers` = %s. Please open an issue " - "to discuss this feature." % ( + "but `max_window_layers` = {} is less than " + "`num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( config.max_window_layers, config.num_hidden_layers, )) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index dac85e35d369d..98bb48a274e49 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py # Copyright 2024 The Qwen team. diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 901b1daaa14a4..0fbf305da8b94 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py # Copyright 2024 The Qwen team. @@ -71,9 +70,9 @@ def __init__( and hasattr(config, "max_window_layers")): raise ValueError("Sliding window for some but all layers is not " "supported. This model uses sliding window " - "but `max_window_layers` = %s is less than " - "`num_hidden_layers` = %s. Please open an issue " - "to discuss this feature." % ( + "but `max_window_layers` = {} is less than " + "`num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( config.max_window_layers, config.num_hidden_layers, )) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index d801903f8f9fe..e30b84e8dd44c 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py # Copyright 2024 The Qwen team. @@ -246,9 +245,8 @@ def forward( q, k, v = dist_utils.split_tensor_along_last_dim(x, 3) batch_size = q.shape[1] - q, k, v = [ - rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v) - ] + q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() + for x in (q, k, v)) if rotary_pos_emb is not None: q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) @@ -258,7 +256,7 @@ def forward( # flash_attn_varlen_func) from flash_attn import flash_attn_varlen_func - q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]] + q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() output = flash_attn_varlen_func(q, @@ -276,7 +274,7 @@ def forward( b=batch_size) elif self.attn_backend == _Backend.TORCH_SDPA: seq_length = q.size(1) - q, k, v = [rearrange(x, "b s h d -> b h s d") for x in [q, k, v]] + q, k, v = (rearrange(x, "b s h d -> b h s d") for x in [q, k, v]) attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index e3e7ccb5cf179..1b233ac7427dd 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 083a48588d01a..34389b645a7c1 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. # All rights reserved. # diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 8f0644bca3e2e..b24c5dadb2b2b 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index 036789642d3c4..e559988ada753 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/xverse/XVERSE-7B/blob/main/modeling_xverse.py # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 6b10d0c609f13..5ff6f93fb25b4 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,4 +1,3 @@ -import sys from abc import ABC, abstractmethod from collections import UserDict, defaultdict from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Mapping, @@ -34,14 +33,9 @@ :meth:`MultiModalInputs.batch`. """ -if sys.version_info < (3, 9): - # UserDict cannot be subscripted - class _MultiModalInputsBase(UserDict): - pass -else: - class _MultiModalInputsBase(UserDict[str, NestedTensors]): - pass +class _MultiModalInputsBase(UserDict[str, NestedTensors]): + pass class MultiModalInputs(_MultiModalInputsBase): @@ -262,18 +256,23 @@ def wrapper(model_cls: N) -> N: logger.warning( "Model class %s already has an input mapper " "registered to %s. It is overwritten by the new one.", - model_cls, self) + model_cls, + self, + ) - self._input_mappers[model_cls] = mapper \ - or self._default_input_mapper + self._input_mappers[model_cls] = (mapper + or self._default_input_mapper) return model_cls return wrapper - def map_input(self, model_config: "ModelConfig", - data: MultiModalData[object], - mm_processor_kwargs: Dict[str, Any]) -> MultiModalInputs: + def map_input( + self, + model_config: "ModelConfig", + data: MultiModalData[object], + mm_processor_kwargs: Dict[str, Any], + ) -> MultiModalInputs: """ Transform the data into a dictionary of model inputs using the input mapper registered for that model. @@ -348,13 +347,15 @@ def wrapper(model_cls: N) -> N: logger.warning( "Model class %s already calculates maximum number of " "tokens in %s. It is overwritten by the new one.", - model_cls, self) + model_cls, + self, + ) if isinstance(max_mm_tokens, int): self._validate_max_multimodal_tokens(max_mm_tokens) - self._max_mm_tokens[model_cls] = max_mm_tokens \ - or self._default_max_multimodal_tokens + self._max_mm_tokens[model_cls] = ( + max_mm_tokens or self._default_max_multimodal_tokens) return model_cls @@ -482,8 +483,10 @@ def from_seq_group( placeholder_maps: Dict[str, MultiModalPlaceholderMap] = defaultdict( MultiModalPlaceholderMap) - for modality, placeholders in seq_group.multi_modal_placeholders.items( - ): + for ( + modality, + placeholders, + ) in seq_group.multi_modal_placeholders.items(): mm_items = mm_data.pop(modality) if not isinstance(mm_items, list): mm_items = [mm_items] @@ -499,8 +502,11 @@ def from_seq_group( return mm_data, placeholder_maps def append_items_from_seq_group( - self, positions: range, multi_modal_items: List[_T], - multi_modal_placeholders: List[PlaceholderRange]) -> List[_T]: + self, + positions: range, + multi_modal_items: List[_T], + multi_modal_placeholders: List[PlaceholderRange], + ) -> List[_T]: """ Adds the multi-modal items that intersect ```positions`` to this placeholder map and returns the intersecting items. @@ -515,20 +521,26 @@ def append_items_from_seq_group( multi_modal_items): placeholder = range( placeholder_dict["offset"], - placeholder_dict["offset"] + placeholder_dict["length"]) - intersection = range(max(positions.start, placeholder.start), - min(positions.stop, placeholder.stop)) + placeholder_dict["offset"] + placeholder_dict["length"], + ) + intersection = range( + max(positions.start, placeholder.start), + min(positions.stop, placeholder.stop), + ) if not intersection: # Skip this multi-modal item. continue - token_embedding_range = range(intersection.start - positions.start, - intersection.stop - positions.start) + token_embedding_range = range( + intersection.start - positions.start, + intersection.stop - positions.start, + ) multimodal_embedding_range = range( intersection.start - placeholder.start + self.src_len, - intersection.stop - placeholder.start + self.src_len) + intersection.stop - placeholder.start + self.src_len, + ) intersecting_items.append(mm_item) self.dest_ranges.append(token_embedding_range) diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py index 4cde2a0254b90..473b87c89c21d 100644 --- a/vllm/prompt_adapter/utils.py +++ b/vllm/prompt_adapter/utils.py @@ -37,9 +37,8 @@ def load_peft_weights(model_id: str, Additional arguments to pass to the `hf_hub_download` method when loading from the HuggingFace Hub. """ - path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) - if hf_hub_download_kwargs.get("subfolder", None) is not None else - model_id) + path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) if + hf_hub_download_kwargs.get("subfolder") is not None else model_id) if device is None: device = infer_device() @@ -51,19 +50,19 @@ def load_peft_weights(model_id: str, filename = os.path.join(path, WEIGHTS_NAME) use_safetensors = False else: - token = hf_hub_download_kwargs.get("token", None) + token = hf_hub_download_kwargs.get("token") if token is None: - token = hf_hub_download_kwargs.get("use_auth_token", None) + token = hf_hub_download_kwargs.get("use_auth_token") hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"], SAFETENSORS_WEIGHTS_NAME) - if hf_hub_download_kwargs.get("subfolder", None) - is not None else SAFETENSORS_WEIGHTS_NAME) + if hf_hub_download_kwargs.get("subfolder") is not None + else SAFETENSORS_WEIGHTS_NAME) has_remote_safetensors_file = file_exists( repo_id=model_id, filename=hub_filename, - revision=hf_hub_download_kwargs.get("revision", None), - repo_type=hf_hub_download_kwargs.get("repo_type", None), + revision=hf_hub_download_kwargs.get("revision"), + repo_type=hf_hub_download_kwargs.get("repo_type"), token=token, ) use_safetensors = has_remote_safetensors_file diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 08697274854e0..1a5870aa4f84c 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -308,7 +308,7 @@ def load_params_config(model, revision) -> PretrainedConfig: config_path = Path( hf_hub_download(model, config_file_name, revision=revision)) - with open(config_path, "r") as file: + with open(config_path) as file: config_dict = json.load(file) config_mapping = { diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py index 49d2b8d8e21b1..e563bf6268d72 100644 --- a/vllm/transformers_utils/configs/chatglm.py +++ b/vllm/transformers_utils/configs/chatglm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/THUDM/ChatGLM2-6B from transformers import PretrainedConfig diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py index 805b8ad930039..f60a59f554133 100644 --- a/vllm/transformers_utils/configs/exaone.py +++ b/vllm/transformers_utils/configs/exaone.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copied from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py # Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved. diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index b06a946f34a47..82f129eb2018e 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright 2023 Cerebras Systems. diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 497db0ae48c96..0f047c8b0361c 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copied from # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py """A HuggingFace-style model configuration.""" @@ -117,10 +116,10 @@ def _validate_config(self) -> None: init_config_defaults) if self.d_model % self.n_heads != 0: raise ValueError('d_model must be divisible by n_heads') - if any(( + if any( prob < 0 or prob > 1 for prob in - [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop] - )): + [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop + ]): raise ValueError( "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are " "probabilities and must be between 0 and 1") diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 139e6b3cdacbe..93fec667d1cf3 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 HuggingFace Inc. team. All rights reserved. # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # @@ -144,7 +143,7 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - head_dim = head_dim or kwargs.get("kv_channels", None) + head_dim = head_dim or kwargs.get("kv_channels") self.head_dim = head_dim if head_dim is not None else ( hidden_size // num_attention_heads) @@ -160,8 +159,8 @@ def __init__( self.rope_theta = rope_theta self.rope_scaling = rope_scaling # for backward compatibility - partial_rotary_factor = kwargs.get("rope_percent", None) or kwargs.get( - "rope_percentage", None) or partial_rotary_factor + partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get( + "rope_percentage") or partial_rotary_factor self.partial_rotary_factor = partial_rotary_factor self._rope_scaling_validation() self.attention_bias = attention_bias diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py index d5113bf01695a..0c1c048f670ee 100644 --- a/vllm/transformers_utils/configs/solar.py +++ b/vllm/transformers_utils/configs/solar.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/utils.py b/vllm/utils.py index 0b75e8761c916..6edc8d72f6bcf 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1153,7 +1153,7 @@ class SortedHelpFormatter(argparse.HelpFormatter): def add_arguments(self, actions): actions = sorted(actions, key=lambda x: x.option_strings) - super(SortedHelpFormatter, self).add_arguments(actions) + super().add_arguments(actions) class FlexibleArgumentParser(argparse.ArgumentParser): @@ -1279,7 +1279,7 @@ def _load_config_file(self, file_path: str) -> List[str]: config: Dict[str, Union[int, str]] = {} try: - with open(file_path, 'r') as config_file: + with open(file_path) as config_file: config = yaml.safe_load(config_file) except Exception as ex: logger.error( From a5fda50a10641e47c0c290907f30ef2add6d4e7a Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 6 Nov 2024 16:50:37 +0800 Subject: [PATCH 23/24] [CI/Build] Fix large_gpu_mark reason (#10070) Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index 16e21f68c7c96..00c7dabe16a7b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -699,7 +699,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator: return pytest.mark.skipif( memory_gb < min_gb, - reason=f"Need at least {memory_gb}GB GPU memory to run the test.", + reason=f"Need at least {min_gb}GB GPU memory to run the test.", ) From 8e62377e52c804e368f020e76220a261ebfe300a Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 6 Nov 2024 14:45:06 +0200 Subject: [PATCH 24/24] format.sh --- .../test_lm_eval_correctness.py | 20 ++++++++----------- setup.py | 3 +-- vllm/executor/ray_hpu_executor.py | 2 +- vllm/model_executor/layers/sampler.py | 2 +- vllm/worker/hpu_model_runner.py | 7 +++---- 5 files changed, 14 insertions(+), 20 deletions(-) diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py index 3df0621f49a72..9a31f59b828a9 100644 --- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py +++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py @@ -76,18 +76,14 @@ def report_performance(task, input_lens, output_lens, time, record_property): context_lens = [i + o for i, o in zip(input_lens, output_lens)] gen_tput = sum(output_lens) / time all_lens = [input_lens, output_lens, context_lens] - min_input_tokens, min_output_tokens, min_context_tokens = [ - min(x) for x in all_lens - ] - max_input_tokens, max_output_tokens, max_context_tokens = [ - max(x) for x in all_lens - ] - mean_input_tokens, mean_output_tokens, mean_context_tokens = [ - statistics.mean(x) for x in all_lens - ] - stddev_input_tokens, stddev_output_tokens, stddev_context_tokens = [ - statistics.stdev(x) for x in all_lens - ] + min_input_tokens, min_output_tokens, min_context_tokens = ( + min(x) for x in all_lens) + max_input_tokens, max_output_tokens, max_context_tokens = ( + max(x) for x in all_lens) + mean_input_tokens, mean_output_tokens, mean_context_tokens = ( + statistics.mean(x) for x in all_lens) + stddev_input_tokens, stddev_output_tokens, stddev_context_tokens = ( + statistics.stdev(x) for x in all_lens) msg = ( f'{task} | estimated average generation throughput: {gen_tput:.2f} tokens/s \n' # noqa: G004, E501 f'{task} | input_tokens | min: {min_input_tokens} | max: {max_input_tokens} | mean: {mean_input_tokens:.2f} | stddev: {stddev_input_tokens:.2f}\n' # noqa: E501 diff --git a/setup.py b/setup.py index 17ed12009af0f..5939d9ec89457 100644 --- a/setup.py +++ b/setup.py @@ -382,8 +382,7 @@ def get_gaudi_sw_version(): output = subprocess.run("hl-smi", shell=True, text=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + capture_output=True, env={"ENABLE_CONSOLE": "true"}) if output.returncode == 0 and output.stdout: return output.stdout.split("\n")[2].replace( diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py index 58b1447531841..ebfaafd29f92c 100644 --- a/vllm/executor/ray_hpu_executor.py +++ b/vllm/executor/ray_hpu_executor.py @@ -34,7 +34,7 @@ class RayHPUExecutor(DistributedGPUExecutor): uses_ray: bool = True def _init_executor(self) -> None: - self.forward_dag: Optional["ray.dag.CompiledDAG"] = None + self.forward_dag: Optional[ray.dag.CompiledDAG] = None # If the env var is set, it uses the Ray's compiled DAG API # which optimizes the control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 388a871c40d2e..56ac01680ee45 100755 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -358,7 +358,7 @@ def _get_bin_counts_and_mask( return bin_counts, mask -class ApplyToppTopkScalar(): +class ApplyToppTopkScalar: """ The original implementation of _apply_top_k_top_p is more general as it uses vector topp, topk diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 60a3ba2650377..9e2ee861ebbd4 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -73,8 +73,7 @@ class Singleton(type): def __call__(cls, *args, **kwargs): if cls not in cls._instances: - cls._instances[cls] = super(Singleton, - cls).__call__(*args, **kwargs) + cls._instances[cls] = super().__call__(*args, **kwargs) return cls._instances[cls] @@ -281,7 +280,7 @@ def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt): return indices, offsets -class HpuModelAdapter(): +class HpuModelAdapter: def __init__(self, model, block_size, dtype, enforce_eager): self.model = model @@ -1788,7 +1787,7 @@ def _maybe_wrap_in_hpu_graph(*args, **kwargs): ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs) -class HabanaProfilerCounterHelper(): +class HabanaProfilerCounterHelper: def __init__(self): self.niter = 0