diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
new file mode 100644
index 0000000000000..78347f63fa793
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
+model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
+tasks:
+- name: "gsm8k"
+ metrics:
+ - name: "exact_match,strict-match"
+ value: 0.356
+ - name: "exact_match,flexible-extract"
+ value: 0.358
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index 64a0f428587af..6057229ac50f3 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -1,6 +1,6 @@
Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index e72138e29dd65..3b7fa0f2d94b3 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -3,7 +3,7 @@ steps:
agents:
queue: cpu_queue
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
@@ -22,7 +22,7 @@ steps:
agents:
queue: cpu_queue
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index df201cdc7c554..860272e71fd84 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -31,8 +31,8 @@ cleanup_docker() {
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
- # Remove unused volumes
- docker volume prune -f
+ # Remove unused volumes / force the system prune for old images as well.
+ docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
@@ -107,11 +107,12 @@ fi
PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
+ # assign job count as the number of shards used
+ commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
- #replace shard arguments
- commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
- commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
- echo "Shard ${GPU} commands:$commands"
+ # assign shard-id for each shard
+ commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+ echo "Shard ${GPU} commands:$commands_gpu"
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
@@ -123,7 +124,7 @@ if [[ $commands == *"--shard-id="* ]]; then
-e HF_HOME=${HF_MOUNT} \
--name ${container_name}_${GPU} \
${image_name} \
- /bin/bash -c "${commands}" \
+ /bin/bash -c "${commands_gpu}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index c2818c38965ea..c331a9c49c0d0 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -32,10 +32,10 @@ docker exec cpu-test bash -c "
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
# Run compressed-tensor test
-# docker exec cpu-test bash -c "
-# pytest -s -v \
-# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
-# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
+docker exec cpu-test bash -c "
+ pytest -s -v \
+ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
+ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker exec cpu-test bash -c "
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 6989c94d46a89..988d5aef5fb8c 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -12,4 +12,4 @@ remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 4c2fe41c739b1..9444dc43ea97e 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -9,6 +9,7 @@
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
+# nightly(bool): run this test in nightly pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually)
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
@@ -77,8 +78,8 @@ steps:
- vllm/
- tests/basic_correctness/test_chunked_prefill
commands:
- - VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
- - VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
+ - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+ - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- label: Core Test # 10min
mirror_hardwares: [amd]
@@ -88,11 +89,7 @@ steps:
- vllm/distributed
- tests/core
commands:
- - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py
- - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py
- - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py
- - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py
- - pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py
+ - pytest -v -s core
- label: Entrypoints Test # 40min
working_dir: "/vllm-workspace/tests"
@@ -184,6 +181,7 @@ steps:
- python3 offline_inference_vision_language_multi_image.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py
+ - python3 offline_profile.py --model facebook/opt-125m
- label: Prefix Caching Test # 9min
#mirror_hardwares: [amd]
@@ -191,8 +189,7 @@ steps:
- vllm/
- tests/prefix_caching
commands:
- - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py
- - pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py
+ - pytest -v -s prefix_caching
- label: Samplers Test # 36min
source_file_dependencies:
@@ -216,8 +213,7 @@ steps:
- tests/spec_decode
commands:
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py
- - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py
+ - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
- label: LoRA Test %N # 15min each
mirror_hardwares: [amd]
@@ -234,15 +230,16 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
+ # these tests need to be separated, cannot combine
+ - pytest -v -s compile/piecewise/test_simple.py
+ - pytest -v -s compile/piecewise/test_toy_llama.py
-# TODO: re-write in comparison tests, and fix symbolic shape
-# for quantization ops.
-# - label: "PyTorch Fullgraph Test" # 18min
-# source_file_dependencies:
-# - vllm/
-# - tests/compile
-# commands:
-# - pytest -v -s compile/test_full_graph.py
+- label: "PyTorch Fullgraph Test" # 18min
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ - pytest -v -s compile/test_full_graph.py
- label: Kernels Test %N # 1h each
mirror_hardwares: [amd]
@@ -317,33 +314,57 @@ steps:
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
-- label: Decoder-only Language Models Test # 1h36min
+- label: Decoder-only Language Models Test (Standard) # 35min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
commands:
- - pytest -v -s models/decoder_only/language
+ - pytest -v -s models/decoder_only/language/test_models.py
+ - pytest -v -s models/decoder_only/language/test_big_models.py
-- label: Decoder-only Multi-Modal Models Test # 1h31min
+- label: Decoder-only Language Models Test (Extended) # 1h20min
+ nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/decoder_only/language
+ commands:
+ - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
+
+- label: Decoder-only Multi-Modal Models Test (Standard)
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
commands:
- - pytest -v -s models/decoder_only/audio_language
- - pytest -v -s models/decoder_only/vision_language
+ - pytest -v -s models/decoder_only/audio_language -m core_model
+ - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model
+
+- label: Decoder-only Multi-Modal Models Test (Extended)
+ nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/decoder_only/audio_language
+ - tests/models/decoder_only/vision_language
+ commands:
+ - pytest -v -s models/decoder_only/audio_language -m 'not core_model'
+ # HACK - run phi3v tests separately to sidestep this transformers bug
+ # https://github.com/huggingface/transformers/issues/34307
+ - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
+ - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model'
- label: Other Models Test # 6min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/embedding/language
+ - tests/models/embedding/vision_language
- tests/models/encoder_decoder/language
- tests/models/encoder_decoder/vision_language
commands:
- pytest -v -s models/embedding/language
+ - pytest -v -s models/embedding/vision_language
- pytest -v -s models/encoder_decoder/language
- pytest -v -s models/encoder_decoder/vision_language
@@ -402,11 +423,11 @@ steps:
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- - TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus
+ - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
+ - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
diff --git a/.dockerignore b/.dockerignore
index 17ed0d97c88b3..3863656915d03 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,6 +1,33 @@
-/.github/
/.venv
/build
dist
-Dockerfile*
vllm/*.so
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+.mypy_cache
+
+# Distribution / packaging
+.Python
+/build/
+cmake-build-*/
+CMakeUserPresets.json
+develop-eggs/
+/dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 6fddca0d6e4b9..4f54eea564ecb 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -5,3 +5,28 @@ updates:
directory: "/"
schedule:
interval: "weekly"
+ - package-ecosystem: "pip"
+ directory: "/"
+ schedule:
+ interval: "weekly"
+ labels: ["dependencies"]
+ open-pull-requests-limit: 5
+ reviewers: ["khluu", "simon-mo"]
+ allow:
+ - dependency-type: "all"
+ ignore:
+ - dependency-name: "torch"
+ - dependency-name: "torchvision"
+ - dependency-name: "xformers"
+ - dependency-name: "lm-format-enforcer"
+ - dependency-name: "gguf"
+ - dependency-name: "compressed-tensors"
+ - dependency-name: "ray[adag]"
+ - dependency-name: "lm-eval"
+ groups:
+ patch-update:
+ applies-to: version-updates
+ update-types: ["patch"]
+ minor-update:
+ applies-to: version-updates
+ update-types: ["minor"]
diff --git a/.github/mergify.yml b/.github/mergify.yml
new file mode 100644
index 0000000000000..1ce5039a061b2
--- /dev/null
+++ b/.github/mergify.yml
@@ -0,0 +1,58 @@
+pull_request_rules:
+- name: label-documentation
+ description: Automatically apply documentation label
+ conditions:
+ - or:
+ - files~=^[^/]+\.md$
+ - files~=^docs/
+ actions:
+ label:
+ add:
+ - documentation
+
+- name: label-ci-build
+ description: Automatically apply ci/build label
+ conditions:
+ - or:
+ - files~=^\.github/
+ - files~=\.buildkite/
+ - files~=^cmake/
+ - files=CMakeLists.txt
+ - files~=^Dockerfile
+ - files~=^requirements.*\.txt
+ - files=setup.py
+ actions:
+ label:
+ add:
+ - ci/build
+
+- name: label-frontend
+ description: Automatically apply frontend label
+ conditions:
+ - files~=^vllm/entrypoints/
+ actions:
+ label:
+ add:
+ - frontend
+
+- name: ping author on conflicts and add 'needs-rebase' label
+ conditions:
+ - conflict
+ - -closed
+ actions:
+ label:
+ add:
+ - needs-rebase
+ comment:
+ message: |
+ This pull request has merge conflicts that must be resolved before it can be
+ merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+
+- name: remove 'needs-rebase' label when conflict is resolved
+ conditions:
+ - -conflict
+ - -closed
+ actions:
+ label:
+ remove:
+ - needs-rebase
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index 2a0e3239f58da..b80749aaa8fec 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -34,4 +34,5 @@ jobs:
- name: "Run actionlint"
run: |
+ echo "::add-matcher::.github/workflows/matchers/actionlint.json"
tools/actionlint.sh -color
diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
index 2e7c7f7f087af..c9d6d4259df99 100644
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Add label
- uses: actions/github-script@v7
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
with:
script: |
github.rest.issues.addLabels({
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index 341fc0665a402..77c007c2ad1ad 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -17,9 +17,9 @@ jobs:
matrix:
python-version: ["3.11"]
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
+ uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
@@ -38,4 +38,4 @@ jobs:
)
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
- | xargs clang-format --dry-run --Werror
\ No newline at end of file
+ | xargs clang-format --dry-run --Werror
diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json
new file mode 100644
index 0000000000000..f048fce528941
--- /dev/null
+++ b/.github/workflows/matchers/mypy.json
@@ -0,0 +1,16 @@
+{
+ "problemMatcher": [
+ {
+ "owner": "mypy",
+ "pattern": [
+ {
+ "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
+ "file": 1,
+ "line": 2,
+ "severity": 3,
+ "message": 4
+ }
+ ]
+ }
+ ]
+}
diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json
new file mode 100644
index 0000000000000..f6d4479ee1996
--- /dev/null
+++ b/.github/workflows/matchers/ruff.json
@@ -0,0 +1,17 @@
+{
+ "problemMatcher": [
+ {
+ "owner": "ruff",
+ "pattern": [
+ {
+ "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
+ "file": 1,
+ "line": 2,
+ "column": 3,
+ "code": 4,
+ "message": 5
+ }
+ ]
+ }
+ ]
+ }
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 053684bebb6f2..5d73daf09b1ce 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -17,9 +17,9 @@ jobs:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
+ uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
@@ -32,4 +32,5 @@ jobs:
pip install types-setuptools
- name: Mypy
run: |
- tools/mypy.sh
+ echo "::add-matcher::.github/workflows/matchers/mypy.json"
+ tools/mypy.sh 1
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 96549b3f99181..f959a1cacf866 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -21,7 +21,7 @@ jobs:
upload_url: ${{ steps.create_release.outputs.upload_url }}
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Extract branch info
shell: bash
@@ -30,7 +30,7 @@ jobs:
- name: Create Release
id: create_release
- uses: "actions/github-script@v7"
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
env:
RELEASE_TAG: ${{ env.release_tag }}
with:
@@ -54,10 +54,10 @@ jobs:
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Setup ccache
- uses: hendrikmuhs/ccache-action@v1.2
+ uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
with:
create-symlink: true
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
@@ -68,7 +68,7 @@ jobs:
bash -x .github/workflows/scripts/env.sh
- name: Set up Python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
@@ -92,7 +92,7 @@ jobs:
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
- name: Upload Release Asset
- uses: actions/upload-release-asset@v1
+ uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 98c5570f61aff..c65730f77a6a6 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -17,9 +17,9 @@ jobs:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
+ uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
@@ -28,7 +28,8 @@ jobs:
pip install -r requirements-lint.txt
- name: Analysing the code with ruff
run: |
- ruff check .
+ echo "::add-matcher::.github/workflows/matchers/ruff.json"
+ ruff check --output-format github .
- name: Spelling check with codespell
run: |
codespell --toml pyproject.toml
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index cda0c28c75c2a..122e4e101e201 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -1,4 +1,5 @@
#!/bin/bash
+set -eux
python_executable=python$1
cuda_home=/usr/local/cuda-$2
@@ -8,13 +9,15 @@ PATH=${cuda_home}/bin:$PATH
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
# Install requirements
-$python_executable -m pip install wheel packaging 'setuptools-scm>=8'
-$python_executable -m pip install -r requirements-cuda.txt
+$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
# Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1
# Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
+
+bash tools/check_repo.sh
+
# Build
$python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
new file mode 100644
index 0000000000000..81e7c9b050760
--- /dev/null
+++ b/.github/workflows/stale.yml
@@ -0,0 +1,52 @@
+name: 'Close inactive issues and PRs'
+
+on:
+ schedule:
+ # Daily at 1:30 AM UTC
+ - cron: '30 1 * * *'
+
+jobs:
+ close-issues-and-pull-requests:
+ permissions:
+ issues: write
+ pull-requests: write
+ actions: write
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
+ with:
+ # Increasing this value ensures that changes to this workflow
+ # propagate to all issues and PRs in days rather than months
+ operations-per-run: 1000
+
+ exempt-draft-pr: true
+ exempt-issue-labels: 'keep-open'
+ exempt-pr-labels: 'keep-open'
+
+ labels-to-add-when-unstale: 'unstale'
+ labels-to-remove-when-stale: 'unstale'
+
+ days-before-issue-stale: 90
+ days-before-issue-close: 30
+ stale-issue-label: 'stale'
+ stale-issue-message: >
+ This issue has been automatically marked as stale because it has not
+ had any activity within 90 days. It will be automatically closed if no
+ further activity occurs within 30 days. Leave a comment if
+ you feel this issue should remain open. Thank you!
+ close-issue-message: >
+ This issue has been automatically closed due to inactivity. Please
+ feel free to reopen if you feel it is still relevant. Thank you!
+
+ days-before-pr-stale: 90
+ days-before-pr-close: 30
+ stale-pr-label: 'stale'
+ stale-pr-message: >
+ This pull request has been automatically marked as stale because it
+ has not had any activity within 90 days. It will be automatically
+ closed if no further activity occurs within 30 days. Leave a comment
+ if you feel this pull request should remain open. Thank you!
+ close-pr-message: >
+ This pull request has been automatically closed due to inactivity.
+ Please feel free to reopen if you intend to continue working on it.
+ Thank you!
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index 68eb06dea47a3..f422588a5f73b 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -17,9 +17,9 @@ jobs:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
+ uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index f1959ad2743f3..42cbf18a0f712 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -13,10 +13,10 @@ sphinx:
fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF
-formats:
- - pdf
+formats: []
# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: docs/requirements-docs.txt
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a424ad7b110f..943424bc4edfa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
-set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
#
# Try to find python package with an executable that exactly matches
@@ -83,24 +83,6 @@ endif()
#
find_package(Torch REQUIRED)
-#
-message(STATUS "Enabling core extension.")
-
-# Define _core_C extension
-# built for (almost) every target platform, (excludes TPU and Neuron)
-
-set(VLLM_EXT_SRC
- "csrc/core/torch_bindings.cpp")
-
-define_gpu_extension_target(
- _core_C
- DESTINATION vllm
- LANGUAGE CXX
- SOURCES ${VLLM_EXT_SRC}
- COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
- USE_SABI 3
- WITH_SOABI)
-
#
# Forward the non-CUDA device extensions to external CMake scripts.
#
@@ -187,12 +169,12 @@ endif()
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
-# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
+# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
+# Each dependency that produces build artifacts should override its BINARY_DIR to avoid
+# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/.
#
include(FetchContent)
-get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
-file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
-set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
+file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
#
@@ -213,7 +195,6 @@ set(VLLM_EXT_SRC
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu"
- "csrc/moe_align_block_size_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/torch_bindings.cpp")
@@ -270,7 +251,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
else()
message(STATUS "Not building Marlin kernels as no compatible archs found"
- "in CUDA target architectures")
+ " in CUDA target architectures")
endif()
#
@@ -286,10 +267,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
else()
- # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
- # build any 3x kernels
- set(SCALED_MM_3X_ARCHS)
-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
@@ -299,13 +276,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
"in CUDA target architectures")
endif()
+
+ # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
+ # build any 3x kernels
+ set(SCALED_MM_3X_ARCHS)
endif()
#
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
- "7.5;8.0;8.6;8.9;9.0;9.0a" "${CUDA_ARCHS}")
+ "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
@@ -423,6 +404,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
+ "csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu")
set_gencode_flags_for_srcs(
@@ -450,7 +432,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else()
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
- "in CUDA target architectures")
+ " in CUDA target architectures")
endif()
endif()
@@ -525,8 +507,10 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
+ GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9
GIT_PROGRESS TRUE
+ # Don't share the vllm-flash-attn build between build types
+ BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5f79356bd32f7..b39fd75b5fb70 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,12 +11,14 @@ We also believe in the power of community support; thus, answering queries, offe
Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
+## License
+
+See [LICENSE](LICENSE).
## Developing
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
-
## Testing
```bash
@@ -33,6 +35,14 @@ pytest tests/
## Contribution Guidelines
+### DCO and Signed-off-by
+
+When contributing changes to this project, you must agree to the [DCO](DCO).
+Commits must include a `Signed-off-by:` header which certifies agreement with
+the terms of the [DCO](DCO).
+
+Using `-s` with `git commit` will automatically add this header.
+
### Issues
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
diff --git a/DCO b/DCO
new file mode 100644
index 0000000000000..49b8cb0549267
--- /dev/null
+++ b/DCO
@@ -0,0 +1,34 @@
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+ have the right to submit it under the open source license
+ indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+ of my knowledge, is covered under an appropriate open source
+ license and I have the right under that license to submit that
+ work with modifications, whether created in whole or in part
+ by me, under the same open source license (unless I am
+ permitted to submit under a different license), as indicated
+ in the file; or
+
+(c) The contribution was provided directly to me by some other
+ person who certified (a), (b) or (c) and I have not modified
+ it.
+
+(d) I understand and agree that this project and the contribution
+ are public and that a record of the contribution (including all
+ personal information I submit with it, including my sign-off) is
+ maintained indefinitely and may be redistributed consistent with
+ this project or the open source license(s) involved.
diff --git a/Dockerfile b/Dockerfile
index 8405e0a88a106..343364da2ebf5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -70,16 +70,10 @@ COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt
-# files and directories related to build wheels
-COPY csrc csrc
-COPY setup.py setup.py
-COPY cmake cmake
-COPY CMakeLists.txt CMakeLists.txt
-COPY README.md README.md
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cuda.txt requirements-cuda.txt
-COPY pyproject.toml pyproject.toml
-COPY vllm vllm
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# max jobs used by Ninja to build extensions
ARG max_jobs=2
@@ -212,7 +206,7 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
- pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
+ pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
ENV VLLM_USAGE_SOURCE production-docker-image
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index b9134d4ae41cb..f1a21d6bd13fc 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -33,19 +33,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install --upgrade pip && \
pip install -r requirements-build.txt
-# install oneDNN
-RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
-
-RUN --mount=type=cache,target=/root/.cache/ccache \
- cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
- -DONEDNN_BUILD_DOC=OFF \
- -DONEDNN_BUILD_EXAMPLES=OFF \
- -DONEDNN_BUILD_TESTS=OFF \
- -DONEDNN_BUILD_GRAPH=OFF \
- -DONEDNN_ENABLE_WORKLOAD=INFERENCE \
- -DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
- cmake --build ./oneDNN/build --target install --config Release
-
FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
@@ -55,7 +42,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt
-COPY ./ ./
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index adae6db87ba87..2143315d2a078 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -17,7 +17,7 @@ RUN apt-get update && \
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ]
-WORKDIR ${APP_MOUNT}
+WORKDIR ${APP_MOUNT}/vllm
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
@@ -25,17 +25,17 @@ RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
-COPY . /app/vllm
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-RUN cd /app/vllm \
- && python3 -m pip install -U \
- cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+RUN python3 -m pip install -U \
+ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-r requirements-neuron.txt
ENV VLLM_TARGET_DEVICE neuron
RUN --mount=type=bind,source=.git,target=.git \
- cd /app/vllm \
- && pip install --no-build-isolation -v -e . \
- && cd ..
+ pip install --no-build-isolation -v -e .
CMD ["/bin/bash"]
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index 95714a3d17188..a05ff452cd36e 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -9,23 +9,17 @@ RUN apt-get update -y && \
ffmpeg libsm6 libxext6 libgl1
WORKDIR /workspace
-# copy requirements
-COPY requirements-build.txt /workspace/vllm/
-COPY requirements-common.txt /workspace/vllm/
-COPY requirements-openvino.txt /workspace/vllm/
-
-COPY vllm/ /workspace/vllm/vllm
-COPY csrc/core /workspace/vllm/csrc/core
-COPY cmake/utils.cmake /workspace/vllm/cmake/
-COPY CMakeLists.txt /workspace/vllm/
-COPY setup.py /workspace/vllm/
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
# build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
-COPY examples/ /workspace/vllm/examples
-COPY benchmarks/ /workspace/vllm/benchmarks
+COPY examples/ /workspace/examples
+COPY benchmarks/ /workspace/benchmarks
CMD ["/bin/bash"]
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index 1f374b01b9bc0..b19c6ddec7948 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -14,11 +14,14 @@ RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
# These packages will be in rocketce eventually
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
- cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
torch==2.3.1 \
-r requirements-cpu.txt \
xformers uvloop==0.20.0
@@ -30,4 +33,4 @@ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 496e6bed7c022..8fb79afaebe97 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -52,7 +52,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip uninstall -y torch torchvision \
&& python3 -m pip install --pre \
torch==2.6.0.dev20240918 \
- setuptools-scm>=8 \
+ 'setuptools-scm>=8' \
torchvision==0.20.0.dev20240918 \
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
*) ;; esac
@@ -117,6 +117,11 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
FROM base AS final
# Import the vLLM development directory from the build context
COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+RUN python3 -m pip install --upgrade pip
# Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index d8f1a42c45177..b43442e4c0af1 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -1,8 +1,8 @@
-ARG NIGHTLY_DATE="20240828"
+ARG NIGHTLY_DATE="20241017"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
FROM $BASE_IMAGE
-WORKDIR /workspace
+WORKDIR /workspace/vllm
# Install some basic utilities
RUN apt-get update && apt-get install -y \
@@ -16,14 +16,17 @@ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
# Build vLLM.
-COPY . /workspace/vllm
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+
ENV VLLM_TARGET_DEVICE="tpu"
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
- cd /workspace/vllm && \
python3 -m pip install \
- cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-r requirements-tpu.txt
-RUN cd /workspace/vllm && python3 setup.py develop
+RUN python3 setup.py develop
CMD ["/bin/bash"]
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 83db341556eaf..0ecb46df6256c 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -33,7 +33,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
-r requirements-xpu.txt
-COPY ./ /workspace/vllm
+COPY . .
+ARG GIT_REPO_CHECK
+RUN --mount=type=bind,source=.git,target=.git \
+ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
ENV VLLM_TARGET_DEVICE=xpu
diff --git a/README.md b/README.md
index 7768cbfa06749..f41749ecb1148 100644
--- a/README.md
+++ b/README.md
@@ -13,9 +13,19 @@ Easy, fast, and cheap LLM serving for everyone
| Intel® Gaudi® README | Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack |
+---
+
+**vLLM x Snowflake Meetup (Wednesday, November 13th, 5:30-8PM PT) at Snowflake HQ, San Mateo**
+
+We are excited to announce the last in-person vLLM meetup of the year!
+Join the vLLM developers and engineers from Snowflake AI Research to chat about the latest LLM inference optimizations and your 2025 vLLM wishlist!
+Register [here](https://lu.ma/h0qvrajz) and be a part of the event!
+
+---
+
*Latest News* 🔥
-- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
+- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
@@ -43,7 +53,7 @@ vLLM is fast with:
- Speculative decoding
- Chunked prefill
-**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
+**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
vLLM is flexible and easy to use with:
@@ -128,5 +138,6 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
* For technical questions and feature requests, please use Github issues or discussions.
* For discussing with fellow users, please use Discord.
+* For coordinating contributions and development, please use Slack.
* For security disclosures, please use Github's security advisory feature.
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4813fde27f0bc..0a903877f000d 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -324,7 +324,7 @@ async def async_request_openai_chat_completions(
},
],
"temperature": 0.0,
- "max_tokens": request_func_input.output_len,
+ "max_completion_tokens": request_func_input.output_len,
"stream": True,
"ignore_eos": request_func_input.ignore_eos,
}
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 30373b119a2ca..0a14aedd5feba 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,5 +1,6 @@
"""Benchmark the latency of processing a single batch of requests."""
import argparse
+import dataclasses
import json
import time
from pathlib import Path
@@ -10,45 +11,19 @@
from tqdm import tqdm
from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
+from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser
def main(args: argparse.Namespace):
print(args)
+ engine_args = EngineArgs.from_cli_args(args)
+
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
- llm = LLM(
- model=args.model,
- speculative_model=args.speculative_model,
- num_speculative_tokens=args.num_speculative_tokens,
- speculative_draft_tensor_parallel_size=\
- args.speculative_draft_tensor_parallel_size,
- tokenizer=args.tokenizer,
- quantization=args.quantization,
- tensor_parallel_size=args.tensor_parallel_size,
- trust_remote_code=args.trust_remote_code,
- dtype=args.dtype,
- max_model_len=args.max_model_len,
- enforce_eager=args.enforce_eager,
- kv_cache_dtype=args.kv_cache_dtype,
- quantization_param_path=args.quantization_param_path,
- device=args.device,
- ray_workers_use_nsight=args.ray_workers_use_nsight,
- use_v2_block_manager=args.use_v2_block_manager,
- enable_chunked_prefill=args.enable_chunked_prefill,
- download_dir=args.download_dir,
- block_size=args.block_size,
- gpu_memory_utilization=args.gpu_memory_utilization,
- load_format=args.load_format,
- distributed_executor_backend=args.distributed_executor_backend,
- otlp_traces_endpoint=args.otlp_traces_endpoint,
- enable_prefix_caching=args.enable_prefix_caching,
- max_num_seqs=args.batch_size,
- )
+ llm = LLM(**dataclasses.asdict(engine_args))
sampling_params = SamplingParams(
n=args.n,
@@ -127,19 +102,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
parser = FlexibleArgumentParser(
description='Benchmark the latency of processing a single batch of '
'requests till completion.')
- parser.add_argument('--model', type=str, default='facebook/opt-125m')
- parser.add_argument('--speculative-model', type=str, default=None)
- parser.add_argument('--num-speculative-tokens', type=int, default=None)
- parser.add_argument('--speculative-draft-tensor-parallel-size',
- '-spec-draft-tp',
- type=int,
- default=None)
- parser.add_argument('--tokenizer', type=str, default=None)
- parser.add_argument('--quantization',
- '-q',
- choices=[*QUANTIZATION_METHODS, None],
- default=None)
- parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--input-len', type=int, default=32)
parser.add_argument('--output-len', type=int, default=128)
parser.add_argument('--batch-size', type=int, default=8)
@@ -156,45 +118,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
type=int,
default=30,
help='Number of iterations to run.')
- parser.add_argument('--trust-remote-code',
- action='store_true',
- help='trust remote code from huggingface')
- parser.add_argument(
- '--max-model-len',
- type=int,
- default=None,
- help='Maximum length of a sequence (including prompt and output). '
- 'If None, will be derived from the model.')
- parser.add_argument(
- '--dtype',
- type=str,
- default='auto',
- choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
- help='data type for model weights and activations. '
- 'The "auto" option will use FP16 precision '
- 'for FP32 and FP16 models, and BF16 precision '
- 'for BF16 models.')
- parser.add_argument('--enforce-eager',
- action='store_true',
- help='enforce eager mode and disable CUDA graph')
- parser.add_argument(
- '--kv-cache-dtype',
- type=str,
- choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'],
- default="auto",
- help='Data type for kv cache storage. If "auto", will use model '
- 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
- 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
- parser.add_argument(
- '--quantization-param-path',
- type=str,
- default=None,
- help='Path to the JSON file containing the KV cache scaling factors. '
- 'This should generally be supplied, when KV cache dtype is FP8. '
- 'Otherwise, KV cache scaling factors default to 1.0, which may cause '
- 'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
- 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
- 'instead supported for common inference criteria.')
parser.add_argument(
'--profile',
action='store_true',
@@ -205,81 +128,12 @@ def run_to_completion(profile_dir: Optional[str] = None):
default=None,
help=('path to save the pytorch profiler output. Can be visualized '
'with ui.perfetto.dev or Tensorboard.'))
- parser.add_argument("--device",
- type=str,
- default="auto",
- choices=DEVICE_OPTIONS,
- help='device type for vLLM execution')
- parser.add_argument('--block-size',
- type=int,
- default=16,
- help='block size of key/value cache')
- parser.add_argument(
- '--enable-chunked-prefill',
- action='store_true',
- help='If True, the prefill requests can be chunked based on the '
- 'max_num_batched_tokens')
- parser.add_argument("--enable-prefix-caching",
- action='store_true',
- help="Enable automatic prefix caching")
- parser.add_argument('--use-v2-block-manager',
- action='store_true',
- default=EngineArgs.use_v2_block_manager)
- parser.add_argument(
- "--ray-workers-use-nsight",
- action='store_true',
- help="If specified, use nsight to profile ray workers",
- )
- parser.add_argument('--download-dir',
- type=str,
- default=None,
- help='directory to download and load the weights, '
- 'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the latency results in JSON format.')
- parser.add_argument('--gpu-memory-utilization',
- type=float,
- default=0.9,
- help='the fraction of GPU memory to be used for '
- 'the model executor, which can range from 0 to 1.'
- 'If unspecified, will use the default value of 0.9.')
- parser.add_argument(
- '--load-format',
- type=str,
- default=EngineArgs.load_format,
- choices=[
- 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
- 'bitsandbytes'
- ],
- help='The format of the model weights to load.\n\n'
- '* "auto" will try to load the weights in the safetensors format '
- 'and fall back to the pytorch bin format if safetensors format '
- 'is not available.\n'
- '* "pt" will load the weights in the pytorch bin format.\n'
- '* "safetensors" will load the weights in the safetensors format.\n'
- '* "npcache" will load the weights in pytorch format and store '
- 'a numpy cache to speed up the loading.\n'
- '* "dummy" will initialize the weights with random values, '
- 'which is mainly for profiling.\n'
- '* "tensorizer" will load the weights using tensorizer from '
- 'CoreWeave. See the Tensorize vLLM Model script in the Examples'
- 'section for more information.\n'
- '* "bitsandbytes" will load the weights using bitsandbytes '
- 'quantization.\n')
- parser.add_argument(
- '--distributed-executor-backend',
- choices=['ray', 'mp'],
- default=None,
- help='Backend to use for distributed serving. When more than 1 GPU '
- 'is used, will be automatically set to "ray" if installed '
- 'or "mp" (multiprocessing) otherwise.')
- parser.add_argument(
- '--otlp-traces-endpoint',
- type=str,
- default=None,
- help='Target URL to which OpenTelemetry traces will be sent.')
+
+ parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index f14092d347343..1aac029992dbf 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -25,6 +25,7 @@
--input-length-range 128:256
"""
+import dataclasses
import json
import random
import time
@@ -130,13 +131,9 @@ def main(args):
filtered_datasets = [(PROMPT, prompt_len, args.output_len)
] * args.num_prompts
- llm = LLM(model=args.model,
- tokenizer_mode='auto',
- trust_remote_code=True,
- enforce_eager=True,
- use_v2_block_manager=args.use_v2_block_manager,
- tensor_parallel_size=args.tensor_parallel_size,
- enable_prefix_caching=args.enable_prefix_caching)
+ engine_args = EngineArgs.from_cli_args(args)
+
+ llm = LLM(**dataclasses.asdict(engine_args))
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
@@ -164,22 +161,11 @@ def main(args):
parser = FlexibleArgumentParser(
description=
'Benchmark the performance with or without automatic prefix caching.')
- parser.add_argument('--model',
- type=str,
- default='baichuan-inc/Baichuan2-13B-Chat')
parser.add_argument("--dataset-path",
type=str,
default=None,
help="Path to the dataset.")
- parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--output-len', type=int, default=10)
- parser.add_argument('--enable-prefix-caching',
- action='store_true',
- help='enable prefix caching')
- parser.add_argument('--use-v2-block-manager',
- action='store_true',
- default=EngineArgs.use_v2_block_manager,
- help='Use BlockSpaceMangerV2')
parser.add_argument('--num-prompts',
type=int,
default=1,
@@ -196,9 +182,7 @@ def main(args):
default='128:256',
help='Range of input lengths for sampling prompts,'
'specified as "min:max" (e.g., "128:256").')
- parser.add_argument("--seed",
- type=int,
- default=0,
- help='Random seed for reproducibility')
+
+ parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index 8843e3a927a01..e0c9e6a6db502 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -1,5 +1,6 @@
"""Benchmark offline prioritization."""
import argparse
+import dataclasses
import json
import random
import time
@@ -7,7 +8,8 @@
from transformers import AutoTokenizer, PreTrainedTokenizerBase
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
def sample_requests(
@@ -62,46 +64,11 @@ def sample_requests(
def run_vllm(
requests: List[Tuple[str, int, int]],
- model: str,
- tokenizer: str,
- quantization: Optional[str],
- tensor_parallel_size: int,
- seed: int,
n: int,
- trust_remote_code: bool,
- dtype: str,
- max_model_len: Optional[int],
- enforce_eager: bool,
- kv_cache_dtype: str,
- quantization_param_path: Optional[str],
- device: str,
- enable_prefix_caching: bool,
- enable_chunked_prefill: bool,
- max_num_batched_tokens: int,
- gpu_memory_utilization: float = 0.9,
- download_dir: Optional[str] = None,
+ engine_args: EngineArgs,
) -> float:
from vllm import LLM, SamplingParams
- llm = LLM(
- model=model,
- tokenizer=tokenizer,
- quantization=quantization,
- tensor_parallel_size=tensor_parallel_size,
- seed=seed,
- trust_remote_code=trust_remote_code,
- dtype=dtype,
- max_model_len=max_model_len,
- gpu_memory_utilization=gpu_memory_utilization,
- enforce_eager=enforce_eager,
- kv_cache_dtype=kv_cache_dtype,
- quantization_param_path=quantization_param_path,
- device=device,
- enable_prefix_caching=enable_prefix_caching,
- download_dir=download_dir,
- enable_chunked_prefill=enable_chunked_prefill,
- max_num_batched_tokens=max_num_batched_tokens,
- disable_log_stats=False,
- )
+ llm = LLM(**dataclasses.asdict(engine_args))
# Add the requests to the engine.
prompts = []
@@ -142,16 +109,8 @@ def main(args: argparse.Namespace):
args.output_len)
if args.backend == "vllm":
- elapsed_time = run_vllm(requests, args.model, args.tokenizer,
- args.quantization, args.tensor_parallel_size,
- args.seed, args.n, args.trust_remote_code,
- args.dtype, args.max_model_len,
- args.enforce_eager, args.kv_cache_dtype,
- args.quantization_param_path, args.device,
- args.enable_prefix_caching,
- args.enable_chunked_prefill,
- args.max_num_batched_tokens,
- args.gpu_memory_utilization, args.download_dir)
+ elapsed_time = run_vllm(requests, args.n,
+ EngineArgs.from_cli_args(args))
else:
raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(prompt_len + output_len
@@ -173,7 +132,7 @@ def main(args: argparse.Namespace):
if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+ parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend",
type=str,
choices=["vllm", "hf", "mii"],
@@ -191,13 +150,6 @@ def main(args: argparse.Namespace):
default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.")
- parser.add_argument("--model", type=str, default="facebook/opt-125m")
- parser.add_argument("--tokenizer", type=str, default=None)
- parser.add_argument('--quantization',
- '-q',
- choices=[*QUANTIZATION_METHODS, None],
- default=None)
- parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
type=int,
default=1,
@@ -206,81 +158,13 @@ def main(args: argparse.Namespace):
type=int,
default=200,
help="Number of prompts to process.")
- parser.add_argument("--seed", type=int, default=0)
- parser.add_argument('--trust-remote-code',
- action='store_true',
- help='trust remote code from huggingface')
- parser.add_argument(
- '--max-model-len',
- type=int,
- default=None,
- help='Maximum length of a sequence (including prompt and output). '
- 'If None, will be derived from the model.')
- parser.add_argument(
- '--dtype',
- type=str,
- default='auto',
- choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
- help='data type for model weights and activations. '
- 'The "auto" option will use FP16 precision '
- 'for FP32 and FP16 models, and BF16 precision '
- 'for BF16 models.')
- parser.add_argument('--gpu-memory-utilization',
- type=float,
- default=0.9,
- help='the fraction of GPU memory to be used for '
- 'the model executor, which can range from 0 to 1.'
- 'If unspecified, will use the default value of 0.9.')
- parser.add_argument("--enforce-eager",
- action="store_true",
- help="enforce eager execution")
- parser.add_argument(
- '--kv-cache-dtype',
- type=str,
- choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
- default="auto",
- help='Data type for kv cache storage. If "auto", will use model '
- 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
- 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
- parser.add_argument(
- '--quantization-param-path',
- type=str,
- default=None,
- help='Path to the JSON file containing the KV cache scaling factors. '
- 'This should generally be supplied, when KV cache dtype is FP8. '
- 'Otherwise, KV cache scaling factors default to 1.0, which may cause '
- 'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
- 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
- 'instead supported for common inference criteria.')
- parser.add_argument(
- "--device",
- type=str,
- default="cuda",
- choices=["cuda", "cpu"],
- help='device type for vLLM execution, supporting CUDA and CPU.')
- parser.add_argument(
- "--enable-prefix-caching",
- action='store_true',
- help="enable automatic prefix caching for vLLM backend.")
- parser.add_argument("--enable-chunked-prefill",
- action='store_true',
- help="enable chunked prefill for vLLM backend.")
- parser.add_argument('--max-num-batched-tokens',
- type=int,
- default=None,
- help='maximum number of batched tokens per '
- 'iteration')
- parser.add_argument('--download-dir',
- type=str,
- default=None,
- help='directory to download and load the weights, '
- 'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
+ parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 04999518b7138..ff06622628219 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -53,6 +53,8 @@
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+
@dataclass
class BenchmarkMetrics:
@@ -60,6 +62,7 @@ class BenchmarkMetrics:
total_input: int
total_output: int
request_throughput: float
+ request_goodput: float
output_throughput: float
total_token_throughput: float
mean_ttft_ms: float
@@ -202,6 +205,7 @@ def sample_hf_requests(
dataset_split: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
+ random_seed: int,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
dataset = load_dataset(dataset_path,
@@ -210,8 +214,8 @@ def sample_hf_requests(
streaming=True)
assert "conversations" in dataset.features, (
"HF Dataset must have 'conversations' column.")
- filtered_dataset = dataset.shuffle().filter(
- lambda x: len(x["conversations"]) >= 2)
+ filter_func = lambda x: len(x["conversations"]) >= 2
+ filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
sampled_requests: List[Tuple[str, int, int, Dict[str,
Collection[str]]]] = []
for data in filtered_dataset:
@@ -315,12 +319,15 @@ def calculate_metrics(
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: List[str],
selected_percentiles: List[float],
+ gootput_config_dict: Dict[str, float],
) -> Tuple[BenchmarkMetrics, List[int]]:
actual_output_lens: List[int] = []
total_input = 0
completed = 0
+ good_completed = 0
itls: List[float] = []
tpots: List[float] = []
+ all_tpots: List[float] = []
ttfts: List[float] = []
e2els: List[float] = []
for i in range(len(outputs)):
@@ -334,9 +341,13 @@ def calculate_metrics(
add_special_tokens=False).input_ids)
actual_output_lens.append(output_len)
total_input += input_requests[i][1]
+ tpot = 0
if output_len > 1:
- tpots.append(
- (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
+ tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
+ 1)
+ tpots.append(tpot)
+ # Note: if output_len <= 1, we regard tpot as 0 for goodput
+ all_tpots.append(tpot)
itls += outputs[i].itl
ttfts.append(outputs[i].ttft)
e2els.append(outputs[i].latency)
@@ -344,6 +355,28 @@ def calculate_metrics(
else:
actual_output_lens.append(0)
+ if gootput_config_dict:
+ valid_metrics = []
+ slo_values = []
+
+ if "ttft" in gootput_config_dict:
+ valid_metrics.append(ttfts)
+ slo_values.append(gootput_config_dict["ttft"] /
+ MILLISECONDS_TO_SECONDS_CONVERSION)
+ if "tpot" in gootput_config_dict:
+ valid_metrics.append(all_tpots)
+ slo_values.append(gootput_config_dict["tpot"] /
+ MILLISECONDS_TO_SECONDS_CONVERSION)
+ if "e2el" in gootput_config_dict:
+ valid_metrics.append(e2els)
+ slo_values.append(gootput_config_dict["e2el"] /
+ MILLISECONDS_TO_SECONDS_CONVERSION)
+
+ for req_metric in zip(*valid_metrics):
+ is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+ if is_good_req:
+ good_completed += 1
+
if completed == 0:
warnings.warn(
"All requests failed. This is likely due to a misconfiguration "
@@ -354,6 +387,7 @@ def calculate_metrics(
total_input=total_input,
total_output=sum(actual_output_lens),
request_throughput=completed / dur_s,
+ request_goodput=good_completed / dur_s,
output_throughput=sum(actual_output_lens) / dur_s,
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
mean_ttft_ms=np.mean(ttfts or 0) *
@@ -372,9 +406,9 @@ def calculate_metrics(
median_itl_ms=np.median(itls or 0) * 1000,
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
for p in selected_percentiles],
- mean_e2el_ms=np.median(e2els or 0) * 1000,
+ mean_e2el_ms=np.mean(e2els or 0) * 1000,
std_e2el_ms=np.std(e2els or 0) * 1000,
- median_e2el_ms=np.mean(e2els or 0) * 1000,
+ median_e2el_ms=np.median(e2els or 0) * 1000,
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
for p in selected_percentiles],
)
@@ -397,6 +431,8 @@ async def benchmark(
selected_percentile_metrics: List[str],
selected_percentiles: List[str],
ignore_eos: bool,
+ gootput_config_dict: Dict[str, float],
+ max_concurrency: Optional[int],
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -431,42 +467,56 @@ async def benchmark(
if profile:
print("Starting profiler...")
- profile_input = RequestFuncInput(
- model=model_id,
- prompt=test_prompt,
- api_url=base_url + "/start_profile",
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- logprobs=logprobs,
- best_of=best_of,
- multi_modal_content=test_mm_content,
- )
+ profile_input = RequestFuncInput(model=model_id,
+ prompt=test_prompt,
+ api_url=base_url + "/start_profile",
+ prompt_len=test_prompt_len,
+ output_len=test_output_len,
+ logprobs=logprobs,
+ best_of=best_of,
+ multi_modal_content=test_mm_content,
+ ignore_eos=ignore_eos)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler started")
print(f"Traffic request rate: {request_rate}")
+ print(f"Maximum request concurrency: {max_concurrency}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+ # This can be used once the minimum Python version is 3.10 or higher,
+ # and it will simplify the code in limited_request_func.
+ # semaphore = (asyncio.Semaphore(max_concurrency)
+ # if max_concurrency else contextlib.nullcontext())
+ semaphore = (asyncio.Semaphore(max_concurrency)
+ if max_concurrency else None)
+
+ async def limited_request_func(request_func_input, pbar):
+ if semaphore is None:
+ return await request_func(request_func_input=request_func_input,
+ pbar=pbar)
+ async with semaphore:
+ return await request_func(request_func_input=request_func_input,
+ pbar=pbar)
+
benchmark_start_time = time.perf_counter()
tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate):
prompt, prompt_len, output_len, mm_content = request
- request_func_input = RequestFuncInput(
- model=model_id,
- prompt=prompt,
- api_url=api_url,
- prompt_len=prompt_len,
- output_len=output_len,
- logprobs=logprobs,
- best_of=best_of,
- multi_modal_content=mm_content,
- )
+ request_func_input = RequestFuncInput(model=model_id,
+ prompt=prompt,
+ api_url=api_url,
+ prompt_len=prompt_len,
+ output_len=output_len,
+ logprobs=logprobs,
+ best_of=best_of,
+ multi_modal_content=mm_content,
+ ignore_eos=ignore_eos)
tasks.append(
asyncio.create_task(
- request_func(request_func_input=request_func_input,
- pbar=pbar)))
+ limited_request_func(request_func_input=request_func_input,
+ pbar=pbar)))
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile:
@@ -496,6 +546,7 @@ async def benchmark(
tokenizer=tokenizer,
selected_percentile_metrics=selected_percentile_metrics,
selected_percentiles=selected_percentiles,
+ gootput_config_dict=gootput_config_dict,
)
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@@ -507,6 +558,9 @@ async def benchmark(
metrics.total_output))
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
metrics.request_throughput))
+ if gootput_config_dict:
+ print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
+ metrics.request_goodput))
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
metrics.output_throughput))
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
@@ -518,6 +572,8 @@ async def benchmark(
"total_input_tokens": metrics.total_input,
"total_output_tokens": metrics.total_output,
"request_throughput": metrics.request_throughput,
+ "request_goodput:":
+ metrics.request_goodput if gootput_config_dict else None,
"output_throughput": metrics.output_throughput,
"total_token_throughput": metrics.total_token_throughput,
"input_lens": [output.prompt_len for output in outputs],
@@ -571,6 +627,41 @@ def process_one_metric(
return result
+def check_goodput_args(args):
+ # Check and parse goodput arguments
+ gootput_config_dict = {}
+ VALID_NAMES = ["ttft", "tpot", "e2el"]
+ if args.goodput:
+ gootput_config_dict = parse_goodput(args.goodput)
+ for slo_name, slo_val in gootput_config_dict.items():
+ if slo_name not in VALID_NAMES:
+ raise ValueError(
+ f"Invalid metric name found, {slo_name}: {slo_val}. "
+ "The service level objective name should be one of "
+ f"{str(VALID_NAMES)}. ")
+ if slo_val < 0:
+ raise ValueError(
+ f"Invalid value found, {slo_name}: {slo_val}. "
+ "The service level objective value should be "
+ "non-negative.")
+ return gootput_config_dict
+
+
+def parse_goodput(slo_pairs):
+ gootput_config_dict = {}
+ try:
+ for slo_pair in slo_pairs:
+ slo_name, slo_val = slo_pair.split(":")
+ gootput_config_dict[slo_name] = float(slo_val)
+ except ValueError as err:
+ raise argparse.ArgumentTypeError(
+ "Invalid format found for service level objectives. "
+ "Specify service level objectives for goodput as \"KEY:VALUE\" "
+ "pairs, where the key is a metric name, and the value is a "
+ "number in milliseconds.") from err
+ return gootput_config_dict
+
+
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
@@ -648,6 +739,7 @@ def main(args: argparse.Namespace):
dataset_split=args.hf_split,
num_requests=args.num_prompts,
tokenizer=tokenizer,
+ random_seed=args.seed,
fixed_output_len=args.hf_output_len,
)
@@ -664,6 +756,8 @@ def main(args: argparse.Namespace):
else:
raise ValueError(f"Unknown dataset: {args.dataset_name}")
+ gootput_config_dict = check_goodput_args(args)
+
benchmark_result = asyncio.run(
benchmark(
backend=backend,
@@ -682,6 +776,8 @@ def main(args: argparse.Namespace):
float(p) for p in args.metric_percentiles.split(",")
],
ignore_eos=args.ignore_eos,
+ gootput_config_dict=gootput_config_dict,
+ max_concurrency=args.max_concurrency,
))
# Save config and results to json
@@ -711,13 +807,16 @@ def main(args: argparse.Namespace):
# Traffic
result_json["request_rate"] = (
args.request_rate if args.request_rate < float("inf") else "inf")
+ result_json["max_concurrency"] = args.max_concurrency
# Merge with benchmark result
result_json = {**result_json, **benchmark_result}
# Save to file
base_model_id = model_id.split("/")[-1]
- file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
+ max_concurrency_str = (f"-concurrency{args.max_concurrency}"
+ if args.max_concurrency is not None else "")
+ file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa
if args.result_filename:
file_name = args.result_filename
if args.result_dir:
@@ -768,6 +867,19 @@ def main(args: argparse.Namespace):
default=None,
help="Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset.")
+ parser.add_argument(
+ "--max-concurrency",
+ type=int,
+ default=None,
+ help="Maximum number of concurrent requests. This can be used "
+ "to help simulate an environment where a higher level component "
+ "is enforcing a maximum number of concurrent requests. While the "
+ "--request-rate argument controls the rate at which requests are "
+ "initiated, this argument will control how many are actually allowed "
+ "to execute at a time. This means that when used in combination, the "
+ "actual request rate may be lower than specified with --request-rate, "
+ "if the server is not processing requests fast enough to keep up.")
+
parser.add_argument(
"--model",
type=str,
@@ -881,6 +993,17 @@ def main(args: argparse.Namespace):
"Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.",
)
+ parser.add_argument(
+ "--goodput",
+ nargs="+",
+ required=False,
+ help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+ "pairs, where the key is a metric name, and the value is in "
+ "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+ "separated by spaces. Allowed request level metric names are "
+ "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+ "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
+ "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
# group for dataset specific arguments
sonnet_group = parser.add_argument_group("sonnet dataset options")
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index e1a359b871e71..a49f37c7d797a 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -1,9 +1,10 @@
"""Benchmark offline inference throughput."""
import argparse
+import dataclasses
import json
import random
import time
-from typing import List, Optional, Tuple
+from typing import List, Optional
import torch
import uvloop
@@ -11,20 +12,38 @@
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
-from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.inputs import TextPrompt
+from vllm.multimodal import MultiModalDataDict
from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+@dataclasses.dataclass
+class SampleRequest:
+ """A class representing a single inference request for benchmarking.
+
+ Attributes:
+ prompt: The input text prompt for the model.
+ multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+ images).
+ prompt_len: The length of the prompt in tokens.
+ expected_output_len: The expected length of the output in tokens.
+ """
+ prompt: str
+ prompt_len: int
+ expected_output_len: int
+ multi_modal_data: Optional[MultiModalDataDict] = None
+
+
def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+) -> List[SampleRequest]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@@ -41,7 +60,7 @@ def sample_requests(
random.shuffle(dataset)
# Filter out sequences that are too long or too short
- filtered_dataset: List[Tuple[str, int, int]] = []
+ filtered_dataset: List[SampleRequest] = []
for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
break
@@ -60,83 +79,34 @@ def sample_requests(
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
- filtered_dataset.append((prompt, prompt_len, output_len))
+ filtered_dataset.append(
+ SampleRequest(prompt=prompt,
+ prompt_len=prompt_len,
+ expected_output_len=output_len))
return filtered_dataset
def run_vllm(
- requests: List[Tuple[str, int, int]],
- model: str,
- tokenizer: str,
- quantization: Optional[str],
- tensor_parallel_size: int,
- seed: int,
+ requests: List[SampleRequest],
n: int,
- trust_remote_code: bool,
- dtype: str,
- max_model_len: Optional[int],
- enforce_eager: bool,
- kv_cache_dtype: str,
- quantization_param_path: Optional[str],
- device: str,
- enable_prefix_caching: bool,
- enable_chunked_prefill: bool,
- max_num_batched_tokens: int,
- distributed_executor_backend: Optional[str],
- gpu_memory_utilization: float = 0.9,
- num_scheduler_steps: int = 1,
- use_v2_block_manager: bool = False,
- download_dir: Optional[str] = None,
- load_format: str = EngineArgs.load_format,
- disable_async_output_proc: bool = False,
- weights_load_device: str = None,
- use_padding_aware_scheduling: bool = False,
- max_num_seqs: int = 256,
- max_num_prefill_seqs: int = None,
+ engine_args: EngineArgs,
) -> float:
from vllm import LLM, SamplingParams
- llm = LLM(
- model=model,
- tokenizer=tokenizer,
- quantization=quantization,
- tensor_parallel_size=tensor_parallel_size,
- seed=seed,
- trust_remote_code=trust_remote_code,
- dtype=dtype,
- max_model_len=max_model_len,
- gpu_memory_utilization=gpu_memory_utilization,
- enforce_eager=enforce_eager,
- kv_cache_dtype=kv_cache_dtype,
- quantization_param_path=quantization_param_path,
- device=device,
- enable_prefix_caching=enable_prefix_caching,
- download_dir=download_dir,
- enable_chunked_prefill=enable_chunked_prefill,
- max_num_batched_tokens=max_num_batched_tokens,
- distributed_executor_backend=distributed_executor_backend,
- load_format=load_format,
- num_scheduler_steps=num_scheduler_steps,
- use_v2_block_manager=use_v2_block_manager,
- disable_async_output_proc=disable_async_output_proc,
- weights_load_device=weights_load_device,
- use_padding_aware_scheduling=use_padding_aware_scheduling,
- max_num_seqs=max_num_seqs,
- max_num_prefill_seqs=max_num_prefill_seqs,
- )
+ llm = LLM(**dataclasses.asdict(engine_args))
# Add the requests to the engine.
- prompts: List[str] = []
+ prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = []
- for prompt, _, output_len in requests:
- prompts.append(prompt)
+ for request in requests:
+ prompts.append(TextPrompt(prompt=request.prompt))
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
- max_tokens=output_len,
+ max_tokens=request.expected_output_len,
))
use_beam_search = False
@@ -146,11 +116,11 @@ def run_vllm(
llm.generate(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter()
else:
- prompts = [prompt for prompt, _, _ in requests]
+ prompts = [request.prompt for request in requests]
# output_len should be the same for all requests.
output_len = requests[0][2]
- for prompt, input_len, _output_len in requests:
- assert _output_len == output_len
+ for request in requests:
+ assert request.expected_output_len == output_len
start = time.perf_counter()
llm.beam_search(
prompts,
@@ -164,30 +134,9 @@ def run_vllm(
async def run_vllm_async(
- requests: List[Tuple[str, int, int]],
- model: str,
- tokenizer: str,
- quantization: Optional[str],
- tensor_parallel_size: int,
- seed: int,
+ requests: List[SampleRequest],
n: int,
- trust_remote_code: bool,
- dtype: str,
- max_model_len: Optional[int],
- enforce_eager: bool,
- kv_cache_dtype: str,
- quantization_param_path: Optional[str],
- device: str,
- enable_prefix_caching: bool,
- enable_chunked_prefill: bool,
- max_num_batched_tokens: int,
- distributed_executor_backend: Optional[str],
- gpu_memory_utilization: float = 0.9,
- num_scheduler_steps: int = 1,
- use_v2_block_manager: bool = False,
- download_dir: Optional[str] = None,
- load_format: str = EngineArgs.load_format,
- disable_async_output_proc: bool = False,
+ engine_args: AsyncEngineArgs,
disable_frontend_multiprocessing: bool = False,
weights_load_device: str = None,
use_padding_aware_scheduling: bool = False,
@@ -195,51 +144,22 @@ async def run_vllm_async(
max_num_prefill_seqs: int = None,
) -> float:
from vllm import SamplingParams
- engine_args = AsyncEngineArgs(
- model=model,
- tokenizer=tokenizer,
- quantization=quantization,
- tensor_parallel_size=tensor_parallel_size,
- seed=seed,
- trust_remote_code=trust_remote_code,
- dtype=dtype,
- max_model_len=max_model_len,
- gpu_memory_utilization=gpu_memory_utilization,
- enforce_eager=enforce_eager,
- kv_cache_dtype=kv_cache_dtype,
- quantization_param_path=quantization_param_path,
- device=device,
- enable_prefix_caching=enable_prefix_caching,
- download_dir=download_dir,
- enable_chunked_prefill=enable_chunked_prefill,
- max_num_batched_tokens=max_num_batched_tokens,
- distributed_executor_backend=distributed_executor_backend,
- load_format=load_format,
- num_scheduler_steps=num_scheduler_steps,
- use_v2_block_manager=use_v2_block_manager,
- disable_async_output_proc=disable_async_output_proc,
- worker_use_ray=False,
- disable_log_requests=True,
- weights_load_device=weights_load_device,
- use_padding_aware_scheduling=use_padding_aware_scheduling,
- max_num_prefill_seqs=max_num_prefill_seqs,
- )
async with build_async_engine_client_from_engine_args(
engine_args, disable_frontend_multiprocessing) as llm:
# Add the requests to the engine.
- prompts: List[str] = []
+ prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = []
- for prompt, _, output_len in requests:
- prompts.append(prompt)
+ for request in requests:
+ prompts.append(TextPrompt(prompt=request.prompt))
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
- max_tokens=output_len,
+ max_tokens=request.expected_output_len,
))
generators = []
@@ -255,7 +175,7 @@ async def run_vllm_async(
def run_hf(
- requests: List[Tuple[str, int, int]],
+ requests: List[SampleRequest],
model: str,
tokenizer: PreTrainedTokenizerBase,
n: int,
@@ -313,14 +233,14 @@ def run_hf(
def run_mii(
- requests: List[Tuple[str, int, int]],
+ requests: List[SampleRequest],
model: str,
tensor_parallel_size: int,
output_len: int,
) -> float:
from mii import client, serve
llm = serve(model, tensor_parallel=tensor_parallel_size)
- prompts = [prompt for prompt, _, _ in requests]
+ prompts = [request.prompt for request in requests]
start = time.perf_counter()
llm.generate(prompts, max_new_tokens=output_len)
@@ -339,34 +259,38 @@ def main(args: argparse.Namespace):
args.tokenizer, trust_remote_code=args.trust_remote_code)
if args.dataset is None:
# Synthesize a prompt with the given input length.
- prompt = "hi" * (args.input_len - 1)
- requests = [(prompt, args.input_len, args.output_len)
- for _ in range(args.num_prompts)]
+ # As tokenizer may add additional tokens like BOS, we need to try
+ # different lengths to get the desired input length.
+ for i in range(-10, 10):
+ prompt = "hi " * (args.input_len + i)
+ tokenized_prompt = tokenizer(prompt).input_ids
+ if len(tokenized_prompt) == args.input_len:
+ break
+ else:
+ raise ValueError(
+ f"Failed to synthesize a prompt with {args.input_len} tokens.")
+ requests = [
+ SampleRequest(prompt=prompt,
+ prompt_len=args.input_len,
+ expected_output_len=args.output_len)
+ for _ in range(args.num_prompts)
+ ]
else:
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
args.output_len)
if args.backend == "vllm":
- run_args = [
- requests, args.model, args.tokenizer, args.quantization,
- args.tensor_parallel_size, args.seed, args.n,
- args.trust_remote_code, args.dtype, args.max_model_len,
- args.enforce_eager, args.kv_cache_dtype,
- args.quantization_param_path, args.device,
- args.enable_prefix_caching, args.enable_chunked_prefill,
- args.max_num_batched_tokens, args.distributed_executor_backend,
- args.gpu_memory_utilization, args.num_scheduler_steps,
- args.use_v2_block_manager, args.download_dir, args.load_format,
- args.disable_async_output_proc, args.weights_load_device,
- args.use_padding_aware_scheduling, args.max_num_seqs,
- args.max_num_prefill_seqs
- ]
-
if args.async_engine:
- run_args.append(args.disable_frontend_multiprocessing)
- elapsed_time = uvloop.run(run_vllm_async(*run_args))
+ elapsed_time = uvloop.run(
+ run_vllm_async(
+ requests,
+ args.n,
+ AsyncEngineArgs.from_cli_args(args),
+ args.disable_frontend_multiprocessing,
+ ))
else:
- elapsed_time = run_vllm(*run_args)
+ elapsed_time = run_vllm(requests, args.n,
+ EngineArgs.from_cli_args(args))
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -376,10 +300,13 @@ def main(args: argparse.Namespace):
args.output_len)
else:
raise ValueError(f"Unknown backend: {args.backend}")
- total_num_tokens = sum(prompt_len + output_len
- for _, prompt_len, output_len in requests)
+ total_num_tokens = sum(request.prompt_len + request.expected_output_len
+ for request in requests)
+ total_output_tokens = sum(request.expected_output_len
+ for request in requests)
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
- f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+ f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+ f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
# Output JSON results if specified
if args.output_json:
@@ -403,7 +330,9 @@ def main(args: argparse.Namespace):
parser.add_argument("--dataset",
type=str,
default=None,
- help="Path to the dataset.")
+ help="Path to the dataset. The dataset is expected to "
+ "be a json in form of List[Dict[..., conversations: "
+ "List[Dict[..., value: ]]]]")
parser.add_argument("--input-len",
type=int,
default=None,
@@ -413,13 +342,6 @@ def main(args: argparse.Namespace):
default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.")
- parser.add_argument("--model", type=str, default="facebook/opt-125m")
- parser.add_argument("--tokenizer", type=str, default=None)
- parser.add_argument('--quantization',
- '-q',
- choices=[*QUANTIZATION_METHODS, None],
- default=None)
- parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
type=int,
default=1,
@@ -428,127 +350,15 @@ def main(args: argparse.Namespace):
type=int,
default=1000,
help="Number of prompts to process.")
- parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--hf-max-batch-size",
type=int,
default=None,
help="Maximum batch size for HF backend.")
- parser.add_argument('--trust-remote-code',
- action='store_true',
- help='trust remote code from huggingface')
- parser.add_argument(
- '--max-model-len',
- type=int,
- default=None,
- help='Maximum length of a sequence (including prompt and output). '
- 'If None, will be derived from the model.')
- parser.add_argument(
- '--dtype',
- type=str,
- default='auto',
- choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
- help='data type for model weights and activations. '
- 'The "auto" option will use FP16 precision '
- 'for FP32 and FP16 models, and BF16 precision '
- 'for BF16 models.')
- parser.add_argument('--gpu-memory-utilization',
- type=float,
- default=0.9,
- help='the fraction of GPU memory to be used for '
- 'the model executor, which can range from 0 to 1.'
- 'If unspecified, will use the default value of 0.9.')
- parser.add_argument("--enforce-eager",
- action="store_true",
- help="enforce eager execution")
- parser.add_argument(
- '--kv-cache-dtype',
- type=str,
- choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'],
- default="auto",
- help='Data type for kv cache storage. If "auto", will use model '
- 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
- 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
- parser.add_argument(
- '--quantization-param-path',
- type=str,
- default=None,
- help='Path to the JSON file containing the KV cache scaling factors. '
- 'This should generally be supplied, when KV cache dtype is FP8. '
- 'Otherwise, KV cache scaling factors default to 1.0, which may cause '
- 'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
- 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
- 'instead supported for common inference criteria.')
- parser.add_argument("--device",
- type=str,
- default="auto",
- choices=DEVICE_OPTIONS,
- help='device type for vLLM execution')
- parser.add_argument(
- "--num-scheduler-steps",
- type=int,
- default=1,
- help="Maximum number of forward steps per scheduler call.")
- parser.add_argument("--use-v2-block-manager",
- action='store_true',
- default=EngineArgs.use_v2_block_manager,
- help="Enable block manager v2.")
- parser.add_argument(
- "--enable-prefix-caching",
- action='store_true',
- help="Enable automatic prefix caching for vLLM backend.")
- parser.add_argument("--enable-chunked-prefill",
- action='store_true',
- help="enable chunked prefill for vLLM backend.")
- parser.add_argument('--max-num-batched-tokens',
- type=int,
- default=None,
- help='maximum number of batched tokens per '
- 'iteration')
- parser.add_argument('--download-dir',
- type=str,
- default=None,
- help='directory to download and load the weights, '
- 'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
- parser.add_argument(
- '--distributed-executor-backend',
- choices=['ray', 'mp'],
- default=None,
- help='Backend to use for distributed serving. When more than 1 GPU '
- 'is used, will be automatically set to "ray" if installed '
- 'or "mp" (multiprocessing) otherwise.')
- parser.add_argument(
- '--load-format',
- type=str,
- default=EngineArgs.load_format,
- choices=[
- 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
- 'bitsandbytes'
- ],
- help='The format of the model weights to load.\n\n'
- '* "auto" will try to load the weights in the safetensors format '
- 'and fall back to the pytorch bin format if safetensors format '
- 'is not available.\n'
- '* "pt" will load the weights in the pytorch bin format.\n'
- '* "safetensors" will load the weights in the safetensors format.\n'
- '* "npcache" will load the weights in pytorch format and store '
- 'a numpy cache to speed up the loading.\n'
- '* "dummy" will initialize the weights with random values, '
- 'which is mainly for profiling.\n'
- '* "tensorizer" will load the weights using tensorizer from '
- 'CoreWeave. See the Tensorize vLLM Model script in the Examples'
- 'section for more information.\n'
- '* "bitsandbytes" will load the weights using bitsandbytes '
- 'quantization.\n')
- parser.add_argument(
- "--disable-async-output-proc",
- action='store_true',
- default=False,
- help="Disable async output processor for vLLM backend.")
parser.add_argument("--async-engine",
action='store_true',
default=False,
@@ -557,23 +367,7 @@ def main(args: argparse.Namespace):
action='store_true',
default=False,
help="Disable decoupled async engine frontend.")
- parser.add_argument("--weights-load-device",
- type=str,
- default=None,
- choices=DEVICE_OPTIONS,
- help='Device on which weights are loaded.')
- parser.add_argument("--use-padding-aware-scheduling",
- action='store_true',
- default=False,
- help="Enable padding-aware scheduling.")
- parser.add_argument("--max-num-seqs",
- type=int,
- default=256,
- help="Maximum number of requests for single decode.")
- parser.add_argument("--max-num-prefill-seqs",
- type=int,
- default=None,
- help="Maximum number of requests for single prefill.")
+ parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index 92f6053cc6d7e..7acea6087fdfd 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -3,8 +3,8 @@
import torch
from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
- seed_everything)
+from vllm.platforms import current_platform
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
@torch.inference_mode()
@@ -16,7 +16,7 @@ def main(num_tokens: int,
do_profile: bool = False,
num_warmup_iters: int = 5,
num_iters: int = 100) -> None:
- seed_everything(seed)
+ current_platform.seed_everything(seed)
torch.set_default_device("cuda")
layer = RMSNorm(hidden_size).to(dtype=dtype)
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index c2ad98b7e2656..8f538c21f7f7e 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -10,7 +10,8 @@
from transformers import AutoConfig
from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.utils import FlexibleArgumentParser, seed_everything
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
class BenchmarkConfig(TypedDict):
@@ -88,22 +89,23 @@ def prepare(i: int):
input_gating.copy_(gating_output[i])
def run():
- fused_moe(
- x,
- w1,
- w2,
- input_gating,
- topk,
- renormalize=True,
- inplace=True,
- override_config=config,
- use_fp8_w8a8=use_fp8_w8a8,
- use_int8_w8a16=use_int8_w8a16,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a1_scale,
- a2_scale=a2_scale,
- )
+ from vllm.model_executor.layers.fused_moe import override_config
+ with override_config(config):
+ fused_moe(
+ x,
+ w1,
+ w2,
+ input_gating,
+ topk,
+ renormalize=True,
+ inplace=True,
+ use_fp8_w8a8=use_fp8_w8a8,
+ use_int8_w8a16=use_int8_w8a16,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ )
# JIT compilation & warmup
run()
@@ -166,7 +168,7 @@ class BenchmarkWorker:
def __init__(self, seed: int) -> None:
torch.set_default_device("cuda")
- seed_everything(seed)
+ current_platform.seed_everything(seed)
self.seed = seed
def benchmark(
@@ -180,7 +182,7 @@ def benchmark(
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
) -> Tuple[Dict[str, int], float]:
- seed_everything(self.seed)
+ current_platform.seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype,
use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8)
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 87864d038d593..14eef00b855ac 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -5,8 +5,9 @@
import torch
from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
- create_kv_caches_with_random, seed_everything)
+ create_kv_caches_with_random)
NUM_BLOCKS = 1024
PARTITION_SIZE = 512
@@ -28,7 +29,7 @@ def main(
device: str = "cuda",
kv_cache_dtype: Optional[str] = None,
) -> None:
- seed_everything(seed)
+ current_platform.seed_everything(seed)
scale = float(1.0 / (head_size**0.5))
query = torch.empty(num_seqs,
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 743a5744e8614..1d62483448946 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -3,8 +3,8 @@
import torch
from vllm import _custom_ops as ops
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
- seed_everything)
+from vllm.platforms import current_platform
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
@torch.inference_mode()
@@ -17,7 +17,7 @@ def main(num_tokens: int,
do_profile: bool = False,
num_warmup_iters: int = 5,
num_iters: int = 100) -> None:
- seed_everything(seed)
+ current_platform.seed_everything(seed)
torch.set_default_device("cuda")
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 73fc9e9dbf461..250d505168d09 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -6,7 +6,8 @@
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
get_rope)
-from vllm.utils import FlexibleArgumentParser, seed_everything
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
def benchmark_rope_kernels_multi_lora(
@@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
max_position: int = 8192,
base: int = 10000,
) -> None:
- seed_everything(seed)
+ current_platform.seed_everything(seed)
torch.set_default_device(device)
if rotary_dim is None:
rotary_dim = head_size
@@ -31,7 +32,7 @@ def benchmark_rope_kernels_multi_lora(
# batched RoPE can take multiple scaling factors
batched_rope = get_rope(head_size, rotary_dim, max_position, base,
is_neox_style, {
- "type": "linear",
+ "rope_type": "linear",
"factor": tuple(scaling_factors)
})
# non-batched RoPE takes only one scaling factor, we create multiple
@@ -41,7 +42,7 @@ def benchmark_rope_kernels_multi_lora(
non_batched_ropes.append(
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
{
- "type": "linear",
+ "rope_type": "linear",
"factor": (scaling_factor, )
}))
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
index 203699e9a8d06..d16d6f9fba442 100644
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -16,7 +16,6 @@ def main(args):
enforce_eager=True,
enable_prefix_caching=True,
tensor_parallel_size=args.tensor_parallel_size,
- use_v2_block_manager=args.use_v2_block_manager,
)
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
@@ -56,8 +55,5 @@ def main(args):
parser.add_argument('--enable-prefix-caching',
action='store_true',
help='enable prefix caching')
- parser.add_argument('--use-v2-block-manager',
- action='store_true',
- help='Use BlockSpaceMangerV2')
args = parser.parse_args()
main(args)
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index bc5f24d3f591c..7237d246ddf55 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -1,5 +1,8 @@
+include(FetchContent)
+
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_CXX_STANDARD 17)
#
# Define environment variables for special configurations
@@ -82,15 +85,40 @@ else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
endif()
+#
+# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms)
+#
+if (AVX512_FOUND AND NOT AVX512_DISABLED)
+ FetchContent_Declare(
+ oneDNN
+ GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+ GIT_TAG v3.5.3
+ GIT_PROGRESS TRUE
+ GIT_SHALLOW TRUE
+ )
+
+ set(ONEDNN_LIBRARY_TYPE "STATIC")
+ set(ONEDNN_BUILD_DOC "OFF")
+ set(ONEDNN_BUILD_EXAMPLES "OFF")
+ set(ONEDNN_BUILD_TESTS "OFF")
+ set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
+ set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+ set(ONEDNN_BUILD_GRAPH "OFF")
+ set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
+ set(ONEDNN_ENABLE_ITT_TASKS "OFF")
+ set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
+ set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
+ set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+ FetchContent_MakeAvailable(oneDNN)
+
+ list(APPEND LIBS dnnl)
+endif()
+
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
list(APPEND LIBS numa)
-# Appending the dnnl library for the AVX2 and AVX512, as it is not utilized by Power architecture.
-if (AVX2_FOUND OR AVX512_FOUND)
- list(APPEND LIBS dnnl)
-endif()
-
#
# _C extension
#
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 24bb7299338ac..40430dae10c5b 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -424,11 +424,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
# dependencies that are not necessary and may not be installed.
if (GPU_LANGUAGE STREQUAL "CUDA")
- if ("${CUDA_CUDA_LIB}" STREQUAL "")
- set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}")
- endif()
- target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
- ${CUDA_LIBRARIES})
+ target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
else()
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
endif()
diff --git a/collect_env.py b/collect_env.py
index ae7f97f355253..80403d576d78f 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -267,23 +267,16 @@ def get_neuron_sdk_version(run_lambda):
def get_vllm_version():
- version = ""
- try:
- import vllm
- version = vllm.__version__
- except Exception:
- pass
- commit = ""
- try:
- import vllm
- commit = vllm.__commit__
- except Exception:
- pass
- if version != "" and commit != "":
- return f"{version}@{commit}"
- if version == "" and commit == "":
- return "N/A"
- return version or commit
+ from vllm import __version__, __version_tuple__
+
+ if __version__ == "dev":
+ return "N/A (dev)"
+
+ if len(__version_tuple__) == 4: # dev build
+ git_sha = __version_tuple__[-1][1:] # type: ignore
+ return f"{__version__} (git sha: {git_sha}"
+
+ return __version__
def summarize_vllm_build_flags():
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 5ed1dc3b8f792..839dc36ba4e29 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -89,6 +89,48 @@ void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
namespace vllm {
+template
+__device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) {
+ const float f = (float)x;
+ return (T)(f > threshold ? f : 0.0f);
+}
+
+template
+__global__ void act_and_mul_kernel_with_param(
+ scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
+ const float param) {
+ const int64_t token_idx = blockIdx.x;
+ for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+ const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
+ const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+ out[token_idx * d + idx] = ACT_FN(x, param) * y;
+ }
+}
+
+} // namespace vllm
+
+#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \
+ int d = input.size(-1) / 2; \
+ int64_t num_tokens = input.numel() / input.size(-1); \
+ dim3 grid(num_tokens); \
+ dim3 block(std::min(d, 1024)); \
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
+ VLLM_DISPATCH_FLOATING_TYPES( \
+ input.scalar_type(), "act_and_mul_kernel_with_param", [&] { \
+ vllm::act_and_mul_kernel_with_param> \
+ <<>>(out.data_ptr(), \
+ input.data_ptr(), d, \
+ PARAM); \
+ });
+
+void fatrelu_and_mul(torch::Tensor& out, // [..., d],
+ torch::Tensor& input, // [..., 2 * d]
+ double threshold) {
+ LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
+}
+namespace vllm {
+
// Element-wise activation kernel template.
template
__global__ void activation_kernel(
diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
index 0e1f360d74bd5..408e736d5bc0f 100644
--- a/csrc/core/scalar_type.hpp
+++ b/csrc/core/scalar_type.hpp
@@ -1,6 +1,7 @@
#pragma once
-#include
+// For TORCH_CHECK
+#include
namespace vllm {
@@ -9,12 +10,7 @@ namespace vllm {
// in particular it can be used to represent sub-byte data types (something
// that torch.dtype currently does not support).
//
-// ScalarTypeTorch is a subclass of ScalarType that is compatible with
-// TORCH_LIBRARY, making it accessible from Python as well meaning this class
-// can be used as a argument for custom operators, helping to simplify these
-// interfaces.
-//
-// The type definitions on the Python side can be found in: vllm/_core_ext.pyi
+// The type definitions on the Python side can be found in: vllm/scalar_type.py
// these type definitions should be kept up to date with any Python API changes
// here.
//
@@ -308,204 +304,7 @@ class ScalarType {
}
};
-// Create a TORCH_LIBRARY compatible version of ScalarType (i.e. inherit from
-// torch::CustomClassHolder), we use multiple inheritance here since we cannot
-// have ScalarType inherit from torch::CustomClassHolder and have a constexpr
-// constructor at the same time (torch::CustomClassHolder does not have a
-// constexpr destructor)
-// See also:
-// https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
-class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType {
- public:
- ScalarTypeTorch(int64_t exponent, int64_t mantissa, int64_t bias,
- bool _signed)
- : ScalarType(exponent, mantissa, bias, _signed){};
-
- ScalarTypeTorch(ScalarType type) : ScalarType(type){};
-
- using Base = ScalarType;
- using Self = ScalarTypeTorch;
- using SelfPtr = c10::intrusive_ptr;
-
- static void check_size_bits(int64_t size_bits, bool signed_) {
- TORCH_CHECK(
- size_bits <=
- std::numeric_limits().mantissa)>::max(),
- "size_bits bit width is too large to be represented");
- }
-
- static void check_bias(int64_t bias) {
- using Bias = decltype(std::declval().bias);
- TORCH_CHECK(bias <= std::numeric_limits::max() &&
- bias >= std::numeric_limits::min(),
- "bias too large or small to be represented");
- }
-
- static void check_exponent(int64_t exponent) {
- TORCH_CHECK(
- exponent <=
- std::numeric_limits().exponent)>::max(),
- "exponent bit width is too large to be represented");
- }
-
- static void check_mantissa(int64_t mantissa) {
- TORCH_CHECK(
- mantissa <=
- std::numeric_limits().mantissa)>::max(),
- "mantissa bit width is too large to be represented");
- }
-
- static SelfPtr int_(int64_t size_bits, c10::optional bias) {
- check_size_bits(size_bits, true);
- check_bias(bias.value_or(0));
- return c10::make_intrusive(
- ScalarType::int_(size_bits, bias.value_or(0)));
- }
-
- static SelfPtr uint(int64_t size_bits, c10::optional bias) {
- check_size_bits(size_bits, true);
- check_bias(bias.value_or(0));
- return c10::make_intrusive(
- ScalarType::uint(size_bits, bias.value_or(0)));
- }
-
- static SelfPtr float_IEEE754(int64_t exponent, int64_t mantissa) {
- check_mantissa(mantissa);
- check_exponent(exponent);
- return c10::make_intrusive(
- ScalarType::float_IEEE754(exponent, mantissa));
- }
-
- static SelfPtr float_(int64_t exponent, int64_t mantissa,
- bool finite_values_only, int64_t nan_repr) {
- check_mantissa(mantissa);
- check_exponent(exponent);
- return c10::make_intrusive(ScalarType::float_(
- exponent, mantissa, finite_values_only, NanRepr(nan_repr)));
- }
-
- // This needs to be implemented and throw a TypeError in order for
- // PyTorch's opcheck to work on ops that use ScalarTypes.
- int64_t len() const {
- throw c10::TypeError({__func__, __FILE__, static_cast(__LINE__)},
- "__len__ not implemented");
- return 0;
- }
-
- // Serialize a ScalarType into a tuple of pairs. Where each pair
- // is a (fieldname, value).
- // For simplicity, we are just going to convert to a ScalarTypeId.
- std::tuple> obj_flatten() const {
- return {{"ScalarType", id()}};
- }
-
- // Deserialize a scalar type that has been serialized by obj_flatten,
- // ostensibly from a tuple of (member name, value) pairs, but in reality
- // just a ScalarTypeId.
- static SelfPtr obj_unflatten(
- std::tuple> const& flat_type) {
- return c10::make_intrusive(
- from_id(std::get<1>(std::get<0>(flat_type))));
- }
-
- template
- static void bind_readonly_property(torch::class_& cls,
- std::string const& name, T Base::*field) {
- auto getter_func_helper = [field = std::move(field)](SelfPtr const& self) {
- if constexpr (std::is_member_function_pointer_v) {
- return (self.get()->*field)();
- } else {
- return self.get()->*field;
- }
- };
-
- auto getter_func = [field = std::move(field),
- getter_func_helper = std::move(getter_func_helper)](
- SelfPtr const& self) {
- auto val = getter_func_helper(self);
- // upconvert uint8_t, int32_t etc. to int64_t for python
- if constexpr (std::is_integral_v) {
- return static_cast(val);
- } else {
- return val;
- }
- };
-
- cls.def_property(name, getter_func);
- }
-
- template
- static void bind_function(torch::class_& cls, const std::string& name,
- MemberFunc Cls::*member) {
- cls.def(name, [member = std::move(member)](SelfPtr const& self) {
- return (self.get()->*member)();
- });
- }
-
- template
- static void bind_function(torch::class_& cls, const std::string& name,
- Func func) {
- cls.def(name, func);
- }
-
- template
- static void bind_static_function(torch::class_& cls,
- const std::string& name, Func func) {
- cls.def_static(name, func);
- }
-
- static void bind_class(torch::Library& lib) {
- auto cls = lib.class_("ScalarType")
- .def(torch::init());
-
- // Bind Properties
- bind_readonly_property(cls, "mantissa", &Base::mantissa);
- bind_readonly_property(cls, "exponent", &Base::exponent);
- bind_readonly_property(cls, "bias", &Base::bias);
- bind_readonly_property(cls, "signed", &Base::is_signed);
- bind_readonly_property(cls, "size_bits", &Base::size_bits);
-
- // Bind member functions
- bind_function(cls, "is_signed", &Base::is_signed);
- bind_function(cls, "is_integer", &Base::is_integer);
- bind_function(cls, "is_floating_point", &Base::is_floating_point);
- bind_function(cls, "is_ieee_754", &Base::is_ieee_754);
- bind_function(cls, "has_nans", &Base::has_nans);
- bind_function(cls, "has_infs", &Base::has_infs);
- bind_function(cls, "has_bias", &Base::has_bias);
-
- bind_function(cls, "max", [](SelfPtr const& self) {
- return std::visit([](auto arg) { return c10::IValue(arg); },
- self.get()->max());
- });
- bind_function(cls, "min", [](SelfPtr const& self) {
- return std::visit([](auto arg) { return c10::IValue(arg); },
- self.get()->min());
- });
-
- bind_function(cls, "__len__", &ScalarTypeTorch::len);
- bind_function(cls, "__str__", &Base::str);
- bind_function(cls, "__eq__", [](SelfPtr const& self, SelfPtr const& other) {
- return *self == *other;
- });
- bind_function(cls, "__repr__", [](SelfPtr const& self) {
- return "ScalarType." + self.get()->str();
- });
-
- bind_function(cls, "__obj_flatten__", &ScalarTypeTorch::obj_flatten);
- bind_static_function(cls, "__obj_unflatten__",
- &ScalarTypeTorch::obj_unflatten);
-
- // Bind static functions (convenience constructors)
- bind_static_function(cls, "int_", &ScalarTypeTorch::int_);
- bind_static_function(cls, "uint", &ScalarTypeTorch::uint);
- bind_static_function(cls, "float_IEEE754", &ScalarTypeTorch::float_IEEE754);
- bind_static_function(cls, "float_", &ScalarTypeTorch::float_);
- }
-};
-
-using ScalarTypeId = int64_t;
-using ScalarTypeTorchPtr = c10::intrusive_ptr;
+using ScalarTypeId = ScalarType::Id;
// "rust style" names generally following:
// https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L60-L70
diff --git a/csrc/core/torch_bindings.cpp b/csrc/core/torch_bindings.cpp
deleted file mode 100644
index f60254189a2f7..0000000000000
--- a/csrc/core/torch_bindings.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-#include
-
-#include "scalar_type.hpp"
-#include "registration.h"
-
-// Note the CORE exstension will be built for (almost) all hardware targets so
-// new additions must account for this. (currently not built for TPU and Neuron)
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, lib) {
- // ScalarType, a custom class for representing data types that supports
- // quantized types, declared here so it can be used when creating interfaces
- // for custom ops.
- vllm::ScalarTypeTorch::bind_class(lib);
-}
-
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 5b1d3d6442b2b..a325153b470cc 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -265,6 +265,30 @@ struct FP32Vec8 : public Vec {
void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
};
+#ifdef __AVX512F__
+struct INT32Vec16: public Vec {
+ constexpr static int VEC_ELEM_NUM = 16;
+ union AliasReg {
+ __m512i reg;
+ int32_t values[VEC_ELEM_NUM];
+ };
+
+ __m512i reg;
+
+ explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {}
+
+ void save(int32_t* ptr) const {
+ _mm512_storeu_epi32(ptr, reg);
+ }
+
+ void save(int32_t* ptr, const int elem_num) const {
+ constexpr uint32_t M = 0xFFFFFFFF;
+ __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+ _mm512_mask_storeu_epi32(ptr, mask, reg);
+ }
+};
+#endif
+
#ifdef __AVX512F__
struct FP32Vec16 : public Vec {
constexpr static int VEC_ELEM_NUM = 16;
@@ -283,8 +307,6 @@ struct FP32Vec16 : public Vec {
explicit FP32Vec16(__m512 data) : reg(data) {}
- explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
-
explicit FP32Vec16(const FP32Vec4 &data)
: reg((__m512)_mm512_inserti32x4(
_mm512_inserti32x4(
@@ -303,6 +325,9 @@ struct FP32Vec16 : public Vec {
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+ explicit FP32Vec16(const INT32Vec16 &v)
+ : reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {}
+
FP32Vec16 operator*(const FP32Vec16 &b) const {
return FP32Vec16(_mm512_mul_ps(reg, b.reg));
}
@@ -333,6 +358,16 @@ struct FP32Vec16 : public Vec {
return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
}
+ FP32Vec16 min(const FP32Vec16& b) const {
+ return FP32Vec16(_mm512_min_ps(reg, b.reg));
+ }
+
+ FP32Vec16 min(const FP32Vec16& b, const int elem_num) const {
+ constexpr uint32_t M = 0xFFFFFFFF;
+ __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+ return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg));
+ }
+
FP32Vec16 abs() const {
return FP32Vec16(_mm512_abs_ps(reg));
}
@@ -341,6 +376,8 @@ struct FP32Vec16 : public Vec {
float reduce_max() const { return _mm512_reduce_max_ps(reg); }
+ float reduce_min() const { return _mm512_reduce_min_ps(reg); }
+
template float reduce_sub_sum(int idx) {
static_assert(VEC_ELEM_NUM % group_size == 0);
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
index 2d7abe6145fee..b493fd793818a 100644
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -5,25 +5,29 @@ namespace {
template
struct KernelVecType {
using load_vec_type = void;
+ using azp_adj_load_vec_type = void;
using cvt_vec_type = void;
};
template <>
struct KernelVecType {
using load_vec_type = vec_op::FP32Vec16;
+ using azp_adj_load_vec_type = vec_op::INT32Vec16;
using cvt_vec_type = vec_op::FP32Vec16;
};
template <>
struct KernelVecType {
using load_vec_type = vec_op::BF16Vec16;
+ using azp_adj_load_vec_type = vec_op::INT32Vec16;
using cvt_vec_type = vec_op::FP32Vec16;
};
#ifdef __AVX512F__
-template
+template
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
- const float* scale, const int num_tokens,
+ const float* scale, const int32_t* azp,
+ const int num_tokens,
const int hidden_size) {
using load_vec_t = typename KernelVecType::load_vec_type;
using cvt_vec_t = typename KernelVecType::cvt_vec_type;
@@ -37,62 +41,110 @@ void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
const cvt_vec_t i8_min_vec(i8_min);
const cvt_vec_t i8_max_vec(i8_max);
+ cvt_vec_t zp_vec;
+ if constexpr (AZP) {
+ zp_vec = cvt_vec_t(static_cast(*azp));
+ }
+
#pragma omp parallel for
for (int i = 0; i < num_tokens; ++i) {
int j = 0;
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
- elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
+ elems_fp32 = elems_fp32 * inv_scale;
+
+ if constexpr (AZP) {
+ elems_fp32 = elems_fp32 + zp_vec;
+ }
+
+ elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
vec_op::INT8Vec16 elems_int8(elems_fp32);
elems_int8.save(output + i * hidden_size + j);
}
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
- elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
- vec_op::INT8Vec16 elems_int8(elems_fp32);
+ elems_fp32 = elems_fp32 * inv_scale;
- if (j + vec_elem_num == hidden_size) {
- elems_int8.save(output + i * hidden_size + j);
- } else {
- elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+ if constexpr (AZP) {
+ elems_fp32 = elems_fp32 + zp_vec;
}
+
+ elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+ vec_op::INT8Vec16 elems_int8(elems_fp32);
+ elems_int8.save(output + i * hidden_size + j, hidden_size - j);
}
}
-template
+template
void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
- float* scale, const int num_tokens,
+ float* scale, int32_t* azp,
+ const int num_tokens,
const int hidden_size) {
using load_vec_t = typename KernelVecType::load_vec_type;
using cvt_vec_t = typename KernelVecType::cvt_vec_type;
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+ constexpr float i8_min =
+ static_cast(std::numeric_limits::min());
+ constexpr float i8_max =
+ static_cast(std::numeric_limits::max());
+ const cvt_vec_t i8_min_vec(i8_min);
+ const cvt_vec_t i8_max_vec(i8_max);
+
#pragma omp parallel for
for (int i = 0; i < num_tokens; ++i) {
- cvt_vec_t max_abs(0.0);
+ cvt_vec_t max_value(std::numeric_limits::lowest());
+ cvt_vec_t min_value(std::numeric_limits::max());
{
int j = 0;
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
- max_abs = max_abs.max(elems_fp32.abs());
+ if constexpr (AZP) {
+ max_value = max_value.max(elems_fp32);
+ min_value = min_value.min(elems_fp32);
+ } else {
+ max_value = max_value.max(elems_fp32.abs());
+ }
}
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
if (j + vec_elem_num == hidden_size) {
- max_abs = max_abs.max(elems_fp32.abs());
+ if constexpr (AZP) {
+ max_value = max_value.max(elems_fp32);
+ min_value = min_value.min(elems_fp32);
+ } else {
+ max_value = max_value.max(elems_fp32.abs());
+ }
} else {
- max_abs = max_abs.max(elems_fp32.abs(), hidden_size - j);
+ if constexpr (AZP) {
+ max_value = max_value.max(elems_fp32, hidden_size - j);
+ min_value = min_value.min(elems_fp32, hidden_size - j);
+ } else {
+ max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
+ }
}
}
- float scale_val = max_abs.reduce_max() / 127.0f;
- scale[i] = scale_val;
+ float scale_val, azp_val;
+ if constexpr (AZP) {
+ float max_scalar = max_value.reduce_max();
+ float min_scalar = min_value.reduce_min();
+ scale_val = (max_scalar - min_scalar) / 255.0f;
+ azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
+ azp[i] = static_cast(azp_val);
+ scale[i] = scale_val;
+ } else {
+ scale_val = max_value.reduce_max() / 127.0f;
+ scale[i] = scale_val;
+ }
+
const cvt_vec_t inv_scale(1.0 / scale_val);
+ const cvt_vec_t azp_vec(azp_val);
{
int j = 0;
@@ -100,6 +152,11 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
elems_fp32 = (elems_fp32 * inv_scale);
+
+ if constexpr (AZP) {
+ elems_fp32 = elems_fp32 + azp_vec;
+ }
+ elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
vec_op::INT8Vec16 elems_int8(elems_fp32);
elems_int8.save(output + i * hidden_size + j);
}
@@ -107,34 +164,111 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
elems_fp32 = (elems_fp32 * inv_scale);
- vec_op::INT8Vec16 elems_int8(elems_fp32);
- if (j + vec_elem_num == hidden_size) {
- elems_int8.save(output + i * hidden_size + j);
- } else {
- elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+ if constexpr (AZP) {
+ elems_fp32 = elems_fp32 + azp_vec;
}
+ elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+ vec_op::INT8Vec16 elems_int8(elems_fp32);
+ elems_int8.save(output + i * hidden_size + j, hidden_size - j);
}
}
}
-template
-void dynamic_output_scale_impl(const float* input, scalar_t* output,
- const float* scale, const scalar_t* bias,
- const int num_tokens, const int hidden_size) {
+template
+void static_quant_epilogue(const float* input, scalar_t* output,
+ const float a_scale, const float* b_scale,
+ const int32_t* azp_with_adj, const int num_tokens,
+ const int hidden_size) {
CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
using load_vec_t = typename KernelVecType::load_vec_type;
+ using azp_adj_load_vec_t =
+ typename KernelVecType::azp_adj_load_vec_type;
+ using cvt_vec_t = typename KernelVecType::cvt_vec_type;
+ constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+ #pragma omp parallel for
+ for (int i = 0; i < num_tokens; ++i) {
+ cvt_vec_t a_scale_vec(a_scale);
+ cvt_vec_t b_scale_vec(*b_scale);
+ cvt_vec_t scale_vec = a_scale_vec * b_scale_vec;
+
+ int j = 0;
+ for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+ cvt_vec_t elems_fp32(input + i * hidden_size + j);
+ azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
+ cvt_vec_t azp_adj_fp32(azp_adj_vec);
+
+ if constexpr (PerChannel) {
+ b_scale_vec = cvt_vec_t(b_scale + j);
+ scale_vec = b_scale_vec * a_scale_vec;
+ }
+
+ elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
+
+ load_vec_t elems_out(elems_fp32);
+ elems_out.save(output + i * hidden_size + j);
+ }
+
+ cvt_vec_t elems_fp32(input + i * hidden_size + j);
+ azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
+ cvt_vec_t azp_adj_fp32(azp_adj_vec);
+
+ if constexpr (PerChannel) {
+ b_scale_vec = cvt_vec_t(b_scale + j);
+ scale_vec = b_scale_vec * a_scale_vec;
+ }
+
+ elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
+
+ load_vec_t elems_out(elems_fp32);
+ elems_out.save(output + i * hidden_size + j, hidden_size - j);
+ }
+}
+
+template
+void dynamic_quant_epilogue(const float* input, scalar_t* output,
+ const float* a_scale, const float* b_scale,
+ const int32_t* azp, const int32_t* azp_adj,
+ const scalar_t* bias, const int num_tokens,
+ const int hidden_size) {
+ CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
+ using load_vec_t = typename KernelVecType::load_vec_type;
+ using azp_adj_load_vec_t =
+ typename KernelVecType::azp_adj_load_vec_type;
using cvt_vec_t = typename KernelVecType::cvt_vec_type;
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
#pragma omp parallel for
for (int i = 0; i < num_tokens; ++i) {
int j = 0;
- cvt_vec_t token_scale_vec(scale[i]);
+ cvt_vec_t token_scale_vec(a_scale[i]);
+ cvt_vec_t token_zp_scale_vec;
+ if constexpr (AZP) {
+ float zp_scale_val = a_scale[i] * static_cast(azp[i]);
+ if constexpr (!PerChannel) {
+ zp_scale_val *= *b_scale;
+ }
+ token_zp_scale_vec = cvt_vec_t(zp_scale_val);
+ }
+
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
cvt_vec_t elems_fp32(input + i * hidden_size + j);
elems_fp32 = elems_fp32 * token_scale_vec;
+ if constexpr (AZP) {
+ azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
+ cvt_vec_t azp_adj_fp32(azp_adj_vec);
+ azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
+
+ if constexpr (PerChannel) {
+ cvt_vec_t b_scale_vec(b_scale + j);
+ azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
+ }
+
+ elems_fp32 = elems_fp32 - azp_adj_fp32;
+ }
+
if constexpr (Bias) {
load_vec_t bias_vec(bias + j);
cvt_vec_t bias_vec_fp32(bias_vec);
@@ -148,6 +282,19 @@ void dynamic_output_scale_impl(const float* input, scalar_t* output,
cvt_vec_t elems_fp32(input + i * hidden_size + j);
elems_fp32 = elems_fp32 * token_scale_vec;
+ if constexpr (AZP) {
+ azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
+ cvt_vec_t azp_adj_fp32(azp_adj_vec);
+ azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
+
+ if constexpr (PerChannel) {
+ cvt_vec_t b_scale_vec(b_scale + j);
+ azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
+ }
+
+ elems_fp32 = elems_fp32 - azp_adj_fp32;
+ }
+
if constexpr (Bias) {
load_vec_t bias_vec(bias + j);
cvt_vec_t bias_vec_fp32(bias_vec);
@@ -155,32 +302,41 @@ void dynamic_output_scale_impl(const float* input, scalar_t* output,
}
load_vec_t elems_out(elems_fp32);
-
- if (j + vec_elem_num == hidden_size) {
- elems_out.save(output + i * hidden_size + j);
- } else {
- elems_out.save(output + i * hidden_size + j, hidden_size - j);
- }
+ elems_out.save(output + i * hidden_size + j, hidden_size - j);
}
}
#else
template
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
- const float* scale, const int num_tokens,
+ const float* scale, const int32_t* azp,
+ const int num_tokens,
const int hidden_size) {
TORCH_CHECK(false, "static_scaled_int8_quant_impl requires AVX512 support.")
}
template
void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
- float* scale, const int num_tokens,
+ float* scale, int32_t* azp,
+ const int num_tokens,
const int hidden_size) {
TORCH_CHECK(false, "dynamic_scaled_int8_quant_impl requires AVX512 support.")
}
+template
+void static_quant_epilogue(const float* input, scalar_t* output,
+ const float a_scale, const float* b_scale,
+ const int32_t* azp_with_adj, const int num_tokens,
+ const int hidden_size) {
+ TORCH_CHECK(false, "static_quant_epilogue requires AVX512 support.")
+}
+
template
-void dynamic_output_scale_impl() {
- TORCH_CHECK(false, "dynamic_output_scale_impl requires AVX512 support.")
+void dynamic_quant_epilogue(const float* input, scalar_t* output,
+ const float* a_scale, const float* b_scale,
+ const int32_t* azp, const int32_t* azp_with_adj,
+ const scalar_t* bias, const int num_tokens,
+ const int hidden_size) {
+ TORCH_CHECK(false, "dynamic_quant_epilogue requires AVX512 support.")
}
#endif
} // namespace
@@ -214,39 +370,52 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
bias->dim() == 1);
}
- VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "cutlass_scaled_mm", [&] {
+ VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm", [&] {
if (a_scales.numel() != 1) {
// per-token
// Note: oneDNN doesn't support per-token activation quantization
+ // Ideally we want to fuse the GEMM and the scale procedure with oneDNN
+ // JIT, the intermediate data is cached in registers or L1. But for now
+ // the oneDNN GEMM code generation only supports two quantization
+ // patterns: per-tensor or per-output-channel of weight.
+ // So we have to apply the per-token scale with a 'epilogue'. In C=s_a *
+ // s_b * (A@B) + bias, the C_inter = s_b * (A@B) is computed by oneDNN
+ // GEMM, then the per-token scale (and bias) is applied with the epilogue
+ // C=s_a * C_inter + bias.
torch::Tensor tmp_fp32_out =
torch::empty_like(c, ::at::ScalarType::Float);
- DNNLPrimitiveHelper::gemm_s8s8_jit(
+ // Compute C_inter=s_b * (A@B)
+ DNNLPrimitiveHelper::gemm_s8s8_jit(
a.data_ptr(), b.data_ptr(),
- tmp_fp32_out.data_ptr(), (void*)(0), a.size(0), b.size(1),
- a.size(1), (float*)(0), b_scales.data_ptr(), 0,
- b_scales.numel());
+ tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1),
+ a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel());
if (bias.has_value()) {
- dynamic_output_scale_impl(
+ // Compute C=s_a * C_inter + bias
+ dynamic_quant_epilogue(
tmp_fp32_out.data_ptr(), c.data_ptr(),
- a_scales.data_ptr(), bias->data_ptr(), c.size(0),
- c.size(1));
+ a_scales.data_ptr(), nullptr, nullptr, nullptr,
+ bias->data_ptr(), c.size(0), c.size(1));
} else {
- dynamic_output_scale_impl(
+ // Compute C=s_a * C_inter
+ dynamic_quant_epilogue(
tmp_fp32_out.data_ptr(), c.data_ptr(),
- a_scales.data_ptr(), (scalar_t*)(0), c.size(0), c.size(1));
+ a_scales.data_ptr(), nullptr, nullptr, nullptr, nullptr,
+ c.size(0), c.size(1));
}
} else {
// per-tensor
if (bias.has_value()) {
+ // Compute C=s_a * s_b * (A@B) + bias
DNNLPrimitiveHelper::gemm_s8s8_jit(
a.data_ptr(), b.data_ptr(), c.data_ptr(),
bias->data_ptr(), a.size(0), b.size(1), a.size(1),
a_scales.data_ptr(), b_scales.data_ptr(),
a_scales.numel(), b_scales.numel());
} else {
- DNNLPrimitiveHelper::gemm_s8s8_jit(
+ // Compute C=s_a * s_b * (A@B)
+ DNNLPrimitiveHelper::gemm_s8s8_jit(
a.data_ptr(), b.data_ptr(), c.data_ptr(),
- (void*)(0), a.size(0), b.size(1), a.size(1),
+ nullptr, a.size(0), b.size(1), a.size(1),
a_scales.data_ptr(), b_scales.data_ptr(),
a_scales.numel(), b_scales.numel());
}
@@ -254,6 +423,127 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
});
}
+void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major
+ const torch::Tensor& a, // [M, IC], row-major
+ const torch::Tensor& b, // [IC, OC], column-major
+ const torch::Tensor& a_scales, // [1] or [M]
+ const torch::Tensor& b_scales, // [1] or [OC]
+ const torch::Tensor& azp_adj, // [OC]
+ const c10::optional& azp, // [1] or [M]
+ const c10::optional& bias // [OC]
+) {
+ CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp)
+ // Checks for conformality
+ TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
+ "int8_scaled_mm_azp only supports INT8 inputs.")
+ TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+ TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+ b.size(1) == c.size(1));
+ TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+ TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+ // Check for strides and alignment
+ TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major
+ TORCH_CHECK(b.stride(0) == 1); // Column-major
+ TORCH_CHECK(c.stride(0) % 16 == 0 &&
+ b.stride(1) % 16 == 0); // 16 Byte Alignment
+ TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+ if (bias) {
+ TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous());
+ }
+ if (azp) {
+ TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous());
+ }
+ TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous());
+
+ // azp & bias types
+ TORCH_CHECK(azp_adj.dtype() == torch::kInt32);
+ TORCH_CHECK(!azp || azp->dtype() == torch::kInt32);
+ TORCH_CHECK(!bias || bias->dtype() == c.dtype(),
+ "currently bias dtype must match output dtype ", c.dtype());
+
+ VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_azp", [&] {
+ torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float);
+ if (a_scales.numel() != 1) {
+ // per-token
+ // Note: oneDNN doesn't support per-token activation quantization
+ // Compute C_inter=s_b * (A@B)
+ DNNLPrimitiveHelper::gemm_s8s8_jit(
+ a.data_ptr(), b.data_ptr(),
+ tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1),
+ a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel());
+ if (bias.has_value()) {
+ // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj + bias
+ if (b_scales.numel() != 1) {
+ // Per-Channel
+ dynamic_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ a_scales.data_ptr(), b_scales.data_ptr(),
+ azp->data_ptr(), azp_adj.data_ptr(),
+ bias->data_ptr(), c.size(0), c.size(1));
+ } else {
+ // Per-Tensor
+ dynamic_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ a_scales.data_ptr(), b_scales.data_ptr(),
+ azp->data_ptr(), azp_adj.data_ptr(),
+ bias->data_ptr(), c.size(0), c.size(1));
+ }
+ } else {
+ // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj
+ if (b_scales.numel() != 1) {
+ // Per-Channel
+ dynamic_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ a_scales.data_ptr(), b_scales.data_ptr(),
+ azp->data_ptr(), azp_adj.data_ptr(), nullptr,
+ c.size(0), c.size(1));
+ } else {
+ // Per-Tensor
+ dynamic_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ a_scales.data_ptr(), b_scales.data_ptr(),
+ azp->data_ptr(), azp_adj.data_ptr(), nullptr,
+ c.size(0), c.size(1));
+ }
+ }
+ } else {
+ // per-tensor
+ if (bias.has_value()) {
+ // Compute C_inter=s_a * s_b * (A@B) + bias
+ DNNLPrimitiveHelper::gemm_s8s8_jit(
+ a.data_ptr(), b.data_ptr(),
+ tmp_fp32_out.data_ptr(), bias->data_ptr(),
+ a.size(0), b.size(1), a.size(1), a_scales.data_ptr(),
+ b_scales.data_ptr(), a_scales.numel(), b_scales.numel());
+ } else {
+ // Compute C_inter=s_a * s_b * (A@B)
+ DNNLPrimitiveHelper::gemm_s8s8_jit(
+ a.data_ptr(), b.data_ptr(),
+ tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1),
+ a.size(1), a_scales.data_ptr(), b_scales.data_ptr(),
+ a_scales.numel(), b_scales.numel());
+ }
+
+ // Compute C=C_inter - s_a * s_b * azp_adj
+ if (b_scales.numel() != 1) {
+ // Per-Channel
+ static_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ *a_scales.data_ptr(), b_scales.data_ptr(),
+ azp_adj.data_ptr(), a.size(0), b.size(1));
+ } else {
+ // Per-Tensor
+ static_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ *a_scales.data_ptr(), b_scales.data_ptr(),
+ azp_adj.data_ptr(), a.size(0), b.size(1));
+ }
+ }
+ });
+}
+
// static-per-tensor quantization.
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
const torch::Tensor& input, // [..., hidden_size]
@@ -263,15 +553,22 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(scale.numel() == 1);
- TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU.");
+ TORCH_CHECK(!azp.has_value() || azp->numel() == 1);
const int hidden_size = input.size(-1);
const int num_tokens = input.numel() / hidden_size;
VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
- static_scaled_int8_quant_impl(
- input.data_ptr(), out.data_ptr(),
- scale.data_ptr(), num_tokens, hidden_size);
+ if (azp.has_value()) {
+ static_scaled_int8_quant_impl(
+ input.data_ptr(), out.data_ptr(),
+ scale.data_ptr(), azp->data_ptr(), num_tokens,
+ hidden_size);
+ } else {
+ static_scaled_int8_quant_impl(
+ input.data_ptr(), out.data_ptr(),
+ scale.data_ptr(), nullptr, num_tokens, hidden_size);
+ }
});
}
@@ -284,14 +581,20 @@ void dynamic_scaled_int8_quant(
CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous());
- TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU.");
int const hidden_size = input.size(-1);
int const num_tokens = input.numel() / hidden_size;
VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
- dynamic_scaled_int8_quant_impl(
- input.data_ptr(), out.data_ptr(),
- scale.data_ptr(), num_tokens, hidden_size);
+ if (azp.has_value()) {
+ dynamic_scaled_int8_quant_impl(
+ input.data_ptr(), out.data_ptr(),
+ scale.data_ptr(), azp->data_ptr(), num_tokens,
+ hidden_size);
+ } else {
+ dynamic_scaled_int8_quant_impl(
+ input.data_ptr(), out.data_ptr(),
+ scale.data_ptr(), nullptr, num_tokens, hidden_size);
+ }
});
}
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index ab697e3e6aef7..03beefbc6de7d 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -11,6 +11,13 @@ void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
const torch::Tensor& b_scales,
const c10::optional& bias);
+void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
+ const torch::Tensor& b, const torch::Tensor& a_scales,
+ const torch::Tensor& b_scales,
+ const torch::Tensor& azp_adj,
+ const c10::optional& azp,
+ const c10::optional& bias);
+
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops
@@ -111,6 +118,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor? bias) -> ()");
ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
+ // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
+ // quantization.
+ ops.def(
+ "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
+ " Tensor b, Tensor a_scales,"
+ " Tensor b_scales, Tensor azp_adj,"
+ " Tensor? azp, Tensor? bias) -> ()");
+ ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
#endif
}
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
index 30831efdfa1a2..498d069c05f0d 100644
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -55,6 +55,7 @@ void set_conv_params_fwd(ConvParamsBase ¶ms,
const at::Tensor out,
const c10::optional& bias,
bool silu_activation,
+ int64_t pad_slot_id,
const c10::optional& query_start_loc = std::nullopt,
const c10::optional& cache_indices = std::nullopt,
const c10::optional& has_initial_state = std::nullopt) {
@@ -66,6 +67,7 @@ void set_conv_params_fwd(ConvParamsBase ¶ms,
params.dim = dim;
params.seqlen = seqlen;
params.width = width;
+ params.pad_slot_id = pad_slot_id;
params.silu_activation = silu_activation;
@@ -90,14 +92,16 @@ void set_conv_params_fwd(ConvParamsBase ¶ms,
}
-at::Tensor
-causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
+void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
const c10::optional &bias_,
const c10::optional &conv_states,
const c10::optional &query_start_loc,
const c10::optional &cache_indices,
const c10::optional &has_initial_state,
- bool silu_activation) {
+ bool silu_activation,
+ // used to identify padding entries if cache_indices provided
+ // in case of padding, the kernel will return early
+ int64_t pad_slot_id) {
auto input_type = x.scalar_type();
auto weight_type = weight.scalar_type();
TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@@ -153,12 +157,13 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
CHECK_SHAPE(cache_indices_, batch_size);
}
- at::Tensor out = torch::empty_like(x);
+ at::Tensor out = x;
ConvParamsBase params;
set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
bias_,
silu_activation,
+ pad_slot_id,
query_start_loc,
cache_indices,
has_initial_state
@@ -183,18 +188,19 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
causal_conv1d_fwd_cuda(params, stream);
});
- return out;
}
-at::Tensor
-causal_conv1d_update(const at::Tensor &x,
+void causal_conv1d_update(const at::Tensor &x,
const at::Tensor &conv_state,
const at::Tensor &weight,
const c10::optional &bias_,
bool silu_activation,
const c10::optional &cache_seqlens_,
- const c10::optional &conv_state_indices_) {
+ const c10::optional &conv_state_indices_,
+ // used to identify padding entries if cache_indices provided
+ // in case of padding, the kernel will return early
+ int64_t pad_slot_id) {
auto input_type = x.scalar_type();
auto weight_type = weight.scalar_type();
TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@@ -227,12 +233,13 @@ causal_conv1d_update(const at::Tensor &x,
CHECK_SHAPE(bias, dim);
}
- at::Tensor out = torch::empty_like(x);
+ at::Tensor out = x;
ConvParamsBase params;
set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
bias_,
- silu_activation);
+ silu_activation,
+ pad_slot_id);
params.conv_state_ptr = conv_state.data_ptr();
params.conv_state_len = conv_state_len;
// All stride are in elements, not bytes.
@@ -274,7 +281,6 @@ causal_conv1d_update(const at::Tensor &x,
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_update", [&] {
causal_conv1d_update_cuda(params, stream);
});
- return out;
}
template
@@ -340,7 +346,10 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
: reinterpret_cast(params.cache_indices_ptr);
int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
-
+ // cache_index == params.pad_slot_id is defined as padding, so we exit early
+ if (cache_index == params.pad_slot_id){
+ return;
+ }
input_t *conv_states = params.conv_states_ptr == nullptr ? nullptr
: reinterpret_cast(params.conv_states_ptr) + cache_index * params.conv_states_batch_stride + channel_id * params.conv_states_c_stride;
@@ -409,6 +418,31 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
}
out += kChunkSize;
+
+ int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
+ // in case the final state is separated between the last "smem_exchange" and
+ // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
+ // (which occurs when `final_state_position` is a non-positivie index)
+ // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
+ if (final_state_position < 0 && seqlen > kWidth){
+ input_t vals_load[kNElts] = {0};
+ if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
+ // chunk = n_chunks - 2, a segment of the final state sits in the last index
+ reinterpret_cast(vals_load)[0] = smem_exchange[kNThreads - 1];
+ #pragma unroll
+ for (int w = 0; w < -final_state_position; ++w){
+ conv_states[w] = vals_load[kNElts + final_state_position + w];
+ }
+ }
+ if ((chunk == n_chunks - 1) && tidx == 0){
+ // chunk = n_chunks - 1, the second segment of the final state first positions
+ reinterpret_cast(vals_load)[0] = smem_exchange[0];
+ for (int w = -final_state_position; w < kWidth - 1; ++w){
+ conv_states[w] = vals_load[w + final_state_position];
+ }
+ return;
+ }
+ }
}
// Final state is stored in the smem_exchange last token slot,
// in case seqlen < kWidth, we would need to take the final state from the
@@ -437,9 +471,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
}
else {
// in case the final state is in between the threads data
- reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1];
- reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread];
const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
+ if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){
+ // In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a
+ // illegal access error on H100.
+ // Therefore, we access last_thread + 1, only if the final state data sits there
+ reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1];
+ }
+ reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread];
#pragma unroll
for (int w = 0; w < kWidth - 1; ++w){
conv_states[w] = x_vals_load[offset + w ];
@@ -528,6 +567,10 @@ void causal_conv1d_update_kernel(ConvParamsBase params) {
const int conv_state_batch_coord = params.conv_state_indices_ptr == nullptr
? batch_id
: params.conv_state_indices_ptr[batch_id];
+ // conv_state_batch_coord == params.pad_slot_id is defined as padding so we exit early
+ if (conv_state_batch_coord == params.pad_slot_id){
+ return;
+ }
input_t *conv_state = reinterpret_cast(params.conv_state_ptr)
+ conv_state_batch_coord * params.conv_state_batch_stride
+ channel_id * params.conv_state_c_stride;
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.h b/csrc/mamba/causal_conv1d/causal_conv1d.h
index 49e37ee4528be..e26684a2b98b8 100644
--- a/csrc/mamba/causal_conv1d/causal_conv1d.h
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.h
@@ -13,6 +13,7 @@ struct ConvParamsBase {
using index_t = uint32_t;
int batch, dim, seqlen, width;
+ int64_t pad_slot_id;
bool silu_activation;
index_t x_batch_stride;
diff --git a/csrc/mamba/mamba_ssm/selective_scan.h b/csrc/mamba/mamba_ssm/selective_scan.h
index 580d0b2e17e74..563d2fe4ef65b 100644
--- a/csrc/mamba/mamba_ssm/selective_scan.h
+++ b/csrc/mamba/mamba_ssm/selective_scan.h
@@ -21,6 +21,7 @@ struct SSMParamsBase {
int dim_ngroups_ratio;
bool is_variable_B;
bool is_variable_C;
+ int64_t pad_slot_id;
bool delta_softplus;
diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
index 6b225b41d295d..71624696338d0 100644
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -115,6 +115,10 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
const int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
: reinterpret_cast(params.cache_indices_ptr);
const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
+ // cache_index == params.pad_slot_id is defined as padding, so we exit early
+ if (cache_index == params.pad_slot_id){
+ return;
+ }
input_t *u = reinterpret_cast(params.u_ptr) + sequence_start_index * params.u_batch_stride
+ dim_id * kNRows * params.u_d_stride;
input_t *delta = reinterpret_cast(params.delta_ptr) + sequence_start_index * params.delta_batch_stride
@@ -387,7 +391,6 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms,
const size_t seqlen,
const size_t dstate,
const size_t n_groups,
- const size_t n_chunks,
const bool is_variable_B,
const bool is_variable_C,
// device pointers
@@ -407,7 +410,8 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms,
const c10::optional& query_start_loc,
const c10::optional& cache_indices,
const c10::optional& has_initial_state,
- bool varlen) {
+ bool varlen,
+ int64_t pad_slot_id) {
// Reset the parameters
memset(¶ms, 0, sizeof(params));
@@ -417,8 +421,8 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms,
params.seqlen = seqlen;
params.dstate = dstate;
params.n_groups = n_groups;
- params.n_chunks = n_chunks;
params.dim_ngroups_ratio = dim / n_groups;
+ params.pad_slot_id = pad_slot_id;
params.delta_softplus = delta_softplus;
@@ -507,7 +511,10 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
const c10::optional &query_start_loc,
const c10::optional &cache_indices,
const c10::optional &has_initial_state,
- const torch::Tensor &ssm_states) {
+ const torch::Tensor &ssm_states,
+ // used to identify padding entries if cache_indices provided
+ // in case of padding, the kernel will return early
+ int64_t pad_slot_id) {
auto input_type = u.scalar_type();
auto weight_type = A.scalar_type();
TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@@ -618,18 +625,14 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
out_z = z;
- const int n_chunks = (seqlen + 2048 - 1) / 2048;
- // const int n_chunks = (seqlen + 1024 - 1) / 1024;
- // at::Tensor out = torch::empty_like(u);
// Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
at::Tensor out = delta;
TORCH_CHECK(ssm_states.scalar_type() == input_type);
TORCH_CHECK(ssm_states.is_cuda());
TORCH_CHECK(ssm_states.stride(-1) == 1);
- CHECK_SHAPE(ssm_states, batch_size, dim, dstate);
SSMParamsBase params;
- set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
+ set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, is_variable_B, is_variable_C,
u, delta, A, B, C, out, z, out_z,
D_,
delta_bias_,
@@ -639,7 +642,8 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
query_start_loc,
cache_indices,
has_initial_state,
- varlen
+ varlen,
+ pad_slot_id
);
diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
index e2db4e4196b6f..5f12483e951e8 100644
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -484,21 +484,22 @@ torch::Tensor marlin_gemm_moe(
const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
torch::Tensor& b_zeros, const torch::Tensor& g_idx,
const torch::Tensor& perm, torch::Tensor& workspace,
- vllm::ScalarTypeTorchPtr const& b_q_type, int64_t size_m, int64_t size_n,
+ vllm::ScalarTypeId const b_q_type_id, int64_t size_m, int64_t size_n,
int64_t size_k, bool is_k_full, int64_t num_experts, int64_t topk,
int64_t moe_block_size, bool replicate_input, bool apply_weights) {
+ vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
bool has_zp = b_zeros.size(1) != 0;
if (has_zp) {
TORCH_CHECK(
- *b_q_type == vllm::kU4,
- "b_q_type must be u4 when has_zp = True. Got = ", b_q_type->str());
+ b_q_type == vllm::kU4,
+ "b_q_type must be u4 when has_zp = True. Got = ", b_q_type.str());
} else {
TORCH_CHECK(
- *b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
- "b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type->str());
+ b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
+ "b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type.str());
}
- int pack_factor = 32 / b_q_type->size_bits();
+ int pack_factor = 32 / b_q_type.size_bits();
int max_par = 4;
@@ -575,7 +576,7 @@ torch::Tensor marlin_gemm_moe(
topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
- *b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size,
+ b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size,
num_experts, topk, moe_block_size, dev,
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par,
replicate_input, apply_weights);
diff --git a/csrc/moe_align_block_size_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
similarity index 59%
rename from csrc/moe_align_block_size_kernels.cu
rename to csrc/moe/moe_align_sum_kernels.cu
index 1f8d75da83bb8..fff7ce34c838a 100644
--- a/csrc/moe_align_block_size_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -1,15 +1,17 @@
#include
#include
+#include
#include
#include
-#include "cuda_compat.h"
-#include "dispatch_utils.h"
+#include "../cuda_compat.h"
+#include "../dispatch_utils.h"
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
namespace vllm {
+namespace moe {
namespace {
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
@@ -32,10 +34,10 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
extern __shared__ int32_t shared_mem[];
int32_t* tokens_cnts =
- shared_mem; // 2d tensor with shape (num_experts + 1, num_experts)
+ shared_mem; // 2d tensor with shape (blockDim.x + 1, num_experts)
int32_t* cumsum =
- shared_mem + (num_experts + 1) *
- num_experts; // 1d tensor with shape (num_experts + 1)
+ shared_mem +
+ (blockDim.x + 1) * num_experts; // 1d tensor with shape (num_experts + 1)
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
@@ -53,10 +55,12 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
__syncthreads();
// For each expert we accumulate the token counts from the different threads.
- tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
- for (int i = 1; i <= blockDim.x; ++i) {
- tokens_cnts[index(num_experts, i, threadIdx.x)] +=
- tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
+ if (threadIdx.x < num_experts) {
+ tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
+ for (int i = 1; i <= blockDim.x; ++i) {
+ tokens_cnts[index(num_experts, i, threadIdx.x)] +=
+ tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
+ }
}
__syncthreads();
@@ -79,9 +83,11 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
* For each expert, each thread processes the tokens of the corresponding
* blocks and stores the corresponding expert_id for each block.
*/
- for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
- i += block_size) {
- expert_ids[i / block_size] = threadIdx.x;
+ if (threadIdx.x < num_experts) {
+ for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
+ i += block_size) {
+ expert_ids[i / block_size] = threadIdx.x;
+ }
}
/**
@@ -106,6 +112,24 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
}
}
+
+template
+__global__ void moe_sum_kernel(
+ scalar_t* __restrict__ out, // [..., d]
+ const scalar_t* __restrict__ input, // [..., topk, d]
+ const int d) {
+ const int64_t token_idx = blockIdx.x;
+ for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+ scalar_t x = 0.0;
+#pragma unroll
+ for (int k = 0; k < TOPK; ++k) {
+ x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
+ }
+ out[token_idx * d + idx] = x;
+ }
+}
+
+} // namespace moe
} // namespace vllm
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
@@ -117,18 +141,62 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
// calc needed amount of shared mem for `tokens_cnts` and `cumsum`
// tensors
+ const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
const int32_t shared_mem =
- ((num_experts + 1) * num_experts + (num_experts + 1)) *
+ ((num_thread + 1) * num_experts + (num_experts + 1)) *
sizeof(int32_t);
// set dynamic shared mem
- auto kernel = vllm::moe_align_block_size_kernel;
+ auto kernel = vllm::moe::moe_align_block_size_kernel