From daa769ef4692e0976efd454c36a75e614bb4ee67 Mon Sep 17 00:00:00 2001 From: Stella Laurenzo Date: Sat, 20 Apr 2024 15:21:37 -0700 Subject: [PATCH] Rename/move from original prototype structure. --- .gitignore | 31 +++ LICENSE | 219 ++++++++++++++++++ README.md | 56 +++++ llm/pyproject.toml | 3 - pytorch-cpu-requirements.txt | 3 + pytorch-rocm-requirements.txt | 5 + requirements.txt | 10 + serving/pyproject.toml | 3 - serving/requirements.txt | 3 - {llm => sharktank}/README.md | 14 +- {llm => sharktank}/mypy.ini | 2 +- sharktank/pyproject.toml | 12 + {llm => sharktank}/requirements.txt | 0 {llm => sharktank}/setup.cfg | 0 {llm => sharktank}/setup.py | 18 +- .../sharktank}/__init__.py | 0 .../examples/export_paged_llm_v1.py | 4 +- .../sharktank}/examples/paged_llm_v1.py | 0 .../examples/validate_llama_ref_model.py | 6 +- .../examples/validate_paged_llama_model.py | 8 +- .../sharktank}/layers/__init__.py | 0 .../sharktank}/layers/base.py | 0 .../sharktank}/layers/causal_llm.py | 0 .../sharktank}/layers/configs/__init__.py | 0 .../sharktank}/layers/configs/llm_configs.py | 0 .../sharktank}/layers/kv_cache.py | 0 .../sharktank}/layers/linear.py | 0 .../sharktank}/layers/norm.py | 0 .../sharktank}/layers/rotary_embedding.py | 0 .../sharktank}/layers/token_embedding.py | 0 .../sharktank}/models/llama/llama.py | 0 .../sharktank}/models/llama/llama_ref.py | 0 .../sharktank}/ops/__init__.py | 0 .../sharktank}/ops/base.py | 4 +- .../sharktank}/ops/custom_inference_ops.py | 0 .../ops/mmt_block_scaled_offset_q4.py | 2 +- .../sharktank}/ops/mmt_block_scaled_q8.py | 2 +- .../ops/mmt_super_block_scaled_offset_q4.py | 0 .../sharktank}/ops/mmtfp.py | 4 +- .../mmt_block_scaled_offset_q4_unsigned.mlir | 2 +- .../ops/templates/mmt_block_scaled_q8_3d.mlir | 2 +- ...er_block_scaled_offset_q4_unsigned_3d.mlir | 0 .../sharktank}/ops/templates/mmtfp_2d.mlir | 2 +- .../sharktank}/ops/templates/mmtfp_3d.mlir | 2 +- .../sharktank}/py.typed | 0 .../sharktank}/tools/dump_gguf.py | 0 .../sharktank}/types/__init__.py | 0 .../sharktank}/types/gguf_interop/__init__.py | 0 .../sharktank}/types/gguf_interop/base.py | 0 .../sharktank}/types/gguf_interop/layouts.py | 0 .../sharktank}/types/layout_utils.py | 0 .../sharktank}/types/layouts.py | 0 .../sharktank}/types/tensors.py | 0 .../sharktank}/types/theta.py | 0 .../sharktank}/utils/cli.py | 0 .../sharktank}/utils/debugging.py | 2 +- .../sharktank}/utils/hf_datasets.py | 0 .../sharktank}/utils/logging.py | 0 .../sharktank}/utils/tokenizer.py | 0 .../ops/mmt_block_scaled_offset_q4_test.py | 8 +- .../tests/ops/mmt_block_scaled_q8_test.py | 6 +- .../mmt_super_block_scaled_offset_q4_test.py | 4 +- {llm => sharktank}/tests/ops/mmtfp_test.py | 6 +- .../tests/types/dataset_test.py | 2 +- .../tests/types/layout_utils_test.py | 2 +- .../tests/types/layouts_test.py | 4 +- {serving => shortfin}/README.md | 7 +- {serving => shortfin}/mypy.ini | 2 +- shortfin/pyproject.toml | 12 + shortfin/requirements.txt | 3 + {serving => shortfin}/setup.cfg | 0 {serving => shortfin}/setup.py | 20 +- .../shortfin}/__init__.py | 0 .../shortfin}/framework/logging.py | 2 +- .../shortfin}/framework/session.py | 0 .../shortfin}/llm/__init__.py | 0 .../shortfin}/llm/api/rest_server.py | 2 +- .../shortfin}/llm/attn_block_cache.py | 2 +- .../shortfin}/llm/config.py | 0 .../shortfin}/llm/impl/service_v1.py | 2 +- .../shortfin}/llm/service.py | 0 .../shortfin}/llm/testing/fake_v1_module.py | 0 .../shortfin}/py.typed | 0 .../tests/framework/device_session_test.py | 2 +- .../tests/llm/api_server_test.py | 7 +- .../tests/llm/service_v1_test.py | 12 +- version_info.json | 1 + 87 files changed, 437 insertions(+), 88 deletions(-) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md delete mode 100644 llm/pyproject.toml create mode 100644 pytorch-cpu-requirements.txt create mode 100644 pytorch-rocm-requirements.txt create mode 100644 requirements.txt delete mode 100644 serving/pyproject.toml delete mode 100644 serving/requirements.txt rename {llm => sharktank}/README.md (64%) rename {llm => sharktank}/mypy.ini (75%) create mode 100644 sharktank/pyproject.toml rename {llm => sharktank}/requirements.txt (100%) rename {llm => sharktank}/setup.cfg (100%) rename {llm => sharktank}/setup.py (84%) rename {llm/turbine_llm => sharktank/sharktank}/__init__.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/examples/export_paged_llm_v1.py (98%) rename {llm/turbine_llm => sharktank/sharktank}/examples/paged_llm_v1.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/examples/validate_llama_ref_model.py (92%) rename {llm/turbine_llm => sharktank/sharktank}/examples/validate_paged_llama_model.py (97%) rename {llm/turbine_llm => sharktank/sharktank}/layers/__init__.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/layers/base.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/layers/causal_llm.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/layers/configs/__init__.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/layers/configs/llm_configs.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/layers/kv_cache.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/layers/linear.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/layers/norm.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/layers/rotary_embedding.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/layers/token_embedding.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/models/llama/llama.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/models/llama/llama_ref.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/ops/__init__.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/ops/base.py (97%) rename {llm/turbine_llm => sharktank/sharktank}/ops/custom_inference_ops.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/ops/mmt_block_scaled_offset_q4.py (97%) rename {llm/turbine_llm => sharktank/sharktank}/ops/mmt_block_scaled_q8.py (97%) rename {llm/turbine_llm => sharktank/sharktank}/ops/mmt_super_block_scaled_offset_q4.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/ops/mmtfp.py (94%) rename {llm/turbine_llm => sharktank/sharktank}/ops/templates/mmt_block_scaled_offset_q4_unsigned.mlir (97%) rename {llm/turbine_llm => sharktank/sharktank}/ops/templates/mmt_block_scaled_q8_3d.mlir (97%) rename {llm/turbine_llm => sharktank/sharktank}/ops/templates/mmt_super_block_scaled_offset_q4_unsigned_3d.mlir (100%) rename {llm/turbine_llm => sharktank/sharktank}/ops/templates/mmtfp_2d.mlir (93%) rename {llm/turbine_llm => sharktank/sharktank}/ops/templates/mmtfp_3d.mlir (96%) rename {llm/turbine_llm => sharktank/sharktank}/py.typed (100%) rename {llm/turbine_llm => sharktank/sharktank}/tools/dump_gguf.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/types/__init__.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/types/gguf_interop/__init__.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/types/gguf_interop/base.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/types/gguf_interop/layouts.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/types/layout_utils.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/types/layouts.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/types/tensors.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/types/theta.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/utils/cli.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/utils/debugging.py (97%) rename {llm/turbine_llm => sharktank/sharktank}/utils/hf_datasets.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/utils/logging.py (100%) rename {llm/turbine_llm => sharktank/sharktank}/utils/tokenizer.py (100%) rename {llm => sharktank}/tests/ops/mmt_block_scaled_offset_q4_test.py (92%) rename {llm => sharktank}/tests/ops/mmt_block_scaled_q8_test.py (93%) rename {llm => sharktank}/tests/ops/mmt_super_block_scaled_offset_q4_test.py (98%) rename {llm => sharktank}/tests/ops/mmtfp_test.py (94%) rename {llm => sharktank}/tests/types/dataset_test.py (99%) rename {llm => sharktank}/tests/types/layout_utils_test.py (98%) rename {llm => sharktank}/tests/types/layouts_test.py (97%) rename {serving => shortfin}/README.md (72%) rename {serving => shortfin}/mypy.ini (69%) create mode 100644 shortfin/pyproject.toml create mode 100644 shortfin/requirements.txt rename {serving => shortfin}/setup.cfg (100%) rename {serving => shortfin}/setup.py (83%) rename {serving/turbine_serving => shortfin/shortfin}/__init__.py (100%) rename {serving/turbine_serving => shortfin/shortfin}/framework/logging.py (95%) rename {serving/turbine_serving => shortfin/shortfin}/framework/session.py (100%) rename {serving/turbine_serving => shortfin/shortfin}/llm/__init__.py (100%) rename {serving/turbine_serving => shortfin/shortfin}/llm/api/rest_server.py (98%) rename {serving/turbine_serving => shortfin/shortfin}/llm/attn_block_cache.py (98%) rename {serving/turbine_serving => shortfin/shortfin}/llm/config.py (100%) rename {serving/turbine_serving => shortfin/shortfin}/llm/impl/service_v1.py (99%) rename {serving/turbine_serving => shortfin/shortfin}/llm/service.py (100%) rename {serving/turbine_serving => shortfin/shortfin}/llm/testing/fake_v1_module.py (100%) rename {serving/turbine_serving => shortfin/shortfin}/py.typed (100%) rename {serving => shortfin}/tests/framework/device_session_test.py (97%) rename {serving => shortfin}/tests/llm/api_server_test.py (90%) rename {serving => shortfin}/tests/llm/service_v1_test.py (91%) create mode 100644 version_info.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..ea43d0c69 --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +# Visual Studio files +.env +.vs/ +.vscode/ +*.sdf +*.opensdf +*.VC.opendb +*.suo +*.user + +# macOS files +.DS_Store + +# CMake artifacts +build/ +build-*/ + +# Python +__pycache__ +_python_build/ +dist/ +wheelhouse +*.egg-info +*.whl +*.venv + +#Model artifacts +*.pt +*.safetensors +*.gguf +*.vmfb diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..f9dc50615 --- /dev/null +++ b/LICENSE @@ -0,0 +1,219 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +--- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. + diff --git a/README.md b/README.md new file mode 100644 index 000000000..cba74c091 --- /dev/null +++ b/README.md @@ -0,0 +1,56 @@ +# SHARK Modeling and Serving Libraries + +**WARNING: This is an early preview that is in progress. It is not ready for +general use.** + +## Development Getting Started + +Use this as a guide to get started developing the project using pinned, +pre-release dependencies. You are welcome to deviate as you see fit, but +these canonical directions mirror what the CI does. + +### Setup a venv + +We recommend setting up a virtual environment (venv). The project is configured +to ignore `.venv` directories, and editors like VSCode pick them up by default. + +``` +python -m venv --prompt sharktank .venv +source .venv/bin/activate +``` + +### Install PyTorch for Your System + +If no explicit action is taken, the default PyTorch version will be installed. +This will give you a current CUDA-based version. Install a different variant +by doing so explicitly first: + +*CPU:* + +``` +pip install -r pytorch-cpu-requirements.txt +``` + +*ROCM:* + +``` +pip install -r pytorch-rocm-requirements.txt +``` + +### Install Development Packages + +This assumes you have `SHARK-Turbine` checked out adjacent (note that for the +moment we rely on pre-release versions, so installation is a bit harder). + +``` +pip install -f https://iree.dev/pip-release-links.html -e ../SHARK-Turbine/core/ +pip install -e sharktank +pip install -e shortfin +``` + +### Running Tests + +``` +pytest sharktank +pytest shortfin +``` diff --git a/llm/pyproject.toml b/llm/pyproject.toml deleted file mode 100644 index 9787c3bdf..000000000 --- a/llm/pyproject.toml +++ /dev/null @@ -1,3 +0,0 @@ -[build-system] -requires = ["setuptools", "wheel"] -build-backend = "setuptools.build_meta" diff --git a/pytorch-cpu-requirements.txt b/pytorch-cpu-requirements.txt new file mode 100644 index 000000000..aae0297db --- /dev/null +++ b/pytorch-cpu-requirements.txt @@ -0,0 +1,3 @@ +--pre +--index-url https://download.pytorch.org/whl/test/cpu +torch==2.3.0 diff --git a/pytorch-rocm-requirements.txt b/pytorch-rocm-requirements.txt new file mode 100644 index 000000000..f78a21070 --- /dev/null +++ b/pytorch-rocm-requirements.txt @@ -0,0 +1,5 @@ +--pre +--index-url https://download.pytorch.org/whl/nightly/rocm6.0 +# TODO: PyTorch ROCM doesn't seem to have a 2.3 RC published, so we just +# get a nightly. +torch>=2.3.0.dev1,<2.4 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..8cc668099 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +numpy>=1.26.3 +onnx>=1.15.0 +pytest>=8.0.0 +pytest-xdist>=3.5.0 +mypy==1.8.0 +types-requests==2.31.0.20240125 + +# It is expected that you have installed a PyTorch version/variant specific +# to your needs, so we only include a minimum version spec. +torch>=2.3.0 diff --git a/serving/pyproject.toml b/serving/pyproject.toml deleted file mode 100644 index 9787c3bdf..000000000 --- a/serving/pyproject.toml +++ /dev/null @@ -1,3 +0,0 @@ -[build-system] -requires = ["setuptools", "wheel"] -build-backend = "setuptools.build_meta" diff --git a/serving/requirements.txt b/serving/requirements.txt deleted file mode 100644 index 3cb469b2c..000000000 --- a/serving/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -fastapi>=0.109.2 -uvicorn>=0.27.0 -requests diff --git a/llm/README.md b/sharktank/README.md similarity index 64% rename from llm/README.md rename to sharktank/README.md index 3c4a586a0..6c33caa41 100644 --- a/llm/README.md +++ b/sharktank/README.md @@ -1,6 +1,10 @@ -# Turbine-LLM +# SHARK Tank -Light weight inference optimized layers and models for popular LLMs. +**WARNING: This is an early preview that is in progress. It is not ready for +general use.** + +Light weight inference optimized layers and models for popular genai +applications. This sub-project is a work in progress. It is intended to be a repository of layers, model recipes, and conversion tools from popular LLM quantization @@ -16,7 +20,7 @@ These are all under active development and should not yet be expected to work. ### Perform batched inference in PyTorch on a paged llama derived LLM: ```shell -python -m turbine_llm.examples.paged_llm_v1 \ +python -m sharktank.examples.paged_llm_v1 \ --hf-dataset=open_llama_3b_v2_f16_gguf \ "Prompt 1" \ "Prompt 2" ... @@ -25,11 +29,11 @@ python -m turbine_llm.examples.paged_llm_v1 \ ### Export an IREE compilable batched LLM for serving: ```shell -python -m turbine_llm.examples.export_paged_llm_v1 --hf-dataset=open_llama_3b_v2_f16_gguf +python -m sharktank.examples.export_paged_llm_v1 --hf-dataset=open_llama_3b_v2_f16_gguf ``` ### Dump parsed information about a model from a gguf file: ```shell -python -m turbine_llm.tools.dump_gguf --hf-dataset=open_llama_3b_v2_f16_gguf +python -m sharktank.tools.dump_gguf --hf-dataset=open_llama_3b_v2_f16_gguf ``` diff --git a/llm/mypy.ini b/sharktank/mypy.ini similarity index 75% rename from llm/mypy.ini rename to sharktank/mypy.ini index 910fdbba9..8afa09fa1 100644 --- a/llm/mypy.ini +++ b/sharktank/mypy.ini @@ -2,4 +2,4 @@ explicit_package_bases = True mypy_path = $MYPY_CONFIG_FILE_DIR -packages = turbine_llm +packages = sharktank diff --git a/sharktank/pyproject.toml b/sharktank/pyproject.toml new file mode 100644 index 000000000..d347ed631 --- /dev/null +++ b/sharktank/pyproject.toml @@ -0,0 +1,12 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.pytest.ini_options] +addopts = "-ra" +testpaths = [ + "tests", +] +pythonpath = [ + ".", +] diff --git a/llm/requirements.txt b/sharktank/requirements.txt similarity index 100% rename from llm/requirements.txt rename to sharktank/requirements.txt diff --git a/llm/setup.cfg b/sharktank/setup.cfg similarity index 100% rename from llm/setup.cfg rename to sharktank/setup.cfg diff --git a/llm/setup.py b/sharktank/setup.py similarity index 84% rename from llm/setup.py rename to sharktank/setup.py index 27e6d758d..911b4769c 100644 --- a/llm/setup.py +++ b/sharktank/setup.py @@ -18,7 +18,7 @@ with open( os.path.join( - REPO_DIR, + THIS_DIR, "README.md", ), "rt", @@ -36,8 +36,8 @@ def load_version_info(): packages = find_namespace_packages( include=[ - "turbine_llm", - "turbine_llm.*", + "sharktank", + "sharktank.*", ], ) @@ -54,9 +54,7 @@ def load_requirement_pins(requirements_file: Path): requirement_pins.update(dict(pin_pairs)) -load_requirement_pins(THIS_DIR / "requirements.txt") -load_requirement_pins(REPO_DIR / "core" / "iree-requirements.txt") -load_requirement_pins(REPO_DIR / "core" / "misc-requirements.txt") +load_requirement_pins(REPO_DIR / "requirements.txt") def get_version_spec(dep: str): @@ -77,14 +75,14 @@ def initialize_options(self): setup( - name=f"turbine-llm", + name=f"sharktank", version=f"{PACKAGE_VERSION}", author="SHARK Authors", author_email="stella@nod.ai", - description="SHARK layers and inference models for LLMs", + description="SHARK layers and inference models for genai", long_description=README, long_description_content_type="text/markdown", - url="https://github.com/nod-ai/SHARK-Turbine", + url="https://github.com/nod-ai/sharktank", license="Apache-2.0", classifiers=[ "Development Status :: 3 - Alpha", @@ -92,7 +90,7 @@ def initialize_options(self): "Programming Language :: Python :: 3", ], packages=packages, - package_data={"turbine_llm": ["py.typed"]}, + package_data={"sharktank": ["py.typed"]}, install_requires=[ "shark-turbine", ], diff --git a/llm/turbine_llm/__init__.py b/sharktank/sharktank/__init__.py similarity index 100% rename from llm/turbine_llm/__init__.py rename to sharktank/sharktank/__init__.py diff --git a/llm/turbine_llm/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py similarity index 98% rename from llm/turbine_llm/examples/export_paged_llm_v1.py rename to sharktank/sharktank/examples/export_paged_llm_v1.py index d0c36205b..3a8251247 100644 --- a/llm/turbine_llm/examples/export_paged_llm_v1.py +++ b/sharktank/sharktank/examples/export_paged_llm_v1.py @@ -10,8 +10,8 @@ from shark_turbine.aot import * -from turbine_llm.layers import * -from turbine_llm.types import * +from sharktank.layers import * +from sharktank.types import * # TODO: Should be using a base class with the protocol supported. from ..models.llama.llama import LlamaModelConfig, PagedLlamaModelV1 diff --git a/llm/turbine_llm/examples/paged_llm_v1.py b/sharktank/sharktank/examples/paged_llm_v1.py similarity index 100% rename from llm/turbine_llm/examples/paged_llm_v1.py rename to sharktank/sharktank/examples/paged_llm_v1.py diff --git a/llm/turbine_llm/examples/validate_llama_ref_model.py b/sharktank/sharktank/examples/validate_llama_ref_model.py similarity index 92% rename from llm/turbine_llm/examples/validate_llama_ref_model.py rename to sharktank/sharktank/examples/validate_llama_ref_model.py index d87718899..881d962a2 100644 --- a/llm/turbine_llm/examples/validate_llama_ref_model.py +++ b/sharktank/sharktank/examples/validate_llama_ref_model.py @@ -14,9 +14,9 @@ import torch -from turbine_llm.layers import * -from turbine_llm.types import * -from turbine_llm.models.llama.llama_ref import * +from sharktank.layers import * +from sharktank.types import * +from sharktank.models.llama.llama_ref import * def main(args: list[str]): diff --git a/llm/turbine_llm/examples/validate_paged_llama_model.py b/sharktank/sharktank/examples/validate_paged_llama_model.py similarity index 97% rename from llm/turbine_llm/examples/validate_paged_llama_model.py rename to sharktank/sharktank/examples/validate_paged_llama_model.py index d8168b8bc..4db3c7b5d 100644 --- a/llm/turbine_llm/examples/validate_paged_llama_model.py +++ b/sharktank/sharktank/examples/validate_paged_llama_model.py @@ -8,9 +8,9 @@ import torch -from turbine_llm.layers import * -from turbine_llm.types import * -from turbine_llm.models.llama.llama import * +from sharktank.layers import * +from sharktank.types import * +from sharktank.models.llama.llama import * def main(args: list[str]): @@ -130,7 +130,7 @@ def main(args: list[str]): print(f" : cache[0] = {cache_state[0][0]}") print(f" : cache[1] = {cache_state[0][1]}") - # from turbine_llm.models import llama + # from sharktank.models import llama # print(f"+++PREFILL XK = {llama.DEBUG_PREFILL_XK.shape}\n{llama.DEBUG_PREFILL_XK}") # print(f"+++DECODE XK = {llama.DEBUG_DECODE_XK.shape}\n{llama.DEBUG_DECODE_XK}") # torch.testing.assert_close(llama.DEBUG_PREFILL_XK, llama.DEBUG_DECODE_XK) diff --git a/llm/turbine_llm/layers/__init__.py b/sharktank/sharktank/layers/__init__.py similarity index 100% rename from llm/turbine_llm/layers/__init__.py rename to sharktank/sharktank/layers/__init__.py diff --git a/llm/turbine_llm/layers/base.py b/sharktank/sharktank/layers/base.py similarity index 100% rename from llm/turbine_llm/layers/base.py rename to sharktank/sharktank/layers/base.py diff --git a/llm/turbine_llm/layers/causal_llm.py b/sharktank/sharktank/layers/causal_llm.py similarity index 100% rename from llm/turbine_llm/layers/causal_llm.py rename to sharktank/sharktank/layers/causal_llm.py diff --git a/llm/turbine_llm/layers/configs/__init__.py b/sharktank/sharktank/layers/configs/__init__.py similarity index 100% rename from llm/turbine_llm/layers/configs/__init__.py rename to sharktank/sharktank/layers/configs/__init__.py diff --git a/llm/turbine_llm/layers/configs/llm_configs.py b/sharktank/sharktank/layers/configs/llm_configs.py similarity index 100% rename from llm/turbine_llm/layers/configs/llm_configs.py rename to sharktank/sharktank/layers/configs/llm_configs.py diff --git a/llm/turbine_llm/layers/kv_cache.py b/sharktank/sharktank/layers/kv_cache.py similarity index 100% rename from llm/turbine_llm/layers/kv_cache.py rename to sharktank/sharktank/layers/kv_cache.py diff --git a/llm/turbine_llm/layers/linear.py b/sharktank/sharktank/layers/linear.py similarity index 100% rename from llm/turbine_llm/layers/linear.py rename to sharktank/sharktank/layers/linear.py diff --git a/llm/turbine_llm/layers/norm.py b/sharktank/sharktank/layers/norm.py similarity index 100% rename from llm/turbine_llm/layers/norm.py rename to sharktank/sharktank/layers/norm.py diff --git a/llm/turbine_llm/layers/rotary_embedding.py b/sharktank/sharktank/layers/rotary_embedding.py similarity index 100% rename from llm/turbine_llm/layers/rotary_embedding.py rename to sharktank/sharktank/layers/rotary_embedding.py diff --git a/llm/turbine_llm/layers/token_embedding.py b/sharktank/sharktank/layers/token_embedding.py similarity index 100% rename from llm/turbine_llm/layers/token_embedding.py rename to sharktank/sharktank/layers/token_embedding.py diff --git a/llm/turbine_llm/models/llama/llama.py b/sharktank/sharktank/models/llama/llama.py similarity index 100% rename from llm/turbine_llm/models/llama/llama.py rename to sharktank/sharktank/models/llama/llama.py diff --git a/llm/turbine_llm/models/llama/llama_ref.py b/sharktank/sharktank/models/llama/llama_ref.py similarity index 100% rename from llm/turbine_llm/models/llama/llama_ref.py rename to sharktank/sharktank/models/llama/llama_ref.py diff --git a/llm/turbine_llm/ops/__init__.py b/sharktank/sharktank/ops/__init__.py similarity index 100% rename from llm/turbine_llm/ops/__init__.py rename to sharktank/sharktank/ops/__init__.py diff --git a/llm/turbine_llm/ops/base.py b/sharktank/sharktank/ops/base.py similarity index 97% rename from llm/turbine_llm/ops/base.py rename to sharktank/sharktank/ops/base.py index b85be7851..53377c979 100644 --- a/llm/turbine_llm/ops/base.py +++ b/sharktank/sharktank/ops/base.py @@ -31,9 +31,9 @@ from ..utils.logging import get_logger -LIBRARY = def_library("turbine_llm") +LIBRARY = def_library("sharktank") TEMPLATES_DIR = Path(__file__).parent / "templates" -logger = get_logger("turbine_llm.ops") +logger = get_logger("sharktank.ops") def call_function(target_function: Operation, *operands: Value) -> Sequence[Value]: diff --git a/llm/turbine_llm/ops/custom_inference_ops.py b/sharktank/sharktank/ops/custom_inference_ops.py similarity index 100% rename from llm/turbine_llm/ops/custom_inference_ops.py rename to sharktank/sharktank/ops/custom_inference_ops.py diff --git a/llm/turbine_llm/ops/mmt_block_scaled_offset_q4.py b/sharktank/sharktank/ops/mmt_block_scaled_offset_q4.py similarity index 97% rename from llm/turbine_llm/ops/mmt_block_scaled_offset_q4.py rename to sharktank/sharktank/ops/mmt_block_scaled_offset_q4.py index 8df659f04..972194c83 100644 --- a/llm/turbine_llm/ops/mmt_block_scaled_offset_q4.py +++ b/sharktank/sharktank/ops/mmt_block_scaled_offset_q4.py @@ -105,7 +105,7 @@ def generate(self, ksel: KernelSelection, kb: KernelBuilder): scale_type_str = str(d_tensor_type.element_type) template_file = "mmt_block_scaled_offset_q4_unsigned.mlir" - target_function_name = f"turbine_llm_mmt_block_scaled_offset_q4_unsigned_3d_{n}_{k}_{bs}_{a_type_str}" + target_function_name = f"sharktank_mmt_block_scaled_offset_q4_unsigned_3d_{n}_{k}_{bs}_{a_type_str}" target_function = inline_template_function( kb, diff --git a/llm/turbine_llm/ops/mmt_block_scaled_q8.py b/sharktank/sharktank/ops/mmt_block_scaled_q8.py similarity index 97% rename from llm/turbine_llm/ops/mmt_block_scaled_q8.py rename to sharktank/sharktank/ops/mmt_block_scaled_q8.py index 566eae067..1ea2cb467 100644 --- a/llm/turbine_llm/ops/mmt_block_scaled_q8.py +++ b/sharktank/sharktank/ops/mmt_block_scaled_q8.py @@ -84,7 +84,7 @@ def generate(self, ksel: KernelSelection, kb: KernelBuilder): template_file = "mmt_block_scaled_q8_3d.mlir" target_function_name = ( - f"turbine_llm_mmt_block_scaled_q8_3d_{n}_{k}_{bs}_{a_type_str}" + f"sharktank_mmt_block_scaled_q8_3d_{n}_{k}_{bs}_{a_type_str}" ) target_function = inline_template_function( diff --git a/llm/turbine_llm/ops/mmt_super_block_scaled_offset_q4.py b/sharktank/sharktank/ops/mmt_super_block_scaled_offset_q4.py similarity index 100% rename from llm/turbine_llm/ops/mmt_super_block_scaled_offset_q4.py rename to sharktank/sharktank/ops/mmt_super_block_scaled_offset_q4.py diff --git a/llm/turbine_llm/ops/mmtfp.py b/sharktank/sharktank/ops/mmtfp.py similarity index 94% rename from llm/turbine_llm/ops/mmtfp.py rename to sharktank/sharktank/ops/mmtfp.py index 726746fc1..272a1f4af 100644 --- a/llm/turbine_llm/ops/mmtfp.py +++ b/sharktank/sharktank/ops/mmtfp.py @@ -71,12 +71,12 @@ def generate(self, ksel: KernelSelection, kb: KernelBuilder): if rank == 2: template_file = "mmtfp_2d.mlir" target_function_name = ( - f"turbine_llm_mmtfp_2d_{n}_{k}_{a_type_str}{bT_type_str}" + f"sharktank_mmtfp_2d_{n}_{k}_{a_type_str}{bT_type_str}" ) elif rank == 3: template_file = "mmtfp_3d.mlir" target_function_name = ( - f"turbine_llm_mmtfp_3d_{n}_{k}_{a_type_str}{bT_type_str}" + f"sharktank_mmtfp_3d_{n}_{k}_{a_type_str}{bT_type_str}" ) target_function = inline_template_function( diff --git a/llm/turbine_llm/ops/templates/mmt_block_scaled_offset_q4_unsigned.mlir b/sharktank/sharktank/ops/templates/mmt_block_scaled_offset_q4_unsigned.mlir similarity index 97% rename from llm/turbine_llm/ops/templates/mmt_block_scaled_offset_q4_unsigned.mlir rename to sharktank/sharktank/ops/templates/mmt_block_scaled_offset_q4_unsigned.mlir index 06ff8d923..e27d41bf1 100644 --- a/llm/turbine_llm/ops/templates/mmt_block_scaled_offset_q4_unsigned.mlir +++ b/sharktank/sharktank/ops/templates/mmt_block_scaled_offset_q4_unsigned.mlir @@ -20,7 +20,7 @@ module {{ -util.func private @turbine_llm_mmt_block_scaled_offset_q4_unsigned_3d_{n}_{k}_{bs}_{a_type}( +util.func private @sharktank_mmt_block_scaled_offset_q4_unsigned_3d_{n}_{k}_{bs}_{a_type}( %a: !a_tensor_type, %d: !d_tensor_type, %qs_raw: !qs_raw_tensor_type, %m: !m_tensor_type) -> !c_tensor_type {{ %zero = arith.constant 0.0: !a_type diff --git a/llm/turbine_llm/ops/templates/mmt_block_scaled_q8_3d.mlir b/sharktank/sharktank/ops/templates/mmt_block_scaled_q8_3d.mlir similarity index 97% rename from llm/turbine_llm/ops/templates/mmt_block_scaled_q8_3d.mlir rename to sharktank/sharktank/ops/templates/mmt_block_scaled_q8_3d.mlir index 8c05b8f99..6ea1467ce 100644 --- a/llm/turbine_llm/ops/templates/mmt_block_scaled_q8_3d.mlir +++ b/sharktank/sharktank/ops/templates/mmt_block_scaled_q8_3d.mlir @@ -18,7 +18,7 @@ module {{ -util.func private @turbine_llm_mmt_block_scaled_q8_3d_{n}_{k}_{bs}_{a_type}( +util.func private @sharktank_mmt_block_scaled_q8_3d_{n}_{k}_{bs}_{a_type}( %a: !a_tensor_type, %d: !d_tensor_type, %qs: !qs_tensor_type) -> !c_tensor_type {{ %zero = arith.constant 0.0: !a_type diff --git a/llm/turbine_llm/ops/templates/mmt_super_block_scaled_offset_q4_unsigned_3d.mlir b/sharktank/sharktank/ops/templates/mmt_super_block_scaled_offset_q4_unsigned_3d.mlir similarity index 100% rename from llm/turbine_llm/ops/templates/mmt_super_block_scaled_offset_q4_unsigned_3d.mlir rename to sharktank/sharktank/ops/templates/mmt_super_block_scaled_offset_q4_unsigned_3d.mlir diff --git a/llm/turbine_llm/ops/templates/mmtfp_2d.mlir b/sharktank/sharktank/ops/templates/mmtfp_2d.mlir similarity index 93% rename from llm/turbine_llm/ops/templates/mmtfp_2d.mlir rename to sharktank/sharktank/ops/templates/mmtfp_2d.mlir index 1a48b0e0e..51bedda28 100644 --- a/llm/turbine_llm/ops/templates/mmtfp_2d.mlir +++ b/sharktank/sharktank/ops/templates/mmtfp_2d.mlir @@ -12,7 +12,7 @@ module {{ -util.func private @turbine_llm_mmtfp_2d_{n}_{k}_{a_type}{bT_type}( +util.func private @sharktank_mmtfp_2d_{n}_{k}_{a_type}{bT_type}( %a: !a_tensor_type, %bT: !bT_tensor_type) -> !c_tensor_type {{ %zero = arith.constant 0.000000e+00 : {a_type} diff --git a/llm/turbine_llm/ops/templates/mmtfp_3d.mlir b/sharktank/sharktank/ops/templates/mmtfp_3d.mlir similarity index 96% rename from llm/turbine_llm/ops/templates/mmtfp_3d.mlir rename to sharktank/sharktank/ops/templates/mmtfp_3d.mlir index c9b5f0103..e91d5abcc 100644 --- a/llm/turbine_llm/ops/templates/mmtfp_3d.mlir +++ b/sharktank/sharktank/ops/templates/mmtfp_3d.mlir @@ -13,7 +13,7 @@ module {{ -util.func private @turbine_llm_mmtfp_3d_{n}_{k}_{a_type}{bT_type}( +util.func private @sharktank_mmtfp_3d_{n}_{k}_{a_type}{bT_type}( %a: !a_tensor_type, %bT: !bT_tensor_type) -> !c_tensor_type {{ %zero = arith.constant 0.000000e+00 : !a_type diff --git a/llm/turbine_llm/py.typed b/sharktank/sharktank/py.typed similarity index 100% rename from llm/turbine_llm/py.typed rename to sharktank/sharktank/py.typed diff --git a/llm/turbine_llm/tools/dump_gguf.py b/sharktank/sharktank/tools/dump_gguf.py similarity index 100% rename from llm/turbine_llm/tools/dump_gguf.py rename to sharktank/sharktank/tools/dump_gguf.py diff --git a/llm/turbine_llm/types/__init__.py b/sharktank/sharktank/types/__init__.py similarity index 100% rename from llm/turbine_llm/types/__init__.py rename to sharktank/sharktank/types/__init__.py diff --git a/llm/turbine_llm/types/gguf_interop/__init__.py b/sharktank/sharktank/types/gguf_interop/__init__.py similarity index 100% rename from llm/turbine_llm/types/gguf_interop/__init__.py rename to sharktank/sharktank/types/gguf_interop/__init__.py diff --git a/llm/turbine_llm/types/gguf_interop/base.py b/sharktank/sharktank/types/gguf_interop/base.py similarity index 100% rename from llm/turbine_llm/types/gguf_interop/base.py rename to sharktank/sharktank/types/gguf_interop/base.py diff --git a/llm/turbine_llm/types/gguf_interop/layouts.py b/sharktank/sharktank/types/gguf_interop/layouts.py similarity index 100% rename from llm/turbine_llm/types/gguf_interop/layouts.py rename to sharktank/sharktank/types/gguf_interop/layouts.py diff --git a/llm/turbine_llm/types/layout_utils.py b/sharktank/sharktank/types/layout_utils.py similarity index 100% rename from llm/turbine_llm/types/layout_utils.py rename to sharktank/sharktank/types/layout_utils.py diff --git a/llm/turbine_llm/types/layouts.py b/sharktank/sharktank/types/layouts.py similarity index 100% rename from llm/turbine_llm/types/layouts.py rename to sharktank/sharktank/types/layouts.py diff --git a/llm/turbine_llm/types/tensors.py b/sharktank/sharktank/types/tensors.py similarity index 100% rename from llm/turbine_llm/types/tensors.py rename to sharktank/sharktank/types/tensors.py diff --git a/llm/turbine_llm/types/theta.py b/sharktank/sharktank/types/theta.py similarity index 100% rename from llm/turbine_llm/types/theta.py rename to sharktank/sharktank/types/theta.py diff --git a/llm/turbine_llm/utils/cli.py b/sharktank/sharktank/utils/cli.py similarity index 100% rename from llm/turbine_llm/utils/cli.py rename to sharktank/sharktank/utils/cli.py diff --git a/llm/turbine_llm/utils/debugging.py b/sharktank/sharktank/utils/debugging.py similarity index 97% rename from llm/turbine_llm/utils/debugging.py rename to sharktank/sharktank/utils/debugging.py index 1c14145b7..8e348d90b 100644 --- a/llm/turbine_llm/utils/debugging.py +++ b/sharktank/sharktank/utils/debugging.py @@ -17,7 +17,7 @@ __all__ = [] -logger = get_logger("turbine_llm.debugging") +logger = get_logger("sharktank.debugging") FLAGS_ENV_NAME = "TURBINE_LLM_DEBUG" SETTING_PART_PATTERN = re.compile(r"""^([\\+\\-])?([^=]+)(=(.*))?$""") diff --git a/llm/turbine_llm/utils/hf_datasets.py b/sharktank/sharktank/utils/hf_datasets.py similarity index 100% rename from llm/turbine_llm/utils/hf_datasets.py rename to sharktank/sharktank/utils/hf_datasets.py diff --git a/llm/turbine_llm/utils/logging.py b/sharktank/sharktank/utils/logging.py similarity index 100% rename from llm/turbine_llm/utils/logging.py rename to sharktank/sharktank/utils/logging.py diff --git a/llm/turbine_llm/utils/tokenizer.py b/sharktank/sharktank/utils/tokenizer.py similarity index 100% rename from llm/turbine_llm/utils/tokenizer.py rename to sharktank/sharktank/utils/tokenizer.py diff --git a/llm/tests/ops/mmt_block_scaled_offset_q4_test.py b/sharktank/tests/ops/mmt_block_scaled_offset_q4_test.py similarity index 92% rename from llm/tests/ops/mmt_block_scaled_offset_q4_test.py rename to sharktank/tests/ops/mmt_block_scaled_offset_q4_test.py index efebaf6f3..6c8961f83 100644 --- a/llm/tests/ops/mmt_block_scaled_offset_q4_test.py +++ b/sharktank/tests/ops/mmt_block_scaled_offset_q4_test.py @@ -13,8 +13,8 @@ import torch from shark_turbine import aot -from turbine_llm import ops -from turbine_llm.types import layout_utils +from sharktank import ops +from sharktank.types import layout_utils class mmt_block_scaled_offset_q4_unsigned_test(unittest.TestCase): @@ -61,7 +61,7 @@ def forward(self, a, d, qs, m): output.verify() asm = str(output.mlir_module) self.assertIn( - "@turbine_llm_mmt_block_scaled_offset_q4_unsigned_3d_3200_3200_32_f32", asm + "@sharktank_mmt_block_scaled_offset_q4_unsigned_3d_3200_3200_32_f32", asm ) def testExportStaticDims(self): @@ -83,7 +83,7 @@ def forward(self, a, d, qs, m): output.verify() asm = str(output.mlir_module) self.assertIn( - "@turbine_llm_mmt_block_scaled_offset_q4_unsigned_3d_3200_3200_32_f32", asm + "@sharktank_mmt_block_scaled_offset_q4_unsigned_3d_3200_3200_32_f32", asm ) diff --git a/llm/tests/ops/mmt_block_scaled_q8_test.py b/sharktank/tests/ops/mmt_block_scaled_q8_test.py similarity index 93% rename from llm/tests/ops/mmt_block_scaled_q8_test.py rename to sharktank/tests/ops/mmt_block_scaled_q8_test.py index 72f2d44a2..02e6a113b 100644 --- a/llm/tests/ops/mmt_block_scaled_q8_test.py +++ b/sharktank/tests/ops/mmt_block_scaled_q8_test.py @@ -13,7 +13,7 @@ import torch from shark_turbine import aot -from turbine_llm import ops +from sharktank import ops class mmt_block_scaled_q8_test(unittest.TestCase): @@ -57,7 +57,7 @@ def forward(self, a, b, qs): output = aot.export(ep) output.verify() asm = str(output.mlir_module) - self.assertIn("@turbine_llm_mmt_block_scaled_q8_3d_3200_3200_32_f32", asm) + self.assertIn("@sharktank_mmt_block_scaled_q8_3d_3200_3200_32_f32", asm) def testExportStaticDims(self): class MyModule(torch.nn.Module): @@ -78,7 +78,7 @@ def forward(self, a, b, qs): output = aot.export(ep) output.verify() asm = str(output.mlir_module) - self.assertIn("@turbine_llm_mmt_block_scaled_q8_3d_3200_3200_32_f32", asm) + self.assertIn("@sharktank_mmt_block_scaled_q8_3d_3200_3200_32_f32", asm) if __name__ == "__main__": diff --git a/llm/tests/ops/mmt_super_block_scaled_offset_q4_test.py b/sharktank/tests/ops/mmt_super_block_scaled_offset_q4_test.py similarity index 98% rename from llm/tests/ops/mmt_super_block_scaled_offset_q4_test.py rename to sharktank/tests/ops/mmt_super_block_scaled_offset_q4_test.py index c762309da..193df277d 100644 --- a/llm/tests/ops/mmt_super_block_scaled_offset_q4_test.py +++ b/sharktank/tests/ops/mmt_super_block_scaled_offset_q4_test.py @@ -13,8 +13,8 @@ import torch from shark_turbine import aot -from turbine_llm import ops -from turbine_llm.types import layout_utils +from sharktank import ops +from sharktank.types import layout_utils class mmt_super_block_scaled_offset_q4_unsigned(unittest.TestCase): diff --git a/llm/tests/ops/mmtfp_test.py b/sharktank/tests/ops/mmtfp_test.py similarity index 94% rename from llm/tests/ops/mmtfp_test.py rename to sharktank/tests/ops/mmtfp_test.py index 441593153..419af622a 100644 --- a/llm/tests/ops/mmtfp_test.py +++ b/sharktank/tests/ops/mmtfp_test.py @@ -13,7 +13,7 @@ import torch from shark_turbine import aot -from turbine_llm import ops +from sharktank import ops class mmtfp_test(unittest.TestCase): @@ -54,7 +54,7 @@ def forward(self, a, b): output = aot.export(ep) output.verify() asm = str(output.mlir_module) - self.assertIn("@turbine_llm_mmtfp_3d_256_32_f32f32", asm) + self.assertIn("@sharktank_mmtfp_3d_256_32_f32f32", asm) def testExportStaticDims(self): class MyModule(torch.nn.Module): @@ -72,7 +72,7 @@ def forward(self, a, b): output = aot.export(ep) output.verify() asm = str(output.mlir_module) - self.assertIn("@turbine_llm_mmtfp_3d_256_32_f32f32", asm) + self.assertIn("@sharktank_mmtfp_3d_256_32_f32f32", asm) def testExportTooDynamic(self): class MyModule(torch.nn.Module): diff --git a/llm/tests/types/dataset_test.py b/sharktank/tests/types/dataset_test.py similarity index 99% rename from llm/tests/types/dataset_test.py rename to sharktank/tests/types/dataset_test.py index 08c780db2..331a59644 100644 --- a/llm/tests/types/dataset_test.py +++ b/sharktank/tests/types/dataset_test.py @@ -12,7 +12,7 @@ import torch from shark_turbine.aot import ExternalTensorTrait -from turbine_llm.types import * +from sharktank.types import * def _t(name: str, *dims: int): diff --git a/llm/tests/types/layout_utils_test.py b/sharktank/tests/types/layout_utils_test.py similarity index 98% rename from llm/tests/types/layout_utils_test.py rename to sharktank/tests/types/layout_utils_test.py index 9ef26f5fe..d468165e7 100644 --- a/llm/tests/types/layout_utils_test.py +++ b/sharktank/tests/types/layout_utils_test.py @@ -8,7 +8,7 @@ import torch -from turbine_llm.types.layout_utils import * +from sharktank.types.layout_utils import * class I4Shuffle(unittest.TestCase): diff --git a/llm/tests/types/layouts_test.py b/sharktank/tests/types/layouts_test.py similarity index 97% rename from llm/tests/types/layouts_test.py rename to sharktank/tests/types/layouts_test.py index 4af8d259c..6aea8b634 100644 --- a/llm/tests/types/layouts_test.py +++ b/sharktank/tests/types/layouts_test.py @@ -8,8 +8,8 @@ import torch -from turbine_llm.types import * -from turbine_llm.types.tensors import REGISTERED_LAYOUT_CLASSES +from sharktank.types import * +from sharktank.types.tensors import REGISTERED_LAYOUT_CLASSES class BlockScaledLayoutTest(unittest.TestCase): diff --git a/serving/README.md b/shortfin/README.md similarity index 72% rename from serving/README.md rename to shortfin/README.md index e1ed3f8a9..a76fad8c7 100644 --- a/serving/README.md +++ b/shortfin/README.md @@ -1,7 +1,10 @@ -# Turbine Serving Infrastructure +# SHARK Shortfin Serving Infrastructure + +**WARNING: This is an early preview that is in progress. It is not ready for +general use.** This sub-project contains components and infrastructure for serving various -forms of Turbine compiled models. Instead of coming with models, it defines +forms of sharktank compiled models. Instead of coming with models, it defines ABIs that compiled models should adhere to in order to be served. It then allows them to be delivered as web endpoints via popular APIs. diff --git a/serving/mypy.ini b/shortfin/mypy.ini similarity index 69% rename from serving/mypy.ini rename to shortfin/mypy.ini index fdba402eb..d10567407 100644 --- a/serving/mypy.ini +++ b/shortfin/mypy.ini @@ -2,4 +2,4 @@ explicit_package_bases = True mypy_path = $MYPY_CONFIG_FILE_DIR -packages = turbine_serving.llm +packages = shortfin.llm diff --git a/shortfin/pyproject.toml b/shortfin/pyproject.toml new file mode 100644 index 000000000..d347ed631 --- /dev/null +++ b/shortfin/pyproject.toml @@ -0,0 +1,12 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.pytest.ini_options] +addopts = "-ra" +testpaths = [ + "tests", +] +pythonpath = [ + ".", +] diff --git a/shortfin/requirements.txt b/shortfin/requirements.txt new file mode 100644 index 000000000..4dbe08577 --- /dev/null +++ b/shortfin/requirements.txt @@ -0,0 +1,3 @@ +fastapi==0.109.2 +uvicorn==0.27.0 +requests==2.31.0 diff --git a/serving/setup.cfg b/shortfin/setup.cfg similarity index 100% rename from serving/setup.cfg rename to shortfin/setup.cfg diff --git a/serving/setup.py b/shortfin/setup.py similarity index 83% rename from serving/setup.py rename to shortfin/setup.py index 53c9fc4f9..3dfb8aa8a 100644 --- a/serving/setup.py +++ b/shortfin/setup.py @@ -18,7 +18,7 @@ with open( os.path.join( - REPO_DIR, + THIS_DIR, "README.md", ), "rt", @@ -36,8 +36,8 @@ def load_version_info(): packages = find_namespace_packages( include=[ - "turbine_serving", - "turbine_serving.*", + "shortfin", + "shortfin.*", ], ) @@ -54,9 +54,7 @@ def load_requirement_pins(requirements_file: Path): requirement_pins.update(dict(pin_pairs)) -load_requirement_pins(THIS_DIR / "requirements.txt") -load_requirement_pins(REPO_DIR / "core" / "iree-requirements.txt") -load_requirement_pins(REPO_DIR / "core" / "misc-requirements.txt") +load_requirement_pins(REPO_DIR / "requirements.txt") def get_version_spec(dep: str): @@ -77,14 +75,14 @@ def initialize_options(self): setup( - name=f"turbine-serving", + name=f"shortfin", version=f"{PACKAGE_VERSION}", author="SHARK Authors", author_email="stella@nod.ai", - description="SHARK Turbine Machine Learning Deployment Tools", + description="SHARK Shortfin Machine Learning Deployment Tools", long_description=README, long_description_content_type="text/markdown", - url="https://github.com/nod-ai/SHARK-Turbine", + url="https://github.com/nod-ai/sharktank", license="Apache-2.0", classifiers=[ "Development Status :: 3 - Alpha", @@ -92,10 +90,10 @@ def initialize_options(self): "Programming Language :: Python :: 3", ], packages=packages, - package_data={"turbine_serving": ["py.typed"]}, + package_data={"shortfin": ["py.typed"]}, install_requires=[ + f"sharktank=={PACKAGE_VERSION}", f"fastapi{get_version_spec('fastapi')}", - f"iree-compiler{get_version_spec('iree-compiler')}", f"iree-runtime{get_version_spec('iree-runtime')}", f"uvicorn{get_version_spec('uvicorn')}", f"requests{get_version_spec('requests')}", diff --git a/serving/turbine_serving/__init__.py b/shortfin/shortfin/__init__.py similarity index 100% rename from serving/turbine_serving/__init__.py rename to shortfin/shortfin/__init__.py diff --git a/serving/turbine_serving/framework/logging.py b/shortfin/shortfin/framework/logging.py similarity index 95% rename from serving/turbine_serving/framework/logging.py rename to shortfin/shortfin/framework/logging.py index c2963d577..5fae8aef3 100644 --- a/serving/turbine_serving/framework/logging.py +++ b/shortfin/shortfin/framework/logging.py @@ -24,7 +24,7 @@ def __init__(self): def _setup_logger(): - root_logger = logging.getLogger("turbine_serving") + root_logger = logging.getLogger("shortfin") root_logger.setLevel(logging.DEBUG) default_handler = logging.StreamHandler(sys.stderr) default_handler.flush = sys.stderr.flush diff --git a/serving/turbine_serving/framework/session.py b/shortfin/shortfin/framework/session.py similarity index 100% rename from serving/turbine_serving/framework/session.py rename to shortfin/shortfin/framework/session.py diff --git a/serving/turbine_serving/llm/__init__.py b/shortfin/shortfin/llm/__init__.py similarity index 100% rename from serving/turbine_serving/llm/__init__.py rename to shortfin/shortfin/llm/__init__.py diff --git a/serving/turbine_serving/llm/api/rest_server.py b/shortfin/shortfin/llm/api/rest_server.py similarity index 98% rename from serving/turbine_serving/llm/api/rest_server.py rename to shortfin/shortfin/llm/api/rest_server.py index 9f2757515..a6fd28b57 100644 --- a/serving/turbine_serving/llm/api/rest_server.py +++ b/shortfin/shortfin/llm/api/rest_server.py @@ -27,7 +27,7 @@ GenerateRequest, ) -logger = get_logger("turbine_serving.llm.api_server") +logger = get_logger("shortfin.llm.api_server") app = FastAPI() service: Optional[GenerateService] = None diff --git a/serving/turbine_serving/llm/attn_block_cache.py b/shortfin/shortfin/llm/attn_block_cache.py similarity index 98% rename from serving/turbine_serving/llm/attn_block_cache.py rename to shortfin/shortfin/llm/attn_block_cache.py index 0a73b02c7..87222bb85 100644 --- a/serving/turbine_serving/llm/attn_block_cache.py +++ b/shortfin/shortfin/llm/attn_block_cache.py @@ -21,7 +21,7 @@ from .config import human_size, CacheParams -logger = get_logger("turbine_serving.llm.cache") +logger = get_logger("shortfin.llm.cache") class AttnBlockCacheEntry: diff --git a/serving/turbine_serving/llm/config.py b/shortfin/shortfin/llm/config.py similarity index 100% rename from serving/turbine_serving/llm/config.py rename to shortfin/shortfin/llm/config.py diff --git a/serving/turbine_serving/llm/impl/service_v1.py b/shortfin/shortfin/llm/impl/service_v1.py similarity index 99% rename from serving/turbine_serving/llm/impl/service_v1.py rename to shortfin/shortfin/llm/impl/service_v1.py index c6dff7b16..5e96ee9c1 100644 --- a/serving/turbine_serving/llm/impl/service_v1.py +++ b/shortfin/shortfin/llm/impl/service_v1.py @@ -42,7 +42,7 @@ ) -logger = get_logger("turbine_serving.llm.impl.service_v1") +logger = get_logger("shortfin.llm.impl.service_v1") EXPECTED_CONCURRENCY = 10 diff --git a/serving/turbine_serving/llm/service.py b/shortfin/shortfin/llm/service.py similarity index 100% rename from serving/turbine_serving/llm/service.py rename to shortfin/shortfin/llm/service.py diff --git a/serving/turbine_serving/llm/testing/fake_v1_module.py b/shortfin/shortfin/llm/testing/fake_v1_module.py similarity index 100% rename from serving/turbine_serving/llm/testing/fake_v1_module.py rename to shortfin/shortfin/llm/testing/fake_v1_module.py diff --git a/serving/turbine_serving/py.typed b/shortfin/shortfin/py.typed similarity index 100% rename from serving/turbine_serving/py.typed rename to shortfin/shortfin/py.typed diff --git a/serving/tests/framework/device_session_test.py b/shortfin/tests/framework/device_session_test.py similarity index 97% rename from serving/tests/framework/device_session_test.py rename to shortfin/tests/framework/device_session_test.py index 73643663c..35b82fb37 100644 --- a/serving/tests/framework/device_session_test.py +++ b/shortfin/tests/framework/device_session_test.py @@ -6,7 +6,7 @@ import pytest -from turbine_serving.framework.session import ( +from shortfin.framework.session import ( DeviceSession, ) diff --git a/serving/tests/llm/api_server_test.py b/shortfin/tests/llm/api_server_test.py similarity index 90% rename from serving/tests/llm/api_server_test.py rename to shortfin/tests/llm/api_server_test.py index 74e29ffe1..197999df9 100644 --- a/serving/tests/llm/api_server_test.py +++ b/shortfin/tests/llm/api_server_test.py @@ -5,6 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import os +from pathlib import Path import pytest import requests import subprocess @@ -21,11 +22,13 @@ def __init__(self, args): [ sys.executable, "-m", - "turbine_serving.llm.api.rest_server", + "shortfin.llm.api.rest_server", "--testing-mock-service", ] + args, env=env, + # TODO: Have a more robust way of forking a subprocess. + cwd=str(Path(__file__).resolve().parent.parent.parent), stdout=sys.stdout, stderr=sys.stderr, ) @@ -41,7 +44,7 @@ def _wait_for_ready(self): if self.process.poll() is not None: raise RuntimeError("API server processs terminated") from e time.sleep(1.0) - if time.time() - start > 30: + if (time.time() - start) > 30: raise RuntimeError("Timeout waiting for server start") def __del__(self): diff --git a/serving/tests/llm/service_v1_test.py b/shortfin/tests/llm/service_v1_test.py similarity index 91% rename from serving/tests/llm/service_v1_test.py rename to shortfin/tests/llm/service_v1_test.py index 8e85bf370..1d1243e9a 100644 --- a/serving/tests/llm/service_v1_test.py +++ b/shortfin/tests/llm/service_v1_test.py @@ -10,28 +10,28 @@ HalElementType, ) -from turbine_serving.framework.session import DeviceSession -from turbine_serving.llm.config import ( +from shortfin.framework.session import DeviceSession +from shortfin.llm.config import ( CacheParams, ModelParams, ServiceParams, ) -from turbine_serving.llm.service import ( +from shortfin.llm.service import ( GenerateRequest, GenerateResponsePart, ) -from turbine_serving.llm.attn_block_cache import ( +from shortfin.llm.attn_block_cache import ( create_attn_block_cache_module, AttnBlockCache, ) -from turbine_serving.llm.impl.service_v1 import ( +from shortfin.llm.impl.service_v1 import ( GenerateServiceV1, ) -from turbine_serving.llm.testing.fake_v1_module import ( +from shortfin.llm.testing.fake_v1_module import ( create_fake_module, ) diff --git a/version_info.json b/version_info.json new file mode 100644 index 000000000..3678f441d --- /dev/null +++ b/version_info.json @@ -0,0 +1 @@ +{"package-version": "0.1.dev2"}