diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 14f41d2ace5..22830670e92 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ ci: autoupdate_schedule: monthly autofix_commit_msg: "ci: auto fixes from pre-commit.ci\n\nFor more information, see https://pre-commit.ci" autoupdate_commit_msg: 'ci: pre-commit autoupdate [skip ci]' - skip: # exceeds tier max size + skip: # exceeds tier max size - buf-format - buf-lint exclude: '(.*\.(css|js|svg))|(.*/(snippets|grpc|proto)/.*)$' @@ -13,9 +13,7 @@ repos: - id: ruff args: [--fix, --exit-non-zero-on-fix, --show-fixes] types_or: [python, pyi] - exclude: ^src/_bentoml_impl/frameworks/unsloth/train\.py$ - id: ruff-format - exclude: ^src/_bentoml_impl/frameworks/unsloth/train\.py$ types_or: [python, pyi] files: '(src|tests|docs|examples|typings)/' - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/pyproject.toml b/pyproject.toml index 7bcd0eb97ed..aeed80b759e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,73 +3,81 @@ name = "bentoml" description = "BentoML: The easiest way to serve AI apps and models" readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.9" -keywords = ["BentoML", "Compound AI Systems", "MLOps", "LLMOps", "Model Serving", "Model Inference", "Model Deployment"] +keywords = [ + "BentoML", + "Compound AI Systems", + "MLOps", + "LLMOps", + "Model Serving", + "Model Inference", + "Model Deployment", +] license = { text = "Apache-2.0" } authors = [{ name = "BentoML Team", email = "contact@bentoml.com" }] classifiers = [ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: Implementation :: CPython", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Topic :: Software Development :: Libraries", + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: Implementation :: CPython", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries", ] dependencies = [ - "Jinja2>=3.0.1", - "PyYAML>=5.0", - "aiohttp", - "attrs>=22.2.0", - "cattrs>=22.1.0,<23.2.0", - "circus>=0.17.0,!=0.17.2", - "click>=7.0", - "click-option-group", - "cloudpickle>=2.0.0", - "deepmerge", - "fs", - "httpx", - "inflection", - "numpy", - "nvidia-ml-py<12", - # OpenTelemetry is the server dependencies, rather than SDK - # Since there are discrepancies among API and instrumentation packages, - # we should always pin the set version of Opentelemetry suite - "opentelemetry-api==1.20.0", - "opentelemetry-sdk==1.20.0", - "opentelemetry-instrumentation==0.41b0", - "opentelemetry-instrumentation-aiohttp-client==0.41b0", - "opentelemetry-instrumentation-asgi==0.41b0", - "opentelemetry-semantic-conventions==0.41b0", - "opentelemetry-util-http==0.41b0", - "packaging>=22.0", - "pathspec", - "pip-requirements-parser>=31.2.0", - "prometheus-client>=0.10.0", - "psutil", - "pydantic<3", - "python-dateutil", - "python-multipart", - "python-json-logger", - "rich>=11.2.0", - "schema", - "simple-di>=0.1.4", - "starlette>=0.24.0", - "uvicorn>=0.22.0", - "watchfiles>=0.15.0", - # for manipulating pyproject.toml - "tomli>=1.1.0; python_version < \"3.11\"", - "tomli-w", - "httpx-ws>=0.6.0", - "aiosqlite>=0.20.0", - "uv", - "questionary>=2.0.1", + "Jinja2>=3.0.1", + "PyYAML>=5.0", + "aiohttp", + "attrs>=22.2.0", + "cattrs>=22.1.0,<23.2.0", + "circus>=0.17.0,!=0.17.2", + "click>=7.0", + "click-option-group", + "cloudpickle>=2.0.0", + "deepmerge", + "fs", + "httpx", + "inflection", + "numpy", + "nvidia-ml-py<12", + # OpenTelemetry is the server dependencies, rather than SDK + # Since there are discrepancies among API and instrumentation packages, + # we should always pin the set version of Opentelemetry suite + "opentelemetry-api==1.20.0", + "opentelemetry-sdk==1.20.0", + "opentelemetry-instrumentation==0.41b0", + "opentelemetry-instrumentation-aiohttp-client==0.41b0", + "opentelemetry-instrumentation-asgi==0.41b0", + "opentelemetry-semantic-conventions==0.41b0", + "opentelemetry-util-http==0.41b0", + "packaging>=22.0", + "pathspec", + "pip-requirements-parser>=31.2.0", + "prometheus-client>=0.10.0", + "psutil", + "pydantic<3", + "python-dateutil", + "python-multipart", + "python-json-logger", + "rich>=11.2.0", + "schema", + "simple-di>=0.1.4", + "starlette>=0.24.0", + "uvicorn>=0.22.0", + "watchfiles>=0.15.0", + # for manipulating pyproject.toml + "tomli>=1.1.0; python_version < \"3.11\"", + "tomli-w", + "httpx-ws>=0.6.0", + "aiosqlite>=0.20.0", + "uv", + "questionary>=2.0.1", ] dynamic = ["version"] [project.urls] @@ -84,40 +92,32 @@ Blog = "https://bentoml.com/blog" bentoml = "bentoml_cli.cli:cli" [project.optional-dependencies] all = [ - "bentoml[aws,io,grpc,grpc-reflection,grpc-channelz,tracing,monitor-otlp]", + "bentoml[aws,io,grpc,grpc-reflection,grpc-channelz,tracing,monitor-otlp]", ] aws = ["fs-s3fs"] -io = [ - "bentoml[io-image,io-pandas]", -] +io = ["bentoml[io-image,io-pandas]"] io-image = ["Pillow"] io-pandas = ["pandas>=1", "pyarrow"] triton = ["tritonclient>=2.29.0", "tritonclient[all]; sys_platform != 'darwin'"] -unsloth = [ - "unsloth[huggingface] @ git+https://github.com/bentoml/unsloth.git@main", - "vllm>=0.5.5", - "fastapi" -] +unsloth = ["bentoml-unsloth"] grpc = [ - "protobuf", - "grpcio", - "grpcio-health-checking", - "opentelemetry-instrumentation-grpc==0.41b0", + "protobuf", + "grpcio", + "grpcio-health-checking", + "opentelemetry-instrumentation-grpc==0.41b0", ] grpc-reflection = ["bentoml[grpc]", "grpcio-reflection"] grpc-channelz = ["bentoml[grpc]", "grpcio-channelz"] # We kept for compatibility with previous # versions of BentoML. It is discouraged to use this, instead use any # of the above tracing.* extras. -tracing = [ - "bentoml[tracing-jaeger,tracing-zipkin,tracing-otlp]", -] +tracing = ["bentoml[tracing-jaeger,tracing-zipkin,tracing-otlp]"] tracing-jaeger = ["opentelemetry-exporter-jaeger==1.20.0"] tracing-zipkin = ["opentelemetry-exporter-zipkin==1.20.0"] tracing-otlp = ["opentelemetry-exporter-otlp==1.20.0"] monitor-otlp = [ - "opentelemetry-exporter-otlp-proto-http==1.20.0", - "opentelemetry-exporter-otlp-proto-grpc==1.20.0", + "opentelemetry-exporter-otlp-proto-http==1.20.0", + "opentelemetry-exporter-otlp-proto-grpc==1.20.0", ] [build-system] @@ -131,21 +131,31 @@ fallback-version = "0.0.0" version-file = "src/bentoml/_version.py" [tool.hatch.version.raw-options] git_describe_command = [ - "git", - "describe", - "--dirty", - "--tags", - "--long", - "--first-parent", + "git", + "describe", + "--dirty", + "--tags", + "--long", + "--first-parent", ] version_scheme = "post-release" fallback_version = "0.0.0" [tool.hatch.metadata] allow-direct-references = true [tool.hatch.build.targets.sdist] -only-include = ["src/bentoml", "src/bentoml_cli", "src/_bentoml_sdk", "src/_bentoml_impl"] +only-include = [ + "src/bentoml", + "src/bentoml_cli", + "src/_bentoml_sdk", + "src/_bentoml_impl", +] [tool.hatch.build.targets.wheel] -packages = ["src/bentoml", "src/bentoml_cli", "src/_bentoml_sdk", "src/_bentoml_impl"] +packages = [ + "src/bentoml", + "src/bentoml_cli", + "src/_bentoml_sdk", + "src/_bentoml_impl", +] [[tool.pdm.source]] url = "https://download.pytorch.org/whl/cpu" @@ -156,37 +166,37 @@ respect-source-order = true [tool.pdm.dev-dependencies] docs = [ - "sphinx>=5", - "myst-parser", - "sphinx-click>=3.0.2", - "furo", - "sphinx-inline-tabs", - "sphinxext-opengraph", - "sphinxcontrib-fulltoc", - "sphinxcontrib-spelling", - "sphinx-copybutton", - "sphinx-issues", - "sphinx-design", - "pyenchant", - "Jinja2>=3.1", - "sphinx-autobuild", - "sphinx-hoverxref", - "pyspark", - "ray[serve]; python_version < \"3.12\"", + "sphinx>=5", + "myst-parser", + "sphinx-click>=3.0.2", + "furo", + "sphinx-inline-tabs", + "sphinxext-opengraph", + "sphinxcontrib-fulltoc", + "sphinxcontrib-spelling", + "sphinx-copybutton", + "sphinx-issues", + "sphinx-design", + "pyenchant", + "Jinja2>=3.1", + "sphinx-autobuild", + "sphinx-hoverxref", + "pyspark", + "ray[serve]; python_version < \"3.12\"", ] tooling = ["pre-commit", "setuptools-scm", "pandas-stubs", "nox"] testing = [ - "pandas>=1", - "scikit-learn", - "yamllint==1.32.0", - "coverage[toml]==7.2.6", - "fastapi", - "lxml", - "orjson", - "pytest-cov==4.1.0", - "pytest==7.4.0", - "pytest-xdist[psutil]==3.3.1", - "pytest-asyncio==0.21.1", + "pandas>=1", + "scikit-learn", + "yamllint==1.32.0", + "coverage[toml]==7.2.6", + "fastapi", + "lxml", + "orjson", + "pytest-cov==4.1.0", + "pytest==7.4.0", + "pytest-xdist[psutil]==3.3.1", + "pytest-asyncio==0.21.1", ] [tool.coverage.paths] @@ -194,43 +204,43 @@ bentoml = ["src/bentoml", "*/bentoml/src/bentoml"] [tool.coverage.run] branch = true omit = [ - "__pypackages__/*", - "src/bentoml/__main__.py", - "src/bentoml/io.py", - "src/bentoml/serve.py", - "src/bentoml/start.py", - "src/bentoml/_internal/types.py", - "src/bentoml/testing/*", - "src/bentoml/grpc/v1alpha1/*", - "src/bentoml/grpc/v1/*", - "src/bentoml/_internal/external_typing/*", + "__pypackages__/*", + "src/bentoml/__main__.py", + "src/bentoml/io.py", + "src/bentoml/serve.py", + "src/bentoml/start.py", + "src/bentoml/_internal/types.py", + "src/bentoml/testing/*", + "src/bentoml/grpc/v1alpha1/*", + "src/bentoml/grpc/v1/*", + "src/bentoml/_internal/external_typing/*", ] [tool.coverage.report] show_missing = true precision = 2 omit = [ - "__pypackages__/*", - 'src/bentoml/__main__.py', - "src/bentoml/io.py", - "src/bentoml/serve.py", - "src/bentoml/start.py", - "src/bentoml/_internal/types.py", - "src/bentoml/testing/*", - "src/bentoml/grpc/v1alpha1/*", - "src/bentoml/grpc/v1/*", - "src/bentoml/_internal/external_typing/*", + "__pypackages__/*", + 'src/bentoml/__main__.py', + "src/bentoml/io.py", + "src/bentoml/serve.py", + "src/bentoml/start.py", + "src/bentoml/_internal/types.py", + "src/bentoml/testing/*", + "src/bentoml/grpc/v1alpha1/*", + "src/bentoml/grpc/v1/*", + "src/bentoml/_internal/external_typing/*", ] exclude_lines = [ - "\\#\\s*pragma: no cover", - "^\\s*def __repr__", - "^\\s*raise AssertionError", - "^\\s*raise NotImplementedError", - "^\\s*raise MissingDependencyException", - "^\\s*except ImportError", - "if __name__ == .__main__.:", - "^\\s*if TYPE_CHECKING:", - "^\\s*@(t\\.)?overload( |$)", - "@(abc\\.)?abstractmethod", + "\\#\\s*pragma: no cover", + "^\\s*def __repr__", + "^\\s*raise AssertionError", + "^\\s*raise NotImplementedError", + "^\\s*raise MissingDependencyException", + "^\\s*except ImportError", + "if __name__ == .__main__.:", + "^\\s*if TYPE_CHECKING:", + "^\\s*@(t\\.)?overload( |$)", + "@(abc\\.)?abstractmethod", ] [tool.pytest.ini_options] @@ -243,25 +253,19 @@ testpaths = ["tests"] line-length = 88 target-version = "py310" -[tool.ruff.format] -exclude = [ - "src/_bentoml_impl/frameworks/unsloth/train.py", -] - [tool.ruff.lint] # We ignore E501 (line too long) here because we keep user-visible strings on one line. ignore = ["E501"] extend-select = ["I"] exclude = [ - "bazel-*/", - "venv", - "typings", - "docs/source", - "src/bentoml/_internal/external_typing", - "src/bentoml/grpc/v1alpha1", - "src/bentoml/grpc/v1", - "src/_bentoml_impl/frameworks/unsloth", - "tests/proto", + "bazel-*/", + "venv", + "typings", + "docs/source", + "src/bentoml/_internal/external_typing", + "src/bentoml/grpc/v1alpha1", + "src/bentoml/grpc/v1", + "tests/proto", ] [tool.ruff.lint.per-file-ignores] @@ -279,14 +283,14 @@ known-first-party = ["bentoml", "bentoml_cli", "_bentoml_sdk", "_bentoml_impl"] pythonVersion = "3.12" include = ["src/", "examples/", "tests/"] exclude = [ - 'src/bentoml/_version.py', - 'src/bentoml/__main__.py', - 'src/bentoml/_internal/external_typing/', - 'src/**/*_pb2.py*', - "src/**/*_pb2_grpc.py*", - "grpc-client/thirdparty", - "tests/proto", - "bazel-*", + 'src/bentoml/_version.py', + 'src/bentoml/__main__.py', + 'src/bentoml/_internal/external_typing/', + 'src/**/*_pb2.py*', + "src/**/*_pb2_grpc.py*", + "grpc-client/thirdparty", + "tests/proto", + "bazel-*", ] analysis.useLibraryCodeForTypes = true strictListInference = true diff --git a/src/_bentoml_impl/frameworks/unsloth/__init__.py b/src/_bentoml_impl/frameworks/unsloth/__init__.py deleted file mode 100644 index 810faaf390a..00000000000 --- a/src/_bentoml_impl/frameworks/unsloth/__init__.py +++ /dev/null @@ -1,182 +0,0 @@ -from __future__ import annotations - -import logging -import math -import os -import pathlib -import shutil -import subprocess -import sys -import tempfile -import typing as t - -import yaml -from deepmerge.merger import Merger - -import bentoml -from _bentoml_sdk.service.config import ServiceConfig as Config -from bentoml._internal.bento.build_config import BentoBuildConfig -from bentoml._internal.bento.build_config import DockerOptions -from bentoml._internal.bento.build_config import ModelSpec -from bentoml._internal.utils import pkg -from bentoml.exceptions import BentoMLException -from bentoml.exceptions import MissingDependencyException - -from .mapping import RUNTIME_MAPPING as MAPPINGS -from .mapping import get_extras - -logger = logging.getLogger(__name__) - -if t.TYPE_CHECKING: - from transformers import PreTrainedModel - from transformers import PreTrainedTokenizerFast - -if pkg.find_spec("unsloth") is None: - raise MissingDependencyException( - "'unsloth' is required in order to use module 'bentoml.unsloth', install unsloth with 'pip install bentoml[unsloth]'." - ) - -merger = Merger( - # merge dicts, append list - [(dict, "merge"), (list, "append")], - # override all other types - ["override"], - # override conflicting types - ["override"], -) - - -def replace_tag(tag: str) -> str: - return tag.lower().replace("/", "--") - - -ModelType = t.Literal["llama", "mistral", "gemma", "gemma2", "qwen2"] - -SPEC = { - "nvidia-tesla-t4": 16.0, - "nvidia-tesla-v100": 16.0, - "nvidia-l4": 24.0, - "nvidia-tesla-l4": 24.0, - "nvidia-tesla-a10g": 24.0, - "nvidia-a100-80gb": 80.0, - "nvidia-tesla-a100": 40.0, -} - - -def calculate_recommended_gpu_type(model) -> str: - # ceiling the number of parameters to the nearest billion - num_params = math.ceil(sum(p.numel() for p in model.parameters()) / 1e9) - for gpu_type, memory in SPEC.items(): - if num_params <= memory / 2: - return gpu_type - # If no suitable GPU is found, return the one with the highest memory - return max(SPEC, key=SPEC.get) - - -def build_bento( - model: PreTrainedModel, - tokenizer: PreTrainedTokenizerFast, - /, - model_name: str | None = None, - *, - quantization_method: t.Literal["bitsandbytes"] | None = None, - save_method: t.Literal["merged_16bit", "merged_4bit"] = "merged_16bit", - service_config: Config | None = None, - engine_config: dict[str, t.Any] - | None = None, # arguments to pass to AsyncEngineArgs -) -> bentoml.Model: - # this model is local then model_name must specified, otherwise derived from model_id - is_local = getattr(model.config, "_commit_hash", None) is None - if is_local is True and model_name is None: - raise BentoMLException( - 'Fine-tune from a local checkpoint requires specifying "model_name".' - ) - else: - model_name = model_name or replace_tag(model.config._name_or_path) - - model_type = t.cast(ModelType, model.config.model_type) - - if service_config is None: - service_config = {} - if engine_config is None: - engine_config = {} - - local_items = MAPPINGS[model_type]["service_config"] - merger.merge( - local_items, - {"resources": {"gpu": 1, "gpu_type": calculate_recommended_gpu_type(model)}}, - ) - - service_config.update({**local_items}) - - engine_config.update(MAPPINGS[model.config.model_type]["engine_config"]) - if quantization_method is not None: - if engine_config.get("quantization") is None: - engine_config.update( - { - "quantization": quantization_method, - "load_format": quantization_method, - } - ) - - with bentoml.models.create(model_name) as bentomodel: - model.save_pretrained_merged( - bentomodel.path, tokenizer, save_method=save_method - ) - - build_opts = dict( - python=dict( - packages=[ - "pyyaml", - "vllm==0.5.5", - "fastapi==0.111.0", - "unsloth[huggingface] @ git+https://github.com/bentoml/unsloth.git@main", - ], - lock_packages=True, - ), - envs=[{"name": "HF_TOKEN"}], - ) - merger.merge(build_opts, get_extras().get(model_type, {})) - - logger.info( - "Building bentos for %s, model_id=%s", model_type, model.config._name_or_path - ) - - with tempfile.TemporaryDirectory() as tempdir: - tempdir = pathlib.Path(tempdir) - shutil.copytree( - pathlib.Path(__file__).parent / "template", tempdir, dirs_exist_ok=True - ) - with (tempdir / "service_config.yaml").open("w") as f: - f.write( - yaml.safe_dump( - dict( - model_tag=str(bentomodel.tag), - engine_config=engine_config, - service_config=service_config, - ) - ) - ) - with (tempdir / "bentofile.yaml").open("w") as f: - BentoBuildConfig( - service="service:VLLM", - name=f"{model_name.replace('.', '-')}-service", - include=["*.py", "*.yaml"], - docker=DockerOptions(python_version="3.11", system_packages=["git"]), - models=[ModelSpec.from_item(str(bentomodel.tag))], - description="API Service for running Unsloth models, powered with BentoML and vLLM.", - **build_opts, - ).with_defaults().to_yaml(f) - - subprocess.run( - [ - sys.executable, - "-m", - "bentoml", - "build", - str(tempdir), - ], - check=True, - cwd=tempdir, - env=os.environ, - ) diff --git a/src/_bentoml_impl/frameworks/unsloth/mapping.py b/src/_bentoml_impl/frameworks/unsloth/mapping.py deleted file mode 100644 index e8420113144..00000000000 --- a/src/_bentoml_impl/frameworks/unsloth/mapping.py +++ /dev/null @@ -1,49 +0,0 @@ -RUNTIME_MAPPING = { - "llama": { - "service_config": { - "traffic": {"timeout": 300}, - "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, - }, - "engine_config": {"max_model_len": 2048}, - }, - "mistral": { - "service_config": { - "traffic": {"timeout": 300}, - "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, - }, - "engine_config": {"max_model_len": 2048}, - }, - "gemma": { - "service_config": { - "traffic": {"timeout": 300}, - "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, - }, - "engine_config": {"max_model_len": 2048}, - }, - "gemma2": { - "service_config": { - "traffic": {"timeout": 300}, - "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, - }, - "engine_config": {"max_model_len": 2048}, - }, - "qwen2": { - "service_config": { - "traffic": {"timeout": 300}, - "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, - }, - "engine_config": {"max_model_len": 2048}, - }, -} - - -def get_extras(): - return { - "gemma2": { - "envs": [{"name": "VLLM_ATTENTION_BACKEND", "value": "FLASHINFER"}], - "python": { - "extra_index_url": ["https://flashinfer.ai/whl/cu121/torch2.3"], - "packages": ["flashinfer==0.1.2+cu121torch2.3"], - }, - } - } diff --git a/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore b/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore deleted file mode 100644 index 6e34eb8cb21..00000000000 --- a/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore +++ /dev/null @@ -1,6 +0,0 @@ -__pycache__/ -*.py[cod] -*$py.class -.ipynb_checkpoints -venv/ -.venv/ diff --git a/src/_bentoml_impl/frameworks/unsloth/template/.gitignore b/src/_bentoml_impl/frameworks/unsloth/template/.gitignore deleted file mode 100644 index 8f462f40473..00000000000 --- a/src/_bentoml_impl/frameworks/unsloth/template/.gitignore +++ /dev/null @@ -1,9 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class -.ipynb_checkpoints - -# Environments -venv/ -.venv/ diff --git a/src/_bentoml_impl/frameworks/unsloth/template/service.py b/src/_bentoml_impl/frameworks/unsloth/template/service.py deleted file mode 100644 index 84cab1afe00..00000000000 --- a/src/_bentoml_impl/frameworks/unsloth/template/service.py +++ /dev/null @@ -1,81 +0,0 @@ -from __future__ import annotations - -import logging -import pathlib -from typing import Literal - -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml - -import bentoml - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -# Load the constants from the yaml file -CONSTANT_YAML = pathlib.Path(__file__).parent / "service_config.yaml" -if not CONSTANT_YAML.exists(): - raise FileNotFoundError(f"service_config.yaml not found in {CONSTANT_YAML.parent}") -with CONSTANT_YAML.open("r") as f: - CONSTANTS = yaml.safe_load(f) -ENGINE_CONFIG = CONSTANTS["engine_config"] -STATIC_DIR = pathlib.Path(__file__).parent / "ui" - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -openai_api_app = fastapi.FastAPI() -for route, endpoint, methods in [ - ("/chat/completions", vllm_api_server.create_chat_completion, ["POST"]), - ("/completions", vllm_api_server.create_completion, ["POST"]), - ("/models", vllm_api_server.show_available_models, ["GET"]), -]: - openai_api_app.add_api_route( - path=route, endpoint=endpoint, methods=methods, include_in_schema=True - ) - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.service(**CONSTANTS["service_config"]) -class VLLM: - bentomodel = bentoml.models.get(CONSTANTS["model_tag"]) - - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs - from vllm import AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - self.engine = AsyncLLMEngine.from_engine_args( - AsyncEngineArgs( - model=self.bentomodel.path, enable_prefix_caching=True, **ENGINE_CONFIG - ) - ) - self.tokenizer = AutoTokenizer.from_pretrained(self.bentomodel.path) - model_config = self.engine.engine.get_model_config() - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - async_engine_client=self.engine, - served_model_names=[self.bentomodel.path], - response_role="assistant", - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - async_engine_client=self.engine, - served_model_names=[self.bentomodel.path], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) diff --git a/src/_bentoml_impl/frameworks/unsloth/train.py b/src/_bentoml_impl/frameworks/unsloth/train.py deleted file mode 100644 index a97c8be2a07..00000000000 --- a/src/_bentoml_impl/frameworks/unsloth/train.py +++ /dev/null @@ -1,69 +0,0 @@ -# ruff: ignore - -def prep_dataset(tokenizer): - alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. - -### Instruction: -{} - -### Input: -{} - -### Response: -{}""" - - EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN - - def formatting_prompts_func(examples): - instructions = examples["instruction"] - inputs = examples["input"] - outputs = examples["output"] - texts = [] - for instruction, input, output in zip(instructions, inputs, outputs): - # Must add EOS_TOKEN, otherwise your generation will go on forever! - text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN - texts.append(text) - return {"text": texts} - - from datasets import load_dataset - - dataset = load_dataset("yahma/alpaca-cleaned", split="train") - dataset = dataset.map( - formatting_prompts_func, - batched=True, - ) - return dataset - - -def main(max_seq_length: int = 8196) -> int: - import unsloth, bentoml, trl, transformers - - model, tokenizer = unsloth.FastLanguageModel.from_pretrained("unsloth/Meta-Llama-3.1-8B-bnb-4bit", max_seq_length=max_seq_length, load_in_4bit=True) - # alpaca chat templates - tokenizer.chat_template="{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() + '\n\n' %}{% else %}{% set loop_messages = messages %}{% set system_message = '' %}{% endif %}{{ bos_token + system_message }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'].strip() + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response:\n' + message['content'].strip() + eos_token + '\n\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '### Instruction:\n' }}{% endif %}{% endfor %}" - model = unsloth.FastLanguageModel.get_peft_model( - model, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], - r=16, lora_alpha=16, lora_dropout=0, bias="none", random_state=3407, - use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context - ) - trl.SFTTrainer( - model=model, tokenizer=tokenizer, - train_dataset=prep_dataset(tokenizer), - dataset_text_field="text", max_seq_length=max_seq_length, - dataset_num_proc=2, packing=False, # Can make training 5x faster for short sequences. - args=transformers.TrainingArguments( - per_device_train_batch_size=2, gradient_accumulation_steps=4, - warmup_steps=5, num_train_epochs = 1, max_steps=60, learning_rate=2e-4, - weight_decay=0.01, seed=3407, - optim="adamw_8bit", - fp16=not unsloth.is_bfloat16_supported(), bf16=unsloth.is_bfloat16_supported(), - logging_steps=1, - lr_scheduler_type="linear", - output_dir="outputs", - ), - ).train() - - bentoml.unsloth.build_bento(model, tokenizer, quantization_method="bitsandbytes", engine_config={"quantization": "bitsandbytes"}) - return 0 - -if __name__ == "__main__": raise SystemExit(main())