From 9d05b6e52bcf227a7f9215154c0ebd487b4a9df1 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Thu, 29 Aug 2024 07:03:03 +0000 Subject: [PATCH] feat: unsloth integrations Signed-off-by: Aaron Pham --- .gitignore | 4 - .pre-commit-config.yaml | 4 +- pyproject.toml | 12 +- .../frameworks/unsloth/__init__.py | 182 ------------------ .../frameworks/unsloth/mapping.py | 49 ----- .../frameworks/unsloth/template/.bentoignore | 6 - .../frameworks/unsloth/template/.gitignore | 9 - .../frameworks/unsloth/template/service.py | 81 -------- src/_bentoml_impl/frameworks/unsloth/train.py | 69 ------- 9 files changed, 2 insertions(+), 414 deletions(-) delete mode 100644 src/_bentoml_impl/frameworks/unsloth/__init__.py delete mode 100644 src/_bentoml_impl/frameworks/unsloth/mapping.py delete mode 100644 src/_bentoml_impl/frameworks/unsloth/template/.bentoignore delete mode 100644 src/_bentoml_impl/frameworks/unsloth/template/.gitignore delete mode 100644 src/_bentoml_impl/frameworks/unsloth/template/service.py delete mode 100644 src/_bentoml_impl/frameworks/unsloth/train.py diff --git a/.gitignore b/.gitignore index 051738c651a..822c97afccd 100644 --- a/.gitignore +++ b/.gitignore @@ -167,7 +167,3 @@ mlruns/ .pdm-python .python-version .pdm-build/ - -# from training scripts -model -outputs diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 14f41d2ace5..22830670e92 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ ci: autoupdate_schedule: monthly autofix_commit_msg: "ci: auto fixes from pre-commit.ci\n\nFor more information, see https://pre-commit.ci" autoupdate_commit_msg: 'ci: pre-commit autoupdate [skip ci]' - skip: # exceeds tier max size + skip: # exceeds tier max size - buf-format - buf-lint exclude: '(.*\.(css|js|svg))|(.*/(snippets|grpc|proto)/.*)$' @@ -13,9 +13,7 @@ repos: - id: ruff args: [--fix, --exit-non-zero-on-fix, --show-fixes] types_or: [python, pyi] - exclude: ^src/_bentoml_impl/frameworks/unsloth/train\.py$ - id: ruff-format - exclude: ^src/_bentoml_impl/frameworks/unsloth/train\.py$ types_or: [python, pyi] files: '(src|tests|docs|examples|typings)/' - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/pyproject.toml b/pyproject.toml index 7bcd0eb97ed..319e3411923 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,11 +93,7 @@ io = [ io-image = ["Pillow"] io-pandas = ["pandas>=1", "pyarrow"] triton = ["tritonclient>=2.29.0", "tritonclient[all]; sys_platform != 'darwin'"] -unsloth = [ - "unsloth[huggingface] @ git+https://github.com/bentoml/unsloth.git@main", - "vllm>=0.5.5", - "fastapi" -] +unsloth = ["bentoml-unsloth"] grpc = [ "protobuf", "grpcio", @@ -243,11 +239,6 @@ testpaths = ["tests"] line-length = 88 target-version = "py310" -[tool.ruff.format] -exclude = [ - "src/_bentoml_impl/frameworks/unsloth/train.py", -] - [tool.ruff.lint] # We ignore E501 (line too long) here because we keep user-visible strings on one line. ignore = ["E501"] @@ -260,7 +251,6 @@ exclude = [ "src/bentoml/_internal/external_typing", "src/bentoml/grpc/v1alpha1", "src/bentoml/grpc/v1", - "src/_bentoml_impl/frameworks/unsloth", "tests/proto", ] diff --git a/src/_bentoml_impl/frameworks/unsloth/__init__.py b/src/_bentoml_impl/frameworks/unsloth/__init__.py deleted file mode 100644 index 810faaf390a..00000000000 --- a/src/_bentoml_impl/frameworks/unsloth/__init__.py +++ /dev/null @@ -1,182 +0,0 @@ -from __future__ import annotations - -import logging -import math -import os -import pathlib -import shutil -import subprocess -import sys -import tempfile -import typing as t - -import yaml -from deepmerge.merger import Merger - -import bentoml -from _bentoml_sdk.service.config import ServiceConfig as Config -from bentoml._internal.bento.build_config import BentoBuildConfig -from bentoml._internal.bento.build_config import DockerOptions -from bentoml._internal.bento.build_config import ModelSpec -from bentoml._internal.utils import pkg -from bentoml.exceptions import BentoMLException -from bentoml.exceptions import MissingDependencyException - -from .mapping import RUNTIME_MAPPING as MAPPINGS -from .mapping import get_extras - -logger = logging.getLogger(__name__) - -if t.TYPE_CHECKING: - from transformers import PreTrainedModel - from transformers import PreTrainedTokenizerFast - -if pkg.find_spec("unsloth") is None: - raise MissingDependencyException( - "'unsloth' is required in order to use module 'bentoml.unsloth', install unsloth with 'pip install bentoml[unsloth]'." - ) - -merger = Merger( - # merge dicts, append list - [(dict, "merge"), (list, "append")], - # override all other types - ["override"], - # override conflicting types - ["override"], -) - - -def replace_tag(tag: str) -> str: - return tag.lower().replace("/", "--") - - -ModelType = t.Literal["llama", "mistral", "gemma", "gemma2", "qwen2"] - -SPEC = { - "nvidia-tesla-t4": 16.0, - "nvidia-tesla-v100": 16.0, - "nvidia-l4": 24.0, - "nvidia-tesla-l4": 24.0, - "nvidia-tesla-a10g": 24.0, - "nvidia-a100-80gb": 80.0, - "nvidia-tesla-a100": 40.0, -} - - -def calculate_recommended_gpu_type(model) -> str: - # ceiling the number of parameters to the nearest billion - num_params = math.ceil(sum(p.numel() for p in model.parameters()) / 1e9) - for gpu_type, memory in SPEC.items(): - if num_params <= memory / 2: - return gpu_type - # If no suitable GPU is found, return the one with the highest memory - return max(SPEC, key=SPEC.get) - - -def build_bento( - model: PreTrainedModel, - tokenizer: PreTrainedTokenizerFast, - /, - model_name: str | None = None, - *, - quantization_method: t.Literal["bitsandbytes"] | None = None, - save_method: t.Literal["merged_16bit", "merged_4bit"] = "merged_16bit", - service_config: Config | None = None, - engine_config: dict[str, t.Any] - | None = None, # arguments to pass to AsyncEngineArgs -) -> bentoml.Model: - # this model is local then model_name must specified, otherwise derived from model_id - is_local = getattr(model.config, "_commit_hash", None) is None - if is_local is True and model_name is None: - raise BentoMLException( - 'Fine-tune from a local checkpoint requires specifying "model_name".' - ) - else: - model_name = model_name or replace_tag(model.config._name_or_path) - - model_type = t.cast(ModelType, model.config.model_type) - - if service_config is None: - service_config = {} - if engine_config is None: - engine_config = {} - - local_items = MAPPINGS[model_type]["service_config"] - merger.merge( - local_items, - {"resources": {"gpu": 1, "gpu_type": calculate_recommended_gpu_type(model)}}, - ) - - service_config.update({**local_items}) - - engine_config.update(MAPPINGS[model.config.model_type]["engine_config"]) - if quantization_method is not None: - if engine_config.get("quantization") is None: - engine_config.update( - { - "quantization": quantization_method, - "load_format": quantization_method, - } - ) - - with bentoml.models.create(model_name) as bentomodel: - model.save_pretrained_merged( - bentomodel.path, tokenizer, save_method=save_method - ) - - build_opts = dict( - python=dict( - packages=[ - "pyyaml", - "vllm==0.5.5", - "fastapi==0.111.0", - "unsloth[huggingface] @ git+https://github.com/bentoml/unsloth.git@main", - ], - lock_packages=True, - ), - envs=[{"name": "HF_TOKEN"}], - ) - merger.merge(build_opts, get_extras().get(model_type, {})) - - logger.info( - "Building bentos for %s, model_id=%s", model_type, model.config._name_or_path - ) - - with tempfile.TemporaryDirectory() as tempdir: - tempdir = pathlib.Path(tempdir) - shutil.copytree( - pathlib.Path(__file__).parent / "template", tempdir, dirs_exist_ok=True - ) - with (tempdir / "service_config.yaml").open("w") as f: - f.write( - yaml.safe_dump( - dict( - model_tag=str(bentomodel.tag), - engine_config=engine_config, - service_config=service_config, - ) - ) - ) - with (tempdir / "bentofile.yaml").open("w") as f: - BentoBuildConfig( - service="service:VLLM", - name=f"{model_name.replace('.', '-')}-service", - include=["*.py", "*.yaml"], - docker=DockerOptions(python_version="3.11", system_packages=["git"]), - models=[ModelSpec.from_item(str(bentomodel.tag))], - description="API Service for running Unsloth models, powered with BentoML and vLLM.", - **build_opts, - ).with_defaults().to_yaml(f) - - subprocess.run( - [ - sys.executable, - "-m", - "bentoml", - "build", - str(tempdir), - ], - check=True, - cwd=tempdir, - env=os.environ, - ) diff --git a/src/_bentoml_impl/frameworks/unsloth/mapping.py b/src/_bentoml_impl/frameworks/unsloth/mapping.py deleted file mode 100644 index e8420113144..00000000000 --- a/src/_bentoml_impl/frameworks/unsloth/mapping.py +++ /dev/null @@ -1,49 +0,0 @@ -RUNTIME_MAPPING = { - "llama": { - "service_config": { - "traffic": {"timeout": 300}, - "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, - }, - "engine_config": {"max_model_len": 2048}, - }, - "mistral": { - "service_config": { - "traffic": {"timeout": 300}, - "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, - }, - "engine_config": {"max_model_len": 2048}, - }, - "gemma": { - "service_config": { - "traffic": {"timeout": 300}, - "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, - }, - "engine_config": {"max_model_len": 2048}, - }, - "gemma2": { - "service_config": { - "traffic": {"timeout": 300}, - "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, - }, - "engine_config": {"max_model_len": 2048}, - }, - "qwen2": { - "service_config": { - "traffic": {"timeout": 300}, - "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, - }, - "engine_config": {"max_model_len": 2048}, - }, -} - - -def get_extras(): - return { - "gemma2": { - "envs": [{"name": "VLLM_ATTENTION_BACKEND", "value": "FLASHINFER"}], - "python": { - "extra_index_url": ["https://flashinfer.ai/whl/cu121/torch2.3"], - "packages": ["flashinfer==0.1.2+cu121torch2.3"], - }, - } - } diff --git a/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore b/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore deleted file mode 100644 index 6e34eb8cb21..00000000000 --- a/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore +++ /dev/null @@ -1,6 +0,0 @@ -__pycache__/ -*.py[cod] -*$py.class -.ipynb_checkpoints -venv/ -.venv/ diff --git a/src/_bentoml_impl/frameworks/unsloth/template/.gitignore b/src/_bentoml_impl/frameworks/unsloth/template/.gitignore deleted file mode 100644 index 8f462f40473..00000000000 --- a/src/_bentoml_impl/frameworks/unsloth/template/.gitignore +++ /dev/null @@ -1,9 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class -.ipynb_checkpoints - -# Environments -venv/ -.venv/ diff --git a/src/_bentoml_impl/frameworks/unsloth/template/service.py b/src/_bentoml_impl/frameworks/unsloth/template/service.py deleted file mode 100644 index 84cab1afe00..00000000000 --- a/src/_bentoml_impl/frameworks/unsloth/template/service.py +++ /dev/null @@ -1,81 +0,0 @@ -from __future__ import annotations - -import logging -import pathlib -from typing import Literal - -import fastapi -import fastapi.staticfiles -import pydantic -import vllm.entrypoints.openai.api_server as vllm_api_server -import yaml - -import bentoml - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -# Load the constants from the yaml file -CONSTANT_YAML = pathlib.Path(__file__).parent / "service_config.yaml" -if not CONSTANT_YAML.exists(): - raise FileNotFoundError(f"service_config.yaml not found in {CONSTANT_YAML.parent}") -with CONSTANT_YAML.open("r") as f: - CONSTANTS = yaml.safe_load(f) -ENGINE_CONFIG = CONSTANTS["engine_config"] -STATIC_DIR = pathlib.Path(__file__).parent / "ui" - - -class Message(pydantic.BaseModel): - role: Literal["system", "user", "assistant"] - content: str - - -openai_api_app = fastapi.FastAPI() -for route, endpoint, methods in [ - ("/chat/completions", vllm_api_server.create_chat_completion, ["POST"]), - ("/completions", vllm_api_server.create_completion, ["POST"]), - ("/models", vllm_api_server.show_available_models, ["GET"]), -]: - openai_api_app.add_api_route( - path=route, endpoint=endpoint, methods=methods, include_in_schema=True - ) - - -@bentoml.mount_asgi_app(openai_api_app, path="/v1") -@bentoml.service(**CONSTANTS["service_config"]) -class VLLM: - bentomodel = bentoml.models.get(CONSTANTS["model_tag"]) - - def __init__(self) -> None: - from transformers import AutoTokenizer - from vllm import AsyncEngineArgs - from vllm import AsyncLLMEngine - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat - from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - - self.engine = AsyncLLMEngine.from_engine_args( - AsyncEngineArgs( - model=self.bentomodel.path, enable_prefix_caching=True, **ENGINE_CONFIG - ) - ) - self.tokenizer = AutoTokenizer.from_pretrained(self.bentomodel.path) - model_config = self.engine.engine.get_model_config() - # inject the engine into the openai serving chat and completion - vllm_api_server.openai_serving_chat = OpenAIServingChat( - async_engine_client=self.engine, - served_model_names=[self.bentomodel.path], - response_role="assistant", - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) - vllm_api_server.openai_serving_completion = OpenAIServingCompletion( - async_engine_client=self.engine, - served_model_names=[self.bentomodel.path], - model_config=model_config, - lora_modules=None, - prompt_adapters=None, - request_logger=None, - ) diff --git a/src/_bentoml_impl/frameworks/unsloth/train.py b/src/_bentoml_impl/frameworks/unsloth/train.py deleted file mode 100644 index a97c8be2a07..00000000000 --- a/src/_bentoml_impl/frameworks/unsloth/train.py +++ /dev/null @@ -1,69 +0,0 @@ -# ruff: ignore - -def prep_dataset(tokenizer): - alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. - -### Instruction: -{} - -### Input: -{} - -### Response: -{}""" - - EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN - - def formatting_prompts_func(examples): - instructions = examples["instruction"] - inputs = examples["input"] - outputs = examples["output"] - texts = [] - for instruction, input, output in zip(instructions, inputs, outputs): - # Must add EOS_TOKEN, otherwise your generation will go on forever! - text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN - texts.append(text) - return {"text": texts} - - from datasets import load_dataset - - dataset = load_dataset("yahma/alpaca-cleaned", split="train") - dataset = dataset.map( - formatting_prompts_func, - batched=True, - ) - return dataset - - -def main(max_seq_length: int = 8196) -> int: - import unsloth, bentoml, trl, transformers - - model, tokenizer = unsloth.FastLanguageModel.from_pretrained("unsloth/Meta-Llama-3.1-8B-bnb-4bit", max_seq_length=max_seq_length, load_in_4bit=True) - # alpaca chat templates - tokenizer.chat_template="{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() + '\n\n' %}{% else %}{% set loop_messages = messages %}{% set system_message = '' %}{% endif %}{{ bos_token + system_message }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'].strip() + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response:\n' + message['content'].strip() + eos_token + '\n\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '### Instruction:\n' }}{% endif %}{% endfor %}" - model = unsloth.FastLanguageModel.get_peft_model( - model, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], - r=16, lora_alpha=16, lora_dropout=0, bias="none", random_state=3407, - use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context - ) - trl.SFTTrainer( - model=model, tokenizer=tokenizer, - train_dataset=prep_dataset(tokenizer), - dataset_text_field="text", max_seq_length=max_seq_length, - dataset_num_proc=2, packing=False, # Can make training 5x faster for short sequences. - args=transformers.TrainingArguments( - per_device_train_batch_size=2, gradient_accumulation_steps=4, - warmup_steps=5, num_train_epochs = 1, max_steps=60, learning_rate=2e-4, - weight_decay=0.01, seed=3407, - optim="adamw_8bit", - fp16=not unsloth.is_bfloat16_supported(), bf16=unsloth.is_bfloat16_supported(), - logging_steps=1, - lr_scheduler_type="linear", - output_dir="outputs", - ), - ).train() - - bentoml.unsloth.build_bento(model, tokenizer, quantization_method="bitsandbytes", engine_config={"quantization": "bitsandbytes"}) - return 0 - -if __name__ == "__main__": raise SystemExit(main())