From 9d05b6e52bcf227a7f9215154c0ebd487b4a9df1 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Thu, 29 Aug 2024 07:03:03 +0000
Subject: [PATCH] feat: unsloth integrations

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 .gitignore                                    |   4 -
 .pre-commit-config.yaml                       |   4 +-
 pyproject.toml                                |  12 +-
 .../frameworks/unsloth/__init__.py            | 182 ------------------
 .../frameworks/unsloth/mapping.py             |  49 -----
 .../frameworks/unsloth/template/.bentoignore  |   6 -
 .../frameworks/unsloth/template/.gitignore    |   9 -
 .../frameworks/unsloth/template/service.py    |  81 --------
 src/_bentoml_impl/frameworks/unsloth/train.py |  69 -------
 9 files changed, 2 insertions(+), 414 deletions(-)
 delete mode 100644 src/_bentoml_impl/frameworks/unsloth/__init__.py
 delete mode 100644 src/_bentoml_impl/frameworks/unsloth/mapping.py
 delete mode 100644 src/_bentoml_impl/frameworks/unsloth/template/.bentoignore
 delete mode 100644 src/_bentoml_impl/frameworks/unsloth/template/.gitignore
 delete mode 100644 src/_bentoml_impl/frameworks/unsloth/template/service.py
 delete mode 100644 src/_bentoml_impl/frameworks/unsloth/train.py

diff --git a/.gitignore b/.gitignore
index 051738c651a..822c97afccd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -167,7 +167,3 @@ mlruns/
 .pdm-python
 .python-version
 .pdm-build/
-
-# from training scripts
-model
-outputs
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 14f41d2ace5..22830670e92 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ ci:
   autoupdate_schedule: monthly
   autofix_commit_msg: "ci: auto fixes from pre-commit.ci\n\nFor more information, see https://pre-commit.ci"
   autoupdate_commit_msg: 'ci: pre-commit autoupdate [skip ci]'
-  skip: # exceeds tier max size
+  skip:  # exceeds tier max size
     - buf-format
     - buf-lint
 exclude: '(.*\.(css|js|svg))|(.*/(snippets|grpc|proto)/.*)$'
@@ -13,9 +13,7 @@ repos:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix, --show-fixes]
         types_or: [python, pyi]
-        exclude: ^src/_bentoml_impl/frameworks/unsloth/train\.py$
       - id: ruff-format
-        exclude: ^src/_bentoml_impl/frameworks/unsloth/train\.py$
         types_or: [python, pyi]
         files: '(src|tests|docs|examples|typings)/'
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/pyproject.toml b/pyproject.toml
index 7bcd0eb97ed..319e3411923 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,11 +93,7 @@ io = [
 io-image = ["Pillow"]
 io-pandas = ["pandas>=1", "pyarrow"]
 triton = ["tritonclient>=2.29.0", "tritonclient[all]; sys_platform != 'darwin'"]
-unsloth = [
-    "unsloth[huggingface] @ git+https://github.com/bentoml/unsloth.git@main",
-    "vllm>=0.5.5",
-    "fastapi"
-]
+unsloth = ["bentoml-unsloth"]
 grpc = [
     "protobuf",
     "grpcio",
@@ -243,11 +239,6 @@ testpaths = ["tests"]
 line-length = 88
 target-version = "py310"
 
-[tool.ruff.format]
-exclude = [
-    "src/_bentoml_impl/frameworks/unsloth/train.py",
-]
-
 [tool.ruff.lint]
 # We ignore E501 (line too long) here because we keep user-visible strings on one line.
 ignore = ["E501"]
@@ -260,7 +251,6 @@ exclude = [
     "src/bentoml/_internal/external_typing",
     "src/bentoml/grpc/v1alpha1",
     "src/bentoml/grpc/v1",
-    "src/_bentoml_impl/frameworks/unsloth",
     "tests/proto",
 ]
 
diff --git a/src/_bentoml_impl/frameworks/unsloth/__init__.py b/src/_bentoml_impl/frameworks/unsloth/__init__.py
deleted file mode 100644
index 810faaf390a..00000000000
--- a/src/_bentoml_impl/frameworks/unsloth/__init__.py
+++ /dev/null
@@ -1,182 +0,0 @@
-from __future__ import annotations
-
-import logging
-import math
-import os
-import pathlib
-import shutil
-import subprocess
-import sys
-import tempfile
-import typing as t
-
-import yaml
-from deepmerge.merger import Merger
-
-import bentoml
-from _bentoml_sdk.service.config import ServiceConfig as Config
-from bentoml._internal.bento.build_config import BentoBuildConfig
-from bentoml._internal.bento.build_config import DockerOptions
-from bentoml._internal.bento.build_config import ModelSpec
-from bentoml._internal.utils import pkg
-from bentoml.exceptions import BentoMLException
-from bentoml.exceptions import MissingDependencyException
-
-from .mapping import RUNTIME_MAPPING as MAPPINGS
-from .mapping import get_extras
-
-logger = logging.getLogger(__name__)
-
-if t.TYPE_CHECKING:
-    from transformers import PreTrainedModel
-    from transformers import PreTrainedTokenizerFast
-
-if pkg.find_spec("unsloth") is None:
-    raise MissingDependencyException(
-        "'unsloth' is required in order to use module 'bentoml.unsloth', install unsloth with 'pip install bentoml[unsloth]'."
-    )
-
-merger = Merger(
-    # merge dicts, append list
-    [(dict, "merge"), (list, "append")],
-    # override all other types
-    ["override"],
-    # override conflicting types
-    ["override"],
-)
-
-
-def replace_tag(tag: str) -> str:
-    return tag.lower().replace("/", "--")
-
-
-ModelType = t.Literal["llama", "mistral", "gemma", "gemma2", "qwen2"]
-
-SPEC = {
-    "nvidia-tesla-t4": 16.0,
-    "nvidia-tesla-v100": 16.0,
-    "nvidia-l4": 24.0,
-    "nvidia-tesla-l4": 24.0,
-    "nvidia-tesla-a10g": 24.0,
-    "nvidia-a100-80gb": 80.0,
-    "nvidia-tesla-a100": 40.0,
-}
-
-
-def calculate_recommended_gpu_type(model) -> str:
-    # ceiling the number of parameters to the nearest billion
-    num_params = math.ceil(sum(p.numel() for p in model.parameters()) / 1e9)
-    for gpu_type, memory in SPEC.items():
-        if num_params <= memory / 2:
-            return gpu_type
-    # If no suitable GPU is found, return the one with the highest memory
-    return max(SPEC, key=SPEC.get)
-
-
-def build_bento(
-    model: PreTrainedModel,
-    tokenizer: PreTrainedTokenizerFast,
-    /,
-    model_name: str | None = None,
-    *,
-    quantization_method: t.Literal["bitsandbytes"] | None = None,
-    save_method: t.Literal["merged_16bit", "merged_4bit"] = "merged_16bit",
-    service_config: Config | None = None,
-    engine_config: dict[str, t.Any]
-    | None = None,  # arguments to pass to AsyncEngineArgs
-) -> bentoml.Model:
-    # this model is local then model_name must specified, otherwise derived from model_id
-    is_local = getattr(model.config, "_commit_hash", None) is None
-    if is_local is True and model_name is None:
-        raise BentoMLException(
-            'Fine-tune from a local checkpoint requires specifying "model_name".'
-        )
-    else:
-        model_name = model_name or replace_tag(model.config._name_or_path)
-
-    model_type = t.cast(ModelType, model.config.model_type)
-
-    if service_config is None:
-        service_config = {}
-    if engine_config is None:
-        engine_config = {}
-
-    local_items = MAPPINGS[model_type]["service_config"]
-    merger.merge(
-        local_items,
-        {"resources": {"gpu": 1, "gpu_type": calculate_recommended_gpu_type(model)}},
-    )
-
-    service_config.update({**local_items})
-
-    engine_config.update(MAPPINGS[model.config.model_type]["engine_config"])
-    if quantization_method is not None:
-        if engine_config.get("quantization") is None:
-            engine_config.update(
-                {
-                    "quantization": quantization_method,
-                    "load_format": quantization_method,
-                }
-            )
-
-    with bentoml.models.create(model_name) as bentomodel:
-        model.save_pretrained_merged(
-            bentomodel.path, tokenizer, save_method=save_method
-        )
-
-    build_opts = dict(
-        python=dict(
-            packages=[
-                "pyyaml",
-                "vllm==0.5.5",
-                "fastapi==0.111.0",
-                "unsloth[huggingface] @ git+https://github.com/bentoml/unsloth.git@main",
-            ],
-            lock_packages=True,
-        ),
-        envs=[{"name": "HF_TOKEN"}],
-    )
-    merger.merge(build_opts, get_extras().get(model_type, {}))
-
-    logger.info(
-        "Building bentos for %s, model_id=%s", model_type, model.config._name_or_path
-    )
-
-    with tempfile.TemporaryDirectory() as tempdir:
-        tempdir = pathlib.Path(tempdir)
-        shutil.copytree(
-            pathlib.Path(__file__).parent / "template", tempdir, dirs_exist_ok=True
-        )
-        with (tempdir / "service_config.yaml").open("w") as f:
-            f.write(
-                yaml.safe_dump(
-                    dict(
-                        model_tag=str(bentomodel.tag),
-                        engine_config=engine_config,
-                        service_config=service_config,
-                    )
-                )
-            )
-        with (tempdir / "bentofile.yaml").open("w") as f:
-            BentoBuildConfig(
-                service="service:VLLM",
-                name=f"{model_name.replace('.', '-')}-service",
-                include=["*.py", "*.yaml"],
-                docker=DockerOptions(python_version="3.11", system_packages=["git"]),
-                models=[ModelSpec.from_item(str(bentomodel.tag))],
-                description="API Service for running Unsloth models, powered with BentoML and vLLM.",
-                **build_opts,
-            ).with_defaults().to_yaml(f)
-
-        subprocess.run(
-            [
-                sys.executable,
-                "-m",
-                "bentoml",
-                "build",
-                str(tempdir),
-            ],
-            check=True,
-            cwd=tempdir,
-            env=os.environ,
-        )
diff --git a/src/_bentoml_impl/frameworks/unsloth/mapping.py b/src/_bentoml_impl/frameworks/unsloth/mapping.py
deleted file mode 100644
index e8420113144..00000000000
--- a/src/_bentoml_impl/frameworks/unsloth/mapping.py
+++ /dev/null
@@ -1,49 +0,0 @@
-RUNTIME_MAPPING = {
-    "llama": {
-        "service_config": {
-            "traffic": {"timeout": 300},
-            "resources": {"gpu": 1, "gpu_type": "nvidia-l4"},
-        },
-        "engine_config": {"max_model_len": 2048},
-    },
-    "mistral": {
-        "service_config": {
-            "traffic": {"timeout": 300},
-            "resources": {"gpu": 1, "gpu_type": "nvidia-l4"},
-        },
-        "engine_config": {"max_model_len": 2048},
-    },
-    "gemma": {
-        "service_config": {
-            "traffic": {"timeout": 300},
-            "resources": {"gpu": 1, "gpu_type": "nvidia-l4"},
-        },
-        "engine_config": {"max_model_len": 2048},
-    },
-    "gemma2": {
-        "service_config": {
-            "traffic": {"timeout": 300},
-            "resources": {"gpu": 1, "gpu_type": "nvidia-l4"},
-        },
-        "engine_config": {"max_model_len": 2048},
-    },
-    "qwen2": {
-        "service_config": {
-            "traffic": {"timeout": 300},
-            "resources": {"gpu": 1, "gpu_type": "nvidia-l4"},
-        },
-        "engine_config": {"max_model_len": 2048},
-    },
-}
-
-
-def get_extras():
-    return {
-        "gemma2": {
-            "envs": [{"name": "VLLM_ATTENTION_BACKEND", "value": "FLASHINFER"}],
-            "python": {
-                "extra_index_url": ["https://flashinfer.ai/whl/cu121/torch2.3"],
-                "packages": ["flashinfer==0.1.2+cu121torch2.3"],
-            },
-        }
-    }
diff --git a/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore b/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore
deleted file mode 100644
index 6e34eb8cb21..00000000000
--- a/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore
+++ /dev/null
@@ -1,6 +0,0 @@
-__pycache__/
-*.py[cod]
-*$py.class
-.ipynb_checkpoints
-venv/
-.venv/
diff --git a/src/_bentoml_impl/frameworks/unsloth/template/.gitignore b/src/_bentoml_impl/frameworks/unsloth/template/.gitignore
deleted file mode 100644
index 8f462f40473..00000000000
--- a/src/_bentoml_impl/frameworks/unsloth/template/.gitignore
+++ /dev/null
@@ -1,9 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-.ipynb_checkpoints
-
-# Environments
-venv/
-.venv/
diff --git a/src/_bentoml_impl/frameworks/unsloth/template/service.py b/src/_bentoml_impl/frameworks/unsloth/template/service.py
deleted file mode 100644
index 84cab1afe00..00000000000
--- a/src/_bentoml_impl/frameworks/unsloth/template/service.py
+++ /dev/null
@@ -1,81 +0,0 @@
-from __future__ import annotations
-
-import logging
-import pathlib
-from typing import Literal
-
-import fastapi
-import fastapi.staticfiles
-import pydantic
-import vllm.entrypoints.openai.api_server as vllm_api_server
-import yaml
-
-import bentoml
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
-# Load the constants from the yaml file
-CONSTANT_YAML = pathlib.Path(__file__).parent / "service_config.yaml"
-if not CONSTANT_YAML.exists():
-    raise FileNotFoundError(f"service_config.yaml not found in {CONSTANT_YAML.parent}")
-with CONSTANT_YAML.open("r") as f:
-    CONSTANTS = yaml.safe_load(f)
-ENGINE_CONFIG = CONSTANTS["engine_config"]
-STATIC_DIR = pathlib.Path(__file__).parent / "ui"
-
-
-class Message(pydantic.BaseModel):
-    role: Literal["system", "user", "assistant"]
-    content: str
-
-
-openai_api_app = fastapi.FastAPI()
-for route, endpoint, methods in [
-    ("/chat/completions", vllm_api_server.create_chat_completion, ["POST"]),
-    ("/completions", vllm_api_server.create_completion, ["POST"]),
-    ("/models", vllm_api_server.show_available_models, ["GET"]),
-]:
-    openai_api_app.add_api_route(
-        path=route, endpoint=endpoint, methods=methods, include_in_schema=True
-    )
-
-
-@bentoml.mount_asgi_app(openai_api_app, path="/v1")
-@bentoml.service(**CONSTANTS["service_config"])
-class VLLM:
-    bentomodel = bentoml.models.get(CONSTANTS["model_tag"])
-
-    def __init__(self) -> None:
-        from transformers import AutoTokenizer
-        from vllm import AsyncEngineArgs
-        from vllm import AsyncLLMEngine
-        from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-        from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
-
-        self.engine = AsyncLLMEngine.from_engine_args(
-            AsyncEngineArgs(
-                model=self.bentomodel.path, enable_prefix_caching=True, **ENGINE_CONFIG
-            )
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(self.bentomodel.path)
-        model_config = self.engine.engine.get_model_config()
-        # inject the engine into the openai serving chat and completion
-        vllm_api_server.openai_serving_chat = OpenAIServingChat(
-            async_engine_client=self.engine,
-            served_model_names=[self.bentomodel.path],
-            response_role="assistant",
-            model_config=model_config,
-            lora_modules=None,
-            prompt_adapters=None,
-            request_logger=None,
-        )
-        vllm_api_server.openai_serving_completion = OpenAIServingCompletion(
-            async_engine_client=self.engine,
-            served_model_names=[self.bentomodel.path],
-            model_config=model_config,
-            lora_modules=None,
-            prompt_adapters=None,
-            request_logger=None,
-        )
diff --git a/src/_bentoml_impl/frameworks/unsloth/train.py b/src/_bentoml_impl/frameworks/unsloth/train.py
deleted file mode 100644
index a97c8be2a07..00000000000
--- a/src/_bentoml_impl/frameworks/unsloth/train.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# ruff: ignore
-
-def prep_dataset(tokenizer):
-  alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
-
-### Instruction:
-{}
-
-### Input:
-{}
-
-### Response:
-{}"""
-
-  EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
-
-  def formatting_prompts_func(examples):
-    instructions = examples["instruction"]
-    inputs = examples["input"]
-    outputs = examples["output"]
-    texts = []
-    for instruction, input, output in zip(instructions, inputs, outputs):
-      # Must add EOS_TOKEN, otherwise your generation will go on forever!
-      text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
-      texts.append(text)
-    return {"text": texts}
-
-  from datasets import load_dataset
-
-  dataset = load_dataset("yahma/alpaca-cleaned", split="train")
-  dataset = dataset.map(
-    formatting_prompts_func,
-    batched=True,
-  )
-  return dataset
-
-
-def main(max_seq_length: int = 8196) -> int:
-  import unsloth, bentoml, trl, transformers
-
-  model, tokenizer = unsloth.FastLanguageModel.from_pretrained("unsloth/Meta-Llama-3.1-8B-bnb-4bit", max_seq_length=max_seq_length, load_in_4bit=True)
-  # alpaca chat templates
-  tokenizer.chat_template="{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() + '\n\n' %}{% else %}{% set loop_messages = messages %}{% set system_message = '' %}{% endif %}{{ bos_token + system_message }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'].strip() + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response:\n' + message['content'].strip() + eos_token + '\n\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '### Instruction:\n' }}{% endif %}{% endfor %}"
-  model = unsloth.FastLanguageModel.get_peft_model(
-    model, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
-    r=16, lora_alpha=16, lora_dropout=0, bias="none", random_state=3407,
-    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
-  )
-  trl.SFTTrainer(
-    model=model, tokenizer=tokenizer,
-    train_dataset=prep_dataset(tokenizer),
-    dataset_text_field="text", max_seq_length=max_seq_length,
-    dataset_num_proc=2, packing=False,  # Can make training 5x faster for short sequences.
-    args=transformers.TrainingArguments(
-      per_device_train_batch_size=2, gradient_accumulation_steps=4,
-      warmup_steps=5, num_train_epochs = 1, max_steps=60, learning_rate=2e-4,
-      weight_decay=0.01, seed=3407,
-      optim="adamw_8bit",
-      fp16=not unsloth.is_bfloat16_supported(), bf16=unsloth.is_bfloat16_supported(),
-      logging_steps=1,
-      lr_scheduler_type="linear",
-      output_dir="outputs",
-    ),
-  ).train()
-
-  bentoml.unsloth.build_bento(model, tokenizer, quantization_method="bitsandbytes", engine_config={"quantization": "bitsandbytes"})
-  return 0
-
-if __name__ == "__main__": raise SystemExit(main())