diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 14f41d2ace5..22830670e92 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ ci:
   autoupdate_schedule: monthly
   autofix_commit_msg: "ci: auto fixes from pre-commit.ci\n\nFor more information, see https://pre-commit.ci"
   autoupdate_commit_msg: 'ci: pre-commit autoupdate [skip ci]'
-  skip: # exceeds tier max size
+  skip:  # exceeds tier max size
     - buf-format
     - buf-lint
 exclude: '(.*\.(css|js|svg))|(.*/(snippets|grpc|proto)/.*)$'
@@ -13,9 +13,7 @@ repos:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix, --show-fixes]
         types_or: [python, pyi]
-        exclude: ^src/_bentoml_impl/frameworks/unsloth/train\.py$
       - id: ruff-format
-        exclude: ^src/_bentoml_impl/frameworks/unsloth/train\.py$
         types_or: [python, pyi]
         files: '(src|tests|docs|examples|typings)/'
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/pyproject.toml b/pyproject.toml
index 7bcd0eb97ed..aeed80b759e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,73 +3,81 @@ name = "bentoml"
 description = "BentoML: The easiest way to serve AI apps and models"
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.9"
-keywords = ["BentoML", "Compound AI Systems", "MLOps", "LLMOps", "Model Serving", "Model Inference", "Model Deployment"]
+keywords = [
+  "BentoML",
+  "Compound AI Systems",
+  "MLOps",
+  "LLMOps",
+  "Model Serving",
+  "Model Inference",
+  "Model Deployment",
+]
 license = { text = "Apache-2.0" }
 authors = [{ name = "BentoML Team", email = "contact@bentoml.com" }]
 classifiers = [
-    "Development Status :: 5 - Production/Stable",
-    "Intended Audience :: Developers",
-    "Intended Audience :: Science/Research",
-    "License :: OSI Approved :: Apache Software License",
-    "Operating System :: OS Independent",
-    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.9",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-    "Programming Language :: Python :: Implementation :: CPython",
-    "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    "Topic :: Software Development :: Libraries",
+  "Development Status :: 5 - Production/Stable",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "License :: OSI Approved :: Apache Software License",
+  "Operating System :: OS Independent",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: Software Development :: Libraries",
 ]
 dependencies = [
-    "Jinja2>=3.0.1",
-    "PyYAML>=5.0",
-    "aiohttp",
-    "attrs>=22.2.0",
-    "cattrs>=22.1.0,<23.2.0",
-    "circus>=0.17.0,!=0.17.2",
-    "click>=7.0",
-    "click-option-group",
-    "cloudpickle>=2.0.0",
-    "deepmerge",
-    "fs",
-    "httpx",
-    "inflection",
-    "numpy",
-    "nvidia-ml-py<12",
-    # OpenTelemetry is the server dependencies, rather than SDK
-    # Since there are discrepancies among API and instrumentation packages,
-    # we should always pin the set version of Opentelemetry suite
-    "opentelemetry-api==1.20.0",
-    "opentelemetry-sdk==1.20.0",
-    "opentelemetry-instrumentation==0.41b0",
-    "opentelemetry-instrumentation-aiohttp-client==0.41b0",
-    "opentelemetry-instrumentation-asgi==0.41b0",
-    "opentelemetry-semantic-conventions==0.41b0",
-    "opentelemetry-util-http==0.41b0",
-    "packaging>=22.0",
-    "pathspec",
-    "pip-requirements-parser>=31.2.0",
-    "prometheus-client>=0.10.0",
-    "psutil",
-    "pydantic<3",
-    "python-dateutil",
-    "python-multipart",
-    "python-json-logger",
-    "rich>=11.2.0",
-    "schema",
-    "simple-di>=0.1.4",
-    "starlette>=0.24.0",
-    "uvicorn>=0.22.0",
-    "watchfiles>=0.15.0",
-    # for manipulating pyproject.toml
-    "tomli>=1.1.0; python_version < \"3.11\"",
-    "tomli-w",
-    "httpx-ws>=0.6.0",
-    "aiosqlite>=0.20.0",
-    "uv",
-    "questionary>=2.0.1",
+  "Jinja2>=3.0.1",
+  "PyYAML>=5.0",
+  "aiohttp",
+  "attrs>=22.2.0",
+  "cattrs>=22.1.0,<23.2.0",
+  "circus>=0.17.0,!=0.17.2",
+  "click>=7.0",
+  "click-option-group",
+  "cloudpickle>=2.0.0",
+  "deepmerge",
+  "fs",
+  "httpx",
+  "inflection",
+  "numpy",
+  "nvidia-ml-py<12",
+  # OpenTelemetry is the server dependencies, rather than SDK
+  # Since there are discrepancies among API and instrumentation packages,
+  # we should always pin the set version of Opentelemetry suite
+  "opentelemetry-api==1.20.0",
+  "opentelemetry-sdk==1.20.0",
+  "opentelemetry-instrumentation==0.41b0",
+  "opentelemetry-instrumentation-aiohttp-client==0.41b0",
+  "opentelemetry-instrumentation-asgi==0.41b0",
+  "opentelemetry-semantic-conventions==0.41b0",
+  "opentelemetry-util-http==0.41b0",
+  "packaging>=22.0",
+  "pathspec",
+  "pip-requirements-parser>=31.2.0",
+  "prometheus-client>=0.10.0",
+  "psutil",
+  "pydantic<3",
+  "python-dateutil",
+  "python-multipart",
+  "python-json-logger",
+  "rich>=11.2.0",
+  "schema",
+  "simple-di>=0.1.4",
+  "starlette>=0.24.0",
+  "uvicorn>=0.22.0",
+  "watchfiles>=0.15.0",
+  # for manipulating pyproject.toml
+  "tomli>=1.1.0; python_version < \"3.11\"",
+  "tomli-w",
+  "httpx-ws>=0.6.0",
+  "aiosqlite>=0.20.0",
+  "uv",
+  "questionary>=2.0.1",
 ]
 dynamic = ["version"]
 [project.urls]
@@ -84,40 +92,32 @@ Blog = "https://bentoml.com/blog"
 bentoml = "bentoml_cli.cli:cli"
 [project.optional-dependencies]
 all = [
-    "bentoml[aws,io,grpc,grpc-reflection,grpc-channelz,tracing,monitor-otlp]",
+  "bentoml[aws,io,grpc,grpc-reflection,grpc-channelz,tracing,monitor-otlp]",
 ]
 aws = ["fs-s3fs"]
-io = [
-    "bentoml[io-image,io-pandas]",
-]
+io = ["bentoml[io-image,io-pandas]"]
 io-image = ["Pillow"]
 io-pandas = ["pandas>=1", "pyarrow"]
 triton = ["tritonclient>=2.29.0", "tritonclient[all]; sys_platform != 'darwin'"]
-unsloth = [
-    "unsloth[huggingface] @ git+https://github.com/bentoml/unsloth.git@main",
-    "vllm>=0.5.5",
-    "fastapi"
-]
+unsloth = ["bentoml-unsloth"]
 grpc = [
-    "protobuf",
-    "grpcio",
-    "grpcio-health-checking",
-    "opentelemetry-instrumentation-grpc==0.41b0",
+  "protobuf",
+  "grpcio",
+  "grpcio-health-checking",
+  "opentelemetry-instrumentation-grpc==0.41b0",
 ]
 grpc-reflection = ["bentoml[grpc]", "grpcio-reflection"]
 grpc-channelz = ["bentoml[grpc]", "grpcio-channelz"]
 # We kept for compatibility with previous
 # versions of BentoML. It is discouraged to use this, instead use any
 # of the above tracing.* extras.
-tracing = [
-    "bentoml[tracing-jaeger,tracing-zipkin,tracing-otlp]",
-]
+tracing = ["bentoml[tracing-jaeger,tracing-zipkin,tracing-otlp]"]
 tracing-jaeger = ["opentelemetry-exporter-jaeger==1.20.0"]
 tracing-zipkin = ["opentelemetry-exporter-zipkin==1.20.0"]
 tracing-otlp = ["opentelemetry-exporter-otlp==1.20.0"]
 monitor-otlp = [
-    "opentelemetry-exporter-otlp-proto-http==1.20.0",
-    "opentelemetry-exporter-otlp-proto-grpc==1.20.0",
+  "opentelemetry-exporter-otlp-proto-http==1.20.0",
+  "opentelemetry-exporter-otlp-proto-grpc==1.20.0",
 ]
 
 [build-system]
@@ -131,21 +131,31 @@ fallback-version = "0.0.0"
 version-file = "src/bentoml/_version.py"
 [tool.hatch.version.raw-options]
 git_describe_command = [
-    "git",
-    "describe",
-    "--dirty",
-    "--tags",
-    "--long",
-    "--first-parent",
+  "git",
+  "describe",
+  "--dirty",
+  "--tags",
+  "--long",
+  "--first-parent",
 ]
 version_scheme = "post-release"
 fallback_version = "0.0.0"
 [tool.hatch.metadata]
 allow-direct-references = true
 [tool.hatch.build.targets.sdist]
-only-include = ["src/bentoml", "src/bentoml_cli", "src/_bentoml_sdk", "src/_bentoml_impl"]
+only-include = [
+  "src/bentoml",
+  "src/bentoml_cli",
+  "src/_bentoml_sdk",
+  "src/_bentoml_impl",
+]
 [tool.hatch.build.targets.wheel]
-packages = ["src/bentoml", "src/bentoml_cli", "src/_bentoml_sdk", "src/_bentoml_impl"]
+packages = [
+  "src/bentoml",
+  "src/bentoml_cli",
+  "src/_bentoml_sdk",
+  "src/_bentoml_impl",
+]
 
 [[tool.pdm.source]]
 url = "https://download.pytorch.org/whl/cpu"
@@ -156,37 +166,37 @@ respect-source-order = true
 
 [tool.pdm.dev-dependencies]
 docs = [
-    "sphinx>=5",
-    "myst-parser",
-    "sphinx-click>=3.0.2",
-    "furo",
-    "sphinx-inline-tabs",
-    "sphinxext-opengraph",
-    "sphinxcontrib-fulltoc",
-    "sphinxcontrib-spelling",
-    "sphinx-copybutton",
-    "sphinx-issues",
-    "sphinx-design",
-    "pyenchant",
-    "Jinja2>=3.1",
-    "sphinx-autobuild",
-    "sphinx-hoverxref",
-    "pyspark",
-    "ray[serve]; python_version < \"3.12\"",
+  "sphinx>=5",
+  "myst-parser",
+  "sphinx-click>=3.0.2",
+  "furo",
+  "sphinx-inline-tabs",
+  "sphinxext-opengraph",
+  "sphinxcontrib-fulltoc",
+  "sphinxcontrib-spelling",
+  "sphinx-copybutton",
+  "sphinx-issues",
+  "sphinx-design",
+  "pyenchant",
+  "Jinja2>=3.1",
+  "sphinx-autobuild",
+  "sphinx-hoverxref",
+  "pyspark",
+  "ray[serve]; python_version < \"3.12\"",
 ]
 tooling = ["pre-commit", "setuptools-scm", "pandas-stubs", "nox"]
 testing = [
-    "pandas>=1",
-    "scikit-learn",
-    "yamllint==1.32.0",
-    "coverage[toml]==7.2.6",
-    "fastapi",
-    "lxml",
-    "orjson",
-    "pytest-cov==4.1.0",
-    "pytest==7.4.0",
-    "pytest-xdist[psutil]==3.3.1",
-    "pytest-asyncio==0.21.1",
+  "pandas>=1",
+  "scikit-learn",
+  "yamllint==1.32.0",
+  "coverage[toml]==7.2.6",
+  "fastapi",
+  "lxml",
+  "orjson",
+  "pytest-cov==4.1.0",
+  "pytest==7.4.0",
+  "pytest-xdist[psutil]==3.3.1",
+  "pytest-asyncio==0.21.1",
 ]
 
 [tool.coverage.paths]
@@ -194,43 +204,43 @@ bentoml = ["src/bentoml", "*/bentoml/src/bentoml"]
 [tool.coverage.run]
 branch = true
 omit = [
-    "__pypackages__/*",
-    "src/bentoml/__main__.py",
-    "src/bentoml/io.py",
-    "src/bentoml/serve.py",
-    "src/bentoml/start.py",
-    "src/bentoml/_internal/types.py",
-    "src/bentoml/testing/*",
-    "src/bentoml/grpc/v1alpha1/*",
-    "src/bentoml/grpc/v1/*",
-    "src/bentoml/_internal/external_typing/*",
+  "__pypackages__/*",
+  "src/bentoml/__main__.py",
+  "src/bentoml/io.py",
+  "src/bentoml/serve.py",
+  "src/bentoml/start.py",
+  "src/bentoml/_internal/types.py",
+  "src/bentoml/testing/*",
+  "src/bentoml/grpc/v1alpha1/*",
+  "src/bentoml/grpc/v1/*",
+  "src/bentoml/_internal/external_typing/*",
 ]
 [tool.coverage.report]
 show_missing = true
 precision = 2
 omit = [
-    "__pypackages__/*",
-    'src/bentoml/__main__.py',
-    "src/bentoml/io.py",
-    "src/bentoml/serve.py",
-    "src/bentoml/start.py",
-    "src/bentoml/_internal/types.py",
-    "src/bentoml/testing/*",
-    "src/bentoml/grpc/v1alpha1/*",
-    "src/bentoml/grpc/v1/*",
-    "src/bentoml/_internal/external_typing/*",
+  "__pypackages__/*",
+  'src/bentoml/__main__.py',
+  "src/bentoml/io.py",
+  "src/bentoml/serve.py",
+  "src/bentoml/start.py",
+  "src/bentoml/_internal/types.py",
+  "src/bentoml/testing/*",
+  "src/bentoml/grpc/v1alpha1/*",
+  "src/bentoml/grpc/v1/*",
+  "src/bentoml/_internal/external_typing/*",
 ]
 exclude_lines = [
-    "\\#\\s*pragma: no cover",
-    "^\\s*def __repr__",
-    "^\\s*raise AssertionError",
-    "^\\s*raise NotImplementedError",
-    "^\\s*raise MissingDependencyException",
-    "^\\s*except ImportError",
-    "if __name__ == .__main__.:",
-    "^\\s*if TYPE_CHECKING:",
-    "^\\s*@(t\\.)?overload( |$)",
-    "@(abc\\.)?abstractmethod",
+  "\\#\\s*pragma: no cover",
+  "^\\s*def __repr__",
+  "^\\s*raise AssertionError",
+  "^\\s*raise NotImplementedError",
+  "^\\s*raise MissingDependencyException",
+  "^\\s*except ImportError",
+  "if __name__ == .__main__.:",
+  "^\\s*if TYPE_CHECKING:",
+  "^\\s*@(t\\.)?overload( |$)",
+  "@(abc\\.)?abstractmethod",
 ]
 
 [tool.pytest.ini_options]
@@ -243,25 +253,19 @@ testpaths = ["tests"]
 line-length = 88
 target-version = "py310"
 
-[tool.ruff.format]
-exclude = [
-    "src/_bentoml_impl/frameworks/unsloth/train.py",
-]
-
 [tool.ruff.lint]
 # We ignore E501 (line too long) here because we keep user-visible strings on one line.
 ignore = ["E501"]
 extend-select = ["I"]
 exclude = [
-    "bazel-*/",
-    "venv",
-    "typings",
-    "docs/source",
-    "src/bentoml/_internal/external_typing",
-    "src/bentoml/grpc/v1alpha1",
-    "src/bentoml/grpc/v1",
-    "src/_bentoml_impl/frameworks/unsloth",
-    "tests/proto",
+  "bazel-*/",
+  "venv",
+  "typings",
+  "docs/source",
+  "src/bentoml/_internal/external_typing",
+  "src/bentoml/grpc/v1alpha1",
+  "src/bentoml/grpc/v1",
+  "tests/proto",
 ]
 
 [tool.ruff.lint.per-file-ignores]
@@ -279,14 +283,14 @@ known-first-party = ["bentoml", "bentoml_cli", "_bentoml_sdk", "_bentoml_impl"]
 pythonVersion = "3.12"
 include = ["src/", "examples/", "tests/"]
 exclude = [
-    'src/bentoml/_version.py',
-    'src/bentoml/__main__.py',
-    'src/bentoml/_internal/external_typing/',
-    'src/**/*_pb2.py*',
-    "src/**/*_pb2_grpc.py*",
-    "grpc-client/thirdparty",
-    "tests/proto",
-    "bazel-*",
+  'src/bentoml/_version.py',
+  'src/bentoml/__main__.py',
+  'src/bentoml/_internal/external_typing/',
+  'src/**/*_pb2.py*',
+  "src/**/*_pb2_grpc.py*",
+  "grpc-client/thirdparty",
+  "tests/proto",
+  "bazel-*",
 ]
 analysis.useLibraryCodeForTypes = true
 strictListInference = true
diff --git a/src/_bentoml_impl/frameworks/unsloth/__init__.py b/src/_bentoml_impl/frameworks/unsloth/__init__.py
deleted file mode 100644
index 810faaf390a..00000000000
--- a/src/_bentoml_impl/frameworks/unsloth/__init__.py
+++ /dev/null
@@ -1,182 +0,0 @@
-from __future__ import annotations
-
-import logging
-import math
-import os
-import pathlib
-import shutil
-import subprocess
-import sys
-import tempfile
-import typing as t
-
-import yaml
-from deepmerge.merger import Merger
-
-import bentoml
-from _bentoml_sdk.service.config import ServiceConfig as Config
-from bentoml._internal.bento.build_config import BentoBuildConfig
-from bentoml._internal.bento.build_config import DockerOptions
-from bentoml._internal.bento.build_config import ModelSpec
-from bentoml._internal.utils import pkg
-from bentoml.exceptions import BentoMLException
-from bentoml.exceptions import MissingDependencyException
-
-from .mapping import RUNTIME_MAPPING as MAPPINGS
-from .mapping import get_extras
-
-logger = logging.getLogger(__name__)
-
-if t.TYPE_CHECKING:
-    from transformers import PreTrainedModel
-    from transformers import PreTrainedTokenizerFast
-
-if pkg.find_spec("unsloth") is None:
-    raise MissingDependencyException(
-        "'unsloth' is required in order to use module 'bentoml.unsloth', install unsloth with 'pip install bentoml[unsloth]'."
-    )
-
-merger = Merger(
-    # merge dicts, append list
-    [(dict, "merge"), (list, "append")],
-    # override all other types
-    ["override"],
-    # override conflicting types
-    ["override"],
-)
-
-
-def replace_tag(tag: str) -> str:
-    return tag.lower().replace("/", "--")
-
-
-ModelType = t.Literal["llama", "mistral", "gemma", "gemma2", "qwen2"]
-
-SPEC = {
-    "nvidia-tesla-t4": 16.0,
-    "nvidia-tesla-v100": 16.0,
-    "nvidia-l4": 24.0,
-    "nvidia-tesla-l4": 24.0,
-    "nvidia-tesla-a10g": 24.0,
-    "nvidia-a100-80gb": 80.0,
-    "nvidia-tesla-a100": 40.0,
-}
-
-
-def calculate_recommended_gpu_type(model) -> str:
-    # ceiling the number of parameters to the nearest billion
-    num_params = math.ceil(sum(p.numel() for p in model.parameters()) / 1e9)
-    for gpu_type, memory in SPEC.items():
-        if num_params <= memory / 2:
-            return gpu_type
-    # If no suitable GPU is found, return the one with the highest memory
-    return max(SPEC, key=SPEC.get)
-
-
-def build_bento(
-    model: PreTrainedModel,
-    tokenizer: PreTrainedTokenizerFast,
-    /,
-    model_name: str | None = None,
-    *,
-    quantization_method: t.Literal["bitsandbytes"] | None = None,
-    save_method: t.Literal["merged_16bit", "merged_4bit"] = "merged_16bit",
-    service_config: Config | None = None,
-    engine_config: dict[str, t.Any]
-    | None = None,  # arguments to pass to AsyncEngineArgs
-) -> bentoml.Model:
-    # this model is local then model_name must specified, otherwise derived from model_id
-    is_local = getattr(model.config, "_commit_hash", None) is None
-    if is_local is True and model_name is None:
-        raise BentoMLException(
-            'Fine-tune from a local checkpoint requires specifying "model_name".'
-        )
-    else:
-        model_name = model_name or replace_tag(model.config._name_or_path)
-
-    model_type = t.cast(ModelType, model.config.model_type)
-
-    if service_config is None:
-        service_config = {}
-    if engine_config is None:
-        engine_config = {}
-
-    local_items = MAPPINGS[model_type]["service_config"]
-    merger.merge(
-        local_items,
-        {"resources": {"gpu": 1, "gpu_type": calculate_recommended_gpu_type(model)}},
-    )
-
-    service_config.update({**local_items})
-
-    engine_config.update(MAPPINGS[model.config.model_type]["engine_config"])
-    if quantization_method is not None:
-        if engine_config.get("quantization") is None:
-            engine_config.update(
-                {
-                    "quantization": quantization_method,
-                    "load_format": quantization_method,
-                }
-            )
-
-    with bentoml.models.create(model_name) as bentomodel:
-        model.save_pretrained_merged(
-            bentomodel.path, tokenizer, save_method=save_method
-        )
-
-    build_opts = dict(
-        python=dict(
-            packages=[
-                "pyyaml",
-                "vllm==0.5.5",
-                "fastapi==0.111.0",
-                "unsloth[huggingface] @ git+https://github.com/bentoml/unsloth.git@main",
-            ],
-            lock_packages=True,
-        ),
-        envs=[{"name": "HF_TOKEN"}],
-    )
-    merger.merge(build_opts, get_extras().get(model_type, {}))
-
-    logger.info(
-        "Building bentos for %s, model_id=%s", model_type, model.config._name_or_path
-    )
-
-    with tempfile.TemporaryDirectory() as tempdir:
-        tempdir = pathlib.Path(tempdir)
-        shutil.copytree(
-            pathlib.Path(__file__).parent / "template", tempdir, dirs_exist_ok=True
-        )
-        with (tempdir / "service_config.yaml").open("w") as f:
-            f.write(
-                yaml.safe_dump(
-                    dict(
-                        model_tag=str(bentomodel.tag),
-                        engine_config=engine_config,
-                        service_config=service_config,
-                    )
-                )
-            )
-        with (tempdir / "bentofile.yaml").open("w") as f:
-            BentoBuildConfig(
-                service="service:VLLM",
-                name=f"{model_name.replace('.', '-')}-service",
-                include=["*.py", "*.yaml"],
-                docker=DockerOptions(python_version="3.11", system_packages=["git"]),
-                models=[ModelSpec.from_item(str(bentomodel.tag))],
-                description="API Service for running Unsloth models, powered with BentoML and vLLM.",
-                **build_opts,
-            ).with_defaults().to_yaml(f)
-
-        subprocess.run(
-            [
-                sys.executable,
-                "-m",
-                "bentoml",
-                "build",
-                str(tempdir),
-            ],
-            check=True,
-            cwd=tempdir,
-            env=os.environ,
-        )
diff --git a/src/_bentoml_impl/frameworks/unsloth/mapping.py b/src/_bentoml_impl/frameworks/unsloth/mapping.py
deleted file mode 100644
index e8420113144..00000000000
--- a/src/_bentoml_impl/frameworks/unsloth/mapping.py
+++ /dev/null
@@ -1,49 +0,0 @@
-RUNTIME_MAPPING = {
-    "llama": {
-        "service_config": {
-            "traffic": {"timeout": 300},
-            "resources": {"gpu": 1, "gpu_type": "nvidia-l4"},
-        },
-        "engine_config": {"max_model_len": 2048},
-    },
-    "mistral": {
-        "service_config": {
-            "traffic": {"timeout": 300},
-            "resources": {"gpu": 1, "gpu_type": "nvidia-l4"},
-        },
-        "engine_config": {"max_model_len": 2048},
-    },
-    "gemma": {
-        "service_config": {
-            "traffic": {"timeout": 300},
-            "resources": {"gpu": 1, "gpu_type": "nvidia-l4"},
-        },
-        "engine_config": {"max_model_len": 2048},
-    },
-    "gemma2": {
-        "service_config": {
-            "traffic": {"timeout": 300},
-            "resources": {"gpu": 1, "gpu_type": "nvidia-l4"},
-        },
-        "engine_config": {"max_model_len": 2048},
-    },
-    "qwen2": {
-        "service_config": {
-            "traffic": {"timeout": 300},
-            "resources": {"gpu": 1, "gpu_type": "nvidia-l4"},
-        },
-        "engine_config": {"max_model_len": 2048},
-    },
-}
-
-
-def get_extras():
-    return {
-        "gemma2": {
-            "envs": [{"name": "VLLM_ATTENTION_BACKEND", "value": "FLASHINFER"}],
-            "python": {
-                "extra_index_url": ["https://flashinfer.ai/whl/cu121/torch2.3"],
-                "packages": ["flashinfer==0.1.2+cu121torch2.3"],
-            },
-        }
-    }
diff --git a/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore b/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore
deleted file mode 100644
index 6e34eb8cb21..00000000000
--- a/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore
+++ /dev/null
@@ -1,6 +0,0 @@
-__pycache__/
-*.py[cod]
-*$py.class
-.ipynb_checkpoints
-venv/
-.venv/
diff --git a/src/_bentoml_impl/frameworks/unsloth/template/.gitignore b/src/_bentoml_impl/frameworks/unsloth/template/.gitignore
deleted file mode 100644
index 8f462f40473..00000000000
--- a/src/_bentoml_impl/frameworks/unsloth/template/.gitignore
+++ /dev/null
@@ -1,9 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-.ipynb_checkpoints
-
-# Environments
-venv/
-.venv/
diff --git a/src/_bentoml_impl/frameworks/unsloth/template/service.py b/src/_bentoml_impl/frameworks/unsloth/template/service.py
deleted file mode 100644
index 84cab1afe00..00000000000
--- a/src/_bentoml_impl/frameworks/unsloth/template/service.py
+++ /dev/null
@@ -1,81 +0,0 @@
-from __future__ import annotations
-
-import logging
-import pathlib
-from typing import Literal
-
-import fastapi
-import fastapi.staticfiles
-import pydantic
-import vllm.entrypoints.openai.api_server as vllm_api_server
-import yaml
-
-import bentoml
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
-# Load the constants from the yaml file
-CONSTANT_YAML = pathlib.Path(__file__).parent / "service_config.yaml"
-if not CONSTANT_YAML.exists():
-    raise FileNotFoundError(f"service_config.yaml not found in {CONSTANT_YAML.parent}")
-with CONSTANT_YAML.open("r") as f:
-    CONSTANTS = yaml.safe_load(f)
-ENGINE_CONFIG = CONSTANTS["engine_config"]
-STATIC_DIR = pathlib.Path(__file__).parent / "ui"
-
-
-class Message(pydantic.BaseModel):
-    role: Literal["system", "user", "assistant"]
-    content: str
-
-
-openai_api_app = fastapi.FastAPI()
-for route, endpoint, methods in [
-    ("/chat/completions", vllm_api_server.create_chat_completion, ["POST"]),
-    ("/completions", vllm_api_server.create_completion, ["POST"]),
-    ("/models", vllm_api_server.show_available_models, ["GET"]),
-]:
-    openai_api_app.add_api_route(
-        path=route, endpoint=endpoint, methods=methods, include_in_schema=True
-    )
-
-
-@bentoml.mount_asgi_app(openai_api_app, path="/v1")
-@bentoml.service(**CONSTANTS["service_config"])
-class VLLM:
-    bentomodel = bentoml.models.get(CONSTANTS["model_tag"])
-
-    def __init__(self) -> None:
-        from transformers import AutoTokenizer
-        from vllm import AsyncEngineArgs
-        from vllm import AsyncLLMEngine
-        from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-        from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
-
-        self.engine = AsyncLLMEngine.from_engine_args(
-            AsyncEngineArgs(
-                model=self.bentomodel.path, enable_prefix_caching=True, **ENGINE_CONFIG
-            )
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(self.bentomodel.path)
-        model_config = self.engine.engine.get_model_config()
-        # inject the engine into the openai serving chat and completion
-        vllm_api_server.openai_serving_chat = OpenAIServingChat(
-            async_engine_client=self.engine,
-            served_model_names=[self.bentomodel.path],
-            response_role="assistant",
-            model_config=model_config,
-            lora_modules=None,
-            prompt_adapters=None,
-            request_logger=None,
-        )
-        vllm_api_server.openai_serving_completion = OpenAIServingCompletion(
-            async_engine_client=self.engine,
-            served_model_names=[self.bentomodel.path],
-            model_config=model_config,
-            lora_modules=None,
-            prompt_adapters=None,
-            request_logger=None,
-        )
diff --git a/src/_bentoml_impl/frameworks/unsloth/train.py b/src/_bentoml_impl/frameworks/unsloth/train.py
deleted file mode 100644
index a97c8be2a07..00000000000
--- a/src/_bentoml_impl/frameworks/unsloth/train.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# ruff: ignore
-
-def prep_dataset(tokenizer):
-  alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
-
-### Instruction:
-{}
-
-### Input:
-{}
-
-### Response:
-{}"""
-
-  EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
-
-  def formatting_prompts_func(examples):
-    instructions = examples["instruction"]
-    inputs = examples["input"]
-    outputs = examples["output"]
-    texts = []
-    for instruction, input, output in zip(instructions, inputs, outputs):
-      # Must add EOS_TOKEN, otherwise your generation will go on forever!
-      text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
-      texts.append(text)
-    return {"text": texts}
-
-  from datasets import load_dataset
-
-  dataset = load_dataset("yahma/alpaca-cleaned", split="train")
-  dataset = dataset.map(
-    formatting_prompts_func,
-    batched=True,
-  )
-  return dataset
-
-
-def main(max_seq_length: int = 8196) -> int:
-  import unsloth, bentoml, trl, transformers
-
-  model, tokenizer = unsloth.FastLanguageModel.from_pretrained("unsloth/Meta-Llama-3.1-8B-bnb-4bit", max_seq_length=max_seq_length, load_in_4bit=True)
-  # alpaca chat templates
-  tokenizer.chat_template="{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() + '\n\n' %}{% else %}{% set loop_messages = messages %}{% set system_message = '' %}{% endif %}{{ bos_token + system_message }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'].strip() + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response:\n' + message['content'].strip() + eos_token + '\n\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '### Instruction:\n' }}{% endif %}{% endfor %}"
-  model = unsloth.FastLanguageModel.get_peft_model(
-    model, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
-    r=16, lora_alpha=16, lora_dropout=0, bias="none", random_state=3407,
-    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
-  )
-  trl.SFTTrainer(
-    model=model, tokenizer=tokenizer,
-    train_dataset=prep_dataset(tokenizer),
-    dataset_text_field="text", max_seq_length=max_seq_length,
-    dataset_num_proc=2, packing=False,  # Can make training 5x faster for short sequences.
-    args=transformers.TrainingArguments(
-      per_device_train_batch_size=2, gradient_accumulation_steps=4,
-      warmup_steps=5, num_train_epochs = 1, max_steps=60, learning_rate=2e-4,
-      weight_decay=0.01, seed=3407,
-      optim="adamw_8bit",
-      fp16=not unsloth.is_bfloat16_supported(), bf16=unsloth.is_bfloat16_supported(),
-      logging_steps=1,
-      lr_scheduler_type="linear",
-      output_dir="outputs",
-    ),
-  ).train()
-
-  bentoml.unsloth.build_bento(model, tokenizer, quantization_method="bitsandbytes", engine_config={"quantization": "bitsandbytes"})
-  return 0
-
-if __name__ == "__main__": raise SystemExit(main())