From eb759c9a28e0219ed210b71e73252910ae52a646 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Thu, 29 Aug 2024 07:03:03 +0000 Subject: [PATCH] feat: unsloth integrations Signed-off-by: Aaron Pham --- .gitignore | 4 + .pre-commit-config.yaml | 4 +- pyproject.toml | 11 + .../frameworks/unsloth/__init__.py | 176 ++++++++++++++ .../frameworks/unsloth/mapping.py | 49 ++++ .../frameworks/unsloth/template/.bentoignore | 6 + .../unsloth/template/.gitattributes | 2 + .../frameworks/unsloth/template/.gitignore | 12 + .../frameworks/unsloth/template/README.md | 225 ++++++++++++++++++ .../template/chat_templates/alpaca.jinja | 24 ++ .../template/chat_templates/amberchat.jinja | 27 +++ .../template/chat_templates/chatml.jinja | 18 ++ .../template/chat_templates/chatqa.jinja | 36 +++ .../chat_templates/falcon-instruct.jinja | 22 ++ .../template/chat_templates/gemma-it.jinja | 31 +++ .../chat_templates/llama-2-chat.jinja | 25 ++ .../chat_templates/llama-3-chat.jinja | 24 ++ .../chat_templates/mistral-instruct.jinja | 26 ++ .../template/chat_templates/openchat.jinja | 20 ++ .../template/chat_templates/phi-3.jinja | 17 ++ .../template/chat_templates/saiga.jinja | 23 ++ .../chat_templates/solar-instruct.jinja | 18 ++ .../template/chat_templates/vicuna.jinja | 24 ++ .../template/chat_templates/zephyr.jinja | 17 ++ .../template/generation_configs/alpaca.json | 6 + .../generation_configs/amberchat.json | 6 + .../template/generation_configs/chatqa.json | 6 + .../template/generation_configs/gemma-it.json | 6 + .../generation_configs/llama-2-chat.json | 6 + .../generation_configs/llama-3-chat.json | 6 + .../generation_configs/mistral-instruct.json | 6 + .../template/generation_configs/openchat.json | 6 + .../template/generation_configs/orca-2.json | 6 + .../template/generation_configs/phi-3.json | 6 + .../generation_configs/qwen2-chat.json | 6 + .../template/generation_configs/saiga.json | 6 + .../generation_configs/solar-instruct.json | 6 + .../template/generation_configs/vicuna.json | 6 + .../template/generation_configs/yi-chat.json | 6 + .../template/generation_configs/zephyr.json | 6 + .../frameworks/unsloth/template/service.py | 179 ++++++++++++++ src/_bentoml_impl/frameworks/unsloth/train.py | 67 ++++++ src/bentoml/__init__.py | 5 + 43 files changed, 1187 insertions(+), 1 deletion(-) create mode 100644 src/_bentoml_impl/frameworks/unsloth/__init__.py create mode 100644 src/_bentoml_impl/frameworks/unsloth/mapping.py create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/.bentoignore create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/.gitattributes create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/.gitignore create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/README.md create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/alpaca.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/amberchat.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/chatml.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/chatqa.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/falcon-instruct.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/gemma-it.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/llama-2-chat.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/llama-3-chat.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/mistral-instruct.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/openchat.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/phi-3.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/saiga.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/solar-instruct.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/vicuna.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/chat_templates/zephyr.jinja create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/alpaca.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/amberchat.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/chatqa.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/gemma-it.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/llama-2-chat.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/llama-3-chat.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/mistral-instruct.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/openchat.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/orca-2.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/phi-3.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/qwen2-chat.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/saiga.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/solar-instruct.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/vicuna.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/yi-chat.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/generation_configs/zephyr.json create mode 100644 src/_bentoml_impl/frameworks/unsloth/template/service.py create mode 100644 src/_bentoml_impl/frameworks/unsloth/train.py diff --git a/.gitignore b/.gitignore index 822c97afccd..051738c651a 100644 --- a/.gitignore +++ b/.gitignore @@ -167,3 +167,7 @@ mlruns/ .pdm-python .python-version .pdm-build/ + +# from training scripts +model +outputs diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 22830670e92..14f41d2ace5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ ci: autoupdate_schedule: monthly autofix_commit_msg: "ci: auto fixes from pre-commit.ci\n\nFor more information, see https://pre-commit.ci" autoupdate_commit_msg: 'ci: pre-commit autoupdate [skip ci]' - skip: # exceeds tier max size + skip: # exceeds tier max size - buf-format - buf-lint exclude: '(.*\.(css|js|svg))|(.*/(snippets|grpc|proto)/.*)$' @@ -13,7 +13,9 @@ repos: - id: ruff args: [--fix, --exit-non-zero-on-fix, --show-fixes] types_or: [python, pyi] + exclude: ^src/_bentoml_impl/frameworks/unsloth/train\.py$ - id: ruff-format + exclude: ^src/_bentoml_impl/frameworks/unsloth/train\.py$ types_or: [python, pyi] files: '(src|tests|docs|examples|typings)/' - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/pyproject.toml b/pyproject.toml index dcb47ba0b41..7bcd0eb97ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,6 +93,11 @@ io = [ io-image = ["Pillow"] io-pandas = ["pandas>=1", "pyarrow"] triton = ["tritonclient>=2.29.0", "tritonclient[all]; sys_platform != 'darwin'"] +unsloth = [ + "unsloth[huggingface] @ git+https://github.com/bentoml/unsloth.git@main", + "vllm>=0.5.5", + "fastapi" +] grpc = [ "protobuf", "grpcio", @@ -238,6 +243,11 @@ testpaths = ["tests"] line-length = 88 target-version = "py310" +[tool.ruff.format] +exclude = [ + "src/_bentoml_impl/frameworks/unsloth/train.py", +] + [tool.ruff.lint] # We ignore E501 (line too long) here because we keep user-visible strings on one line. ignore = ["E501"] @@ -250,6 +260,7 @@ exclude = [ "src/bentoml/_internal/external_typing", "src/bentoml/grpc/v1alpha1", "src/bentoml/grpc/v1", + "src/_bentoml_impl/frameworks/unsloth", "tests/proto", ] diff --git a/src/_bentoml_impl/frameworks/unsloth/__init__.py b/src/_bentoml_impl/frameworks/unsloth/__init__.py new file mode 100644 index 00000000000..3ec2e9a3cd5 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/__init__.py @@ -0,0 +1,176 @@ +from __future__ import annotations + +import logging +import os +import pathlib +import shutil +import subprocess +import sys +import tempfile +import typing as t + +import yaml +from deepmerge.merger import Merger + +import bentoml +from _bentoml_sdk.service.config import ServiceConfig as Config +from bentoml._internal.bento.build_config import BentoBuildConfig +from bentoml._internal.bento.build_config import DockerOptions +from bentoml._internal.bento.build_config import ModelSpec +from bentoml._internal.utils import pkg +from bentoml.exceptions import BentoMLException +from bentoml.exceptions import MissingDependencyException + +from .mapping import RUNTIME_MAPPING as MAPPINGS +from .mapping import get_extras + +logger = logging.getLogger(__name__) + +if t.TYPE_CHECKING: + from transformers import PreTrainedModel + from transformers import PreTrainedTokenizerFast + +if pkg.find_spec("unsloth") is None: + raise MissingDependencyException( + "'unsloth' is required in order to use module 'bentoml.unsloth', install unsloth with 'pip install bentoml[unsloth]'." + ) + +merger = Merger( + # merge dicts, append list + [(dict, "merge"), (list, "append")], + # override all other types + ["override"], + # override conflicting types + ["override"], +) + + +def replace_tag(tag: str) -> str: + return tag.lower().replace("/", "--") + + +ChatTemplate = t.Literal[ + "alpaca", + "amberchat", + "chatml", + "chatqa", + "falcon-instruct", + "gemma-it", + "llama-2-chat", + "llama-3-chat", + "mistral-instruct", + "openchat", + "phi-3", + "saiga", + "solar-instruct", + "vicuna", + "zephyr", +] + +ModelType = t.Literal["llama", "mistral", "gemma", "gemma2", "qwen2"] + + +def build_bento( + model: PreTrainedModel, + tokenizer: PreTrainedTokenizerFast, + /, + model_name: str | None = None, + *, + chat_template: ChatTemplate, + quantization_method: t.Literal["bitsandbytes"] | None = None, + save_method: t.Literal["merged_16bit", "merged_4bit"] = "merged_16bit", + service_config: Config | None = None, + engine_config: dict[str, t.Any] + | None = None, # arguments to pass to AsyncEngineArgs +) -> bentoml.Model: + # this model is local then model_name must specified, otherwise derived from model_id + is_local = getattr(model.config, "_commit_hash", None) is None + if is_local is True and model_name is None: + raise BentoMLException( + 'Fine-tune from a local checkpoint requires specifying "model_name".' + ) + else: + model_name = model_name or replace_tag(model.config._name_or_path) + + model_type = t.cast(ModelType, model.config.model_type) + + if service_config is None: + service_config = {} + if engine_config is None: + engine_config = {} + + service_config.update({**MAPPINGS[model_type]["service_config"]}) + + engine_config.update(MAPPINGS[model.config.model_type]["engine_config"]) + if quantization_method is not None: + engine_config.update( + {"quantization": quantization_method, "load_format": quantization_method} + ) + + with bentoml.models.create(model_name) as bentomodel: + model.save_pretrained_merged( + bentomodel.path, tokenizer, save_method=save_method + ) + + build_opts = dict( + python=dict( + packages=[ + "pyyaml", + "vllm==0.5.5", + "fastapi==0.111.0", + "unsloth[huggingface] @ git+https://github.com/bentoml/unsloth.git@main", + ], + lock_packages=True, + ), + envs=[{"name": "HF_TOKEN"}], + ) + merger.merge(build_opts, get_extras().get(model_type, {})) + + logger.info( + "Building bentos for %s, model_id=%s", model_type, model.config._name_or_path + ) + + with tempfile.TemporaryDirectory() as tempdir: + tempdir = pathlib.Path(tempdir) + shutil.copytree( + pathlib.Path(__file__).parent / "template", tempdir, dirs_exist_ok=True + ) + with (tempdir / "service_config.yaml").open("w") as f: + f.write( + yaml.safe_dump( + dict( + chat_template=chat_template, + model_tag=str(bentomodel.tag), + engine_config=engine_config, + service_config=service_config, + ) + ) + ) + with (tempdir / "bentofile.yaml").open("w") as f: + BentoBuildConfig( + service="service:VLLM", + name=f"{model_name.replace('.', '-')}-service", + include=[ + "*.py", + "*.yaml", + "chat_templates/*.jinja", + "generation_configs/*.json", + ], + docker=DockerOptions(python_version="3.11", system_packages=["git"]), + models=[ModelSpec.from_item(str(bentomodel.tag))], + description="API Service for running Unsloth models, powered with BentoML and vLLM.", + **build_opts, + ).with_defaults().to_yaml(f) + + subprocess.run( + [ + sys.executable, + "-m", + "bentoml", + "build", + str(tempdir), + ], + check=True, + cwd=tempdir, + env=os.environ, + ) diff --git a/src/_bentoml_impl/frameworks/unsloth/mapping.py b/src/_bentoml_impl/frameworks/unsloth/mapping.py new file mode 100644 index 00000000000..e8420113144 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/mapping.py @@ -0,0 +1,49 @@ +RUNTIME_MAPPING = { + "llama": { + "service_config": { + "traffic": {"timeout": 300}, + "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, + }, + "engine_config": {"max_model_len": 2048}, + }, + "mistral": { + "service_config": { + "traffic": {"timeout": 300}, + "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, + }, + "engine_config": {"max_model_len": 2048}, + }, + "gemma": { + "service_config": { + "traffic": {"timeout": 300}, + "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, + }, + "engine_config": {"max_model_len": 2048}, + }, + "gemma2": { + "service_config": { + "traffic": {"timeout": 300}, + "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, + }, + "engine_config": {"max_model_len": 2048}, + }, + "qwen2": { + "service_config": { + "traffic": {"timeout": 300}, + "resources": {"gpu": 1, "gpu_type": "nvidia-l4"}, + }, + "engine_config": {"max_model_len": 2048}, + }, +} + + +def get_extras(): + return { + "gemma2": { + "envs": [{"name": "VLLM_ATTENTION_BACKEND", "value": "FLASHINFER"}], + "python": { + "extra_index_url": ["https://flashinfer.ai/whl/cu121/torch2.3"], + "packages": ["flashinfer==0.1.2+cu121torch2.3"], + }, + } + } diff --git a/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore b/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore new file mode 100644 index 00000000000..6e34eb8cb21 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/.bentoignore @@ -0,0 +1,6 @@ +__pycache__/ +*.py[cod] +*$py.class +.ipynb_checkpoints +venv/ +.venv/ diff --git a/src/_bentoml_impl/frameworks/unsloth/template/.gitattributes b/src/_bentoml_impl/frameworks/unsloth/template/.gitattributes new file mode 100644 index 00000000000..4ae0312f0b8 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/.gitattributes @@ -0,0 +1,2 @@ +* text=auto eol=lf +**/ui/** linguist-vendored=true diff --git a/src/_bentoml_impl/frameworks/unsloth/template/.gitignore b/src/_bentoml_impl/frameworks/unsloth/template/.gitignore new file mode 100644 index 00000000000..d5629589c1c --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/.gitignore @@ -0,0 +1,12 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Environments +venv/ + +# BentoML +bentoml/client_id + +chattts/ChatTTS/ diff --git a/src/_bentoml_impl/frameworks/unsloth/template/README.md b/src/_bentoml_impl/frameworks/unsloth/template/README.md new file mode 100644 index 00000000000..56d992593f9 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/README.md @@ -0,0 +1,225 @@ +# chat_templates + +This is a repository that includes proper chat templates (or input formats) for large language models (LLMs), to support `transformers`'s `chat_template` [feature](https://huggingface.co/docs/transformers/chat_templating). + +We know that different models are trained with different input formats, especially for those instruction-tuned or chat models. This is especially noted in `transformers`'s new `chat_template` feature. However, I found that popular models (e.g., `vicuna`, `falcon`) on HuggingFace do not include this parameter in their `tokenizer_config.json` files, which may make it troublesome to properly run these models. Also, the `chat_template` feature requires to implement a Jinja template, which may be not intuitive to be directly done in the json files. + +So I collect proper chat templates of several popular models from official reference or implementations, which are put under `chat_templates`. If you are interested to include more chat templates, feel free to open a pull request. + +If you find this repo useful, please kindly cite it: +```tex +@misc{zheng-2024-chat-templates, + author = {Zheng, Chujie}, + title = {Chat Templates for HuggingFace Large Language Models}, + year = {2024}, + howpublished = {\url{https://github.com/chujiezheng/chat_templates}} +} +``` + +## Updates + +* **[05/2024]** Added support for Nvidia's **ChatQA** models +* **[04/2024]** Added support for Microsoft's **Phi-3** models +* **[04/2024]** Added support for Meta's **Llama-3** models +* **[02/2024]** Added support for Google's **Gemma** models +* **[02/2024]** Added usage explanation for **generation_configs** +* **[01/2024]** Added support for Alibaba's **Qwen2** models + +## What are Contained in This Repo? + +- [`chat_templates`](/chat_templates/) contains the jinja files of collected chat templates, which can be directly replaced in the Huggingface tokenizers. + +- [`generation_configs`](/generation_configs/) contains the corresponding json configs used for controlling the ending of response generations. Specially, **the `stop_token_ids` should be directly passed into the `generate` method by the `eos_token_id` argument.** + +## Supported Models + +| Model (Family) | Template File | Reference | Comment | +|-----------------------------------------------|--------------------------|-------------------------------------------------------------------------------------------------------------------------------------------| ------------------------------ | +| `llama-3-chat` **New** | `llama-3-chat.jinja` | [link](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer_config.json#L2053) | Official template
`Meta-Llama-3-8B/70B-Instruct` | +| `phi-3` **New** | `phi-3.jinja` | [link](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/tokenizer_config.json#L338) | Official template
`Phi-3-mini-4k/128k-instruct` | +| `qwen2-chat` **New** | `chatml.jinja` | [link](https://huggingface.co/Qwen/Qwen1.5-72B-Chat/blob/main/tokenizer_config.json#L31) | ChatML format
`Qwen1.5-0.4B/1.8B/4B/7B/14B/72B-Chat` | +| `gemma-it` **New** | `gemma-it.jinja` | [link](https://huggingface.co/google/gemma-7b-it/blob/main/tokenizer_config.json#L1507) | `gemma-2b/7b-it`
**System message allowed** | +| `chatqa` **New** | `chatqa.jinja` | [link](https://huggingface.co/nvidia/Llama3-ChatQA-1.5-8B#when-context-is-available) | `Llama3-ChatQA-1.5-8B/70B`
**Context message allowed** | +| `llama-2-chat` | `llama-2-chat.jinja` | [link](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/tokenizer_config.json#L12) | Official template
`Llama-2-7b/13b/70b-chat-hf` | +| `mistral-instruct` | `mistral-instruct.jinja` | [link](https://docs.mistral.ai/usage/guardrailing) | `Mistral-7B-Instruct-v0.1/0.2`
**System message allowed** | +| `openchat` | `openchat.jinja` | [link](https://huggingface.co/openchat/openchat_3.5/blob/main/tokenizer_config.json#L51) | `openchat-3.5` | +| `zephyr` | `zephyr.jinja` | [link](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta/blob/main/tokenizer_config.json#L34) | `zephyr-7b-alpha/beta` | +| `yi-chat` | `chatml.jinja` | [link](https://huggingface.co/01-ai/Yi-6B-Chat/blob/main/tokenizer_config.json#L60) | ChatML format
`Yi-6B/34B-Chat` | +| `orca-2` | `chatml.jinja` | [link](https://huggingface.co/microsoft/Orca-2-7b) | ChatML format
`Orca-2-7b/13b` | +| `vicuna` | `vicuna.jinja` | [link](https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template) | `vicuna-7b/13b-v1.5` | +| `falcon-instruct` | `falcon-instruct.jinja` | [link](https://github.com/lm-sys/FastChat/blob/d578599c69d060e6d40943f1b5b72af98956092a/fastchat/conversation.py#L675) | `falcon-7b/40b-instruct` | +| `starling-lm` | `openchat.jinja` | [link](https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha/blob/main/tokenizer_config.json#L49) | `Starling-LM-7B-alpha/beta` | +| `solar-instruct` | `solar-instruct.jinja` | [link](https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0/blob/main/tokenizer_config.json#L31) | `SOLAR-10.7B-Instruct-v1.0` | +| `alpaca` | `alpaca.jinja` | [link](https://github.com/tatsu-lab/stanford_alpaca) | `alpaca`-style models, like `Platypus2-13B` | +| `amberchat` | `amberchat.jinja` | [link](https://huggingface.co/LLM360/AmberChat) | `AmberChat`, `AmberSafe` | +| `saiga` | `saiga.jinja` | [link](https://huggingface.co/IlyaGusev/saiga_mistral_7b_lora#saigamistral-7b-russian-mistral-based-chatbot) | `saiga`, a series of Russian models | + +## Examples of Setting `chat_template` + +### Example 1: `llama-3-chat` + +This example may check if the jinja file is correctly implemented. + +```python +from transformers import AutoTokenizer + +toker = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", token="YOUR_OWN_TOKEN") +messages = [ + {'role': 'system', 'content': 'This is a system prompt.'}, + {'role': 'user', 'content': 'This is the first user input.'}, + {'role': 'assistant', 'content': 'This is the first assistant response.'}, + {'role': 'user', 'content': 'This is the second user input.'}, +] +print('###### Default (yet Correct) Chat Template ######') +print(toker.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)) +print('###### Corrected Chat Template ######') +chat_template = open('./chat_templates/llama-3-chat.jinja').read() +chat_template = chat_template.replace(' ', '').replace('\n', '') +toker.chat_template = chat_template +print(toker.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)) +``` + +Expected output: + +``` +###### Default (yet Correct) Chat Template ###### +<|begin_of_text|><|start_header_id|>system<|end_header_id|> + +This is a system prompt.<|eot_id|><|start_header_id|>user<|end_header_id|> + +This is the first user input.<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +This is the first assistant response.<|eot_id|><|start_header_id|>user<|end_header_id|> + +This is the second user input.<|eot_id|><|start_header_id|>assistant<|end_header_id|> + + +###### Corrected Chat Template ###### +<|begin_of_text|><|start_header_id|>system<|end_header_id|> + +This is a system prompt.<|eot_id|><|start_header_id|>user<|end_header_id|> + +This is the first user input.<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +This is the first assistant response.<|eot_id|><|start_header_id|>user<|end_header_id|> + +This is the second user input.<|eot_id|><|start_header_id|>assistant<|end_header_id|> +``` + +### Example 2: `llama-2-chat` + +This example may check if the jinja file is correctly implemented. + +```python +from transformers import AutoTokenizer + +toker = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token="YOUR_OWN_TOKEN") +messages = [ + {'role': 'system', 'content': 'This is a system prompt.'}, + {'role': 'user', 'content': 'This is the first user input.'}, + {'role': 'assistant', 'content': 'This is the first assistant response.'}, + {'role': 'user', 'content': 'This is the second user input.'}, +] +print('###### Default (yet Correct) Chat Template ######') +print(toker.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)) +print('###### Corrected Chat Template ######') +chat_template = open('./chat_templates/llama-2-chat.jinja').read() +chat_template = chat_template.replace(' ', '').replace('\n', '') +toker.chat_template = chat_template +print(toker.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)) +``` + +Expected output: + +``` +###### Default (yet Correct) Chat Template ###### +[INST] <> +This is a system prompt. +<> + +This is the first user input. [/INST] This is the first assistant response. [INST] This is the second user input. [/INST] +###### Corrected Chat Template ###### +[INST] <> +This is a system prompt. +<> + +This is the first user input. [/INST] This is the first assistant response. [INST] This is the second user input. [/INST] +``` + +### Example 3: `mistral-instruct` + +For `mistral-instruct` (also `gemma-it`), it does not natively support the `system` message, so passing the `system` message would raise error. + +```python +from transformers import AutoTokenizer + +toker = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5") +messages = [ + {'role': 'system', 'content': 'This is a system prompt.'}, + {'role': 'user', 'content': 'This is the first user input.'}, + {'role': 'assistant', 'content': 'This is the first assistant response.'}, + {'role': 'user', 'content': 'This is the second user input.'}, +] +print('###### Default (but Improper) Chat Template ######') +# raising error +#print(toker.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)) +print('###### Corrected Chat Template ######') +chat_template = open('./chat_templates/mistral-instruct.jinja').read() +chat_template = chat_template.replace(' ', '').replace('\n', '') +toker.chat_template = chat_template +print(toker.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)) +``` + +Expected output: + +``` +###### Default (but Error-Raising) Chat Template ###### +jinja2.exceptions.TemplateError: Conversation roles must alternate user/assistant/user/assistant/... +###### Corrected Chat Template ###### +[INST] This is a system prompt. + +This is the first user input. [/INST] This is the first assistant response. [INST] This is the second user input. [/INST] +``` + +### Example 4: `vicuna` + +NOTE: In [fast-chat](https://github.com/lm-sys/FastChat/blob/d578599c69d060e6d40943f1b5b72af98956092a/fastchat/conversation.py#L287C3-L287C3), `vicuna` does not add linebreaks between roles' messages. But I found that adding linebreaks leads to a bit better performance (especially for the v1.5 version). + +Also, I found `vicuna-7/13/33b-v1.3` may not work well when given a system message different from its default one. So I would recommend to use `vicuna-7/13b-v1.5` instead. + +```python +from transformers import AutoTokenizer + +toker = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5") +messages = [ + {'role': 'system', 'content': 'This is a system prompt.'}, + {'role': 'user', 'content': 'This is the first user input.'}, + {'role': 'assistant', 'content': 'This is the first assistant response.'}, + {'role': 'user', 'content': 'This is the second user input.'}, +] +print('###### Default (but Improper) Chat Template ######') +print(toker.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)) +print('###### Corrected Chat Template ######') +chat_template = open('./chat_templates/vicuna.jinja').read() +chat_template = chat_template.replace(' ', '').replace('\n', '') +toker.chat_template = chat_template +print(toker.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)) +``` + +Expected output: + +``` +###### Default (but Improper) Chat Template ###### +[INST] <> +This is a system prompt. +<> + +This is the first user input. [/INST] This is the first assistant response. [INST] This is the second user input. [/INST] +###### Corrected Chat Template ###### +This is a system prompt. + +USER: This is the first user input. +ASSISTANT: This is the first assistant response. +USER: This is the second user input. +ASSISTANT: +``` diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/alpaca.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/alpaca.jinja new file mode 100644 index 00000000000..8d5023fd809 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/alpaca.jinja @@ -0,0 +1,24 @@ +{% if messages[0]['role'] == 'system' %} + {% set loop_messages = messages[1:] %} + {% set system_message = messages[0]['content'].strip() + '\n\n' %} +{% else %} + {% set loop_messages = messages %} + {% set system_message = '' %} +{% endif %} + +{{ bos_token + system_message }} +{% for message in loop_messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {% if message['role'] == 'user' %} + {{ '### Instruction:\n' + message['content'].strip() + '\n\n' }} + {% elif message['role'] == 'assistant' %} + {{ '### Response:\n' + message['content'].strip() + eos_token + '\n\n' }} + {% endif %} + + {% if loop.last and message['role'] == 'user' and add_generation_prompt %} + {{ '### Instruction:\n' }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/amberchat.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/amberchat.jinja new file mode 100644 index 00000000000..4a213d2752c --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/amberchat.jinja @@ -0,0 +1,27 @@ +{% if messages[0]['role'] == 'system' %} + {% set loop_messages = messages[1:] %} + {% set system_message = messages[0]['content'].strip() + '\n' %} +{% else %} + {% set loop_messages = messages %} + {% set system_message = '' %} +{% endif %} + +{% for message in loop_messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {% if loop.index0 == 0 %} + {{ bos_token + system_message }} + {% endif %} + + {% if message['role'] == 'user' %} + {{ '###Human: ' + message['content'].strip() + '\n' }} + {% elif message['role'] == 'assistant' %} + {{ '###Assistant: ' + message['content'].strip() + '\n' }} + {% endif %} + + {% if loop.last and message['role'] == 'user' and add_generation_prompt %} + {{ '###Assistant:' }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/chatml.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/chatml.jinja new file mode 100644 index 00000000000..c8ef5929d08 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/chatml.jinja @@ -0,0 +1,18 @@ +{% if messages[0]['role'] == 'system' %} + {% set offset = 1 %} +{% else %} + {% set offset = 0 %} +{% endif %} + +{{ bos_token }} +{% for message in messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {{ '<|im_start|>' + message['role'] + '\n' + message['content'].strip() + '<|im_end|>\n' }} + + {% if loop.last and message['role'] == 'user' and add_generation_prompt %} + {{ '<|im_start|>assistant\n' }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/chatqa.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/chatqa.jinja new file mode 100644 index 00000000000..d66dd408fcd --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/chatqa.jinja @@ -0,0 +1,36 @@ +{{ bos_token }} +{% if messages[0]['role'] == 'system' %} + {% set loop_messages = messages[1:] %} + {% set system_message = 'System: ' + messages[0]['content'].strip() %} +{% else %} + {% set loop_messages = messages %} + {% set system_message = '' %} +{% endif %} + +{% if messages[0]['role'] == 'context' %} + {% set loop_messages = messages[1:] %} + {% set context_message = '\n\n' + messages[0]['content'].strip() %} +{% else %} + {% set loop_messages = messages %} + {% set context_message = '' %} +{% endif %} + +{% for message in loop_messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {% if loop.index0 == 0 %} + {{ system_message + context_message }} + {% endif %} + + {% if message['role'] == 'user' %} + {{ '\n\nUser: ' + content.strip() }} + {% elif message['role'] == 'assistant' %} + {{ '\n\nAssistant: ' + content.strip() }} + {% endif %} + + {% if loop.last and message['role'] == 'user' and add_generation_prompt %} + {{ '\n\nAssistant:' }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/falcon-instruct.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/falcon-instruct.jinja new file mode 100644 index 00000000000..0e6593c4fdf --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/falcon-instruct.jinja @@ -0,0 +1,22 @@ +{% if messages[0]['role'] == 'system' %} + {% set loop_messages = messages[1:] %} + {% set system_message = messages[0]['content'] %} +{% else %} + {% set loop_messages = messages %} + {% set system_message = '' %} +{% endif %} + +{% for message in loop_messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {% if loop.index0 == 0 %} + {{ system_message.strip() }} + {% endif %} + {{ '\n\n' + message['role'].title() + ': ' + message['content'].strip().replace('\r\n', '\n').replace('\n\n', '\n') }} + + {% if loop.last and message['role'] == 'user' and add_generation_prompt %} + {{ '\n\nAssistant:' }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/gemma-it.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/gemma-it.jinja new file mode 100644 index 00000000000..d89320bc3ec --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/gemma-it.jinja @@ -0,0 +1,31 @@ +{% if messages[0]['role'] == 'system' %} + {% set loop_messages = messages[1:] %} + {% set system_message = messages[0]['content'].strip() + '\n\n' %} +{% else %} + {% set loop_messages = messages %} + {% set system_message = '' %} +{% endif %} + +{% for message in loop_messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {% if loop.index0 == 0 %} + {% set content = system_message + message['content'] %} + {% else %} + {% set content = message['content'] %} + {% endif %} + + {% if (message['role'] == 'assistant') %} + {% set role = 'model' %} + {% else %} + {% set role = message['role'] %} + {% endif %} + + {{ '' + role + '\n' + content.strip() + '\n' }} + + {% if loop.last and message['role'] == 'user' and add_generation_prompt %} + {{'model\n'}} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/llama-2-chat.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/llama-2-chat.jinja new file mode 100644 index 00000000000..88962343b00 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/llama-2-chat.jinja @@ -0,0 +1,25 @@ +{% if messages[0]['role'] == 'system' %} + {% set loop_messages = messages[1:] %} + {% set system_message = '<>\n' + messages[0]['content'].strip() + '\n<>\n\n' %} +{% else %} + {% set loop_messages = messages %} + {% set system_message = '' %} +{% endif %} + +{% for message in loop_messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {% if loop.index0 == 0 %} + {% set content = system_message + message['content'] %} + {% else %} + {% set content = message['content'] %} + {% endif %} + + {% if message['role'] == 'user' %} + {{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }} + {% elif message['role'] == 'assistant' %} + {{ ' ' + content.strip() + ' ' + eos_token }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/llama-3-chat.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/llama-3-chat.jinja new file mode 100644 index 00000000000..23af01d96a9 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/llama-3-chat.jinja @@ -0,0 +1,24 @@ +{{ bos_token }} +{% if messages[0]['role'] == 'system' %} + {% set loop_messages = messages[1:] %} + {% set system_message = '<|start_header_id|>' + 'system' + '<|end_header_id|>\n\n' + messages[0]['content'].strip() + '<|eot_id|>' %} +{% else %} + {% set loop_messages = messages %} + {% set system_message = '' %} +{% endif %} + +{% for message in loop_messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {% if loop.index0 == 0 %} + {{ system_message }} + {% endif %} + + {{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'].strip() + '<|eot_id|>' }} + + {% if loop.last and message['role'] == 'user' and add_generation_prompt %} + {{ '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\n\n' }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/mistral-instruct.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/mistral-instruct.jinja new file mode 100644 index 00000000000..23a8faa709f --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/mistral-instruct.jinja @@ -0,0 +1,26 @@ +{% if messages[0]['role'] == 'system' %} + {% set loop_messages = messages[1:] %} + {% set system_message = messages[0]['content'].strip() + '\n\n' %} +{% else %} + {% set loop_messages = messages %} + {% set system_message = '' %} +{% endif %} + +{{ bos_token }} +{% for message in loop_messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {% if loop.index0 == 0 %} + {% set content = system_message + message['content'] %} + {% else %} + {% set content = message['content'] %} + {% endif %} + + {% if message['role'] == 'user' %} + {{ '[INST] ' + content.strip() + ' [/INST]' }} + {% elif message['role'] == 'assistant' %} + {{ ' ' + content.strip() + eos_token }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/openchat.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/openchat.jinja new file mode 100644 index 00000000000..0dd84905798 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/openchat.jinja @@ -0,0 +1,20 @@ +{% if messages[0]['role'] == 'system' %} + {% set loop_messages = messages[1:] %} + {% set system_message = messages[0]['content'].strip() + '<|end_of_turn|>' %} +{% else %} + {% set loop_messages = messages %} + {% set system_message = '' %} +{% endif %} + +{{ bos_token + system_message }} +{% for message in loop_messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>' }} + + {% if loop.last and message['role'] == 'user' and add_generation_prompt %} + {{ 'GPT4 Correct Assistant:' }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/phi-3.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/phi-3.jinja new file mode 100644 index 00000000000..36d88d6958c --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/phi-3.jinja @@ -0,0 +1,17 @@ +{% if messages[0]['role'] == 'system' %} + {% set offset = 1 %} +{% else %} + {% set offset = 0 %} +{% endif %} + +{% for message in messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {{ '<|' + message['role'] + '|>\n' + message['content'].strip() + '<|end|>' + '\n' }} + + {% if loop.last and message['role'] == 'user' and add_generation_prompt %} + {{ '<|assistant|>\n' }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/saiga.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/saiga.jinja new file mode 100644 index 00000000000..79367646795 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/saiga.jinja @@ -0,0 +1,23 @@ +{% if messages[0]['role'] == 'system' %} + {% set loop_messages = messages[1:] %} + {% set system_message = bos_token + 'system' + '\n' + messages[0]['content'].strip() + eos_token %} +{% else %} + {% set loop_messages = messages %} + {% set system_message = '' %} +{% endif %} + +{% for message in loop_messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{ raise_exception('Conversation roles must alternate user/bot/user/bot/...') }} + {% endif %} + + {% if loop.index0 == 0 %} + {{ system_message }} + {% endif %} + + {{ bos_token + message['role'] + '\n' + message['content'].strip() + eos_token }} + + {% if loop.last and message['role'] == 'user' and add_generation_prompt %} + {{ bos_token + 'bot' }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/solar-instruct.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/solar-instruct.jinja new file mode 100644 index 00000000000..57efad0ab08 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/solar-instruct.jinja @@ -0,0 +1,18 @@ +{% if messages[0]['role'] == 'system' %} + {% set offset = 1 %} +{% else %} + {% set offset = 0 %} +{% endif %} + +{{ bos_token }} +{% for message in messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {{ '### ' + message['role'].title() + ':\n' + message['content'].strip() + '\n\n' }} + + {% if loop.last and message['role'] == 'user' and add_generation_prompt %} + {{ '### Assistant:\n' }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/vicuna.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/vicuna.jinja new file mode 100644 index 00000000000..3845ba1b3f0 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/vicuna.jinja @@ -0,0 +1,24 @@ +{% if messages[0]['role'] == 'system' %} + {% set loop_messages = messages[1:] %} + {% set system_message = messages[0]['content'].strip() + '\n\n' %} +{% else %} + {% set loop_messages = messages %} + {% set system_message = '' %} +{% endif %} + +{{ bos_token + system_message }} +{% for message in loop_messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {% if message['role'] == 'user' %} + {{ 'USER: ' + message['content'].strip() + '\n' }} + {% elif message['role'] == 'assistant' %} + {{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\n' }} + {% endif %} + + {% if loop.last and message['role'] == 'user' and add_generation_prompt %} + {{ 'ASSISTANT:' }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/zephyr.jinja b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/zephyr.jinja new file mode 100644 index 00000000000..35d92f96d59 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/chat_templates/zephyr.jinja @@ -0,0 +1,17 @@ +{% if messages[0]['role'] == 'system' %} + {% set offset = 1 %} +{% else %} + {% set offset = 0 %} +{% endif %} + +{% for message in messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {{ '<|' + message['role'] + '|>\n' + message['content'].strip() + eos_token + '\n' }} + + {% if loop.last and message['role'] == 'user' and add_generation_prompt %} + {{ '<|assistant|>\n' }} + {% endif %} +{% endfor %} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/alpaca.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/alpaca.json new file mode 100644 index 00000000000..336a7feb005 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/alpaca.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/alpaca.jinja", + "stop_str": null, + "stop_token_ids": [2], + "system_prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request." +} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/amberchat.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/amberchat.json new file mode 100644 index 00000000000..e18dca791b5 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/amberchat.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/amberchat.jinja", + "stop_str": "\n###Human", + "stop_token_ids": [2], + "system_prompt": null +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/chatqa.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/chatqa.json new file mode 100644 index 00000000000..597e6c5a455 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/chatqa.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/chatqa.jinja", + "stop_str": null, + "stop_token_ids": [128001, 128009], + "system_prompt": "This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context." +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/gemma-it.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/gemma-it.json new file mode 100644 index 00000000000..3deb31c787d --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/gemma-it.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/gemma-it.jinja", + "stop_str": null, + "stop_token_ids": [1, 107], + "system_prompt": null +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/llama-2-chat.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/llama-2-chat.json new file mode 100644 index 00000000000..47aaeb3b260 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/llama-2-chat.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/llama-2-chat.jinja", + "stop_str": null, + "stop_token_ids": [2], + "system_prompt": "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information." +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/llama-3-chat.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/llama-3-chat.json new file mode 100644 index 00000000000..6736275329f --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/llama-3-chat.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/llama-3-chat.jinja", + "stop_str": null, + "stop_token_ids": [128001, 128009], + "system_prompt": null +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/mistral-instruct.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/mistral-instruct.json new file mode 100644 index 00000000000..c3f12e59e83 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/mistral-instruct.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/mistral-instruct.jinja", + "stop_str": null, + "stop_token_ids": [2], + "system_prompt": "Always assist with care, respect, and truth. Respond with utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies promote fairness and positivity." +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/openchat.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/openchat.json new file mode 100644 index 00000000000..8e6584263ef --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/openchat.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/openchat.jinja", + "stop_str": null, + "stop_token_ids": [2, 32000], + "system_prompt": null +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/orca-2.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/orca-2.json new file mode 100644 index 00000000000..1e854baa5a1 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/orca-2.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/chatml.jinja", + "stop_str": null, + "stop_token_ids": [2, 32002], + "system_prompt": "You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior." +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/phi-3.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/phi-3.json new file mode 100644 index 00000000000..221e42b62cf --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/phi-3.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/phi-3.jinja", + "stop_str": null, + "stop_token_ids": [2, 32000, 32007], + "system_prompt": null +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/qwen2-chat.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/qwen2-chat.json new file mode 100644 index 00000000000..2c2f7708bf5 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/qwen2-chat.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/openchat.jinja", + "stop_str": null, + "stop_token_ids": [151643, 151645], + "system_prompt": null +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/saiga.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/saiga.json new file mode 100644 index 00000000000..839f5b96939 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/saiga.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/saiga.jinja", + "stop_str": null, + "stop_token_ids": [2], + "system_prompt": "Ты — Сайга, русскоязычный автоматический ассистент. Ты разговариваешь с людьми и помогаешь им." +} diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/solar-instruct.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/solar-instruct.json new file mode 100644 index 00000000000..63acfff63be --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/solar-instruct.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/solar-instruct.jinja", + "stop_str": null, + "stop_token_ids": [2], + "system_prompt": null +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/vicuna.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/vicuna.json new file mode 100644 index 00000000000..c55d91c9621 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/vicuna.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/vicuna.jinja", + "stop_str": null, + "stop_token_ids": [2], + "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions." +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/yi-chat.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/yi-chat.json new file mode 100644 index 00000000000..f53b41bbae5 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/yi-chat.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/chatml.jinja", + "stop_str": null, + "stop_token_ids": [2, 7], + "system_prompt": null +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/zephyr.json b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/zephyr.json new file mode 100644 index 00000000000..dd95ac13e25 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/generation_configs/zephyr.json @@ -0,0 +1,6 @@ +{ + "chat_template": "chat_templates/zephyr.jinja", + "stop_str": null, + "stop_token_ids": [2], + "system_prompt": null +} \ No newline at end of file diff --git a/src/_bentoml_impl/frameworks/unsloth/template/service.py b/src/_bentoml_impl/frameworks/unsloth/template/service.py new file mode 100644 index 00000000000..8addc035488 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/template/service.py @@ -0,0 +1,179 @@ +from __future__ import annotations + +import functools +import json +import logging +import pathlib +import sys +import uuid +from typing import Any +from typing import AsyncGenerator +from typing import Literal +from typing import Optional + +import fastapi +import fastapi.staticfiles +import pydantic +import vllm.entrypoints.openai.api_server as vllm_api_server +import yaml +from annotated_types import Ge +from annotated_types import Le +from typing_extensions import Annotated + +import bentoml + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +@functools.lru_cache(maxsize=1) +def _get_gen_config(community_chat_template: str) -> dict[str, Any]: + logger.info(f"Load community_chat_template:{community_chat_template}") + chat_template_path = pathlib.Path(__file__).parent / "chat_templates" + config_path = pathlib.Path(__file__).parent / "generation_configs" + with (config_path / f"{community_chat_template}.json").open("r") as f: + gen_config = json.load(f) + chat_template_file = gen_config["chat_template"].split("/")[-1] + with (chat_template_path / chat_template_file).open("r") as f: + chat_template = f.read() + gen_config["template"] = chat_template.replace(" ", "").replace("\n", "") + return gen_config + + +# Load the constants from the yaml file +CONSTANT_YAML = pathlib.Path(__file__).parent / "service_config.yaml" +if not CONSTANT_YAML.exists(): + raise FileNotFoundError(f"service_config.yaml not found in {CONSTANT_YAML.parent}") +with CONSTANT_YAML.open("r") as f: + CONSTANTS = yaml.safe_load(f) +ENGINE_CONFIG = CONSTANTS["engine_config"] +STATIC_DIR = pathlib.Path(__file__).parent / "ui" + + +class Message(pydantic.BaseModel): + role: Literal["system", "user", "assistant"] + content: str + + +openai_api_app = fastapi.FastAPI() +for route, endpoint, methods in [ + ("/chat/completions", vllm_api_server.create_chat_completion, ["POST"]), + ("/completions", vllm_api_server.create_completion, ["POST"]), + ("/models", vllm_api_server.show_available_models, ["GET"]), +]: + openai_api_app.add_api_route( + path=route, endpoint=endpoint, methods=methods, include_in_schema=True + ) + +# special handling for prometheus_client of bentoml +if "prometheus_client" in sys.modules: + sys.modules.pop("prometheus_client") + + +@bentoml.mount_asgi_app(openai_api_app, path="/v1") +@bentoml.service(**CONSTANTS["service_config"]) +class VLLM: + bentomodel = bentoml.models.get(CONSTANTS["model_tag"]) + + def __init__(self) -> None: + from transformers import AutoTokenizer + from vllm import AsyncEngineArgs + from vllm import AsyncLLMEngine + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + + self.engine = AsyncLLMEngine.from_engine_args( + AsyncEngineArgs( + model=self.bentomodel.path, enable_prefix_caching=True, **ENGINE_CONFIG + ) + ) + self.tokenizer = AutoTokenizer.from_pretrained(self.bentomodel.path) + model_config = self.engine.engine.get_model_config() + # inject the engine into the openai serving chat and completion + vllm_api_server.openai_serving_chat = OpenAIServingChat( + chat_template=_get_gen_config(CONSTANTS["chat_template"])["template"], + async_engine_client=self.engine, + served_model_names=[self.bentomodel.path], + response_role="assistant", + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + vllm_api_server.openai_serving_completion = OpenAIServingCompletion( + async_engine_client=self.engine, + served_model_names=[self.bentomodel.path], + model_config=model_config, + lora_modules=None, + prompt_adapters=None, + request_logger=None, + ) + + @bentoml.api(route="/api/generate") + async def generate( + self, + prompt: str = "Explain superconductors like I'm five years old", + stop: Optional[list[str]] = None, + max_tokens: Annotated[ + int, Ge(128), Le(ENGINE_CONFIG["max_model_len"]) + ] = ENGINE_CONFIG["max_model_len"], + ) -> AsyncGenerator[str, None]: + from vllm import SamplingParams + + if stop is None: + stop = [] + stream = await self.engine.add_request( + uuid.uuid4().hex, prompt, SamplingParams(max_tokens=max_tokens, stop=stop) + ) + cursor = 0 + async for request_output in stream: + text = request_output.outputs[0].text + yield text[cursor:] + cursor = len(text) + + @bentoml.api(route="/api/chat") + async def chat( + self, + messages: list[Message] = [ + Message(content="what is the meaning of life?", role="user") + ], + stop: Optional[list[str]] = None, + max_tokens: Annotated[ + int, Ge(128), Le(ENGINE_CONFIG["max_model_len"]) + ] = ENGINE_CONFIG["max_model_len"], + ) -> AsyncGenerator[str, None]: + from vllm import SamplingParams + + try: + gen_config = _get_gen_config(CONSTANTS["chat_template"]) + if not stop: + if gen_config["stop_str"]: + stop = [gen_config["stop_str"]] + else: + stop = [] + system_prompt = gen_config["system_prompt"] + self.tokenizer.chat_template = gen_config["template"] + + if system_prompt and messages[0].role != "system": + messages = [dict(role="system", content=system_prompt)] + messages + sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop) + stream = await self.engine.add_request( + uuid.uuid4().hex, + self.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ), + sampling_params, + ) + cursor, strip_flag = 0, True + async for request_output in stream: + text = request_output.outputs[0].text + assistant_message = text[cursor:] + if not strip_flag: # strip the leading whitespace + yield assistant_message + elif assistant_message.strip(): + strip_flag = False + yield assistant_message.lstrip() + cursor = len(text) + except Exception as e: + logger.error(f"Error in chat API:{e}") + yield f"Error in chat API:{e}" diff --git a/src/_bentoml_impl/frameworks/unsloth/train.py b/src/_bentoml_impl/frameworks/unsloth/train.py new file mode 100644 index 00000000000..87a0a89a6d8 --- /dev/null +++ b/src/_bentoml_impl/frameworks/unsloth/train.py @@ -0,0 +1,67 @@ +# ruff: ignore + +def prep_dataset(tokenizer): + alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. + +### Instruction: +{} + +### Input: +{} + +### Response: +{}""" + + EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN + + def formatting_prompts_func(examples): + instructions = examples["instruction"] + inputs = examples["input"] + outputs = examples["output"] + texts = [] + for instruction, input, output in zip(instructions, inputs, outputs): + # Must add EOS_TOKEN, otherwise your generation will go on forever! + text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN + texts.append(text) + return {"text": texts} + + from datasets import load_dataset + + dataset = load_dataset("yahma/alpaca-cleaned", split="train") + dataset = dataset.map( + formatting_prompts_func, + batched=True, + ) + return dataset + + +def main(max_seq_length: int = 8196) -> int: + import unsloth, bentoml, trl, transformers + + model, tokenizer = unsloth.FastLanguageModel.from_pretrained("unsloth/Meta-Llama-3.1-8B-bnb-4bit", max_seq_length=max_seq_length, load_in_4bit=True) + model = unsloth.FastLanguageModel.get_peft_model( + model, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], + r=16, lora_alpha=16, lora_dropout=0, bias="none", random_state=3407, + use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context + ) + trl.SFTTrainer( + model=model, tokenizer=tokenizer, + train_dataset=prep_dataset(tokenizer), + dataset_text_field="text", max_seq_length=max_seq_length, + dataset_num_proc=2, packing=False, # Can make training 5x faster for short sequences. + args=transformers.TrainingArguments( + per_device_train_batch_size=2, gradient_accumulation_steps=4, + warmup_steps=5, num_train_epochs = 1, max_steps=60, learning_rate=2e-4, + weight_decay=0.01, seed=3407, + optim="adamw_8bit", + fp16=not unsloth.is_bfloat16_supported(), bf16=unsloth.is_bfloat16_supported(), + logging_steps=1, + lr_scheduler_type="linear", + output_dir="outputs", + ), + ).train() + + bentoml.unsloth.build_bento(model, tokenizer, chat_template="alpaca", quantization_method="bitsandbytes", engine_config={"quantization": "bitsandbytes"}) + return 0 + +if __name__ == "__main__": raise SystemExit(main()) diff --git a/src/bentoml/__init__.py b/src/bentoml/__init__.py index 89f4337a015..1dcc8adb5b3 100644 --- a/src/bentoml/__init__.py +++ b/src/bentoml/__init__.py @@ -60,6 +60,7 @@ from _bentoml_impl.frameworks import lightgbm from _bentoml_impl.frameworks import mlflow from _bentoml_impl.frameworks import sklearn + from _bentoml_impl.frameworks import unsloth from _bentoml_impl.frameworks import xgboost from . import diffusers_simple @@ -125,6 +126,9 @@ lightgbm = _LazyLoader( "bentoml.lightgbm", globals(), "_bentoml_impl.frameworks.lightgbm" ) + unsloth = _LazyLoader( + "bentoml.unsloth", globals(), "_bentoml_impl.frameworks.unsloth" + ) mlflow = _LazyLoader("bentoml.mlflow", globals(), "_bentoml_impl.frameworks.mlflow") detectron = _LazyLoader( "bentoml.detectron", globals(), "bentoml._internal.frameworks.detectron" @@ -276,6 +280,7 @@ def __getattr__(name: str) -> Any: "torchscript", "transformers", "xgboost", + "unsloth", # integrations "ray", "cloud",