Merge pull request #345 from EricLBuehler/gguf_to_hf_tokenizer

Add support for using GGUF tokenizer
EricLBuehler · May 28, 2024 · 34275f4 · 34275f4
2 parents 511a616 + c374297
commit 34275f4
Show file tree

Hide file tree

Showing 15 changed files with 431 additions and 83 deletions.
diff --git a/README.md b/README.md
@@ -155,7 +155,7 @@ Please submit more benchmarks via raising an issue!
 
 ## Usage
 ### Installation and Build
-To install mistral.rs, one should ensure they have Rust installed by following [this](https://rustup.rs/) link. Additionally, the Hugging Face token should be provided in `~/.cache/huggingface/token` when using the server to enable automatic download of gated models.
+To install mistral.rs, one should ensure they have Rust installed by following [this](https://rustup.rs/) link. Additionally, the Hugging Face token should be provided in `~/.cache/huggingface/token` by running `huggingface-cli login` to enable automatic download of gated models.
 
 1) Install required packages
     - `openssl` (ex., `sudo apt install libssl-dev`)
@@ -169,9 +169,7 @@ To install mistral.rs, one should ensure they have Rust installed by following [
 
 3) Set HF token correctly (skip if already set or your model is not gated, or if you want to use the `token_source` parameters in Python or the command line.)
     ```bash
-    mkdir ~/.cache/huggingface
-    touch ~/.cache/huggingface/token
-    echo <HF_TOKEN_HERE> > ~/.cache/huggingface/token
+    huggingface-cli login
     ```
 
 4) Download the code
@@ -220,7 +218,13 @@ To install mistral.rs, one should ensure they have Rust installed by following [
 
     You can install Python support by following the guide [here](mistralrs-pyo3/README.md).
 
-### Getting models from HF Hub
+## Getting models
+
+There are 2 ways to run a model with mistral.rs:
+- From Hugging Face Hub (easiest)
+- From local files
+
+### Getting models from Hugging Face Hub
 
 Mistral.rs can automatically download models from HF Hub. To access gated models, you should provide a token source. They may be one of:
 - `literal:<value>`: Load from a specified literal
@@ -240,17 +244,12 @@ This is passed in the following ways:
 
 If token cannot be loaded, no token will be used (i.e. effectively using `none`).
 
-## Loading models from local files:**
+### Loading models from local files:
 
-You can also instruct mistral.rs to load models locally by modifying the `*_model_id` arguments or options:
+You can also instruct mistral.rs to load models fully locally by modifying the `*_model_id` arguments or options:
 ```bash
 ./mistralrs_server --port 1234 plain -m . -a mistral
 ```
-or
-
-```bash
-./mistralrs-server gguf -m . -t . -f Phi-3-mini-128k-instruct-q4_K_M.gguf
-```
 
 Throughout mistral.rs, any model ID argument or option may be a local path and should contain the following files for each model ID option:
 - `--model-id` (server) or `model_id` (python/rust) or `--tok-model-id` (server) or `tok_model_id` (python/rust): 
@@ -267,7 +266,22 @@ Throughout mistral.rs, any model ID argument or option may be a local path and s
 - `--adapters-model-id` (server) or `adapters_model_id` (python/rust):
   - Adapters `.safetensors` and `adapter_config.json` files in their respective directories
 
-### Run
+### Running GGUF models locally
+
+To run GGUF models fully locally, you do not need to specify the tokenizer model ID argument and instead should pass a path to the
+chat template JSON file (examples [here](chat_templates), you will need to create your own by specifying the chat template and `bos`/`eos` tokens) as well as specifying a local model ID. For example:
+
+```bash
+./mistralrs-server --chat-template <chat_template> gguf -m . -f Phi-3-mini-128k-instruct-q4_K_M.gguf
+```
+
+The following tokenizer model types are currently supported. If you would like one to be added, please raise an issue. Otherwise,
+please consider using the method demonstrated in examples below, where the tokenizer is sourced from Hugging Face.
+
+**Supported GGUF tokenizer types**
+- `llama`
+
+## Run
 
 To start a server serving Mistral GGUF on `localhost:1234`, 
 ```bash
@@ -290,7 +304,7 @@ Additionally, for models without quantization, the model architecture should be
 You can launch interactive mode, a simple chat application running in the terminal, by passing `-i`:
 
 ```bash
-./mistralrs_server -i gguf -t mistralai/Mistral-7B-Instruct-v0.1 -m TheBloke/Mistral-7B-Instruct-v0.1-GGUF -f mistral-7b-instruct-v0.1.Q4_K_M.gguf
+./mistralrs_server -i plain -m microsoft/Phi-3-mini-128k-instruct -a phi3
 ```
 
 ### Quick examples:
@@ -333,7 +347,7 @@ To start a server running Llama from GGML:
 To start a server running Mistral from safetensors.
 
 ```bash
-./mistralrs_server --port 1234 gguf -m mistralai/Mistral-7B-Instruct-v0.1
+./mistralrs_server --port 1234 plain -m mistralai/Mistral-7B-Instruct-v0.1 -a mistral
 ```
 
 ### Structured selection with a `.toml` file

diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
@@ -56,6 +56,7 @@ toml = "0.8.12"
 strum = { version = "0.26", features = ["derive"] }
 derive_more = { version = "0.99.17", default-features = false, features = ["from"] }
 tracing-subscriber.workspace = true
+reqwest = { version = "0.12.4", features = ["blocking"] }
 
 [features]
 pyo3_macros = ["pyo3"]

diff --git a/mistralrs-core/src/model_loader.rs b/mistralrs-core/src/model_loader.rs
@@ -150,22 +150,19 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         .build(arch),
         ModelSelected::GGUF {
             tok_model_id,
-            tokenizer_json,
             quantized_model_id,
             quantized_filename,
             repeat_last_n,
         } => GGUFLoaderBuilder::new(
             GGUFSpecificConfig { repeat_last_n },
             args.chat_template,
-            tokenizer_json,
-            Some(tok_model_id),
+            tok_model_id,
             quantized_model_id,
             quantized_filename,
         )
         .build(),
         ModelSelected::XLoraGGUF {
             tok_model_id,
-            tokenizer_json,
             quantized_model_id,
             quantized_filename,
             repeat_last_n,
@@ -175,7 +172,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         } => GGUFLoaderBuilder::new(
             GGUFSpecificConfig { repeat_last_n },
             args.chat_template,
-            tokenizer_json,
             tok_model_id,
             quantized_model_id,
             quantized_filename,
@@ -192,7 +188,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         .build(),
         ModelSelected::LoraGGUF {
             tok_model_id,
-            tokenizer_json,
             quantized_model_id,
             quantized_filename,
             repeat_last_n,
@@ -201,7 +196,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         } => GGUFLoaderBuilder::new(
             GGUFSpecificConfig { repeat_last_n },
             args.chat_template,
-            tokenizer_json,
             tok_model_id,
             quantized_model_id,
             quantized_filename,

diff --git a/mistralrs-core/src/model_selected.rs b/mistralrs-core/src/model_selected.rs
@@ -95,13 +95,11 @@ pub enum ModelSelected {
 
     /// Select a GGUF model.
     GGUF {
-        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
+        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
+        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
+        /// removing all remote accesses.
         #[arg(short, long)]
-        tok_model_id: String,
-
-        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
-        #[arg(long)]
-        tokenizer_json: Option<String>,
+        tok_model_id: Option<String>,
 
         /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set.
         /// This may be a HF hub repo or a local path.
@@ -119,14 +117,12 @@ pub enum ModelSelected {
 
     /// Select a GGUF model with X-LoRA.
     XLoraGGUF {
-        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
+        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
+        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
+        /// removing all remote accesses.
         #[arg(short, long)]
         tok_model_id: Option<String>,
 
-        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
-        #[arg(long)]
-        tokenizer_json: Option<String>,
-
         /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set.
         /// This may be a HF hub repo or a local path.
         #[arg(short = 'm', long)]
@@ -156,14 +152,12 @@ pub enum ModelSelected {
 
     /// Select a GGUF model with LoRA.
     LoraGGUF {
-        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
+        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
+        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
+        /// removing all remote accesses.
         #[arg(short, long)]
         tok_model_id: Option<String>,
 
-        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
-        #[arg(long)]
-        tokenizer_json: Option<String>,
-
         /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set.
         /// This may be a HF hub repo or a local path.
         #[arg(short = 'm', long)]

diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs
@@ -7,14 +7,14 @@ use crate::aici::bintokens::build_tok_trie;
 use crate::aici::toktree::TokTrie;
 use crate::lora::Ordering;
 use crate::pipeline::chat_template::{calculate_eos_tokens, GenerationConfig};
+use crate::pipeline::gguf_tokenizer::convert_ggml_to_hf_tokenizer;
 use crate::pipeline::{get_chat_template, Cache};
 use crate::pipeline::{ChatTemplate, LocalModelPaths};
 use crate::prefix_cacher::PrefixCacheManager;
 use crate::sequence::Sequence;
-use crate::utils::tokenizer::get_tokenizer;
 use crate::utils::varbuilder_utils::{from_mmaped_safetensors, load_preload_adapters};
 use crate::xlora_models::NonGranularState;
-use crate::{do_sample, get_mut_arcmutex, get_paths, DeviceMapMetadata, DEBUG};
+use crate::{do_sample, get_mut_arcmutex, get_paths_gguf, DeviceMapMetadata, DEBUG};
 use crate::{
     models::quantized_llama::ModelWeights as QLlama,
     models::quantized_phi2::ModelWeights as QPhi,
@@ -69,7 +69,6 @@ pub struct GGUFLoader {
     xlora_order: Option<Ordering>,
     no_kv_cache: bool,
     chat_template: Option<String>,
-    tokenizer_json: Option<String>,
     kind: ModelKind,
     tgt_non_granular_index: Option<usize>,
 }
@@ -119,24 +118,24 @@ pub struct GGUFLoaderBuilder {
     xlora_order: Option<Ordering>,
     no_kv_cache: bool,
     chat_template: Option<String>,
-    tokenizer_json: Option<String>,
     tgt_non_granular_index: Option<usize>,
 }
 
 impl GGUFLoaderBuilder {
+    /// Create a loader builder for a GGUF model. `tok_model_id` is the model ID where you can find a
+    /// `tokenizer_config.json` file. If the `chat_template` is specified, then it will be treated as a
+    /// path and used over remote files, removing all remote accesses.
     pub fn new(
         config: GGUFSpecificConfig,
         chat_template: Option<String>,
-        tokenizer_json: Option<String>,
-        model_id: Option<String>,
+        tok_model_id: Option<String>,
         quantized_model_id: String,
         quantized_filename: String,
     ) -> Self {
         Self {
             config,
             chat_template,
-            tokenizer_json,
-            model_id,
+            model_id: tok_model_id,
             kind: ModelKind::QuantizedGGUF,
             quantized_filename,
             quantized_model_id,
@@ -197,7 +196,6 @@ impl GGUFLoaderBuilder {
             xlora_order: self.xlora_order,
             no_kv_cache: self.no_kv_cache,
             chat_template: self.chat_template,
-            tokenizer_json: self.tokenizer_json,
             tgt_non_granular_index: self.tgt_non_granular_index,
             quantized_filename: Some(self.quantized_filename),
             quantized_model_id: Some(self.quantized_model_id),
@@ -217,7 +215,6 @@ impl GGUFLoader {
         xlora_order: Option<Ordering>,
         no_kv_cache: bool,
         chat_template: Option<String>,
-        tokenizer_json: Option<String>,
         tgt_non_granular_index: Option<usize>,
     ) -> Self {
         let model_id = if let Some(id) = model_id {
@@ -238,7 +235,6 @@ impl GGUFLoader {
             xlora_order,
             no_kv_cache,
             chat_template,
-            tokenizer_json,
             kind,
             tgt_non_granular_index,
         }
@@ -279,7 +275,7 @@ impl Loader for GGUFLoader {
         mapper: DeviceMapMetadata,
         in_situ_quant: Option<GgmlDType>,
     ) -> Result<Arc<Mutex<dyn Pipeline + Send + Sync>>> {
-        let paths: anyhow::Result<Box<dyn ModelPaths>> = get_paths!(
+        let paths: anyhow::Result<Box<dyn ModelPaths>> = get_paths_gguf!(
             LocalModelPaths,
             &token_source,
             revision,
@@ -360,6 +356,8 @@ impl Loader for GGUFLoader {
             info!("Debug is enabled, wrote the names and information about each tensor to `mistralrs_gguf_tensors.txt`.");
         }
 
+        let tokenizer = convert_ggml_to_hf_tokenizer(&model)?;
+
         let mut is_lora = false;
         let model = match self.kind {
             ModelKind::QuantizedGGUF => match arch {
@@ -480,8 +478,6 @@ impl Loader for GGUFLoader {
             _ => unreachable!(),
         };
 
-        let tokenizer = get_tokenizer(paths.get_tokenizer_filename())?;
-
         let gen_conf: Option<GenerationConfig> = paths
             .get_gen_conf_filename()
             .map(|f| serde_json::from_str(&fs::read_to_string(f).unwrap()).unwrap());