diff --git a/Cargo.lock b/Cargo.lock index 7270fdf34..663093a52 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2287,7 +2287,7 @@ dependencies = [ [[package]] name = "mistralrs" -version = "0.2.1" +version = "0.2.2" dependencies = [ "anyhow", "candle-core", @@ -2301,7 +2301,7 @@ dependencies = [ [[package]] name = "mistralrs-bench" -version = "0.2.1" +version = "0.2.2" dependencies = [ "anyhow", "candle-core", @@ -2317,7 +2317,7 @@ dependencies = [ [[package]] name = "mistralrs-core" -version = "0.2.1" +version = "0.2.2" dependencies = [ "accelerate-src", "akin", @@ -2381,7 +2381,7 @@ dependencies = [ [[package]] name = "mistralrs-paged-attn" -version = "0.2.1" +version = "0.2.2" dependencies = [ "anyhow", "bindgen_cuda 0.1.6", @@ -2391,7 +2391,7 @@ dependencies = [ [[package]] name = "mistralrs-pyo3" -version = "0.2.1" +version = "0.2.2" dependencies = [ "accelerate-src", "base64 0.22.1", @@ -2412,7 +2412,7 @@ dependencies = [ [[package]] name = "mistralrs-server" -version = "0.2.1" +version = "0.2.2" dependencies = [ "accelerate-src", "anyhow", @@ -2440,7 +2440,7 @@ dependencies = [ [[package]] name = "mistralrs-vision" -version = "0.2.1" +version = "0.2.2" dependencies = [ "candle-core", "image", diff --git a/Cargo.toml b/Cargo.toml index 02e927723..af7e64d6f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ exclude = [ resolver = "2" [workspace.package] -version = "0.2.1" +version = "0.2.2" edition = "2021" description = "Fast and easy LLM serving." homepage = "https://github.com/EricLBuehler/mistral.rs" diff --git a/README.md b/README.md index 86deaf9ab..f42162409 100644 --- a/README.md +++ b/README.md @@ -35,10 +35,10 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis ./mistralrs_server -i toml -f toml-selectors/anymoe_lora.toml ``` -- 💎 Run the Gemma 2 model +- 🦙 Run the Llama 3.1 model ``` - ./mistralrs_server -i plain -m google/gemma-2-9b-it -a gemma2 + ./mistralrs_server -i plain -m meta-llama/Meta-Llama-3.1-8B-Instruct -a llama ``` - φ³ Run the Phi 3 model with 128K context window @@ -189,7 +189,7 @@ Please submit more benchmarks via raising an issue! > Note: You can use our [Docker containers here](https://github.com/EricLBuehler/mistral.rs/pkgs/container/mistral.rs). > Learn more about running Docker containers: https://docs.docker.com/engine/reference/run/ -> Note: You can use pre-built `mistralrs-server` binaries [here](https://github.com/EricLBuehler/mistral.rs/releases/tag/v0.2.1) +> Note: You can use pre-built `mistralrs-server` binaries [here](https://github.com/EricLBuehler/mistral.rs/releases/tag/v0.2.2) - Install the [Python package here](mistralrs-pyo3/README.md). diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml index 6012b4cda..d6175421d 100644 --- a/mistralrs-bench/Cargo.toml +++ b/mistralrs-bench/Cargo.toml @@ -18,7 +18,7 @@ candle-core.workspace = true serde.workspace = true serde_json.workspace = true clap.workspace = true -mistralrs-core = { version = "0.2.1", path = "../mistralrs-core" } +mistralrs-core = { version = "0.2.2", path = "../mistralrs-core" } tracing.workspace = true either.workspace = true tokio.workspace = true diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml index b5e9aa445..921867453 100644 --- a/mistralrs-core/Cargo.toml +++ b/mistralrs-core/Cargo.toml @@ -64,13 +64,13 @@ tracing-subscriber.workspace = true derive-new = "0.6.0" itertools = "0.13.0" sysinfo = "0.30.12" -mistralrs-vision = { version = "0.2.1", path = "../mistralrs-vision" } +mistralrs-vision = { version = "0.2.2", path = "../mistralrs-vision" } csv = "1.3.0" reqwest.workspace = true base64.workspace = true bytemuck_derive = "1.7.0" plotly = { version = "0.9.0", features = ["kaleido"], optional = true } -mistralrs-paged-attn = { version = "0.2.1", path = "../mistralrs-paged-attn", optional = true } +mistralrs-paged-attn = { version = "0.2.2", path = "../mistralrs-paged-attn", optional = true } [features] default = ["plotly"] diff --git a/mistralrs-core/src/pipeline/isq.rs b/mistralrs-core/src/pipeline/isq.rs index b7d6c6f38..88409e76b 100644 --- a/mistralrs-core/src/pipeline/isq.rs +++ b/mistralrs-core/src/pipeline/isq.rs @@ -142,74 +142,78 @@ pub trait IsqModel { { let (tensors, mapper) = self.get_biases(); let total_tensors = tensors.len(); - info!("Applying in-situ quantization bias device mapping to {total_tensors} biases."); - let bar = ProgressBar::new(total_tensors as u64); - bar.set_style( - ProgressStyle::default_bar() - .template("[{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} ({eta})") - .unwrap() - .progress_chars("#>-"), - ); - - let mut devices = Vec::new(); - for (_, layer) in &tensors { - let device = if let Some(layer) = layer { - mapper.device_for(*layer, false).unwrap_or(&device) - } else { - &device - }; - devices.push(device.clone()); - } - - let t_start = Instant::now(); - #[cfg(not(feature = "metal"))] - { - // NOTE(EricLBuehler): On version 0.2.0, remove this - let isq_low_mem = std::env::var("ISQ_LOW_MEMORY").is_ok(); - if isq_low_mem { - warn!("ISQ_LOW_MEMORY is set but as of version 0.1.24, this is irrelevant"); + if total_tensors > 0 { + info!( + "Applying in-situ quantization bias device mapping to {total_tensors} biases." + ); + let bar = ProgressBar::new(total_tensors as u64); + bar.set_style( + ProgressStyle::default_bar() + .template("[{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} ({eta})") + .unwrap() + .progress_chars("#>-"), + ); + + let mut devices = Vec::new(); + for (_, layer) in &tensors { + let device = if let Some(layer) = layer { + mapper.device_for(*layer, false).unwrap_or(&device) + } else { + &device + }; + devices.push(device.clone()); } - info!("Applying ISQ on {} threads.", rayon::current_num_threads()); - - use indicatif::ParallelProgressIterator; - use rayon::iter::{ - IndexedParallelIterator, IntoParallelIterator, ParallelIterator, - }; - tensors - .into_par_iter() - .zip(devices) - .progress_with(bar) - .for_each(|((tensor, _), device)| { - if let Some(tensor) = tensor { - *tensor = tensor - .to_device(&device) - .unwrap() - .to_dtype(DType::F32) - .unwrap(); - } - }); - } + let t_start = Instant::now(); + #[cfg(not(feature = "metal"))] + { + // NOTE(EricLBuehler): On version 0.2.0, remove this + let isq_low_mem = std::env::var("ISQ_LOW_MEMORY").is_ok(); + if isq_low_mem { + warn!("ISQ_LOW_MEMORY is set but as of version 0.1.24, this is irrelevant"); + } + + info!("Applying ISQ on {} threads.", rayon::current_num_threads()); + + use indicatif::ParallelProgressIterator; + use rayon::iter::{ + IndexedParallelIterator, IntoParallelIterator, ParallelIterator, + }; + tensors + .into_par_iter() + .zip(devices) + .progress_with(bar) + .for_each(|((tensor, _), device)| { + if let Some(tensor) = tensor { + *tensor = tensor + .to_device(&device) + .unwrap() + .to_dtype(DType::F32) + .unwrap(); + } + }); + } - #[cfg(feature = "metal")] - { - use indicatif::ProgressIterator; - tensors - .into_iter() - .zip(devices) - .progress_with(bar) - .for_each(|((tensor, _), device)| { - if let Some(tensor) = tensor { - *tensor = tensor - .to_device(&device) - .unwrap() - .to_dtype(DType::F32) - .unwrap(); - } - }); + #[cfg(feature = "metal")] + { + use indicatif::ProgressIterator; + tensors + .into_iter() + .zip(devices) + .progress_with(bar) + .for_each(|((tensor, _), device)| { + if let Some(tensor) = tensor { + *tensor = tensor + .to_device(&device) + .unwrap() + .to_dtype(DType::F32) + .unwrap(); + } + }); + } + let delta = Instant::now().duration_since(t_start).as_secs_f32(); + info!("Applied in-situ quantization device mapping. Took {delta:.2}s",); } - let delta = Instant::now().duration_since(t_start).as_secs_f32(); - info!("Applied in-situ quantization device mapping. Took {delta:.2}s",); } Ok(()) } diff --git a/mistralrs-pyo3/Cargo.toml b/mistralrs-pyo3/Cargo.toml index 75ec404e1..e582d94e5 100644 --- a/mistralrs-pyo3/Cargo.toml +++ b/mistralrs-pyo3/Cargo.toml @@ -17,7 +17,7 @@ doc = false [dependencies] pyo3.workspace = true -mistralrs-core = { version = "0.2.1", path = "../mistralrs-core", features = ["pyo3_macros"] } +mistralrs-core = { version = "0.2.2", path = "../mistralrs-core", features = ["pyo3_macros"] } serde.workspace = true serde_json.workspace = true candle-core.workspace = true diff --git a/mistralrs-pyo3/Cargo_template.toml b/mistralrs-pyo3/Cargo_template.toml index de0990a85..2e12e79cb 100644 --- a/mistralrs-pyo3/Cargo_template.toml +++ b/mistralrs-pyo3/Cargo_template.toml @@ -17,7 +17,7 @@ doc = false [dependencies] pyo3.workspace = true -mistralrs-core = { version = "0.2.1", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] } +mistralrs-core = { version = "0.2.2", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] } serde.workspace = true serde_json.workspace = true candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.6.0", rev = "c967be9", features=["$feature_name"] } diff --git a/mistralrs-pyo3/pyproject.toml b/mistralrs-pyo3/pyproject.toml index d235cbaf4..e3af1c14b 100644 --- a/mistralrs-pyo3/pyproject.toml +++ b/mistralrs-pyo3/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "mistralrs" -version = "0.2.1" +version = "0.2.2" requires-python = ">=3.8" classifiers = [ "Programming Language :: Rust", diff --git a/mistralrs-pyo3/pyproject_template.toml b/mistralrs-pyo3/pyproject_template.toml index 50d61eee2..ad15af097 100644 --- a/mistralrs-pyo3/pyproject_template.toml +++ b/mistralrs-pyo3/pyproject_template.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "$name" -version = "0.2.1" +version = "0.2.2" requires-python = ">=3.8" classifiers = [ "Programming Language :: Rust", diff --git a/mistralrs-server/Cargo.toml b/mistralrs-server/Cargo.toml index aa41865f2..87375804d 100644 --- a/mistralrs-server/Cargo.toml +++ b/mistralrs-server/Cargo.toml @@ -22,7 +22,7 @@ axum = { version = "0.7.4", features = ["tokio"] } tower-http = { version = "0.5.1", features = ["cors"]} utoipa = { version = "4.2", features = ["axum_extras"] } utoipa-swagger-ui = { version = "7.1.0", features = ["axum"]} -mistralrs-core = { version = "0.2.1", path = "../mistralrs-core" } +mistralrs-core = { version = "0.2.2", path = "../mistralrs-core" } indexmap.workspace = true accelerate-src = { workspace = true, optional = true } intel-mkl-src = { workspace = true, optional = true } diff --git a/mistralrs/Cargo.toml b/mistralrs/Cargo.toml index 8b0eb3210..1a1e45dfd 100644 --- a/mistralrs/Cargo.toml +++ b/mistralrs/Cargo.toml @@ -12,7 +12,7 @@ license.workspace = true homepage.workspace = true [dependencies] -mistralrs-core = { version = "0.2.1", path = "../mistralrs-core" } +mistralrs-core = { version = "0.2.2", path = "../mistralrs-core" } anyhow.workspace = true tokio.workspace = true candle-core.workspace = true