From 4c175c49a4468b4476c8917c13d90b4fee9b1b3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=81=E8=A1=8C?= Date: Fri, 22 Dec 2023 17:28:12 +0800 Subject: [PATCH] support phi-2. --- README.md | 14 +-- README_en.md | 283 +++++++++++++++++++++++++--------------------- include/llm.hpp | 13 +++ ios/README.md | 2 +- src/llm.cpp | 13 +++ src/tokenizer.cpp | 2 - 6 files changed, 190 insertions(+), 137 deletions(-) diff --git a/README.md b/README.md index e8ee06f5..ca5d6b57 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![License](https://img.shields.io/github/license/wangzhaode/mnn-llm)](LICENSE.txt) [![Download](https://img.shields.io/github/downloads/wangzhaode/mnn-llm/total)](https://github.com/wangzhaode/mnn-llm/releases) -[Read me in english ](./README_en.md) +[English](./README_en.md) ## 示例工程 @@ -15,7 +15,7 @@ ## 模型支持 -llm模型导出onnx模型请使用[llm-export](https://github.com/wangzhaode/llm-export) +llm模型导出`onnx`和`mnn`模型请使用[llm-export](https://github.com/wangzhaode/llm-export) 当前支持以模型: @@ -153,17 +153,17 @@ llm模型导出onnx模型请使用[llm-export](https://github.com/wangzhaode/llm ```bash # linux/macos -./cli_demo # cli demo -./web_demo # web ui demo +./cli_demo qwen-1.8b-int4 # cli demo +./web_demo qwen-1.8b-int4 ../web # web ui demo # windows -.\Debug\cli_demo.exe -.\Debug\web_demo.exe +.\Debug\cli_demo.exe qwen-1.8b-int4 +.\Debug\web_demo.exe qwen-1.8b-int4 ../web # android adb push libs/*.so build/libllm.so build/cli_demo /data/local/tmp adb push model_dir /data/local/tmp -adb shell "cd /data/local/tmp && export LD_LIBRARY_PATH=. && ./cli_demo -m model" +adb shell "cd /data/local/tmp && export LD_LIBRARY_PATH=. && ./cli_demo qwen-1.8b-int4" ``` diff --git a/README_en.md b/README_en.md index a04a45d9..77502306 100644 --- a/README_en.md +++ b/README_en.md @@ -1,148 +1,177 @@ -# mnn-llm -## Support -- chatglm-6b -- chatglm2-6b -- codegeex2-6b -- Qwen-7B-Chat -- Baichuan2-7B-Chat - -## Usage -### 0. Model export and convert -Using [LLMExporter](https://github.com/wangzhaode/LLMExporter) export llm model to `onnx` format,and then using `mnnconvert` convert to `mnn` model. - - -### 1. Download this project -```bash -git clone https://github.com/wangzhaode/mnn-llm.git -``` +![mnn-llm](resource/logo.png) -### 2. Compile MNN library -Compile MNN from source code, the latest release version is 2.5.0 - -```bash -git clone https://github.com/alibaba/MNN.git -b 2.5.0 +# mnn-llm +[![License](https://img.shields.io/github/license/wangzhaode/mnn-llm)](LICENSE.txt) +[![Download](https://img.shields.io/github/downloads/wangzhaode/mnn-llm/total)](https://github.com/wangzhaode/mnn-llm/releases) + +[Chinese](./README.md) + +## Example Projects + +- [cli](./demo/cli_demo.cpp): Compile using the command line, for Android compilation refer to[android_build.sh](./script/android_build.sh) +- [web](./demo/web_demo.cpp): Compile using the command line, runtime requires specifying[web resources](./web) +- [android](./android/): Open with Android Studio for compilation; APK download: [![Download][download-qwen-1.8b-apk]][release-qwen-1.8b-apk] +- [ios](./ios/README.md): Open with Xcode for compilation; 🚀🚀🚀**This sample code is 100% generated by ChatGPT**🚀🚀🚀 + +## 模型支持 + +For exporting the llm model to `ONNX` or `mnn`, please use[llm-export](https://github.com/wangzhaode/llm-export) + +Current supported models: + +| model | onnx-fp32 | mnn-int4 | +|-------|-----------|----------| +| chatglm-6b | [![Download][download-chatglm-6b-onnx]][release-chatglm-6b-onnx] | [![Download][download-chatglm-6b-mnn]][release-chatglm-6b-mnn] | +| chatglm2-6b | [![Download][download-chatglm2-6b-onnx]][release-chatglm2-6b-onnx] | [![Download][download-chatglm2-6b-mnn]][release-chatglm2-6b-mnn] | +| chatglm3-6b | [![Download][download-chatglm3-6b-onnx]][release-chatglm3-6b-onnx] | [![Download][download-chatglm3-6b-mnn]][release-chatglm3-6b-mnn] | +| codegeex2-6b | [![Download][download-codegeex2-6b-onnx]][release-codegeex2-6b-onnx] | [![Download][download-codegeex2-6b-mnn]][release-codegeex2-6b-mnn] | +| Qwen-7B-Chat | [![Download][download-qwen-7b-chat-onnx]][release-qwen-7b-chat-onnx] | [![Download][download-qwen-7b-chat-mnn]][release-qwen-7b-chat-mnn] | +| Baichuan2-7B-Chat | [![Download][download-baichuan2-7b-chat-onnx]][release-baichuan2-7b-chat-onnx] | [![Download][download-baichuan2-7b-chat-mnn]][release-baichuan2-7b-chat-mnn] | +| Llama-2-7b-chat | [![Download][download-llama2-7b-chat-onnx]][release-llama2-7b-chat-onnx] | [![Download][download-llama2-7b-chat-mnn]][release-llama2-7b-chat-mnn] | +| Qwen-1_8B-Chat | [![Download][download-qwen-1.8b-onnx]][release-qwen-1.8b-onnx] | [![Download][download-qwen-1.8b-mnn]][release-qwen-1.8b-mnn] | + +Other versions: +- Qwen-1_8B-Chat-int8:[![Download][download-qwen-1.8b-mnn-int8]][release-qwen-1.8b-mnn-int8] + +[download-chatglm-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm-6b-onnx/total +[download-chatglm2-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm2-6b-onnx/total +[download-chatglm3-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm3-6b-onnx/total +[download-codegeex2-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/codegeex2-6b-onnx/total +[download-qwen-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen-7b-chat-onnx/total +[download-baichuan2-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/baichuan2-7b-chat-onnx/total +[download-llama2-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/llama2-7b-chat-onnx/total +[download-qwen-1.8b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen-1.8b-onnx/total +[release-chatglm-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm-6b-onnx +[release-chatglm2-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm2-6b-onnx +[release-chatglm3-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm3-6b-onnx +[release-codegeex2-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/codegeex2-6b-onnx +[release-qwen-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen-7b-chat-onnx +[release-baichuan2-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/baichuan2-7b-chat-onnx +[release-llama2-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/llama2-7b-chat-onnx +[release-qwen-1.8b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen-1.8b-onnx +[download-chatglm-6b-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/chatglm-6b-mnn/total +[download-chatglm2-6b-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/chatglm2-6b-mnn/total +[download-chatglm3-6b-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/chatglm3-6b-mnn/total +[download-codegeex2-6b-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/codegeex2-6b-mnn/total +[download-qwen-7b-chat-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/qwen-7b-chat-mnn/total +[download-baichuan2-7b-chat-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/baichuan2-7b-chat-mnn/total +[download-llama2-7b-chat-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/llama2-7b-chat-mnn/total +[download-qwen-1.8b-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/qwen-1.8b-mnn/total +[download-qwen-1.8b-mnn-int8]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/qwen-1.8b-mnn-int8/total +[download-qwen-1.8b-apk]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/qwen-1.8b-apk/total + +[release-chatglm-6b-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/chatglm-6b-mnn +[release-chatglm2-6b-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/chatglm2-6b-mnn +[release-chatglm3-6b-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/chatglm3-6b-mnn +[release-codegeex2-6b-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/codegeex2-6b-mnn +[release-qwen-7b-chat-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/qwen-7b-chat-mnn +[release-baichuan2-7b-chat-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/baichuan2-7b-chat-mnn +[release-llama2-7b-chat-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/llama2-7b-chat-mnn +[release-qwen-1.8b-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/qwen-1.8b-mnn +[release-qwen-1.8b-mnn-int8]: https://github.com/wangzhaode/mnn-llm/releases/tag/qwen-1.8b-mnn-int8 +[release-qwen-1.8b-apk]: https://github.com/wangzhaode/mnn-llm/releases/tag/qwen-1.8b-apk + + +### Performance + +#### CPU 4-thread speed: `prefill / decode` `tok/s` + +| model | android(f16/32)| macos (f32) | linux (f32) | windows (f32) | +|:-----------------:|:--------------:|:-------------:|:--------------:|:--------------:| +| qwen-1.8b-int4 | 100.21 / 22.22 | 84.85 / 19.93 | 151.00 / 35.89 | 117.30 / 33.40 | +| qwen-1.8b-int8 | 99.95 / 16.94 | 67.70 / 13.45 | 118.51 / 24.90 | 97.19 / 22.76 | +| chatglm-6b-int4 | 17.37 / 6.69 | 19.79 / 6.10 | 34.05 / 10.82 | 30.73 / 10.63 | +| chatglm2-6b-int4 | 26.41 / 8.21 | 20.78 / 6.70 | 36.99 / 11.50 | 33.25 / 11.47 | +| chatglm3-6b-int4 | 26.24 / 7.94 | 19.67 / 6.67 | 37.33 / 11.92 | 33.61 / 11.21 | +| qwen-7b-int4 | 14.60 / 6.96 | 19.79 / 6.06 | 33.55 / 10.20 | 29.05 / 9.62 | +| baichuan2-7b-int4 | 13.87 / 6.08 | 17.21 / 6.10 | 30.11 / 10.87 | 26.31 / 9.84 | +| llama-2-7b-int4 | 17.98 / 5.17 | 19.72 / 5.06 | 34.47 / 9.29 | 28.66 / 8.90 | + +Tested system and device information is as follows + +| os | device | CPU | Memory | +|:--:|:-------:|:----:|:--------:| +| android | XiaoMi12 | Snapdragon 8gen1 | 8 GB | +| macos | MacBook Pro 2019 | Intel(R) Core(TM) i7-9750H | 16 GB | +| linux | PC | Intel(R) Core(TM) i7-13700K | 32GB | +| windows | PC | Intel(R) Core(TM) i7-13700K | 32GB | + +### Downloading INT4 Models ``` +# like `chatglm-6b` +# linux/macos +./script/download_model.sh -- Enter the MNN project, and build a Build directory ready to compile -```bash -cd MNN -mkdir build && cd build +# windows +./script/download_model.ps1 ``` -- Formally compiled, CPU/CUDA/OpenCL can be selected. It is recommended to choose CUDA if you have an NVIDIA graphics card, choose CPU if you don’t have a graphics card, and choose OpenCL if you have an AMD graphics card. -```bash -# CPU only(Suport Linux/Mac/Windows) -cmake -DCMAKE_BUILD_TYPE=Release .. - -# using CUDA(Support Linux) -cmake -DCMAKE_BUILD_TYPE=Release -DMNN_CUDA=ON .. +## Building -# using OPENCL -cmake -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_USE_SYSTEM_LIB=ON -DMNN_SEP_BUILD=OFF .. +Current build status: -# start build(support Linux/Mac) -make -j$(nproc) +| System | Build Statud | +|:------:|:------------:| +| Linux | [![Build Status][pass-linux]][ci-linux] | +| Macos | [![Build Status][pass-macos]][ci-macos] | +| Windows | [![Build Status][pass-windows]][ci-windows] | +| Android | [![Build Status][pass-android]][ci-android] | -# start build(support Windows) -cmake --build . -- /m:8 +[pass-linux]: https://github.com/wangzhaode/mnn-llm/actions/workflows/linux.yml/badge.svg +[pass-macos]: https://github.com/wangzhaode/mnn-llm/actions/workflows/macos.yml/badge.svg +[pass-windows]: https://github.com/wangzhaode/mnn-llm/actions/workflows/windows.yml/badge.svg +[pass-android]: https://github.com/wangzhaode/mnn-llm/actions/workflows/android.yml/badge.svg +[ci-linux]: https://github.com/wangzhaode/mnn-llm/actions/workflows/linux.yml +[ci-macos]: https://github.com/wangzhaode/mnn-llm/actions/workflows/macos.yml +[ci-windows]: https://github.com/wangzhaode/mnn-llm/actions/workflows/windows.yml +[ci-android]: https://github.com/wangzhaode/mnn-llm/actions/workflows/android.yml +### Local Compilation ``` +# linux +./script/linux_build.sh -- Back to ChatGLM-MNN -```bash -cd ../.. -``` +# macos +./script/macos_build.sh -- Copy the compilation result of the MNN library to `mnn-llm/libs` -```bash -# for Linux/Mac -cp -r MNN/include/MNN include -cp MNN/build/libMNN.so libs/ -cp MNN/build/express/*.so libs/ - -# for windows -cp -r MNN/include/MNN include -cp MNN/build/Debug/MNN.dll libs/ -cp MNN/build/Debug/MNN.lib libs/ -``` -- For Windows, you also need to download the third-party library pthread, download [address] (https://gigenet.dl.sourceforge.net/project/pthreads4w/pthreads-w32-2-9-1-release.zip), unzip after downloading, and open Pre-built.2libx64, Copy the pthreadVC2.lib file to the libs folder of ChatGLM-MNN. Open Pre-built.2include and place the following three .h files in the include folder of ChatGLM-MNN. For Windows, the final file structure of the project is as follows: -```bash -├───libs -│ ├───MNN.dll -│ ├───MNN.lib -│ └───pthreadVC2.lib -├───include -│ ├───cppjieba -│ ├───limonp -│ ├───MNN -│ ├───chat.hpp -│ ├───httplib.h -│ ├───pthread.h -│ ├───sched.h -│ └───semaphore.h -``` +# windows msvc +./script/windows_build.ps1 -### 3. Download Models -Download model files from github release to /path/to/ChatGLM-MNN/resource/models, as follows: -- 对于Linux/Mac -```bash -cd resource/models -# download fp16(almost no loss of precision) -./download_models.sh fp16 -# For Chinese users, you can use third-party services to speed up downloading the fp16 model -./download_models.sh fp16 proxy - -# download int8(little loss of precision,recommend) -./download_models.sh int8 -# For Chinese users, you can use third-party services to speed up downloading the int8 model -./download_models.sh int8 proxy - -# download int4(some precision loss) -./download_models.sh int4 -# For Chinese users, you can use third-party services to speed up downloading the int4 model -./download_models.sh int4 proxy +# android +./script/android_build.sh ``` -- For Windows, replace 'xxx.sh' above with the 'xxx.ps1' file, for example: -```powershell -cd resource/models +The default backend used is `CPU`. If you want to use a different backend, you can add a MNN compilation macro within the script: +- cuda: `-DMNN_CUDA=ON` +- opencl: `-DMNN_OPENCL=ON` -# download fp16(almost no loss of precision) -./download_models.ps1 fp16 -# For Chinese users, you can use third-party services to speed up downloading the fp16 model -./download_models.ps1 fp16 proxy -``` -### 4. Build and Run +### 4. Execution -##### Mac/Linux/Windows: ```bash -mkdir build -cd build -# for CPU -cmake .. -# for GPU -cmake -D WITH_CUDA=on .. -# for mini memory device -cmake -D BUILD_MINI_MEM_MODE=on .. - -# start build(support Linux/Mac) -make -j$(nproc) -# start build(support Windows) -cmake --build . -- /m:8 - -# run (for Linux/Mac) -./cli_demo # cli demo -./web_demo # web ui demo -# run (for Windows) -.\Debug\cli_demo.exe -.\Debug\web_demo.exe +# linux/macos +./cli_demo qwen-1.8b-int4 # cli demo +./web_demo qwen-1.8b-int4 ../web # web ui demo + +# windows +.\Debug\cli_demo.exe qwen-1.8b-int4 +.\Debug\web_demo.exe qwen-1.8b-int4 ../web + +# android +adb push libs/*.so build/libllm.so build/cli_demo /data/local/tmp +adb push model_dir /data/local/tmp +adb shell "cd /data/local/tmp && export LD_LIBRARY_PATH=. && ./cli_demo qwen-1.8b-int4" ``` -##### Android: -``` -mkdir build -cd build -../android_build.sh -make -j8 -``` \ No newline at end of file + +## Reference +- [chatglm-6b](https://modelscope.cn/models/ZhipuAI/chatglm-6b/summary) +- [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary) +- [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary) +- [codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary) +- [Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary) +- [Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary) +- [Qwen-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary) +- [cpp-httplib](https://github.com/yhirose/cpp-httplib) +- [chatgpt-web](https://github.com/xqdoo00o/chatgpt-web) +- [ChatViewDemo](https://github.com/BrettFX/ChatViewDemo) \ No newline at end of file diff --git a/include/llm.hpp b/include/llm.hpp index 2af4989d..ea88a5b3 100644 --- a/include/llm.hpp +++ b/include/llm.hpp @@ -130,6 +130,19 @@ class Chatglm2_6b : public Llm { virtual bool is_stop(int token_id) override; }; +class Phi_2 : public Chatglm2_6b { +public: + Phi_2() { + model_name_ = "Phi_2"; + layer_nums_ = 32; + key_value_shape_ = {1, 0, 2, 32, 80}; + hidden_size_ = 2560; + tokenizer_.reset(new Tiktoken); + } +private: + virtual std::vector tokenizer(const std::string& query) override; + virtual bool is_stop(int token_id) override; +}; class Qwen_7b : public Llm { public: diff --git a/ios/README.md b/ios/README.md index 8882da6e..bf4e70c5 100644 --- a/ios/README.md +++ b/ios/README.md @@ -1,6 +1,6 @@ # mnn-llm ios demo -🚀 本项目全部代码由`ChatGPT-4`生成。 +🚀 本示例代码全部由`ChatGPT-4`生成。 ## 速度 diff --git a/src/llm.cpp b/src/llm.cpp index ec7d7363..b8f9bbbd 100644 --- a/src/llm.cpp +++ b/src/llm.cpp @@ -50,6 +50,8 @@ Llm* Llm::createLLM(const std::string& path, std::string model_type) { } else if (model_type.find("baichuan") != std::string::npos) { llm = new Llama2_7b; llm->model_name_ = "Baichuan2_7b"; + } else if (model_type.find("phi2") != std::string::npos) { + llm = new Phi_2; } if (!llm) { std::cerr << "model type can't judge!" << std::endl; @@ -409,6 +411,17 @@ bool Chatglm2_6b::is_stop(int token_id) { return token_id <= 2; } +// Phi_2 +std::vector Phi_2::tokenizer(const std::string& query) { + auto prompt = query; + auto ids = tokenizer_encode(prompt); + return ids; +} + +bool Phi_2::is_stop(int token_id) { + return token_id == 50256; +} + // Qwen_7b std::vector Qwen_7b::tokenizer(const std::string& query) { auto ids = tokenizer_encode(query); diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 9534899a..df975d80 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -303,8 +303,6 @@ bool Sentencepiece::is_control(int id) const { return sentence_pieces_[id].type == PieceType::CONTROL; } -const int CHARACTER_VOCABULARY_SIZE = 256; - bool Tiktoken::load(const std::string& filename) { std::ifstream tok_file(filename); std::string token;