From 4c175c49a4468b4476c8917c13d90b4fee9b1b3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=81=E8=A1=8C?= <zhaode.wzd@alibaba-inc.com>
Date: Fri, 22 Dec 2023 17:28:12 +0800
Subject: [PATCH] support phi-2.

---
 README.md         |  14 +--
 README_en.md      | 283 +++++++++++++++++++++++++---------------------
 include/llm.hpp   |  13 +++
 ios/README.md     |   2 +-
 src/llm.cpp       |  13 +++
 src/tokenizer.cpp |   2 -
 6 files changed, 190 insertions(+), 137 deletions(-)

diff --git a/README.md b/README.md
index e8ee06f5..ca5d6b57 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 [![License](https://img.shields.io/github/license/wangzhaode/mnn-llm)](LICENSE.txt)
 [![Download](https://img.shields.io/github/downloads/wangzhaode/mnn-llm/total)](https://github.com/wangzhaode/mnn-llm/releases)
 
-[Read me in english ](./README_en.md)
+[English](./README_en.md)
 
 ## 示例工程
 
@@ -15,7 +15,7 @@
 
 ## 模型支持
 
-llm模型导出onnx模型请使用[llm-export](https://github.com/wangzhaode/llm-export)
+llm模型导出`onnx`和`mnn`模型请使用[llm-export](https://github.com/wangzhaode/llm-export)
 
 当前支持以模型：
 
@@ -153,17 +153,17 @@ llm模型导出onnx模型请使用[llm-export](https://github.com/wangzhaode/llm
 
 ```bash
 # linux/macos
-./cli_demo # cli demo
-./web_demo # web ui demo
+./cli_demo qwen-1.8b-int4 # cli demo
+./web_demo qwen-1.8b-int4 ../web # web ui demo
 
 # windows
-.\Debug\cli_demo.exe
-.\Debug\web_demo.exe
+.\Debug\cli_demo.exe qwen-1.8b-int4
+.\Debug\web_demo.exe qwen-1.8b-int4 ../web
 
 # android
 adb push libs/*.so build/libllm.so build/cli_demo /data/local/tmp
 adb push model_dir /data/local/tmp
-adb shell "cd /data/local/tmp && export LD_LIBRARY_PATH=. && ./cli_demo -m model"
+adb shell "cd /data/local/tmp && export LD_LIBRARY_PATH=. && ./cli_demo qwen-1.8b-int4"
 ```
 
 
diff --git a/README_en.md b/README_en.md
index a04a45d9..77502306 100644
--- a/README_en.md
+++ b/README_en.md
@@ -1,148 +1,177 @@
-# mnn-llm
-## Support
-- chatglm-6b
-- chatglm2-6b
-- codegeex2-6b
-- Qwen-7B-Chat
-- Baichuan2-7B-Chat
-
-## Usage
-### 0. Model export and convert
-Using [LLMExporter](https://github.com/wangzhaode/LLMExporter) export llm model to `onnx` format，and then using `mnnconvert` convert to `mnn` model.
-
-
-### 1. Download this project
-```bash
-git clone https://github.com/wangzhaode/mnn-llm.git
-```
+![mnn-llm](resource/logo.png)
 
-### 2. Compile MNN library
-Compile MNN from source code, the latest release version is 2.5.0
-
-```bash
-git clone https://github.com/alibaba/MNN.git -b 2.5.0
+# mnn-llm
+[![License](https://img.shields.io/github/license/wangzhaode/mnn-llm)](LICENSE.txt)
+[![Download](https://img.shields.io/github/downloads/wangzhaode/mnn-llm/total)](https://github.com/wangzhaode/mnn-llm/releases)
+
+[Chinese](./README.md)
+
+## Example Projects
+
+- [cli](./demo/cli_demo.cpp): Compile using the command line, for Android compilation refer to[android_build.sh](./script/android_build.sh)
+- [web](./demo/web_demo.cpp): Compile using the command line, runtime requires specifying[web resources](./web)
+- [android](./android/): Open with Android Studio for compilation; APK download: [![Download][download-qwen-1.8b-apk]][release-qwen-1.8b-apk]
+- [ios](./ios/README.md): Open with Xcode for compilation; 🚀🚀🚀**This sample code is 100% generated by ChatGPT**🚀🚀🚀
+
+## 模型支持
+
+For exporting the llm model to `ONNX` or `mnn`, please use[llm-export](https://github.com/wangzhaode/llm-export)
+
+Current supported models：
+
+| model | onnx-fp32 | mnn-int4 |
+|-------|-----------|----------|
+| chatglm-6b | [![Download][download-chatglm-6b-onnx]][release-chatglm-6b-onnx] | [![Download][download-chatglm-6b-mnn]][release-chatglm-6b-mnn] |
+| chatglm2-6b | [![Download][download-chatglm2-6b-onnx]][release-chatglm2-6b-onnx] | [![Download][download-chatglm2-6b-mnn]][release-chatglm2-6b-mnn] |
+| chatglm3-6b | [![Download][download-chatglm3-6b-onnx]][release-chatglm3-6b-onnx] | [![Download][download-chatglm3-6b-mnn]][release-chatglm3-6b-mnn] |
+| codegeex2-6b | [![Download][download-codegeex2-6b-onnx]][release-codegeex2-6b-onnx] | [![Download][download-codegeex2-6b-mnn]][release-codegeex2-6b-mnn] |
+| Qwen-7B-Chat | [![Download][download-qwen-7b-chat-onnx]][release-qwen-7b-chat-onnx] | [![Download][download-qwen-7b-chat-mnn]][release-qwen-7b-chat-mnn] |
+| Baichuan2-7B-Chat | [![Download][download-baichuan2-7b-chat-onnx]][release-baichuan2-7b-chat-onnx] | [![Download][download-baichuan2-7b-chat-mnn]][release-baichuan2-7b-chat-mnn] |
+| Llama-2-7b-chat | [![Download][download-llama2-7b-chat-onnx]][release-llama2-7b-chat-onnx] | [![Download][download-llama2-7b-chat-mnn]][release-llama2-7b-chat-mnn] |
+| Qwen-1_8B-Chat | [![Download][download-qwen-1.8b-onnx]][release-qwen-1.8b-onnx] | [![Download][download-qwen-1.8b-mnn]][release-qwen-1.8b-mnn] |
+
+Other versions：
+- Qwen-1_8B-Chat-int8：[![Download][download-qwen-1.8b-mnn-int8]][release-qwen-1.8b-mnn-int8]
+
+[download-chatglm-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm-6b-onnx/total
+[download-chatglm2-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm2-6b-onnx/total
+[download-chatglm3-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm3-6b-onnx/total
+[download-codegeex2-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/codegeex2-6b-onnx/total
+[download-qwen-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen-7b-chat-onnx/total
+[download-baichuan2-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/baichuan2-7b-chat-onnx/total
+[download-llama2-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/llama2-7b-chat-onnx/total
+[download-qwen-1.8b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen-1.8b-onnx/total
+[release-chatglm-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm-6b-onnx
+[release-chatglm2-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm2-6b-onnx
+[release-chatglm3-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm3-6b-onnx
+[release-codegeex2-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/codegeex2-6b-onnx
+[release-qwen-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen-7b-chat-onnx
+[release-baichuan2-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/baichuan2-7b-chat-onnx
+[release-llama2-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/llama2-7b-chat-onnx
+[release-qwen-1.8b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen-1.8b-onnx
+[download-chatglm-6b-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/chatglm-6b-mnn/total
+[download-chatglm2-6b-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/chatglm2-6b-mnn/total
+[download-chatglm3-6b-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/chatglm3-6b-mnn/total
+[download-codegeex2-6b-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/codegeex2-6b-mnn/total
+[download-qwen-7b-chat-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/qwen-7b-chat-mnn/total
+[download-baichuan2-7b-chat-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/baichuan2-7b-chat-mnn/total
+[download-llama2-7b-chat-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/llama2-7b-chat-mnn/total
+[download-qwen-1.8b-mnn]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/qwen-1.8b-mnn/total
+[download-qwen-1.8b-mnn-int8]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/qwen-1.8b-mnn-int8/total
+[download-qwen-1.8b-apk]: https://img.shields.io/github/downloads/wangzhaode/mnn-llm/qwen-1.8b-apk/total
+
+[release-chatglm-6b-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/chatglm-6b-mnn
+[release-chatglm2-6b-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/chatglm2-6b-mnn
+[release-chatglm3-6b-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/chatglm3-6b-mnn
+[release-codegeex2-6b-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/codegeex2-6b-mnn
+[release-qwen-7b-chat-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/qwen-7b-chat-mnn
+[release-baichuan2-7b-chat-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/baichuan2-7b-chat-mnn
+[release-llama2-7b-chat-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/llama2-7b-chat-mnn
+[release-qwen-1.8b-mnn]: https://github.com/wangzhaode/mnn-llm/releases/tag/qwen-1.8b-mnn
+[release-qwen-1.8b-mnn-int8]: https://github.com/wangzhaode/mnn-llm/releases/tag/qwen-1.8b-mnn-int8
+[release-qwen-1.8b-apk]: https://github.com/wangzhaode/mnn-llm/releases/tag/qwen-1.8b-apk
+
+
+### Performance
+
+#### CPU 4-thread speed: `prefill / decode` `tok/s`
+
+| model             | android(f16/32)| macos (f32)   | linux (f32)    | windows (f32)  |
+|:-----------------:|:--------------:|:-------------:|:--------------:|:--------------:|
+| qwen-1.8b-int4    | 100.21 / 22.22 | 84.85 / 19.93 | 151.00 / 35.89 | 117.30 / 33.40 |
+| qwen-1.8b-int8    |  99.95 / 16.94 | 67.70 / 13.45 | 118.51 / 24.90 |  97.19 / 22.76 |
+| chatglm-6b-int4   |  17.37 /  6.69 | 19.79 /  6.10 |  34.05 / 10.82 |  30.73 / 10.63 |
+| chatglm2-6b-int4  |  26.41 /  8.21 | 20.78 /  6.70 |  36.99 / 11.50 |  33.25 / 11.47 |
+| chatglm3-6b-int4  |  26.24 /  7.94 | 19.67 /  6.67 |  37.33 / 11.92 |  33.61 / 11.21 |
+| qwen-7b-int4      |  14.60 /  6.96 | 19.79 /  6.06 |  33.55 / 10.20 |  29.05 / 9.62  |
+| baichuan2-7b-int4 |  13.87 /  6.08 | 17.21 /  6.10 |  30.11 / 10.87 |  26.31 / 9.84  |
+| llama-2-7b-int4   |  17.98 /  5.17 | 19.72 /  5.06 |  34.47 /  9.29 |  28.66 / 8.90  |
+
+Tested system and device information is as follows
+
+| os | device | CPU | Memory |
+|:--:|:-------:|:----:|:--------:|
+| android | XiaoMi12 | Snapdragon 8gen1 | 8 GB |
+| macos | MacBook Pro 2019 | Intel(R) Core(TM) i7-9750H | 16 GB |
+| linux | PC | Intel(R) Core(TM) i7-13700K | 32GB |
+| windows | PC | Intel(R) Core(TM) i7-13700K | 32GB |
+
+### Downloading INT4 Models
 ```
+# <model> like `chatglm-6b`
+# linux/macos
+./script/download_model.sh <model>
 
-- Enter the MNN project, and build a Build directory ready to compile
-```bash
-cd MNN
-mkdir build && cd build
+# windows
+./script/download_model.ps1 <model>
 ```
 
-- Formally compiled, CPU/CUDA/OpenCL can be selected. It is recommended to choose CUDA if you have an NVIDIA graphics card, choose CPU if you don’t have a graphics card, and choose OpenCL if you have an AMD graphics card.
-```bash
-# CPU only（Suport Linux/Mac/Windows）
-cmake -DCMAKE_BUILD_TYPE=Release ..
-
-# using CUDA(Support Linux)
-cmake -DCMAKE_BUILD_TYPE=Release -DMNN_CUDA=ON ..
+## Building
 
-# using OPENCL
-cmake -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_USE_SYSTEM_LIB=ON -DMNN_SEP_BUILD=OFF ..
+Current build status:
 
-# start build(support Linux/Mac)
-make -j$(nproc)
+| System | Build Statud |
+|:------:|:------------:|
+| Linux | [![Build Status][pass-linux]][ci-linux] |
+| Macos | [![Build Status][pass-macos]][ci-macos] |
+| Windows | [![Build Status][pass-windows]][ci-windows] |
+| Android | [![Build Status][pass-android]][ci-android] |
 
-# start build(support Windows)
-cmake --build . -- /m:8
+[pass-linux]: https://github.com/wangzhaode/mnn-llm/actions/workflows/linux.yml/badge.svg
+[pass-macos]: https://github.com/wangzhaode/mnn-llm/actions/workflows/macos.yml/badge.svg
+[pass-windows]: https://github.com/wangzhaode/mnn-llm/actions/workflows/windows.yml/badge.svg
+[pass-android]: https://github.com/wangzhaode/mnn-llm/actions/workflows/android.yml/badge.svg
+[ci-linux]: https://github.com/wangzhaode/mnn-llm/actions/workflows/linux.yml
+[ci-macos]: https://github.com/wangzhaode/mnn-llm/actions/workflows/macos.yml
+[ci-windows]: https://github.com/wangzhaode/mnn-llm/actions/workflows/windows.yml
+[ci-android]: https://github.com/wangzhaode/mnn-llm/actions/workflows/android.yml
 
+### Local Compilation
 ```
+# linux
+./script/linux_build.sh
 
-- Back to ChatGLM-MNN
-```bash
-cd ../..
-```
+# macos
+./script/macos_build.sh
 
-- Copy the compilation result of the MNN library to `mnn-llm/libs`
-```bash
-# for Linux/Mac
-cp -r MNN/include/MNN include
-cp MNN/build/libMNN.so libs/
-cp MNN/build/express/*.so  libs/
-
-# for windows
-cp -r MNN/include/MNN include
-cp MNN/build/Debug/MNN.dll libs/
-cp MNN/build/Debug/MNN.lib libs/
-```
-- For Windows, you also need to download the third-party library pthread, download [address] (https://gigenet.dl.sourceforge.net/project/pthreads4w/pthreads-w32-2-9-1-release.zip), unzip after downloading, and open Pre-built.2libx64, Copy the pthreadVC2.lib file to the libs folder of ChatGLM-MNN. Open Pre-built.2include and place the following three .h files in the include folder of ChatGLM-MNN. For Windows, the final file structure of the project is as follows:
-```bash
-├───libs
-│   ├───MNN.dll
-│   ├───MNN.lib
-│   └───pthreadVC2.lib
-├───include
-│   ├───cppjieba
-│   ├───limonp
-│   ├───MNN
-│   ├───chat.hpp
-│   ├───httplib.h
-│   ├───pthread.h
-│   ├───sched.h
-│   └───semaphore.h
-```
+# windows msvc
+./script/windows_build.ps1
 
-### 3. Download Models
-Download model files from github release to /path/to/ChatGLM-MNN/resource/models, as follows:   
-- 对于Linux/Mac
-```bash
-cd resource/models
-# download fp16(almost no loss of precision)
-./download_models.sh fp16 
-# For Chinese users, you can use third-party services to speed up downloading the fp16 model
-./download_models.sh fp16 proxy
-
-# download int8(little loss of precision,recommend)
-./download_models.sh int8
-# For Chinese users, you can use third-party services to speed up downloading the int8 model
-./download_models.sh int8 proxy
-
-# download int4(some precision loss)
-./download_models.sh int4
-# For Chinese users, you can use third-party services to speed up downloading the int4 model
-./download_models.sh int4 proxy
+# android
+./script/android_build.sh
 ```
 
-- For Windows, replace 'xxx.sh' above with the 'xxx.ps1' file, for example:
-```powershell
-cd resource/models
+The default backend used is `CPU`. If you want to use a different backend, you can add a MNN compilation macro within the script:
+- cuda: `-DMNN_CUDA=ON`
+- opencl: `-DMNN_OPENCL=ON`
 
-# download fp16(almost no loss of precision)
-./download_models.ps1 fp16 
-# For Chinese users, you can use third-party services to speed up downloading the fp16 model
-./download_models.ps1 fp16 proxy
-```
 
-### 4. Build and Run
+### 4. Execution
 
-##### Mac/Linux/Windows:
 ```bash
-mkdir build
-cd build
-# for CPU
-cmake ..
-# for GPU
-cmake -D WITH_CUDA=on ..
-# for mini memory device
-cmake -D BUILD_MINI_MEM_MODE=on ..
-
-# start build(support Linux/Mac)
-make -j$(nproc)
-# start build(support Windows)
-cmake --build . -- /m:8
-
-# run (for Linux/Mac)
-./cli_demo # cli demo
-./web_demo # web ui demo
-# run (for Windows)
-.\Debug\cli_demo.exe
-.\Debug\web_demo.exe
+# linux/macos
+./cli_demo qwen-1.8b-int4 # cli demo
+./web_demo qwen-1.8b-int4 ../web # web ui demo
+
+# windows
+.\Debug\cli_demo.exe qwen-1.8b-int4
+.\Debug\web_demo.exe qwen-1.8b-int4 ../web
+
+# android
+adb push libs/*.so build/libllm.so build/cli_demo /data/local/tmp
+adb push model_dir /data/local/tmp
+adb shell "cd /data/local/tmp && export LD_LIBRARY_PATH=. && ./cli_demo qwen-1.8b-int4"
 ```
 
-##### Android:
-```
-mkdir build
-cd build
-../android_build.sh
-make -j8
-```
\ No newline at end of file
+
+## Reference
+- [chatglm-6b](https://modelscope.cn/models/ZhipuAI/chatglm-6b/summary)
+- [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)
+- [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)
+- [codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary)
+- [Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)
+- [Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)
+- [Qwen-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)
+- [cpp-httplib](https://github.com/yhirose/cpp-httplib)
+- [chatgpt-web](https://github.com/xqdoo00o/chatgpt-web)
+- [ChatViewDemo](https://github.com/BrettFX/ChatViewDemo)
\ No newline at end of file
diff --git a/include/llm.hpp b/include/llm.hpp
index 2af4989d..ea88a5b3 100644
--- a/include/llm.hpp
+++ b/include/llm.hpp
@@ -130,6 +130,19 @@ class Chatglm2_6b : public Llm {
     virtual bool is_stop(int token_id) override;
 };
 
+class Phi_2 : public Chatglm2_6b {
+public:
+    Phi_2() {
+        model_name_ = "Phi_2";
+        layer_nums_ = 32;
+        key_value_shape_ = {1, 0, 2, 32, 80};
+        hidden_size_ = 2560;
+        tokenizer_.reset(new Tiktoken);
+    }
+private:
+    virtual std::vector<int> tokenizer(const std::string& query) override;
+    virtual bool is_stop(int token_id) override;
+};
 
 class Qwen_7b : public Llm {
 public:
diff --git a/ios/README.md b/ios/README.md
index 8882da6e..bf4e70c5 100644
--- a/ios/README.md
+++ b/ios/README.md
@@ -1,6 +1,6 @@
 # mnn-llm ios demo
 
-🚀 本项目全部代码由`ChatGPT-4`生成。
+🚀 本示例代码全部由`ChatGPT-4`生成。
 
 ## 速度
 
diff --git a/src/llm.cpp b/src/llm.cpp
index ec7d7363..b8f9bbbd 100644
--- a/src/llm.cpp
+++ b/src/llm.cpp
@@ -50,6 +50,8 @@ Llm* Llm::createLLM(const std::string& path, std::string model_type) {
     } else if (model_type.find("baichuan") != std::string::npos) {
         llm = new Llama2_7b;
         llm->model_name_ = "Baichuan2_7b";
+    } else if (model_type.find("phi2") != std::string::npos) {
+        llm = new Phi_2;
     }
     if (!llm) {
         std::cerr << "model type can't judge!" << std::endl;
@@ -409,6 +411,17 @@ bool Chatglm2_6b::is_stop(int token_id) {
     return token_id <= 2;
 }
 
+// Phi_2
+std::vector<int> Phi_2::tokenizer(const std::string& query) {
+    auto prompt = query;
+    auto ids = tokenizer_encode(prompt);
+    return ids;
+}
+
+bool Phi_2::is_stop(int token_id) {
+    return token_id == 50256;
+}
+
 // Qwen_7b
 std::vector<int> Qwen_7b::tokenizer(const std::string& query) {
     auto ids = tokenizer_encode(query);
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 9534899a..df975d80 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -303,8 +303,6 @@ bool Sentencepiece::is_control(int id) const {
     return sentence_pieces_[id].type == PieceType::CONTROL;
 }
 
-const int CHARACTER_VOCABULARY_SIZE = 256;
-
 bool Tiktoken::load(const std::string& filename) {
     std::ifstream tok_file(filename);
     std::string token;