From e68767bb69a6786cccf764f1fc0f8fbe23b92561 Mon Sep 17 00:00:00 2001
From: yanxing <zhaode.wzd@alibaba-inc.com>
Date: Thu, 25 Apr 2024 21:46:22 +0800
Subject: [PATCH] add python function

---
 include/llm.hpp   |  16 ++--
 python/mnnllm.cpp | 191 ++++++++++++++++++++++++++++++++++++++++------
 python/setup.py   |   3 +-
 src/llm.cpp       |  47 ++++++++----
 4 files changed, 215 insertions(+), 42 deletions(-)
diff --git a/include/llm.hpp b/include/llm.hpp
index 49f8f144..9ef2491e 100644
--- a/include/llm.hpp
+++ b/include/llm.hpp
@@ -80,14 +80,24 @@ class Llm {
     void warmup();
     std::string response(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr);
     std::string response_nohistory(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr);
+    void generate_init();
+    std::string generate(const std::vector<int>& input_ids, std::ostream* os, const char* end_with);
+    std::vector<int> generate(const std::vector<int>& input_ids);
+    int forward(const std::vector<int>& input_ids);
     float load_progress() { return load_progress_; }
     void reset();
     void print_speed();
     friend class Pipeline;
 public:
     std::vector<int> history_;
+    std::string model_name_ = "";
+    // config
+    int max_new_tokens_ = 1024;
+    int backend_type_ = 0;
+    int thread_num_ = 4;
+    bool low_precision_ = true;
+    bool chatml_ = true;
     // forward info
-    int max_seq_len_ = 1024;
     int prompt_len_ = 0;
     int gen_seq_len_ = 0;
     int all_seq_len_ = 0;
@@ -95,11 +105,8 @@ class Llm {
     int64_t prefill_us_ = 0;
     int64_t decode_us_ = 0;
 protected:
-    void response_init();
-    std::string response_impl(const std::vector<int>& input_ids, std::ostream* os, const char* end_with);
     VARP embedding(const std::vector<int>& input_ids);
     VARP txt_embedding(const std::vector<int>& input_ids);
-    int forward(const std::vector<int>& input_ids);
     std::vector<int> tokenizer_encode(const std::string& input_str);
     std::string decode(int id);
 protected:
@@ -111,7 +118,6 @@ class Llm {
     int layer_nums_ = 0;
     int hidden_size_ = 4096;
     std::vector<int> key_value_shape_ = {};
-    std::string model_name_ = "";
     std::string disk_embedding_file_ = "";
     // gen info
     float load_progress_ = 0.f;
diff --git a/python/mnnllm.cpp b/python/mnnllm.cpp
index 1b285ae4..723843c3 100644
--- a/python/mnnllm.cpp
+++ b/python/mnnllm.cpp
@@ -11,6 +11,23 @@
 
 using namespace std;
 
+// macros
+#define def_attr(NAME) \
+static PyObject* PyLLM_get_##NAME(LLM *self, void *closure) {\
+    return PyLong_FromLong(self->llm->NAME##_);\
+}\
+static int PyLLM_set_##NAME(LLM *self, PyObject *value, void *closure) {\
+    if (self->llm) {\
+        self->llm->NAME##_ = PyLong_AsLong(value);\
+    }\
+    return 0;\
+}
+
+#define register_attr(NAME) \
+    {#NAME, (getter)PyLLM_get_##NAME, (setter)PyLLM_set_##NAME, "___"#NAME"__", NULL},
+// end
+
+// type convert start
 inline PyObject* string2Object(const std::string& str) {
 #if PY_MAJOR_VERSION == 2
   return PyString_FromString(str.c_str());
@@ -19,6 +36,114 @@ inline PyObject* string2Object(const std::string& str) {
 #endif
 }
 
+static inline PyObject* toPyObj(string val) {
+    return string2Object(val);
+}
+
+static inline PyObject* toPyObj(int val) {
+    return PyLong_FromLong(val);
+}
+
+template <typename T, PyObject*(*Func)(T)=toPyObj>
+static PyObject* toPyObj(vector<T> values) {
+    PyObject* obj = PyList_New(values.size());
+    for (int i = 0; i < values.size(); i++) {
+        PyList_SetItem(obj, i, Func(values[i]));
+    }
+    return obj;
+}
+
+/*
+static inline PyObject* toPyArray(MNN::Express::VARP var) {
+    auto info = var->getInfo();
+    auto shape = info->dim;
+    size_t total_length = info->size;
+    auto var_ptr = const_cast<void*>(var->readMap<void>());
+    std::vector<npy_intp> npy_dims;
+    for(const auto dim : shape) {
+        npy_dims.push_back(dim);
+    }
+    // auto data = PyArray_SimpleNewFromData(npy_dims.size(), npy_dims.data(), NPY_FLOAT, ptr);
+    auto ndarray = PyArray_SimpleNew(npy_dims.size(), npy_dims.data(), NPY_FLOAT);
+    void* npy_ptr = PyArray_DATA((PyArrayObject*)ndarray);
+    std::memcpy(npy_ptr, var_ptr, total_length * sizeof(float));
+    return (PyObject*)ndarray;
+}
+
+static inline PyObject* toPyArray(std::vector<int> vec) {
+    npy_intp dims[1] = { static_cast<npy_intp>(vec.size()) };
+    auto ndarray = PyArray_SimpleNew(1, dims, NPY_INT);
+    void* npy_ptr = PyArray_DATA((PyArrayObject*)ndarray);
+    std::memcpy(npy_ptr, vec.data(), vec.size() * sizeof(int));
+    return (PyObject*)ndarray;
+}
+*/
+
+static inline bool isInt(PyObject* obj) {
+    return PyLong_Check(obj)
+#if PY_MAJOR_VERSION < 3
+    || PyInt_Check(obj)
+#endif
+    ;
+}
+
+template <bool (*Func)(PyObject*)>
+static bool isVec(PyObject* obj) {
+    if (PyTuple_Check(obj)) {
+        if (PyTuple_Size(obj) > 0) {
+            return Func(PyTuple_GetItem(obj, 0));
+        } else return true;
+    } else if (PyList_Check(obj)) {
+        if (PyList_Size(obj) > 0) {
+            return Func(PyList_GetItem(obj, 0));
+        } else return true;
+    }
+    return false;
+}
+
+static inline bool isInts(PyObject* obj) {
+    return isInt(obj) || isVec<isInt>(obj);
+}
+
+inline int64_t unpackLong(PyObject* obj) {
+    int overflow;
+    long long value = PyLong_AsLongLongAndOverflow(obj, &overflow);
+    return (int64_t)value;
+}
+
+static inline int toInt(PyObject* obj) {
+    return static_cast<int>(unpackLong(obj));
+}
+
+template <typename T, T (*Func)(PyObject*)>
+static vector<T> toVec(PyObject* obj) {
+    vector<T> values;
+    if (PyTuple_Check(obj)) {
+        size_t size = PyTuple_Size(obj);
+        values.resize(size);
+        for (int i = 0; i < size; i++) {
+            values[i] = Func(PyTuple_GetItem(obj, i));
+        }
+        return values;
+    }
+    if (PyList_Check(obj)) {
+        size_t size = PyList_Size(obj);
+        values.resize(size);
+        for (int i = 0; i < size; i++) {
+            values[i] = Func(PyList_GetItem(obj, i));
+        }
+        return values;
+    }
+    values.push_back(Func(obj));
+    return values;
+}
+
+static inline std::vector<int> toInts(PyObject* obj) {
+    if (isInt(obj)) { return { toInt(obj) }; }
+    return toVec<int, toInt>(obj);
+}
+// type convert end
+
 typedef struct {
     PyObject_HEAD
     Llm* llm;
@@ -30,17 +155,36 @@ static PyObject* PyLLM_new(struct _typeobject *type, PyObject *args, PyObject *k
 }
 
 static PyObject* Py_str(PyObject *self) {
-    char str[50];
     LLM* llm = (LLM*)self;
-    sprintf(str, "Llm object: %p", llm->llm);
-    return Py_BuildValue("s", str);
+    if (!llm) {
+        Py_RETURN_NONE;
+    }
+    return toPyObj(llm->llm->model_name_);
+}
+
+static PyObject* PyLLM_load(LLM *self, PyObject *args) {
+    const char* model_dir = NULL;
+    if (!PyArg_ParseTuple(args, "s", &model_dir)) {
+        Py_RETURN_NONE;
+    }
+    self->llm->load(model_dir);
+    Py_RETURN_NONE;
+}
+
+static PyObject* PyLLM_generate(LLM *self, PyObject *args) {
+    PyObject *input_ids = nullptr;
+    if (!PyArg_ParseTuple(args, "O", &input_ids) && isInts(input_ids)) {
+        Py_RETURN_NONE;
+    }
+    auto output_ids = self->llm->generate(toInts(input_ids));
+    return toPyObj<int, toPyObj>(output_ids);
 }
 
 static PyObject* PyLLM_response(LLM *self, PyObject *args) {
     const char* query = NULL;
     int stream = 0;
     if (!PyArg_ParseTuple(args, "s|p", &query, &stream)) {
-        return NULL;
+        Py_RETURN_NONE;
     }
     LlmStreamBuffer buffer(nullptr);
     std::ostream null_os(&buffer);
@@ -49,24 +193,24 @@ static PyObject* PyLLM_response(LLM *self, PyObject *args) {
 }
 
 static PyMethodDef PyLLM_methods[] = {
-    {"response", (PyCFunction)PyLLM_response, METH_VARARGS, "response without hsitory."},
+    {"load", (PyCFunction)PyLLM_load, METH_VARARGS, "load model from `dir`."},
+    {"generate", (PyCFunction)PyLLM_generate, METH_VARARGS, "generate `output_ids` by `input_ids`."},
+    {"response", (PyCFunction)PyLLM_response, METH_VARARGS, "response `query` without hsitory."},
     {NULL}  /* Sentinel */
 };
 
-
-static PyObject* PyLLM_get_mgl(LLM *self, void *closure) {
-    return PyLong_FromLong(self->llm->max_seq_len_);
-}
-
-static int PyLLM_set_mgl(LLM *self, PyObject *value, void *closure) {
-    if (self->llm) {
-        self->llm->max_seq_len_ = (int)PyLong_AsLong(value);
-    }
-    return 0;
-}
+def_attr(backend_type)
+def_attr(thread_num)
+def_attr(low_precision)
+def_attr(chatml)
+def_attr(max_new_tokens)
 
 static PyGetSetDef PyLLM_getsetters[] = {
-    {"max_gen_len", (getter)PyLLM_get_mgl, (setter)PyLLM_set_mgl, "___max_gen_len___", NULL},
+    register_attr(backend_type)
+    register_attr(thread_num)
+    register_attr(low_precision)
+    register_attr(chatml)
+    register_attr(max_new_tokens)
     {NULL}  /* Sentinel */
 };
 
@@ -111,11 +255,11 @@ static PyTypeObject PyLLM = {
     PyLLM_new,                                /* tp_new */
 };
 
-static PyObject *py_load(PyObject *self, PyObject *args) {
+static PyObject* py_create(PyObject *self, PyObject *args) {
     if (!PyTuple_Size(args)) {
         return NULL;
     }
-    const char *model_dir = NULL;
+    const char* model_dir = NULL;
     const char* model_type = "auto";
     if (!PyArg_ParseTuple(args, "s|s", &model_dir, &model_type)) {
         return NULL;
@@ -125,19 +269,19 @@ static PyObject *py_load(PyObject *self, PyObject *args) {
         return NULL;
     }
     llm->llm = Llm::createLLM(model_dir, model_type);
-    llm->llm->load(model_dir);
+    // llm->llm->load(model_dir);
     return (PyObject*)llm;
 }
 
 static PyMethodDef Methods[] = {
-        {"load", py_load, METH_VARARGS},
-        {NULL, NULL}
+    {"create", py_create, METH_VARARGS},
+    {NULL, NULL}
 };
 
 static struct PyModuleDef mnnllmModule = {
         PyModuleDef_HEAD_INIT,
         "cmnnllm", /*module name*/
-        "", /* module documentation, may be NULL */
+        "mnnllm cpython module.", /* module documentation, may be NULL */
         -1, /* size of per-interpreter state of the module, or -1 if the module keeps state in global variables. */
         Methods
 };
@@ -151,6 +295,7 @@ PyMODINIT_FUNC PyInit_cmnnllm(void) {
     if (PyType_Ready(&PyLLM) < 0) {
         PyErr_SetString(PyExc_Exception, "init LLM: PyType_Ready PyLLM failed");
     }
+    // _import_array();
     PyModule_AddObject(m, "LLM", (PyObject *)&PyLLM);
     def(m, &Methods[0]);
     return m;
diff --git a/python/setup.py b/python/setup.py
index 8fc67cfc..d691ca3d 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -32,9 +32,10 @@ def make_relative_rpath(path):
 
 setup(name='mnnllm',
       version='0.1',
+      language='c++',
       description='mnn-llm python',
       ext_modules=[module],
       packages=packages,
       data_files=lib_files,
       author='wangzhaode',
-      author_email='hi@zhaode.wang')
\ No newline at end of file
+      author_email='hi@zhaode.wang')
diff --git a/src/llm.cpp b/src/llm.cpp
index fb32219e..48acfc05 100644
--- a/src/llm.cpp
+++ b/src/llm.cpp
@@ -119,7 +119,7 @@ void Llm::chat() {
     reset();
 }
 
-void Llm::response_init() {
+void Llm::generate_init() {
     // init status
     gen_seq_len_ = 0;
     all_seq_len_ = 0;
@@ -135,7 +135,25 @@ void Llm::response_init() {
     }
 }
 
-std::string Llm::response_impl(const std::vector<int>& input_ids, std::ostream* os, const char* end_with) {
+std::vector<int> Llm::generate(const std::vector<int>& input_ids) {
+    generate_init();
+    std::vector<int> output_ids;
+    prompt_len_ = static_cast<int>(input_ids.size());
+    // prefill
+    int token = forward(input_ids);
+    output_ids.push_back(token);
+    // decode
+    while (gen_seq_len_ < max_new_tokens_) {
+        token = forward({token});
+        if (is_stop(token)) {
+            break;
+        }
+        output_ids.push_back(token);
+    }
+    return output_ids;
+}
+
+std::string Llm::generate(const std::vector<int>& input_ids, std::ostream* os, const char* end_with) {
     prompt_len_ = static_cast<int>(input_ids.size());
     auto st = std::chrono::system_clock::now();
     int token = forward(input_ids);
@@ -144,7 +162,7 @@ std::string Llm::response_impl(const std::vector<int>& input_ids, std::ostream*
     std::string output_str = decode(token);
     prefill_us_ = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
     *os << output_str << std::flush;
-    while (gen_seq_len_ < max_seq_len_) {
+    while (gen_seq_len_ < max_new_tokens_) {
         st = std::chrono::system_clock::now();
         token = forward({token});
         et = std::chrono::system_clock::now();
@@ -165,29 +183,30 @@ std::string Llm::response_impl(const std::vector<int>& input_ids, std::ostream*
 }
 
 std::string Llm::response(const std::string& query, std::ostream* os, const char* end_with) {
-    response_init();
+    generate_init();
     if (!end_with) {
         end_with = "\n";
     }
     // response
-    auto input_ids = tokenizer(query);
+    auto input_ids = chatml_ ? tokenizer(query) : tokenizer_encode(query);
+    // printf("ids = "); for (int id : input_ids) printf("%d, ", id); printf("\n");
     if (!history_.empty()) {
         std::copy(input_ids.begin(), input_ids.end(), std::back_inserter(history_));
         input_ids = history_;
     } else {
         history_ = input_ids;
     }
-    return response_impl(input_ids, os, end_with);
+    return generate(input_ids, os, end_with);
 }
 
 std::string Llm::response_nohistory(const std::string& query, std::ostream* os, const char* end_with) {
-    response_init();
+    generate_init();
     if (!end_with) {
         end_with = "\n";
     }
     // response
-    auto input_ids = tokenizer(query);
-    return response_impl(input_ids, os, end_with);
+    auto input_ids = chatml_ ? tokenizer(query) : tokenizer_encode(query);
+    return generate(input_ids, os, end_with);
 }
 
 void Llm::print_speed() {
@@ -217,10 +236,11 @@ void Llm::load(const std::string& model_dir) {
     // init
     ScheduleConfig config;
     BackendConfig cpuBackendConfig;
-    config.type          = MNN_FORWARD_CPU;
-    // config.type          = MNN_FORWARD_OPENCL;
-    config.numThread     = 4;
-    cpuBackendConfig.precision = BackendConfig::Precision_Low;
+    config.type          = static_cast<MNNForwardType>(backend_type_);;
+    config.numThread     = thread_num_;
+    if (low_precision_) {
+        cpuBackendConfig.precision = BackendConfig::Precision_Low;
+    }
     cpuBackendConfig.memory = BackendConfig::Memory_Low;
     config.backendConfig = &cpuBackendConfig;
     runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config));
@@ -385,6 +405,7 @@ VARP Llm::txt_embedding(const std::vector<int>& input_ids) {
     if (needNewVar(inputs_embeds_, 0, seq_len)) {
         inputs_embeds_ = _Input({seq_len, 1, hidden_size_}, NCHW);
     }
+
     size_t size = hidden_size_ * sizeof(int16_t);
     FILE* file = fopen(disk_embedding_file_.c_str(), "rb");
     std::unique_ptr<int16_t[]> buffer(new int16_t[hidden_size_]);