From dd29962958bf3bcfac99fbf9499d34f376674ce1 Mon Sep 17 00:00:00 2001
From: "Xiaolong.Gao" <1506957902@qq.com>
Date: Tue, 19 Jul 2022 19:45:15 +0800
Subject: [PATCH 1/8] feat: impl GPU tensor

Signed-off-by: Xiaolong.Gao <1506957902@qq.com>
---
 .../fate_tensor_gpu/__init__.py               |   21 +
 .../fate_tensor_gpu/gpu_engine.py             | 3726 +++++++++++++++++
 .../fate_tensor_gpu/gpu_tensor.py             |  428 ++
 .../fate_tensor_gpu/secureprotol/__init__.py  |   15 +
 .../secureprotol/fate_paillier.py             |  343 ++
 .../secureprotol/fixedpoint.py                |  298 ++
 .../fate_tensor_gpu/secureprotol/gmpy_math.py |  133 +
 gpu/fate-tensor-gpu/pyproject.toml            |   17 +
 gpu/fate-tensor-gpu/tests/__init__.py         |    0
 .../tests/test_fate_tensor_gpu.py             |    5 +
 10 files changed, 4986 insertions(+)
 create mode 100644 gpu/fate-tensor-gpu/fate_tensor_gpu/__init__.py
 create mode 100644 gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py
 create mode 100644 gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_tensor.py
 create mode 100644 gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/__init__.py
 create mode 100644 gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py
 create mode 100644 gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py
 create mode 100644 gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/gmpy_math.py
 create mode 100644 gpu/fate-tensor-gpu/pyproject.toml
 create mode 100644 gpu/fate-tensor-gpu/tests/__init__.py
 create mode 100644 gpu/fate-tensor-gpu/tests/test_fate_tensor_gpu.py

diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/__init__.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/__init__.py
new file mode 100644
index 0000000000..5d9d7241b5
--- /dev/null
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/__init__.py
@@ -0,0 +1,21 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+from .gpu_tensor import keygen, SK, PK, Cipherblock
+
+__version__ = '0.1.0'
+__all__ = ['keygen', "SK", "PK", "Cipherblock"]
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py
new file mode 100644
index 0000000000..0e96d2cb56
--- /dev/null
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py
@@ -0,0 +1,3726 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+import random
+import numpy as np
+
+from ctypes import cdll, sizeof, c_buffer, cast, c_int32
+from ctypes import c_char, c_char_p, c_void_p, c_uint32, c_double, c_int64, c_int, c_size_t
+
+from .secureprotol.fate_paillier import PaillierPublicKey, PaillierPrivateKey, PaillierEncryptedNumber
+from .secureprotol.fixedpoint import FixedPointNumber
+
+from concurrent.futures import ProcessPoolExecutor as Executor
+
+# define memory types
+MEM_HOST = 1
+MEM_DEVICE = 2
+
+# define device type
+# TODO: make those paras into actual use
+# device_type = 00: CPU
+# device_type = 10: GPU
+# device_type = 20: FPGA_num_0
+# device_type = 21: FPGA_num_1
+device_type = 1
+
+# aliases defined by WeBank
+PaillierPublicKeyStorage = PaillierPublicKey
+PaillierPrivateKeyStorage = PaillierPrivateKey
+
+'''##############import ctypes to implement py2c and c2py#################'''
+'''############## load the .so library written in C     ##################'''
+
+# we made 3 libraries, each one indicating a different CIPHER_BIT length
+# here use absolute path to locate the shared library
+GPU_LIB = cdll.LoadLibrary(os.path.dirname(__file__) + "/GPU_2048.so")
+# GPU_LIB = cdll.LoadLibrary("../../../Documents/GPU_2048.so")
+
+# set the CIPHER_BIT according to the library chosen.
+CIPHER_BITS = 2048
+PLAIN_BITS = 2048
+BYTE_LEN = 8
+CIPHER_BYTE = CIPHER_BITS // BYTE_LEN
+PLAIN_BYTE = PLAIN_BITS // BYTE_LEN
+
+# ### DEFINE THE BTYE_LENGTHS OF DATA TYPES ####
+CHAR_BYTE = sizeof(c_char)
+U_INT32_BYTE = sizeof(c_uint32)
+DOUBLE_BYTE = sizeof(c_double)
+INT64_BYTE = sizeof(c_int64)
+
+# DEFINE THE RETURN TYPE OF C_malloc####
+GPU_LIB.c_malloc.restype = c_void_p
+GPU_LIB.c_direct_malloc.restype = c_void_p
+GPU_LIB.cuda_malloc.restype = c_void_p
+
+GPU_LIB.init_pub_key.restype = c_void_p
+GPU_LIB.init_priv_key.restype = c_void_p
+
+GPU_LIB.get_cur_device.restype = c_int
+GPU_LIB.get_cur_context.restype = c_void_p
+GPU_LIB.create_cuda_context.restype = c_void_p
+
+# DEFINE TWO DIFFERENT TYPE OF DATA IN TensorStorage
+INT64_TYPE = 1  # datatype flag for int32 and int64
+FLOAT_TYPE = 2  # datatype flag for float and double
+
+# define BASE for Paillier encrypted numbers
+PEN_BASE = 16
+# as there's no BASE defined in Python PaillierEncryptedNumber,
+# and we need this in CUDA, we define PEN_BASE as 16
+
+''' Device Initializer '''
+
+
+def initialize_device():
+    GPU_LIB.gpu_init()
+    GPU_LIB.print_example_banner()
+
+
+def _MEM_ON_HOST(mem_type):
+    if mem_type == MEM_HOST:
+        return True
+    elif mem_type == MEM_DEVICE:
+        return False
+
+
+# ############################################################################
+'''
+    functions for getting nvidia GPU info
+    And set the cuda context for the new data
+'''
+
+
+def GPU_driver_init():
+    GPU_LIB.cuda_driver_init()
+
+
+def set_GPU_device(device_num):
+    GPU_LIB.set_gpu_device(c_int(device_num))
+
+
+def get_GPU_device():
+    return GPU_LIB.get_cur_device()
+
+
+def get_GPU_context():
+    return GPU_LIB.get_cur_context()
+
+
+def create_GPU_context(device_num):
+    context_pointer = GPU_LIB.create_cuda_context(c_int(device_num))
+    return context_pointer
+
+
+def bind_GPU_context(context_ptr):
+    GPU_LIB.bind_cuda_context(c_void_p(context_ptr))
+
+
+def free_GPU_context(context_pointer):
+    GPU_LIB.free_cuda_context(c_void_p(context_pointer))
+
+
+# ############################################################################
+# ######################Useful independent functions##########################
+# ###################Reconstruct ndaray from C memory type####################
+# ############################################################################
+
+def __get_C_fpn(fpn_space, size):
+    '''
+    copy FixedPointNumber (FPN) object out from C memory space,
+    reform a ndarray, return it to upper python level
+    --------------------
+    Para:
+    res_fpn_space: int, indicating the start address of a c_memory space
+    size: int, the number of FPN in the C memory space
+    Return:
+    A ndarray, each element is a bigint
+    '''
+    res_fpn = []
+    get_res = c_buffer(PLAIN_BYTE)
+    for i in range(size):
+        GPU_LIB.c_memcpy(cast(get_res, c_void_p),
+                         c_void_p(fpn_space + i * PLAIN_BYTE),
+                         c_size_t(PLAIN_BYTE))
+        res_fpn.append(int.from_bytes(get_res.raw, 'little'))
+    return np.asarray(res_fpn)
+
+
+def __get_C_pen(pen_space, index, size):
+    '''
+    copy PaillierEncryptedNumber(PEN) object out from C memory space,
+    reform a ndarray, return it to upper python level
+    ------------------
+    Para:
+    pen:   int, indicating the start address of a continuous C memory space
+    index: int, the offset from start address that we start to get PEN
+    size:  int, the number of PEN ought to get
+    Return:
+    A ndarray, each element is a bigint
+    '''
+    res_pen = []
+    get_res = c_buffer(CIPHER_BYTE)
+    for i in range(size):
+        GPU_LIB.c_memcpy(cast(get_res, c_void_p),
+                         c_void_p(pen_space + (index + i) * CIPHER_BYTE),
+                         c_size_t(CIPHER_BYTE))
+        res_pen.append(int.from_bytes(get_res.raw, 'little'))
+    return np.asarray(res_pen)
+
+
+bi_c2p = __get_C_pen
+
+
+def __get_C_uint32(uint32_space, size):
+    '''
+    copy uint32 out from C memory space, form a ndarraay
+    since numpy has a very good support for basic C numeric objects,
+    A single memcpy will be sufficient
+    ------------------------
+    Para:
+    res_uint32_space: int, indicating the start address of a continuous C memory space
+    size: int, the number of uint32 ought to get
+    '''
+    uint32_list = (c_uint32 * size)(*[0 for _ in range(size)])
+    GPU_LIB.c_memcpy(uint32_list, c_void_p(uint32_space),
+                     c_size_t(size * U_INT32_BYTE))
+    return np.asarray(uint32_list)
+
+
+def __get_C_double(double_space, size):
+    '''copy double out from C memory space, form a ndarray'''
+    double_list = (c_double * size)(*[0 for _ in range(size)])
+    GPU_LIB.c_memcpy(double_list, c_void_p(double_space),
+                     c_size_t(size * DOUBLE_BYTE))
+    # convert all the data in one step, no loop
+    return np.asarray(double_list)
+
+
+def __get_C_int64(int64_space, size):
+    '''copy int64 out from C memory space, form a ndarray'''
+    int64_list = (c_int64 * size)(*[0 for _ in range(size)])
+    GPU_LIB.c_memcpy(int64_list, c_void_p(int64_space),
+                     c_size_t(size * INT64_BYTE))
+    # convert all the data in one step, no loop
+    return np.asarray(int64_list)
+
+
+def __get_c_fpn_storage(fpn, base, exp, vec_size, n, max_int):
+    '''
+    Construct array of FixedPointNumber from given C memory spaces
+    -------------------
+    Para:
+    fpn:  int, start address of a C memory space,
+               inside which stores FPN's encodings(bigint, PLAIN_BITS long)
+    base: int, start address of a C memory space,
+               inside which stores FPN's base(uint32)
+    exp:  int, start address of a C memory space,
+               inside which stores FPN's exp(uint32)
+    vec_size: int, the number of bigint
+    n, max_int: int, the key used to encode the original plaintext
+
+    Return:
+    A ndarray, each element is a FixedPointNumber
+    '''
+    res_fpn = __get_C_fpn(fpn, vec_size)
+    # res_base = __get_C_uint32(base, size)
+    res_exp = __get_C_uint32(exp, vec_size)
+    res_FixedPointNumber = []
+    for i in range(vec_size):
+        res_FixedPointNumber.append(FixedPointNumber(
+            res_fpn[i], int(round(res_exp[i])), n, max_int))
+    return np.asarray(res_FixedPointNumber)
+
+
+def __get_c_pen_storage_raw(pen, base, exp, vec_size, n):
+    res_cipher = __get_C_pen(pen, 0, vec_size)
+    res_base = __get_C_uint32(base, vec_size)
+    res_exp = __get_C_uint32(exp, vec_size)
+
+    return res_cipher, res_base, res_exp
+
+
+def __get_c_pen_storage_mp(pen, base, exp, vec_size, n, thread_num=4):
+    '''
+    Use multi-process to accelerate __get_C_pen process.
+
+    Since on Linux, python use fork to create sub-process,
+    thus the C memory space is shared between father and child processes.
+    And the whole process concerns no CUDA and cuda-context,
+    even the return result is in python object form.
+    So we can use multi-process for acceleration here safely
+    ---------------------------------
+    Para:
+        thread_num: number of processes used in multi-processing
+    Return:
+        tuple, (ndarray, ndarray, ndarray)
+    '''
+    job_cnt = round(vec_size / thread_num)
+    job_idx = 0
+    job_idx_list, job_cnt_list = [0], []
+    for i in range(thread_num - 1):
+        job_idx += job_cnt
+        job_idx_list.append(job_idx)
+        job_cnt_list.append(job_cnt)
+    job_cnt_list.append(vec_size - job_cnt * (thread_num - 1))
+    # for __get_C_pen, use multiprocess to accelerate
+    executor = Executor()
+    futures = []
+    for i in range(thread_num):
+        futures.append(executor.submit(
+            __get_C_pen, pen, job_idx_list[i], job_cnt_list[i]))
+    res_list = [r.result() for r in futures]
+    res_pen = []
+    for res in res_list:
+        res_pen.extend(res)
+    # for uint32, no special demand for multiprocess
+    res_base = __get_C_uint32(base, vec_size)
+    res_exp = __get_C_uint32(exp, vec_size)
+    return np.asarray(res_pen), res_base, res_exp
+
+
+def __get_c_pen_storage(pen, base, exp, vec_size, n):
+    '''
+    Construct array of PaillierEncryptedNumber storage from given memory space
+    ------------------
+    pen:  int, start address of a C memory space,
+               inside which stores PEN's encodings(bigint, CIPHER_BITS long)
+    base: int, start address of a C memory space,
+               inside which stores PEN's base(uint32)
+    exp:  int, start address of a C memory space,
+               inside which stores PEN's exp(uint32)
+    vec_size:   int, the number of bigint
+    n, max_int: int, the key used to encode the original plaintext
+
+    Return:
+    A ndarray, each element is a PaillierEncryptedNumber (PEN)
+    '''
+    res_pen = __get_C_pen(pen, 0, vec_size)
+    res_exp = __get_C_uint32(exp, vec_size)
+
+    res_PaillierEncryptedNumber = []
+    public_key = PaillierPublicKey(n)
+    for i in range(vec_size):
+        res_PaillierEncryptedNumber.append(
+            PaillierEncryptedNumber(public_key, res_pen[i], int(round(res_exp[i]))))
+
+    return np.asarray(res_PaillierEncryptedNumber)
+
+
+#######################################################################
+# #########################DEFINITION OF CLASSES#######################
+#######################################################################
+'''#############  the definition of functions and classes #################'''
+
+'''
+    TensorStorage.data Containing the address pointing to a double type
+    All the int32/int64 have been transformed to int64_t type
+    All the float32/float64 have been transformed to double type
+    We assume that TensorStorage has 2 types:
+    1. data is ndarray, caculation can be performed directly by ndarray.
+    2. data is C memory pointer, used for performing further encoding for
+       the lower bound
+'''
+
+
+class TensorStorage(object):
+    '''
+    TensorStorage Class is used for store plaintexts.
+    Currently support
+    1. int32, int64 (all transformed to int64_t type)
+    2. float32, float64 (all transformed to double type)
+
+    Attributes:
+        data: ndarray or int,
+            1. ndarray means data is a python object
+            2. int means data is a C memory object, the value of int is the C memory's
+               start address
+        vec_size: int, the number of data stored in current class
+                       saved here since it may lost when data transfered to C memory
+        mem_type: int, value is MEM_HOST or MEM_DEVICE, where the data is stored
+                       default MEM_HOST
+        data_type: int, value is INT_TYPE or FLOAT_TYPE, the data type of plaintext,
+                        saved here since it may lost when data transfered to C memory
+    '''
+
+    def __init__(self, data, vec_size, mem_type: int, data_type: int):
+        # numpy has some strange shallowcopies which causes incontinuous memory space
+        # so add np.ascontiguousarray here to prevent potential errors
+        self.data = np.ascontiguousarray(data) if isinstance(data, np.ndarray) else data
+        self.vec_size = vec_size
+        self.mem_type = mem_type
+        self.data_type = data_type  # new parameter
+
+    def __str__(self):
+        return f"{self.__class__}:{self.data}"
+
+    def __del__(self):
+        te_free(self)
+
+
+class BigIntStorage(object):
+    '''
+    Used for store bigint objects:
+
+    Attributes:
+        bigint_storage: int, the start address of the C memory storing bigint
+        elem_size:      int, the size of the bigint,
+                            useless since we unified into CIPHER_BITS
+        vec_size:       int, the number of bigint stored in this class
+        mem_type:       int, MEM_HOST or MEM_DEVICE, where data is stored, default MEM_HOST
+
+    '''
+
+    def __init__(self, data, vec_size, mem_type: int, elem_size: int):
+        # 1:cpu/host  2:gpu/device
+        self.mem_type = mem_type
+        # self.data = data
+        self.bigint_storage = data
+        self.elem_size = elem_size
+        self.vec_size = vec_size
+
+    def __len__(self):
+        return len(self.data)
+
+    def __del__(self):
+        bi_free(self)
+
+
+class FixedPointStorage:
+    '''
+    Contains the 3 pointers indicating start address of C memory,
+    which can be handled directly by passing it to C functions in GPU_LIB
+    ------------------
+    Attributes:
+        bigint_storage: int, start address of C memory,
+                                in which stores the mantissa of a fpn array
+        base_storage:   int, start address of C memory,
+                                in which stores the base array of the fpn array
+        exp_storage:    int, start address of C memory,
+                                in which stores the exponent array of fpn array
+        vec_size:       int, the number of data stored in current class
+                                saved here since it may lost when data transfered to C memory
+        mem_type:       int, value is MEM_HOST or MEM_DEVICE, where the data is stored
+                                default MEM_HOST
+        data_type:      int, value is INT_TYPE or FLOAT_TYPE, the data type of plaintext,
+                                saved here since it may lost when data transfered to C memory
+        encode_n, max_int: bigint, the para used for encode the plaintext
+    '''
+
+    def __init__(self, bigint_storage, base_storage, exp_storage, vec_size,
+                 n, max_int, mem_type: int, data_type):
+        # 1:cpu/host  2:gpu/device
+        self.mem_type = mem_type
+        '''Actual data and length for fpn'''
+        self.bigint_storage = bigint_storage
+        self.base_storage = base_storage
+        self.exp_storage = exp_storage
+        self.vec_size = vec_size
+        '''TensorStorage needed paras'''
+        self.data_type = data_type
+        '''En/Decode needed paras '''
+        # these 2 are just python int, not BigintStorage nor C_types
+        self.encode_n = n
+        self.max_int = max_int
+
+    def __len__(self):
+        return self.vec_size
+        # return len(self.data)
+
+    def __del__(self):
+        fp_free(self)
+
+
+class PaillierEncryptedStorage:
+    '''
+    Contains the 3 pointers indicating start address of C memory,
+    which can be handled directly by passing it to C functions in GPU_LIB
+    --------------------
+    Attributes:
+        pen_storage:    int, start address of C memory,
+                                in which stores the mantissa of the pen array
+        base_storage:   int, start address of C memory,
+                                in which stores the bases of the pen array
+        exp_storage:    int, start address of C memory,
+                                in which stores the exponents of the pen array
+        vec_size:       int, the number of data stored in current class
+                                saved here since it may lost when data transfered to C memory
+        mem_type:       int, value is MEM_HOST or MEM_DEVICE, where the data is stored
+                                default MEM_HOST
+        data_type:      int, value is INT_TYPE or FLOAT_TYPE, the data type of plaintext,
+                                saved here since it may lost when data transfered to C memory
+        encode_n, max_int: bigint, the para used for encode the plaintext
+    '''
+
+    def __init__(self, pen_storage, base_storage, exp_storage, vec_size,
+                 mem_type: int, data_type, fpn_encode_n, fpn_encode_max_int):
+        self.mem_type = mem_type
+        '''Actual data and length for pen'''
+        self.pen_storage = pen_storage
+        self.base_storage = base_storage
+        self.exp_storage = exp_storage
+        self.vec_size = vec_size
+        '''TensorStorage needed paras'''
+        self.data_type = data_type
+        '''En/Decode needed paras '''
+        self.encode_n = fpn_encode_n
+        self.encode_max_int = fpn_encode_max_int
+        '''Pub_key paras'''
+
+    def __len__(self):
+        return self.vec_size
+
+    def __del__(self):
+        pi_free(self)
+
+
+class TensorShapeStorage:
+    '''
+    Used for store the shape, currently support 2 dim
+    The behavior is identical to numpy
+    -------------------
+    Attributes:
+        dim1: the 1st dim, aka the row
+        dim2: the 2nd dim, aka the col
+    '''
+
+    def __init__(self, dim1=None, dim2=None):
+        if dim1 is not None and not isinstance(dim1, int):
+            raise TypeError("invalid dimension")
+        if dim2 is not None and not isinstance(dim2, int):
+            raise TypeError("invalid dimension")
+        self.dim1 = dim1
+        self.dim2 = dim2
+
+    def size(self):
+        dim1 = 1 if self.dim1 is None else self.dim1
+        dim2 = 1 if self.dim2 is None else self.dim2
+        return dim1 * dim2
+
+    def __getitem__(self, item):
+        return self.to_tuple().__getitem__(item)
+
+    def __len__(self):
+        return len(self.to_tuple())
+
+    def to_tuple(self):
+        if self.dim1 is None:
+            return ()
+        else:
+            if self.dim2 is None:
+                return (self.dim1,)
+            else:
+                return (self.dim1, self.dim2)
+
+    def from_tuple(self, v):
+        if len(v) == 1:
+            self.dim1 = v[0]
+            self.dim2 = None
+        elif len(v) == 2:
+            self.dim1 = v[0]
+            self.dim2 = v[1]
+        else:
+            self.dim1 = None
+            self.dim2 = None
+        return self
+
+    def transpose(self):
+        return TensorShapeStorage(self.dim2, self.dim1)
+
+    def matmul(self, other):
+        return TensorShapeStorage(self.dim1, other.dim2)
+
+
+class PubKeyStorage:
+    '''
+    Used for store PaillierPublicKey info as C-accpetable data type
+    -------------
+    Attributes:
+       n,g, nsquare, max_int:
+            c_char_p, actual value is bytes
+            all identical to PaillierPublicKey, which is defined in fate_script
+    '''
+
+    def __init__(self, n, g, nsquare, max_int):
+        self.n = c_char_p(n.to_bytes(CIPHER_BYTE, 'little'))
+        self.g = c_char_p(g.to_bytes(CIPHER_BYTE, 'little'))
+        self.nsquare = c_char_p(nsquare.to_bytes(CIPHER_BYTE, 'little'))
+        self.max_int = c_char_p(max_int.to_bytes(CIPHER_BYTE, 'little'))
+
+
+class PrivKeyStorage:
+    '''
+    Used for store PaillierPrivateKey info as C-acceptable data type
+    ------------
+    Attributes are all identical to PaillierPrivateKey, defined in fate_script
+    '''
+
+    def __init__(self, p, q, psquare, qsquare, q_inverse, hp, hq):
+        self.p = c_char_p(p.to_bytes(CIPHER_BYTE, 'little'))
+        self.q = c_char_p(q.to_bytes(CIPHER_BYTE, 'little'))
+        self.psquare = c_char_p(psquare.to_bytes(CIPHER_BYTE, 'little'))
+        self.qsquare = c_char_p(qsquare.to_bytes(CIPHER_BYTE, 'little'))
+        self.q_inverse = c_char_p(q_inverse.to_bytes(CIPHER_BYTE, 'little'))
+        self.hp = c_char_p(hp.to_bytes(CIPHER_BYTE, 'little'))
+        self.hq = c_char_p(hq.to_bytes(CIPHER_BYTE, 'little'))
+
+
+class Dev_PubKeyStorage:
+    '''
+    Used for store PaillierPublicKey info in GPU memory
+    -----------------
+    Attributes:
+        pub_key_ptr:
+            int, actually a pointer,
+            pointing to the address where pubkey is stored on GPU
+    '''
+
+    def __init__(self, pubkey_storage):
+        self.pub_key_ptr = GPU_LIB.init_pub_key(
+            pubkey_storage.n,
+            pubkey_storage.g,
+            pubkey_storage.nsquare,
+            pubkey_storage.max_int)
+
+    def __del__(self):
+        pi_free_d_pub_key(self.pub_key_ptr)
+
+
+class Dev_PrivKeyStorage:
+    '''
+    Used for store PaillierPrivateKey info in GPU memory
+    ------------------
+    Attributes:
+       priv_key_ptr:
+            int, actually a pointer,
+            pointing to the address where privkey is stored on GPU
+    '''
+
+    def __init__(self, privkey_storage):
+        self.priv_key_ptr = GPU_LIB.init_priv_key(
+            privkey_storage.p,
+            privkey_storage.q,
+            privkey_storage.psquare,
+            privkey_storage.qsquare,
+            privkey_storage.q_inverse,
+            privkey_storage.hp,
+            privkey_storage.hq)
+
+    def __del__(self):
+        pi_free_d_priv_key(self.priv_key_ptr)
+
+
+##########################################################################
+# ####################FUNCTION DEFINITION ################################
+##########################################################################
+def te_p2c_shape(shape, res):
+    '''
+    Change a 2-elem tuple into a TensorShapeStorage object
+    -------------
+    Para:
+        shape:   tuple, with no more than 2 elements
+        res:     return value
+    Return:
+        res,     TensorShapeStorage
+    '''
+    if res is None:
+        res = TensorShapeStorage()
+    res.from_tuple(shape)
+    return res
+
+
+def te_c2p_shape(shape):
+    '''
+    recover the shape_tuple from TensorShapeStorage
+    --------------
+    Para:   shape:   TensorShapeStorage
+    Return: tuple
+    '''
+    return shape.to_tuple()
+
+
+def te_free(tes):
+    '''
+    free the c memory space in a TensorStorage class
+    --------------
+    Para:
+        tes:    TensorStorage,
+                if tes.data is a int, which means that it is a C memory pointer
+    Return:
+        None
+    '''
+    if (isinstance(tes.data, int)):
+        GPU_LIB.c_free(c_void_p(tes.data))
+        tes.data = None
+
+
+def te_p2c(data, res):
+    '''
+    transmit the data storage form from Python to C
+    we assume data's structure has already been preserved by the upper layer
+    using the TensorShapeStorage class
+    ------------------
+    Args:
+        data, list or ndarray, the original data array
+    Return:
+        TensorStorage, and data is a C pointer
+    '''
+    # flatten the current ndarray for get the actual vec_size
+    if isinstance(data, list):
+        data = np.asarray(data)
+    if not isinstance(data, np.ndarray):
+        raise TypeError("Unsupported Data Structure")
+    vec_size = data.size
+
+    # malloc the space
+    if res is None:
+        storage_pointer = GPU_LIB.c_malloc(c_size_t(vec_size * DOUBLE_BYTE))
+    else:
+        storage_pointer = res.data
+
+    # switch the differnt data types
+    if (data.dtype == 'int32'):
+        new_data = data.astype(np.int64)
+        data_pointer = new_data.ctypes.data_as(c_void_p)
+        data_type = INT64_TYPE
+        GPU_LIB.c_memcpy(c_void_p(storage_pointer), data_pointer, c_size_t(vec_size * INT64_BYTE))
+    elif (data.dtype == 'int64'):
+        data_pointer = data.ctypes.data_as(c_void_p)
+        data_type = INT64_TYPE
+        GPU_LIB.c_memcpy(c_void_p(storage_pointer), data_pointer, c_size_t(vec_size * INT64_BYTE))
+    elif (data.dtype == 'float32'):
+        new_data = data.astype(np.float64)
+        data_pointer = new_data.ctypes.data_as(c_void_p)
+        data_type = FLOAT_TYPE
+        GPU_LIB.c_memcpy(c_void_p(storage_pointer), data_pointer, c_size_t(vec_size * DOUBLE_BYTE))
+    elif (data.dtype == 'float64'):
+        data_pointer = data.ctypes.data_as(c_void_p)
+        data_type = FLOAT_TYPE
+        GPU_LIB.c_memcpy(c_void_p(storage_pointer), data_pointer, c_size_t(vec_size * DOUBLE_BYTE))
+    else:
+        raise PermissionError("Invalid Data Type")
+    return _te_init_store(res, storage_pointer, vec_size, MEM_HOST, data_type)
+
+
+def te_c2p(store):
+    '''
+    transmit TensorShapeStorage form from C to Python
+    due to different data type, the return array may diff
+    -----------
+    Para:
+        store: TensorShapeStorage, the storage waited to be changed
+    Return:
+        res_array: np.ndarray, the returned ndarray to Python
+    '''
+    if store.data_type == FLOAT_TYPE:
+        temp_array = __get_C_double(store.data, store.vec_size)
+        res_array = temp_array.astype(np.float64)
+        return res_array
+    elif store.data_type == INT64_TYPE:
+        temp_array = __get_C_int64(store.data, store.vec_size)
+        res_array = temp_array.astype(np.int64)
+        return res_array
+    else:
+        raise PermissionError("Invalid Data Type")
+
+
+def te_c2bytes(data, res):
+    '''
+    transmit TensorShapeStorage form from C to bytes stream.
+    Used for communication between sites, since C memory is not shared
+    --------------------
+    Para:
+        data: TensorShapeStorage, data is a C memory ptr
+        res:  the return bytes string
+    Return:
+        res:  bytes
+    '''
+    bytes_res = c_buffer(DOUBLE_BYTE * data.vec_size + U_INT32_BYTE)
+    # first 4 bytes: contains the data_type info
+    # remaining bytes:  contains the data
+    GPU_LIB.te_get_bytes(cast(bytes_res, c_void_p),
+                         c_char_p(data.data_type.to_bytes(U_INT32_BYTE, 'little')),
+                         c_void_p(data.data), c_size_t(data.vec_size))
+    return bytes_res.raw
+    # return pickle.dumps(data)
+
+
+def fp_c2bytes(store, res):
+    '''
+    transmit FixedPointStorage form to bytes stream;
+    Used for communication between sites, since C memory is not shared
+    Other info besides the C memory, including data_type, mem_type,
+    are also included
+    -----------------
+    Para:
+        store: FixedPointStorage
+        res:   the return bytes string
+    Return:
+        res:   bytes
+    '''
+    # uint32
+    data_type = store.data_type
+    mem_type = store.mem_type
+    # bigint
+    encode_n = store.encode_n
+    max_int = store.max_int
+    # C memory storage
+    bytes_res = c_buffer((PLAIN_BYTE + U_INT32_BYTE * 2) * store.vec_size + U_INT32_BYTE * 2 + PLAIN_BYTE * 2)
+    GPU_LIB.fp_get_bytes(cast(bytes_res, c_void_p),
+                         c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
+                         c_char_p(mem_type.to_bytes(U_INT32_BYTE, 'little')),
+                         c_char_p(encode_n.to_bytes(PLAIN_BYTE, 'little')),
+                         c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
+                         c_void_p(store.bigint_storage),
+                         c_void_p(store.base_storage),
+                         c_void_p(store.exp_storage),
+                         c_size_t(store.vec_size))
+    return bytes_res.raw
+
+
+def pi_c2bytes(store, res):
+    '''
+    transmit PaillierEncryptedNumber form to bytes stream
+    Used for communication between sites, since C memory is not shared
+    ----------------
+    Para:
+        store: PaillierEncryptedStorage
+        res:   the return bytes string
+    Return:
+        res:   bytes
+    '''
+    # uint32
+    data_type = store.data_type
+    mem_type = store.mem_type
+    # bigint
+    encode_n = store.encode_n
+    max_int = store.encode_max_int
+    # C memory storage
+    bytes_res = c_buffer((CIPHER_BYTE + U_INT32_BYTE * 2) * store.vec_size + U_INT32_BYTE * 2 + CIPHER_BYTE * 2)
+    GPU_LIB.pi_get_bytes(cast(bytes_res, c_void_p),
+                         c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
+                         c_char_p(mem_type.to_bytes(U_INT32_BYTE, 'little')),
+                         c_char_p(encode_n.to_bytes(CIPHER_BYTE, 'little')),
+                         c_char_p(max_int.to_bytes(CIPHER_BYTE, 'little')),
+                         c_void_p(store.pen_storage),
+                         c_void_p(store.base_storage),
+                         c_void_p(store.exp_storage),
+                         c_size_t(store.vec_size))
+
+    return bytes_res.raw
+
+
+def _te_init_store(store, data, vec_size, mem_type,
+                   data_type):
+    '''
+    initialize tensor storage,
+    -----------
+    Para:
+        store: the return value, TensorStorage, default None
+        Other paras' definition are equals to the one in TensorStorage
+    Return:
+        TensorShapeStorage
+    '''
+    if store is None:
+        store = TensorStorage(data, vec_size, mem_type, data_type)
+    else:
+        store.data = data
+        store.vec_size = vec_size
+        if mem_type is not None:
+            store.mem_type = mem_type
+        store.data_type = data_type
+    return store
+
+
+def te_bytes2c(data, res):
+    '''
+    Restore TensorStorage from bytes buffer,
+    TensorStorage.data is a ptr pointing to the restored C memory space.
+    -------------
+    Para:
+        data: the bytes string
+        res:  the return value, TensorStorage
+    Return:
+        res:  TensorStorage, the restored struct from para.data
+    '''
+    data_type_res = c_buffer(U_INT32_BYTE)
+    len_data = len(data) - U_INT32_BYTE
+    if res is None:
+        storage_pointer = GPU_LIB.c_malloc(c_size_t(len_data))
+    else:
+        storage_pointer = res.data
+    GPU_LIB.te_from_bytes_get_c(cast(data_type_res, c_void_p),
+                                c_void_p(storage_pointer),
+                                c_char_p(data), c_size_t(len_data))
+    data_type = int.from_bytes(data_type_res, 'little')
+    # TODO: change according to different data_types' length,
+    # now just use DOUBLE BYTE because we have only INT64 and DOUBLE,
+    # all of them are 8 bytes(Equal to DOUBLE_BYTE)
+    vec_size = len_data // DOUBLE_BYTE
+    return _te_init_store(res, storage_pointer, vec_size, MEM_HOST, data_type)
+
+
+def fp_bytes2c(data, res):
+    '''
+    Restore FixedPointStorage from bytes buffer.
+    ---------------
+    Para:
+        data: the bytes string
+        res:  the return value, FixedPointStorage
+    Return:
+        res:  FixedPointStorage, the restored struct from para.data.
+    '''
+    # caculate vec_size
+    vec_size = ((len(data) - 2 * (U_INT32_BYTE + PLAIN_BYTE)) // (U_INT32_BYTE * 2 + PLAIN_BYTE))
+    # uint32
+    data_type = c_buffer(U_INT32_BYTE)
+    mem_type = c_buffer(U_INT32_BYTE)
+    # bigint
+    encode_n = c_buffer(PLAIN_BYTE)
+    max_int = c_buffer(PLAIN_BYTE)
+    # storage
+    if res is None:
+        fpn = GPU_LIB.c_malloc(c_size_t(PLAIN_BYTE * vec_size))
+        base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+        exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    else:
+        fpn = res.bigint_storage
+        base = res.base_storage
+        exp = res.exp_storage
+
+    GPU_LIB.fp_from_bytes_get_c(
+        cast(data_type, c_void_p), cast(mem_type, c_void_p),
+        cast(encode_n, c_void_p), cast(max_int, c_void_p),
+        cast(fpn, c_void_p), cast(base, c_void_p), cast(exp, c_void_p),
+        c_char_p(data), c_size_t(vec_size))
+    return _fp_init_store(res, fpn, base, exp, vec_size,
+                          int.from_bytes(encode_n, 'little'),
+                          int.from_bytes(max_int, 'little'),
+                          int.from_bytes(mem_type, 'little'),
+                          int.from_bytes(data_type, 'little'))
+
+
+def pi_bytes2c(data, res):
+    '''
+    Restored PaillierEncryptedStorage from bytes buffer
+    --------------
+    Para:
+        data: the bytes string
+        res:  the return value, PaillierEncryptedStorage
+    Return:
+        res:  PaillierEncryptedStorage, the restored struct from para.data
+    '''
+    # caculate vec_size
+    vec_size = ((len(data) - 2 * (U_INT32_BYTE + CIPHER_BYTE)) // (U_INT32_BYTE * 2 + CIPHER_BYTE))
+    # uint32
+    data_type = c_buffer(U_INT32_BYTE)
+    mem_type = c_buffer(U_INT32_BYTE)
+    # bigint
+    encode_n = c_buffer(CIPHER_BYTE)
+    max_int = c_buffer(CIPHER_BYTE)
+    # storage
+    if res is None:
+        pen = GPU_LIB.c_malloc(c_size_t(CIPHER_BYTE * vec_size))
+        base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+        exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    else:
+        pen = res.pen_storage
+        base = res.base_storage
+        exp = res.exp_storage
+
+    GPU_LIB.fp_from_bytes_get_c(
+        cast(data_type, c_void_p), cast(mem_type, c_void_p),
+        cast(encode_n, c_void_p), cast(max_int, c_void_p),
+        cast(pen, c_void_p), cast(base, c_void_p),
+        cast(exp, c_void_p), c_char_p(data), c_size_t(vec_size))
+    return _pi_init_store(res, pen, base, exp, vec_size,
+                          int.from_bytes(mem_type, 'little'),
+                          int.from_bytes(data_type, 'little'),
+                          int.from_bytes(encode_n, 'little'),
+                          int.from_bytes(max_int, 'little'))
+
+
+def _te_init_shape(shape_store, shape_tuple):
+    '''
+    Init TensorShapeStorage
+    ----------
+    Para:
+        shape_store: TensorShapeStorage or None, return value, default None
+        shape_tuple: tuple, at most 2 dim, source data of TensorShapeStorage
+    Return:
+        TensorShapeStorage
+    '''
+    if shape_store is None:
+        shape_store = TensorShapeStorage()
+    shape_store.from_tuple(shape_tuple)
+    return shape_store
+
+
+def _te_init_ss(res_store, res_data, vec_size,
+                res_shape, shape_tuple, mem_type, data_type):
+    '''
+    Init TensorStorage and TensorShapeStorage at the same time
+    ------------
+    Para:
+        res_store: The return value, TensorStorage, default None
+        res_data:  int or ndarray
+        vec_size:  int
+        res_shape: The return value, TensorShapeStorage, default None
+        shape_tuple, tuple, at most 2 dim
+        mem_type:  int
+        data_type: int
+    Return:
+        tuple, (TensorStorage, TensorShapeStorage)
+    '''
+    return _te_init_store(res_store, res_data, vec_size, mem_type, data_type), _te_init_shape(res_shape, shape_tuple)
+
+
+'''''''''
+The following calculators are done on TensorStorage
+Definition are the same with numpy
+TensorStorage.data should all be ndarray datatype in order to support numpy
+
+NOT USED IN OUR FATE IMPLEMENTATION,
+but Webank's implementation seems to have used them
+'''''''''
+
+
+def te_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
+    if axis == 1:
+        res_data = store.data[:, start:stop]
+    elif axis == 0:
+        res_data = store.data[start:stop]
+    else:
+        raise NotImplementedError()
+    return _te_init_ss(res_store, res_data, res_data.size,
+                       res_shape, res_data.shape,
+                       store.mem_type, store.data_type)
+
+
+def te_cat(stores, axis, res_store, res_shape):
+    if axis == 0:
+        res_data = np.vstack([x.data for x in stores])
+    elif axis == 1:
+        res_data = np.hstack([x.data for x in stores])
+    else:
+        raise NotImplementedError()
+    return _te_init_ss(res_store, res_data, res_data.size,
+                       res_shape, res_data.shape,
+                       stores[0].mem_type, stores[0].data_type)
+
+
+# TODO: precise data_type
+
+def te_pow(left_store, right, left_shape, res_store, res_shape, stream):
+    res_data = left_store.data ** right
+    return _te_init_ss(res_store, res_data, res_data.size,
+                       res_shape, res_data.shape,
+                       left_store.mem_type, left_store.data_type)
+
+
+# TODO: precise data_type
+
+def te_add(left_store, right_store, left_shape, right_shape,
+           res_store, res_shape, stream):
+    res_data = left_store.data + right_store.data
+    return _te_init_ss(res_store, res_data, res_data.size,
+                       res_shape, res_data.shape,
+                       left_store.mem_type, left_store.data_type)
+
+
+# TODO: precise data_type
+
+def te_mul(left_store, right_store, left_shape, right_shape,
+           res_store, res_shape, stream):
+    res_data = left_store.data * right_store.data
+    return _te_init_ss(res_store, res_data, res_data.size,
+                       res_shape, res_data.shape,
+                       left_store.mem_type, left_store.data_type)
+
+
+# TODO: precise data_type
+
+def te_truediv(left_store, right_store, left_shape, right_shape,
+               res_store, res_shape, stream):
+    res_data = left_store.data / right_store.data
+    return _te_init_ss(res_store, res_data, res_data.size,
+                       res_shape, res_data.shape,
+                       left_store.mem_type, FLOAT_TYPE)
+
+
+def te_floordiv(left_store, right_store, left_shape, right_shape,
+                res_store, res_shape, stream):
+    res_data = left_store.data // right_store.data
+    return _te_init_ss(res_store, res_data, res_data.size,
+                       res_shape, res_data.shape,
+                       left_store.mem_type, INT64_TYPE)
+
+
+def te_sub(left_store, right_store, left_shape, right_shape,
+           res_store, res_shape, stream):
+    res_data = left_store.data - right_store.data
+    return _te_init_ss(res_store, res_data, res_data.size,
+                       res_shape, res_data.shape,
+                       left_store.mem_type, left_store.data_type)
+
+
+# TODO: precise data_type, currently only inherent from left
+
+def te_matmul(left_store, right_store, left_shape, right_shape,
+              res_store, res_shape, stream):
+    res_data = left_store.data @ right_store.data
+    return _te_init_ss(res_store, res_data, res_data.size,
+                       res_shape, res_data.shape,
+                       left_store.mem_type, left_store.data_type)
+
+
+def te_abs(left_store, left_shape, res_store, res_shape, stream):
+    return _te_init_ss(res_store, abs(left_store.data), left_store.vec_size,
+                       res_shape, left_shape.to_tuple(),
+                       left_store.mem_type, left_store.data_type)
+
+
+def te_neg(left_store, left_shape, res_store, res_shape, stream):
+    return _te_init_ss(res_store, -left_store.data, left_store.vec_size,
+                       res_shape, left_shape.to_tuple(),
+                       left_store.mem_type, left_store.data_type)
+
+
+def te_transpose(left_store, left_shape, res_store, res_shape, stream):
+    res_data = left_store.data.transpose()
+    return _te_init_ss(res_store, res_data, res_data.size,
+                       res_shape, res_data.shape,
+                       left_store.mem_type, left_store.data_type)
+
+
+def te_sum(left_store, left_shape, axis, res_store, res_shape, stream):
+    res_data = left_store.data.sum(axis=axis)
+    return _te_init_ss(res_store, res_data, res_data.size,
+                       res_shape, res_data.shape,
+                       left_store.mem_type, left_store.data_type)
+
+
+def te_reshape(store, shape, new_shape, res_store, res_shape, stream):
+    return _te_init_ss(res_store, store.data.reshape(new_shape),
+                       store.vec_size,
+                       res_shape, new_shape.to_tuple(),
+                       store.mem_type, store.data_type)
+
+
+def te_exp(store, shape, res_store, res_shape, stream):
+    return _te_init_ss(res_store, np.exp(store.data), store.vec_size,
+                       res_shape, shape.to_tuple(),
+                       store.mem_type, FLOAT_TYPE)
+
+
+def te_hstack(left_store, right_store, left_shape, right_shape,
+              res_store, res_shape, stream):
+    _store, _shape = te_cat([left_store, right_store], 1, res_store, res_shape)
+    # avoid naming collision
+    return _te_init_ss(res_store, _store.data, _store.vec_size,
+                       _shape, _shape.to_tuple(),
+                       left_store.mem_type, left_store.data_type)
+
+
+def te_c2p_first(store):
+    '''
+    Get the first element in the C data storage of TensorStorage
+    ---------------
+    Para:
+        store: TensorStorage, store.data must be a pointer to C memory
+    Return:
+        int or double, the first element in the C memory
+    '''
+    if store.data_type == FLOAT_TYPE:
+        temp_array = __get_C_double(store.data, 1)
+        res_array = temp_array.astype(np.float64)
+        return res_array[0]
+    elif store.data_type == INT64_TYPE:
+        temp_array = __get_C_int64(store.data, 1)
+        res_array = temp_array.astype(np.int64)
+        return res_array[0]
+    else:
+        raise PermissionError("Invalid Data Type")
+
+
+'''################malloc a space with size elements############### '''
+'''
+    function: allocate space and form a new PaillierEncryptedStorage Class
+    res:    spilted to 3 different parts, indicating the 3 parts
+            that are needed for the PaillierEncrytedStorage
+    size:   is the number of elements that need to be alloced
+    return: A PaillierEncryptedStorage class, wrapping res as a class
+'''
+
+
+def direct_bi_alloc(res, vec_size, elem_size, mem_type):
+    return _bi_init_store(
+        res, GPU_LIB.c_direct_malloc(c_size_t(vec_size * elem_size)),
+        vec_size, elem_size, mem_type)
+
+
+def direct_pi_alloc(res, size, mem_type):
+    if mem_type == MEM_HOST:
+        res_pen = GPU_LIB.c_direct_malloc(c_size_t(size * CIPHER_BYTE))
+    elif mem_type == MEM_DEVICE:
+        res_pen = GPU_LIB.cuda_malloc(c_size_t(size))
+    else:
+        res_pen = None
+    res_base = GPU_LIB.c_direct_malloc(c_size_t(size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_direct_malloc(c_size_t(size * U_INT32_BYTE))
+    # data_type, encode_n and encode_max_int all set to 0
+    return _pi_init_store(res, res_pen, res_base, res_exp,
+                          size, mem_type, 0, 0, 0)
+
+
+def direct_fp_alloc(res, size, mem_type):
+    if mem_type == MEM_HOST:
+        res_fpn = GPU_LIB.c_direct_malloc(c_size_t(size * PLAIN_BYTE))
+    elif mem_type == MEM_DEVICE:
+        res_fpn = GPU_LIB.cuda_malloc(c_size_t(size))
+    else:
+        res_fpn = None
+    res_base = GPU_LIB.c_direct_malloc(c_size_t(size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_direct_malloc(c_size_t(size * U_INT32_BYTE))
+    return _fp_init_store(res, res_fpn, res_base, res_exp,
+                          size, 0, 0, mem_type, 0)
+
+
+def direct_te_alloc(res, size, mem_type):
+    data = GPU_LIB.c_direct_malloc(c_size_t(size * DOUBLE_BYTE))
+    return _te_init_store(res, data, size, mem_type, 0)
+
+
+def bi_alloc(res, vec_size, elem_size, mem_type):
+    return _bi_init_store(
+        res, GPU_LIB.c_malloc(c_size_t(vec_size * elem_size)),
+        vec_size, elem_size, mem_type)
+
+
+def pi_alloc(res, size, mem_type):
+    if mem_type == MEM_HOST:
+        res_pen = GPU_LIB.c_malloc(c_size_t(size * CIPHER_BYTE))
+    elif mem_type == MEM_DEVICE:
+        res_pen = GPU_LIB.cuda_malloc(c_size_t(size))
+    else:
+        res_pen = None
+    res_base = GPU_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    # data_type, encode_n and encode_max_int all set to 0
+    return _pi_init_store(res, res_pen, res_base, res_exp,
+                          size, mem_type, 0, 0, 0)
+
+
+def fp_alloc(res, size, mem_type):
+    if mem_type == MEM_HOST:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(size * PLAIN_BYTE))
+    elif mem_type == MEM_DEVICE:
+        res_fpn = GPU_LIB.cuda_malloc(c_size_t(size))
+    else:
+        res_fpn = None
+    res_base = GPU_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    return _fp_init_store(res, res_fpn, res_base, res_exp,
+                          size, 0, 0, mem_type, 0)
+
+
+def te_alloc(res, size, mem_type):
+    data = GPU_LIB.c_malloc(c_size_t(size * DOUBLE_BYTE))
+    return _te_init_store(res, data, size, mem_type, 0)
+
+
+def pi_free(ptr):
+    '''
+    The delete function of PaillierEncryptedStorage,
+    Due to different mem_type, the delete method may change
+    --------------
+    Para:
+        ptr: PaillierEncryptedStorage
+    '''
+    if _MEM_ON_HOST(ptr.mem_type):
+        GPU_LIB.c_free(c_void_p(ptr.pen_storage))
+    else:
+        print("free space on gpu")
+        GPU_LIB.cuda_free(c_void_p(ptr.pen_storage))
+    GPU_LIB.c_free(c_void_p(ptr.base_storage))
+    GPU_LIB.c_free(c_void_p(ptr.exp_storage))
+    ptr.pen_storage, ptr.base_storage, ptr.exp_storage = None, None, None
+
+
+def fp_h2d(target, src, stream=None):
+    '''TODO: currently not Implemented because it is not used'''
+    return src
+
+
+def fp_d2h(target, src, stream):
+    '''TODO: currently not Implemented because it is not used'''
+    return src
+
+
+def pi_h2d(pub_key, target, src, stream):
+    '''
+    Transfer C-memory stored PaillierEncryptedStorage into GPU-memory stored,
+    with the internal exponent aligned done.
+    ---------------
+    Para:
+        pub_key: Dev_PubKeyStorage, Paillier PubKey used for exp align
+        target:  PaillierEncryptedStorage, return value,
+                 target.pen_storage is a pointer pointing to GPU-memory,
+        src:     PaillierEncryptedStorage, source data
+                 src.pen_storage is a pointer pointing to CPU C-memory
+    Return:
+        PaillierEncryptedStorage, ptr pointing to GPU-memory
+    '''
+    vec_size = src.vec_size
+    # pen_storage is a pointer pointing to GPU-memory
+    # base_storage & exp_storage are pointers pointing to CPU C-memory
+    # Since those two are rarely used in computation and costs less time for
+    # copying from Host To Device when compared with encrypted bigint.
+    if target is None:
+        pen_storage = GPU_LIB.cuda_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        base_storage = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        exp_storage = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        pen_storage = target.pen_storage
+        base_storage = target.base_storage
+        exp_storage = target.exp_storage
+
+    GPU_LIB.pen_host2device_exp_align(
+        c_char_p(src.pen_storage), c_void_p(src.base_storage), c_void_p(src.exp_storage),
+        c_void_p(pen_storage), c_void_p(base_storage), c_void_p(exp_storage),
+        c_size_t(vec_size), c_void_p(pub_key.pub_key_ptr))
+    mem_type = MEM_DEVICE
+    return _pi_init_store(
+        target, pen_storage, base_storage, exp_storage,
+        vec_size, mem_type, src.data_type, src.encode_n, src.encode_max_int)
+
+
+def pi_d2h(target, src, stream):
+    '''
+    Transfer GPU-memory stored PaillierEncryptedStorage into C-memory stored ones.
+    --------------
+    Para:
+        target: PaillierEncryptedStorage, return value
+                target.pen_storage is a pointer pointing to CPU C-memory
+        src:    PaillierEncryptedStorage, source value
+                src.pen_storage is a pointer pointing to GPU-memory
+    Return
+        PaillierEncryptedStorage, ptr pointing to C-memory
+    '''
+    vec_size = src.vec_size
+    if target is None:
+        pen_storage = GPU_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        base_storage = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        exp_storage = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        pen_storage = target.pen_storage
+        base_storage = target.base_storage
+        exp_storage = target.exp_storage
+
+    GPU_LIB.c_memcpy(c_void_p(base_storage),
+                     c_void_p(src.base_storage),
+                     c_size_t(vec_size * U_INT32_BYTE))
+    GPU_LIB.c_memcpy(c_void_p(exp_storage),
+                     c_void_p(src.exp_storage),
+                     c_size_t(vec_size * U_INT32_BYTE))
+
+    GPU_LIB.pen_device2host(
+        c_void_p(src.pen_storage),
+        c_char_p(pen_storage),
+        c_size_t(src.vec_size))
+    mem_type = MEM_HOST
+    return _pi_init_store(
+        target, pen_storage, base_storage, exp_storage,
+        src.vec_size, mem_type, src.data_type,
+        src.encode_n, src.encode_max_int)
+
+
+def pi_h2d_pub_key(target, src):
+    '''
+    Transfer CPU C-memory stored PubKeyStorage to GPU-memory stored Dev_PubKeyStorage
+    ----------------
+    target:  Dev_PubKeyStorage, return value
+    src:     PubKeyStorage, the source value to be transfered
+    '''
+    target = Dev_PubKeyStorage(src)
+    return target
+
+
+def pi_h2d_priv_key(target, src):
+    '''
+    Transfer CPU C-memory stored PubKeyStorage to GPU-memory stored Dev_PubKeyStorage
+    ----------------
+    target:  Dev_PrivKeyStorage, return value
+    src:     PrivKeyStorage, the source value to be transfered
+    '''
+    target = Dev_PrivKeyStorage(src)
+    return target
+
+
+def pi_free_d_pub_key(target):
+    '''
+    free memory malloced for Dev_PubKeyStorage, which is on GPU
+    -----------------
+    target:  a pointer pointing to a continuous cuda memory
+    '''
+    GPU_LIB.cuda_free(c_void_p(target))
+
+
+def pi_free_d_priv_key(target):
+    '''
+    free memory malloced for Dev_PrivKeyStorage, which is on GPU
+    ------------------
+    target:  a pointer pointing to a continuous cuda memory
+    '''
+    GPU_LIB.cuda_free(c_void_p(target))
+
+
+def pi_p2c_pub_key(target, src):
+    '''
+    Transfer Python form PaillierPublicKey to C form PubKeyStorage,
+    the latter can be used for C/Cuda computing
+    '''
+    target = PubKeyStorage(src.n, src.g, src.nsquare, src.max_int)
+    return target
+
+
+def pi_p2c_priv_key(target, src):
+    '''Transfer Python form PaillierPrivateKey to C form PrivKeyStorage'''
+    target = PrivKeyStorage(src.p, src.q, src.psquare, src.qsquare,
+                            src.q_inverse, src.hp, src.hq)
+    return target
+
+
+# ###########PaillierEncrypted STORAGE INITIALIZE#################
+def _pi_init_store(res_store, pen_storage, base_storage, exp_storage, vec_size,
+                   mem_type, data_type, encode_n, encode_max_int):
+    '''
+    init a new PaillierEncryptedStorage
+    ---------------
+    Para:
+        res_store, PaillierEncryptedStorage or None, return value, default None
+        Else paras are identical to the ones described in PaillierEncryptedStorage
+    '''
+    if res_store is None:
+        res_store = PaillierEncryptedStorage(
+            pen_storage, base_storage, exp_storage, vec_size,
+            mem_type, data_type, encode_n, encode_max_int)
+    else:
+        res_store.pen_storage = pen_storage
+        res_store.base_storage = base_storage
+        res_store.exp_storage = exp_storage
+        res_store.vec_size = vec_size
+        res_store.mem_type = mem_type
+        '''para needed by TensorStorage'''
+        res_store.data_type = data_type
+        '''para needed by FixedPointNumber'''
+        res_store.encode_n = encode_n
+        res_store.encode_max_int = encode_max_int
+    return res_store
+
+
+_pi_init_shape = _te_init_shape
+
+
+def _pi_init_ss(res_store, pen_storage, base_storage, exp_storage, vec_size,
+                res_shape, res_shape_tuple,
+                mem_type, data_type, encode_n, encode_max_int):
+    '''
+    init new PaillierEncryptedStorage and corresponding TensorShapeStorage at same time
+    Paras are identical to _pi_init_store & _te_init_shape
+    '''
+    return _pi_init_store(res_store, pen_storage, base_storage, exp_storage, vec_size, mem_type, data_type, encode_n,
+                          encode_max_int), _pi_init_shape(res_shape, res_shape_tuple)
+
+
+''' transfor PEN tensor from Python memory to C memory '''
+
+
+def pi_p2c(target, src, data_type=FLOAT_TYPE):
+    '''
+    Transform list of PaillierEncryptedNumber to C-memory style PaillierEncryptedStorage
+    --------------------
+    Para:
+        target:     PaillierEncryptedStorage, return value
+        src:        List or ndarray, each element is a PaillierEncryptedNumber
+        data_type:  int, src's original datatype, default double
+    '''
+    if isinstance(src, list):
+        vec_size = len(src)
+    elif isinstance(src, np.ndarray):
+        vec_size = src.size
+        src = src.flat
+    else:
+        raise TypeError("Unsupported Data Structure")
+    # malloc the space for the type
+    if target is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = target.pen_storage
+        res_base = target.base_storage
+        res_exp = target.exp_storage
+    # get the two encoding parameters
+    n = src[0].public_key.n
+    max_int = src[0].public_key.max_int
+    base_temp = []
+    exp_temp = []
+    # Due to the special condition that big_int in ndarray are not continuously stored,
+    # they are actually object type rather than int type.
+    # So we should use a for loop to handle each bigint and memcpy it
+    for i in range(vec_size):
+        src_number = src[i].ciphertext(False).to_bytes(CIPHER_BYTE, 'little')
+        GPU_LIB.c_memcpy(
+            c_void_p(res_pen + i * CIPHER_BYTE), c_char_p(src_number),
+            c_size_t(CIPHER_BYTE))
+        base_temp.append(PEN_BASE)
+        exp_temp.append(src[i].exponent)
+    # base and exp are deepcopyed in order to prevent potential double free here
+    base_array_pointer = np.asarray(base_temp, np.uint32).ctypes.data_as(c_void_p)
+    exp_array_pointer = np.asarray(exp_temp, np.uint32).ctypes.data_as(c_void_p)
+    GPU_LIB.c_memcpy(
+        c_void_p(res_base), base_array_pointer,
+        c_size_t(vec_size * U_INT32_BYTE))
+    GPU_LIB.c_memcpy(
+        c_void_p(res_exp), exp_array_pointer,
+        c_size_t(vec_size * U_INT32_BYTE))
+    return _pi_init_store(
+        target, res_pen, res_base, res_exp,
+        vec_size, MEM_HOST, data_type, n, max_int)
+
+
+def _bi_init_store(res_store, data, count, elem_size, mem_type):
+    '''init a new BigIntStorage object'''
+    if res_store is None:
+        res_store = BigIntStorage(data, count, mem_type, elem_size)
+    else:
+        res_store.bigint_storage = data
+        res_store.vec_size = count
+        res_store.elem_size = elem_size
+        res_store.mem_type = mem_type
+    return res_store
+
+
+_bi_init_shape = _te_init_shape
+
+
+def _bi_init_ss(res_store, res_data, vec_size, res_shape, res_shape_tuple,
+                elem_size, mem_type):
+    '''Init BigIntStorage and the corresponding TensorShapeStorage'''
+    return _bi_init_store(res_store, res_data, vec_size, elem_size, mem_type), _bi_init_shape(res_shape,
+                                                                                              res_shape_tuple)
+
+
+def _fp_init_store(res_store, fpn_storage, base_storage, exp_storage,
+                   vec_size, n, max_int, mem_type,
+                   data_type):
+    '''
+    Init FixedPointStorage class,
+    paras are identical to the elements in FixedPointStorage
+    '''
+    if res_store is None:
+        res_store = FixedPointStorage(
+            fpn_storage, base_storage, exp_storage,
+            vec_size, n, max_int, mem_type, data_type)
+    else:
+        res_store.bigint_storage = fpn_storage
+        res_store.base_storage = base_storage
+        res_store.exp_storage = exp_storage
+        res_store.vec_size = vec_size
+        res_store.mem_type = mem_type
+        '''TensorStorage needed paras'''
+        res_store.data_type = data_type
+        '''En/Decode needed paras '''
+        res_store.encode_n = n
+        res_store.max_int = max_int
+    return res_store
+
+
+def _fp_init_ss(res_store, fpn_storage, base_storage, exp_storage,
+                vec_size, n, max_int,
+                res_shape, res_shape_tuple, mem_type, data_type):
+    '''Init FixedPointStorage and the corresponding TensorShapeStorage'''
+    return _fp_init_store(res_store, fpn_storage, base_storage, exp_storage, vec_size, n, max_int, mem_type,
+                          data_type), _te_init_shape(res_shape, res_shape_tuple)
+
+
+def get_add_mul_size(left_shape: TensorShapeStorage, right_shape: TensorShapeStorage):
+    '''
+    Get the result size of pi_add, pi_mul, fp_mul calculators
+    --------------------
+    Para:
+        left_shape, right_shape: TensorShapeStorage, the two operator's shape
+    Return:
+        res_size: int, the size of the return value
+    '''
+    if isinstance(left_shape, TensorShapeStorage) is False:
+        raise RuntimeError(f"Illegal shape type : {type(left_shape)}, params need type: {TensorShapeStorage}")
+    if isinstance(right_shape, TensorShapeStorage) is False:
+        raise RuntimeError(f"Illegal shape type : {type(right_shape)}, params need type: {TensorShapeStorage}")
+
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    res_size = max(P, R) * max(Q, S)
+    return res_size
+
+
+def get_matmul_rmatmul_size(left_shape: TensorShapeStorage, right_shape: TensorShapeStorage):
+    '''
+    Get the result size of matmul, rmatmul calculators
+    ----------------------
+    Para:
+        left_shape, right_shape: TensorShapeStorage, the two operator's shape
+    Return:
+        res_size: int, the size of the result of corresponding calculators
+    '''
+    if isinstance(left_shape, TensorShapeStorage) is False:
+        raise RuntimeError(f"Illegal shape type : {type(left_shape)}, params need type: {TensorShapeStorage}")
+    if isinstance(right_shape, TensorShapeStorage) is False:
+        raise RuntimeError(f"Illegal shape type : {type(right_shape)}, params need type: {TensorShapeStorage}")
+    P, Q = __shape_decompose(left_shape)
+    R, S = __shape_decompose(right_shape)
+    res_size = P * S
+    return res_size
+
+
+def get_sum_size(shape: TensorShapeStorage, axis):
+    '''
+    Get the result size of pi_sum, whose result depends on axis
+    ----------------------
+    Para:
+        shape: TensorShapeStorage, the input store's size
+        axis:  int or None, the dim which sum is performed,
+               0 means vertical sum, 1 means horizontal sum, None means sum all data
+    Return:
+        int, the size of the result of pi_sum
+    '''
+    if isinstance(shape, TensorShapeStorage) is False:
+        raise RuntimeError(f"Illegal shape type : {type(shape)}, params need type: {TensorShapeStorage}")
+    if axis is None:
+        return 1
+    if len(shape.to_tuple()) < 2:
+        return shape.size()
+    elif len(shape.to_tuple()) == 2 and axis == 0:
+        return shape.to_tuple()[1]
+    elif len(shape.to_tuple()) == 2 and axis == 1:
+        return shape.to_tuple()[0]
+    else:
+        raise RuntimeError("illegal shape or axis!")
+
+
+def get_slice_size(shape: TensorShapeStorage, start: int, stop: int, axis):
+    '''
+    Get the result size of fp_slice, pi_slice, whose result depends on axis
+    ------------------------
+    Para:
+        shape: TensorShapeStorage, the input store's size
+        axis:  int or None, the dim which sum is performed,
+               0 means slice horizontally
+               1 means slice vertically
+    Return:
+        int, the result size of corresponding calculators
+    '''
+    if isinstance(shape, TensorShapeStorage) is False:
+        raise RuntimeError(f"Illegal shape type : {type(shape)}, params need type: {TensorShapeStorage}")
+    if isinstance(start, int) is False:
+        raise RuntimeError(f"Illegal start type : {type(start)}, params need type : {int}")
+    if isinstance(stop, int) is False:
+        raise RuntimeError(f"Illegal stop type : {type(stop)}, params need type : {int}")
+    shape_tuple = shape.to_tuple()
+    dim0, dim1 = 0, 0
+    if len(shape_tuple) == 1:
+        dim0, dim1 = 1, shape_tuple[0]
+    elif len(shape_tuple) == 2:
+        dim0, dim1 = shape_tuple[0], shape_tuple[1]
+    gap_length = stop - start
+    res_size = None
+    if axis == 0:
+        # 'axis == 0 means that we need to cut the matrix horizontally '
+        res_size = dim0 * gap_length
+    elif axis == 1:
+        # 'axis == 1 means that we need to cut the matrix vertically '
+        res_size = dim1 * gap_length
+    return res_size
+
+
+def get_cat_size(shapes: list):
+    '''
+    Get the result size of fp_cat, pi_cat
+    -------------------
+    Para:
+        shape: List[TensorShapeStorage], the to-be-concated stores' shape
+    Return:
+        int, the sum result of all shapes
+    '''
+    if isinstance(shapes, list) is False:
+        raise RuntimeError(f"Illegal shapes type : {type(shapes)}, params need type : {list}")
+    res_size = np.sum([v.size() for v in shapes])
+    return res_size
+
+
+def pi_encrypt(pub_key, fps, res, stream):
+    '''
+    perform paillier encryption for FixedPointStorage,
+    use raw encrypt with no obfuscation
+    ----------------
+    Para:
+        pubkey: Dev_PubKeyPtr, the PaillierPublicKey class stored in GPU memory
+        fps:    FixedPointStorage, fpn value waiting to be encrypted
+        res:    None or PaillierEncryptedStorage, return value, default None
+        stream: None, currently not used
+    Return:
+        PaillierEncryptedStorage, the encrypted value
+    '''
+    src_fpn = fps.bigint_storage
+    src_base = fps.base_storage
+    src_exp = fps.exp_storage
+    vec_size = fps.vec_size
+
+    if res is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res.pen_storage
+        res_base = res.base_storage
+        res_exp = res.exp_storage
+    '''call the encrypt function'''
+    GPU_LIB.encrypt_paillier(
+        c_char_p(src_fpn), c_void_p(src_base), c_void_p(src_exp),
+        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(PLAIN_BITS), c_size_t(CIPHER_BITS),
+        c_size_t(vec_size), c_uint32(device_type))
+    return _pi_init_store(
+        res, res_pen, res_base, res_exp, vec_size,
+        fps.mem_type, fps.data_type,
+        fps.encode_n, fps.max_int)
+
+
+def pi_decrypt(pub_key, priv_key, pes, res, stream, fps=None):
+    '''
+    perform decryption and decode as a whole
+    ---------------------
+    Para:
+        pub_key:   Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        priv_key:  Dev_PrivKeyStorage, PaillierPrivateKey stored in GPU mem
+        pes:       PaillierEncryptedStorage, pens waiting to be decrypted
+        res:       TensorStorage, the return value;
+        stream:    None, currently not used
+        fps:       FixedPointStorage, the middle memory space used
+                   after decrypt and before encode
+    Return:
+        TensorStorage, the decrypted then decoded value
+    '''
+    src_pen = pes.pen_storage
+    src_base = pes.base_storage
+    src_exp = pes.exp_storage
+    vec_size = pes.vec_size
+    '''malloc space for middle FixedPointStorage'''
+    if fps is None:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_fpn = fps.bigint_storage
+        res_base = fps.base_storage
+        res_exp = fps.exp_storage
+    '''call the decrypt function'''
+    GPU_LIB.decrypt_paillier(
+        c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+        c_char_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
+        c_void_p(pub_key.pub_key_ptr), c_void_p(priv_key.priv_key_ptr),
+        c_size_t(PLAIN_BITS), c_size_t(CIPHER_BITS),
+        c_size_t(vec_size), c_uint32(device_type))
+
+    decrypt_store = FixedPointStorage(
+        res_fpn, res_base, res_exp, vec_size,
+        pes.encode_n, pes.encode_max_int,
+        pes.mem_type, pes.data_type)
+    return fp_decode(decrypt_store, res, stream)
+
+
+def pi_obfuscate(pub_key, pes, obf_seeds, res, stream):
+    '''
+    apply obfuscation to a PaillierEncryptedStorage using the
+    obfuscation seed given, actually a mulmod
+    ----------------------
+    Para:
+        pubkey:    Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        pes:       PaillierEncryptedStorage, raw pen haven't be obfuscated
+        obf_seeds: BigIntStorage, random bigint generated by pi_gen_obf_seed
+        res:       PaillierEncryptedStorage, the obfuscated return value
+    Return:
+        PaillierEncryptedStorage, the same as res
+    '''
+    # get the pen storage ptr
+    src_pen = pes.pen_storage
+    src_base = pes.base_storage
+    src_exp = pes.exp_storage
+    vec_size = pes.vec_size
+    # get the bigint random ptr
+    obf_rand = obf_seeds.bigint_storage
+    '''initialize the res space'''
+    if res is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res.pen_storage
+        res_base = res.base_storage
+        res_exp = res.exp_storage
+    '''run the modular mul function'''
+    GPU_LIB.obf_modular_multiplication(
+        c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+        c_char_p(obf_rand),
+        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS), c_size_t(CIPHER_BITS),
+        c_size_t(vec_size), c_uint32(device_type))
+    return _pi_init_store(
+        res, res_pen, res_base, res_exp, vec_size,
+        pes.mem_type, pes.data_type,
+        pes.encode_n, pes.encode_max_int)
+
+
+def pi_gen_obf_seed(res_store, pub_key, count, elem_size, rand_seed, stream):
+    '''
+    generate random bigint and perform expmod based on the given public key.
+    The calculation result is then used as obfuscation seed for further encrypt.
+    --------------
+    Para:
+        res_store:   BigIntStorage, the return value
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        count:       int, the number of random numbers need to be generated
+        elem_size:   int, the length of the random bigint
+        rand_seed:   the seed used for generating random number
+    Return:
+        BigIntStorage, same as res_store
+    '''
+    rand_storage = bi_gen_rand(elem_size, count, res_store, rand_seed, stream)
+    rand_data = rand_storage.bigint_storage
+    if res_store is None:
+        res_data = GPU_LIB.c_malloc(c_size_t(count * CIPHER_BYTE))
+    else:
+        res_data = res_store.bigint_storage
+    GPU_LIB.obf_modular_exponentiation(
+        c_char_p(rand_data), c_size_t(CIPHER_BITS),
+        c_void_p(pub_key.pub_key_ptr),
+        c_char_p(res_data), c_size_t(CIPHER_BITS),
+        c_size_t(count), c_uint32(device_type))
+    return _bi_init_store(res_store, res_data, count, elem_size, MEM_DEVICE)
+
+
+def __shape_decompose(shape):
+    '''
+    Decompose TensorShapeStorage to 2-D tuple
+    satisfying cuda computation demand
+    '''
+    shape_tuple = shape.to_tuple()
+    if len(shape_tuple) == 0:
+        return 1, 1
+    elif len(shape_tuple) == 1:
+        return 1, shape_tuple[0]
+    elif len(shape_tuple) == 2:
+        return shape_tuple[0], shape_tuple[1]
+    else:
+        raise PermissionError("Invalid Shape")
+
+
+def __shape_resolve(shape_1, shape_2):
+    '''check aligment capability of shape_1 and shape_2 to support broadcast'''
+
+    def check_func(a, b):
+        return a == b or a == 1 or b == 1
+
+    P, Q = __shape_decompose(shape_1)
+    R, S = __shape_decompose(shape_2)
+    max_shape_size = max(len(shape_1.to_tuple()), len(shape_2.to_tuple()))
+    if check_func(P, R) and check_func(Q, S):
+        # to suit numpy's shape output, config output shape here
+        if max_shape_size == 0:
+            return P, Q, R, S, ()
+        elif max_shape_size == 1:
+            return P, Q, R, S, (max(Q, S),)
+        elif max_shape_size == 2:
+            return P, Q, R, S, (max(P, R), max(Q, S))
+        else:
+            raise PermissionError("Invalid shape", shape_1, shape_2)
+    else:
+        raise PermissionError("shape cannot align", shape_1, shape_2)
+
+
+def pi_add(pub_key, left_store, right_store, left_shape, right_shape,
+           res_store, res_shape, stream):
+    '''
+    Perform element-wise encrypted add, support broadcast over cols or rows
+    ---------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        right_store: PaillierEncryptedStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if left/right operators cannot aligned for compute,
+                         even if broadcast is supported
+    '''
+    # check for alignment capability of shapes
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    res_size = max(P, R) * max(Q, S)
+    # the left_store data
+    l_pen = left_store.pen_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_pen = right_store.pen_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    # perform calculation
+    GPU_LIB.pen_matrix_add_pen_matrix(
+        c_char_p(l_pen), c_void_p(l_base), c_void_p(l_exp),
+        c_char_p(r_pen), c_void_p(r_base), c_void_p(r_exp),
+        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_size_t(P), c_size_t(Q), c_size_t(R), c_size_t(S),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS), c_uint32(device_type))
+    # handle the result's data type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and \
+            right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _pi_init_ss(
+        res_store, res_pen, res_base, res_exp, res_size,
+        res_shape, res_shape_tuple,
+        left_store.mem_type, data_type,
+        left_store.encode_n, left_store.encode_max_int)
+
+
+def pi_mul(pub_key, left_store, right_store, left_shape, right_shape,
+           res_store, res_shape, stream):
+    '''
+    Perform element-wise encrypted muliply, support broadcast for cols and rows
+    --------------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        right_store: FixedPointStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if left/right operators cannot aligned for compute,
+                         even if broadcast is supported
+    '''
+    # check for alignment capability of shapes
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    res_size = max(P, R) * max(Q, S)
+    # the left_store data
+    l_pen = left_store.pen_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_fpn = right_store.bigint_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    # '''call the batch_mul function'''
+    GPU_LIB.fpn_matrix_elementwise_multiply_pen_matrix(
+        c_char_p(r_fpn), c_void_p(r_base), c_void_p(r_exp),
+        c_char_p(l_pen), c_void_p(l_base), c_void_p(l_exp),
+        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_size_t(R), c_size_t(S), c_size_t(P), c_size_t(Q),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(PLAIN_BITS), c_size_t(CIPHER_BITS), c_uint32(device_type))
+    # handle the result's data type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and \
+            right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _pi_init_ss(
+        res_store, res_pen, res_base, res_exp, res_size,
+        res_shape, res_shape_tuple,
+        left_store.mem_type, data_type,
+        left_store.encode_n, left_store.encode_max_int)
+
+
+def fp_transpose(left_store, left_shape, res_store, res_shape, stream):
+    '''
+    transpose the C-memory stored matrix of FixedPointStorage,
+    support at most 2-D matrix
+    -----------------
+    Para:
+        left_store:  FixedPointStorage, left_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (FixedPointStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if dimension is higher than 2-D, not supported
+    '''
+    left_shape_tuple = left_shape.to_tuple()
+    # get the left_store parameters
+    src_fpn = left_store.bigint_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    # malloc space for the res value
+    if res_store is None:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    #  Handling different shapes
+    if len(left_shape_tuple) < 2:
+        # the tuple is 0-D or 1-D,
+        # transpose returns the same value as input in numpy
+        # make the output same as numpy, so no need for transpose
+        GPU_LIB.c_memcpy(c_void_p(res_fpn), c_void_p(src_fpn), c_size_t(vec_size * PLAIN_BYTE))
+        GPU_LIB.c_memcpy(c_void_p(res_base), c_void_p(src_base), c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(c_void_p(res_exp), c_void_p(src_exp), c_size_t(vec_size * U_INT32_BYTE))
+        return _fp_init_ss(
+            res_store, res_fpn, res_base, res_exp,
+            left_store.vec_size, left_store.encode_n, left_store.max_int,
+            left_shape, left_shape_tuple,
+            left_store.mem_type, left_store.data_type)
+    elif len(left_shape_tuple) == 2:
+        # the tuple is 2-D
+        # do a normal transpose
+        res_shape_tuple = (left_shape_tuple[1], left_shape_tuple[0])
+        GPU_LIB.transpose(
+            c_char_p(src_fpn), c_void_p(src_base), c_void_p(src_exp),
+            c_char_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
+            c_size_t(res_shape_tuple[1]), c_size_t(res_shape_tuple[0]))
+        return _fp_init_ss(
+            res_store, res_fpn, res_base, res_exp,
+            vec_size, left_store.encode_n, left_store.max_int,
+            res_shape, res_shape_tuple,
+            left_store.mem_type, left_store.data_type)
+    else:
+        raise PermissionError("Unsupported shape")
+
+
+'''
+    In the cuda code: the right matrix is vertically flattened:
+    for instance:
+    [[1,2,3],[4,5,6]]
+    should be flatten to [1,4,2,5,3,6] rather than [1,2,3,4,5,6]
+    This aims for a better leverage of space locality.
+
+    So we need a transpose to make the memory looks like [1,4,2,5,3,6]
+    so horizontal flatten for [[1,4],[2,5],[3,6]] is identically [1,4,2,5,3,6]
+    And we know that  [[1,2,3],[4,5,6]]^T = [[1,4],[2,5],[3,6]],
+    So the res is: we do a transpose and maintain the shape unchanged,
+    then we get the vertically flattened matrix
+'''
+
+
+def pi_matmul(pub_key, left_store, right_store, left_shape, right_shape,
+              res_store, res_shape, stream):
+    '''
+    Perform matrix multiply under encryption.
+    Due to implementation of cuda code, right_store needs to be transposed
+    -------------------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        right_store: FixedPointStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if shape is invalid for 1-D or 2-D matrix mul
+        ValueError, if left/right operators' shape can't align for matmul
+    '''
+
+    # '''Pre-process shape'''
+    left_tuple = left_shape.to_tuple()
+    right_tuple = right_shape.to_tuple()
+    if len(left_tuple) == 0 or len(right_tuple) == 0 or \
+            len(left_tuple) > 2 or len(right_tuple) > 2:
+        raise PermissionError("Invalid shape")
+    P, Q = __shape_decompose(left_shape)
+    R, S = __shape_decompose(right_shape)
+    if len(right_tuple) == 1:
+        R, S = S, R
+    if Q != R:
+        raise ValueError("shape not aligned")
+    if len(left_tuple) == 1 and len(right_tuple) == 1:
+        res_shape_tuple = ()
+    elif len(left_tuple) == 1 and len(right_tuple) == 2:
+        res_shape_tuple = (S,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 1:
+        res_shape_tuple = (P,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 2:
+        res_shape_tuple = (P, S)
+    else:
+        raise RuntimeError("You should never ever see this error unless something VERY STRANGE occurs")
+    res_size = P * S
+    '''A transpose is need to make the right matrix vertically flattened'''
+    transpose_right_store, _ = fp_transpose(
+        right_store, right_shape, None, None, stream)
+    # the left_store data
+    l_pen = left_store.pen_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_fpn = transpose_right_store.bigint_storage
+    r_base = transpose_right_store.base_storage
+    r_exp = transpose_right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_cipher = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_cipher = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    '''call the batch_mul function'''
+    GPU_LIB.pen_matrix_multiply_fpn_matrix(
+        c_char_p(l_pen), c_void_p(l_base), c_void_p(l_exp),
+        c_char_p(r_fpn), c_void_p(r_base), c_void_p(r_exp),
+        c_char_p(res_cipher), c_void_p(res_base), c_void_p(res_exp),
+        c_size_t(P), c_size_t(Q), c_size_t(S),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(PLAIN_BITS), c_size_t(CIPHER_BITS), c_uint32(device_type))
+
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and \
+            right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+
+    del transpose_right_store
+
+    return _pi_init_ss(
+        res_store, res_cipher, res_base, res_exp, res_size,
+        res_shape, res_shape_tuple,
+        left_store.mem_type, data_type,
+        left_store.encode_n, left_store.encode_max_int)
+
+
+def pi_rmatmul(pub_key, left_store, right_store, left_shape, right_shape,
+               res_store, res_shape, stream):
+    '''
+    Perform matrix multiply under encryption.
+    rmatmul means right_op is PaillierEncryptedStorage, differ from pi_matmul
+    Due to implementation of cuda code, right_store needs to be transposed
+    -------------------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  FixedPointStorage, left_operator
+        right_store: PaillierEncryptedStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if shape is invalid for 1-D or 2-D matrix mul
+        ValueError, if left/right operators' shape can't align for matmul
+        RuntimeError,  default error for shape evaluation
+    '''
+    # pre-process of shapes
+    left_tuple = left_shape.to_tuple()
+    right_tuple = right_shape.to_tuple()
+    if len(left_tuple) == 0 or len(right_tuple) == 0 or len(left_tuple) > 2 or len(right_tuple) > 2:
+        raise PermissionError("Invalid shape")
+    P, Q = __shape_decompose(left_shape)
+    R, S = __shape_decompose(right_shape)
+    if len(right_tuple) == 1:
+        R, S = S, R
+    if Q != R:
+        raise ValueError("shape not aligned")
+    if len(left_tuple) == 1 and len(right_tuple) == 1:
+        res_shape_tuple = ()
+    elif len(left_tuple) == 1 and len(right_tuple) == 2:
+        res_shape_tuple = (S,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 1:
+        res_shape_tuple = (P,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 2:
+        res_shape_tuple = (P, S)
+    else:
+        raise RuntimeError("You should never ever see this error unless something VERY STRANGE occurs")
+    res_size = P * S
+    '''A transpose is needed to make the right matrix vertically flattened'''
+    transpose_right_store, _ = pi_transpose(
+        right_store, right_shape, None, None, stream)
+    # the left_store data
+    l_fpn = left_store.bigint_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_pen = transpose_right_store.pen_storage
+    r_base = transpose_right_store.base_storage
+    r_exp = transpose_right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    GPU_LIB.fpn_matrix_multiply_pen_matrix(
+        c_char_p(l_fpn), c_void_p(l_base), c_void_p(l_exp),
+        c_char_p(r_pen), c_void_p(r_base), c_void_p(r_exp),
+        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_size_t(P), c_size_t(Q), c_size_t(S),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(PLAIN_BITS), c_size_t(CIPHER_BITS), c_uint32(device_type))
+
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and \
+            right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+
+    del transpose_right_store
+
+    return _pi_init_ss(
+        res_store, res_pen, res_base, res_exp, res_size,
+        res_shape, res_shape_tuple,
+        right_store.mem_type, data_type,
+        right_store.encode_n, right_store.encode_max_int)
+
+
+def pi_transpose(left_store, left_shape, res_store, res_shape, stream):
+    '''
+    transpose the C-memory stored matrix of PaillierEncryptedStorage,
+    support at most 2-D matrix
+    -----------------
+    Para:
+        left_store:  PaillierEncryptedStorage, left_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if dimension is higher than 2-D, not supported
+    '''
+    left_shape_tuple = left_shape.to_tuple()
+    # get the left_store parameters
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    # malloc space for the res value
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    '''Start handling different type of data '''
+    if len(left_shape_tuple) < 2:
+        # just a raw memcpy, no transpose needed for this scene
+        GPU_LIB.c_memcpy(c_void_p(res_pen), c_void_p(src_pen), c_size_t(vec_size * CIPHER_BYTE))
+        GPU_LIB.c_memcpy(c_void_p(res_base), c_void_p(src_base), c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(c_void_p(res_exp), c_void_p(src_exp), c_size_t(vec_size * U_INT32_BYTE))
+        return _pi_init_ss(
+            res_store, res_pen, res_base, res_exp,
+            left_store.vec_size, left_shape, left_shape_tuple,
+            left_store.mem_type, left_store.data_type,
+            left_store.encode_n, left_store.encode_max_int)
+    elif len(left_shape_tuple) == 2:
+        res_shape_tuple = (left_shape_tuple[1], left_shape_tuple[0])
+        # call the C transpose functions
+        GPU_LIB.transpose(
+            c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+            c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+            c_size_t(res_shape_tuple[1]), c_size_t(res_shape_tuple[0]))
+        return _pi_init_ss(
+            res_store, res_pen, res_base, res_exp, vec_size,
+            res_shape, res_shape_tuple,
+            left_store.mem_type, left_store.data_type,
+            left_store.encode_n, left_store.encode_max_int)
+    else:
+        raise PermissionError("Invalid Shape")
+
+
+# WARNING:  NOW ALMOST ABANDONED DUE TO NOT IDEAL PERFORMANCE!
+def pi_sum_multi_stream(pub_key, left_store, left_shape, axis=None, res_store=None, res_shape=None, stream=None):
+    '''Doing pi_sum using multi cuda stream'''
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    shape_tuple = left_shape.to_tuple()
+
+    GPU_LIB.pen_sum_multi_stream(
+        c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_size_t(shape_tuple[0]), c_size_t(shape_tuple[1]),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS), c_uint32(device_type))
+
+    res_size = shape_tuple[0]
+    res_shape_tuple = (res_size,)
+    return _pi_init_ss(
+        res_store, res_pen, res_base, res_exp, res_size,
+        res_shape, res_shape_tuple,
+        left_store.mem_type, left_store.data_type,
+        left_store.encode_n, left_store.encode_max_int)
+
+
+def pi_sum(pub_key, left_store, left_shape, axis,
+           res_store, res_shape, stream):
+    '''
+    Perform sum according to the axis
+    ----------------------
+    Para:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        axis:        int or None, the dimension which sum is performed
+                        None: sum over all elements
+                        0:    sum vertically, over the 1st demension
+                        1:    sum horizontally, over the 2nd demension
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        Permission error: when the input axis is not aligned to input shape
+    '''
+    # return shape are tuned to be the same as numpy's output
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    res_pen, res_base, res_exp, res_size = 0, 0, 0, 0
+    res_shape_tuple = ()
+    left_shape_tuple = left_shape.to_tuple()
+
+    if len(left_shape_tuple) == 0:
+        # handling shape (), meaning only one element in left_store
+        if axis is not None and axis != 0:
+            raise PermissionError("Cannot set axis other than 0 or None for dimension 0")
+        if res_store is None:
+            res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+            res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+            res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        GPU_LIB.c_memcpy(c_void_p(res_pen), c_void_p(src_pen), c_size_t(vec_size * CIPHER_BYTE))
+        GPU_LIB.c_memcpy(c_void_p(res_base), c_void_p(src_base), c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(c_void_p(res_exp), c_void_p(src_exp), c_size_t(vec_size * U_INT32_BYTE))
+        return _pi_init_ss(
+            left_store, res_pen, res_base, res_exp, vec_size,
+            left_shape, left_shape_tuple,
+            left_store.mem_type, left_store.data_type,
+            left_store.encode_n, left_store.encode_max_int)
+    elif axis is None or len(left_shape_tuple) == 1:
+        # handling shape (n,) or axis == None
+        # both mean sum for all elements
+        if len(left_shape_tuple) == 1 and axis is not None and axis >= 1:
+            raise PermissionError("axis is out of bounds for array of dimension 1")
+        if res_store is None:
+            res_pen = GPU_LIB.c_malloc(c_size_t(1 * CIPHER_BYTE))
+            res_base = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+            res_exp = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        res_size = 1
+        res_shape_tuple = ()
+        GPU_LIB.pen_sum(
+            c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+            c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+            c_size_t(1), c_size_t(vec_size),
+            c_void_p(pub_key.pub_key_ptr),
+            c_size_t(CIPHER_BITS), c_uint32(device_type))
+    elif axis == 0:
+        # handling 2-D matrix, axis == 0 means sum vertically
+        # since our gpu sum support only horizontal sum
+        # aka batch sum over continuous memory space
+        transpose_store, transpose_shape = pi_transpose(
+            left_store, left_shape, None, None, stream)
+        src_pen = transpose_store.pen_storage
+        src_base = transpose_store.base_storage
+        src_exp = transpose_store.exp_storage
+        transpose_tuple = transpose_shape.to_tuple()
+        '''perform sum on the transposed matrix'''
+        if res_store is None:
+            res_pen = GPU_LIB.c_malloc(c_size_t(transpose_tuple[0] * CIPHER_BYTE))
+            res_base = GPU_LIB.c_malloc(c_size_t(transpose_tuple[0] * U_INT32_BYTE))
+            res_exp = GPU_LIB.c_malloc(c_size_t(
+                transpose_tuple[0] * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        res_size = transpose_tuple[0]
+        res_shape_tuple = (transpose_tuple[0],)
+        GPU_LIB.pen_sum(
+            c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+            c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+            c_size_t(transpose_tuple[0]), c_size_t(transpose_tuple[1]),
+            c_void_p(pub_key.pub_key_ptr),
+            c_size_t(CIPHER_BITS), c_uint32(device_type))
+    elif axis == 1:
+        # handling 2-D matrix, axis == 1 means sum horizontally
+        if res_store is None:
+            res_pen = GPU_LIB.c_malloc(c_size_t(left_shape_tuple[0] * CIPHER_BYTE))
+            res_base = GPU_LIB.c_malloc(c_size_t(left_shape_tuple[0] * U_INT32_BYTE))
+            res_exp = GPU_LIB.c_malloc(c_size_t(
+                left_shape_tuple[0] * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        res_size = left_shape_tuple[0]
+        res_size = left_shape_tuple[0]
+        res_shape_tuple = (left_shape_tuple[0],)
+        GPU_LIB.pen_sum(
+            c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+            c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+            c_size_t(left_shape_tuple[0]), c_size_t(left_shape_tuple[1]),
+            c_void_p(pub_key.pub_key_ptr),
+            c_size_t(CIPHER_BITS), c_uint32(device_type))
+    else:
+        raise PermissionError("Invalid Axis or Shape")
+
+    return _pi_init_ss(
+        res_store, res_pen, res_base, res_exp, res_size,
+        res_shape, res_shape_tuple,
+        left_store.mem_type, left_store.data_type,
+        left_store.encode_n, left_store.encode_max_int)
+
+
+# WARNING: ABANDONED BECAUSE OF NOT IDEAL PERFORMANCE
+def pi_sum_with_index_v2(pub_key, left_store, left_shape, valid_index):
+    '''
+    A different version of C-implemetation of pen_sum_with_index,
+    details is that it generates a concrete new vector by traverse all
+    elements in left_store.
+    '''
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    left_shape_tuple = left_shape.to_tuple()
+    valid_store = te_p2c(valid_index, None)
+
+    valid_size = np.asarray(valid_index).sum()
+    if len(left_shape_tuple) == 0:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(c_void_p(res_pen), c_void_p(src_pen), c_size_t(vec_size * CIPHER_BYTE))
+        GPU_LIB.c_memcpy(c_void_p(res_base), c_void_p(src_base), c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(c_void_p(res_exp), c_void_p(src_exp), c_size_t(vec_size * U_INT32_BYTE))
+        return _pi_init_ss(
+            left_store, res_pen, res_base, res_exp, vec_size,
+            left_shape, left_shape_tuple,
+            left_store.mem_type, left_store.data_type,
+            left_store.encode_n, left_store.encode_max_int)
+
+    res_pen = GPU_LIB.c_malloc(c_size_t(1 * CIPHER_BYTE))
+    res_base = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+    res_size = 1
+    res_shape_tuple = ()
+
+    GPU_LIB.pen_sum_with_index_v2(
+        c_void_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_size_t(1), c_size_t(vec_size),
+        c_size_t(valid_size), c_void_p(valid_store.data),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS), c_uint32(device_type))
+
+    return _pi_init_ss(
+        None, res_pen, res_base, res_exp, res_size,
+        None, res_shape_tuple,
+        MEM_HOST, left_store.data_type,
+        left_store.encode_n, left_store.encode_max_int)
+
+
+def pi_sum_with_index(pub_key, left_store, left_shape, valid_index):
+    '''
+    Run pi_sum with an index list indicating which indices are used
+    Only support sum the whole list now, no axis is valid
+    ----------------
+    Paras:
+        pub_key: dev_pubkey_storage class
+        left_store: PaillierEncryptedStorage
+        left_shape: TensorShapeStorage class
+        valid_index: list, contents like [0,1,1,1,0,1,0,1],
+                        valid_index[i] == 1 means the ith value in left_store
+                        should be added to the sum result
+                        valid_index[i] == 0 means the ith value in left_store
+                        should not be counted into sum result
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    valid_store = te_p2c(valid_index, None)
+
+    res_pen, res_base, res_exp, res_size = 0, 0, 0, 0
+    res_shape_tuple = ()
+    left_shape_tuple = left_shape.to_tuple()
+
+    # TODO: check for the result of shape (), with only one elements
+    # TODO: check for the result of shape (0,) with no elements
+    if len(left_shape_tuple) == 0:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(c_void_p(res_pen), c_void_p(src_pen), c_size_t(vec_size * CIPHER_BYTE))
+        GPU_LIB.c_memcpy(c_void_p(res_base), c_void_p(src_base), c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(c_void_p(res_exp), c_void_p(src_exp), c_size_t(vec_size * U_INT32_BYTE))
+        return _pi_init_ss(
+            left_store, res_pen, res_base, res_exp, vec_size,
+            left_shape, left_shape_tuple,
+            left_store.mem_type, left_store.data_type,
+            left_store.encode_n, left_store.encode_max_int)
+
+    res_pen = GPU_LIB.c_malloc(c_size_t(1 * CIPHER_BYTE))
+    res_base = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+    # sum result number is fixed to 1
+    res_size = 1
+    res_shape_tuple = ()
+    GPU_LIB.pen_sum_with_index(
+        c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_size_t(1), c_size_t(vec_size),
+        c_void_p(valid_store.data),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS), c_uint32(device_type))
+    return _pi_init_ss(
+        None, res_pen, res_base, res_exp, res_size,
+        None, res_shape_tuple,
+        MEM_HOST, left_store.data_type,
+        left_store.encode_n, left_store.encode_max_int)
+
+
+def pi_sum_multi_index(pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None):
+    '''
+    Run sum for data with the same index indicated in the valid_index list
+    Return: A PEN_Storage class with max_value-min_value+1 number of PEN values
+    ------------
+    Parameters:
+        left_store:   PaillierEncryptedStorage, the original PEN_storage class
+        valid_index:  list, contains indices like [-1, 1, 2, 1, 3, 3, 2, -1],
+                        -1 means that this value will not be calculated if min_value >= 0
+                        1,2,3 means the different groups that it belongs to
+        min_value:    int, The min valid value of the valid index, default 0,
+                           in the above example, if min_value == 1, then -1 will be invalid
+                           if min_value == -1, -1 is also valid
+        max_value:    int, The max valid value of the valid index
+    Return:
+        tuple   (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    valid_store = te_p2c(valid_index, None)
+    # set max_value to maximum number if it is not designated
+    max_value = max(valid_index) if max_value is None else max_value
+    res_size = max_value - min_value + 1
+
+    res_pen = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+    res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_shape_tuple = (res_size,)
+    GPU_LIB.pen_sum_with_multi_index_v2(
+        c_void_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_size_t(res_size), c_size_t(vec_size), c_int64(min_value),
+        c_void_p(valid_store.data),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS), c_uint32(device_type))
+    return _pi_init_ss(
+        None, res_pen, res_base, res_exp, res_size,
+        None, res_shape_tuple,
+        MEM_HOST, left_store.data_type,
+        left_store.encode_n, left_store.encode_max_int)
+
+
+# WARNNIG: CURRENTLY NOT IN USE BECAUSE NO APPRENT IMPROVEMENT WHEN left_store.vec_size is very large
+# TODO: apply this to store with small size
+def pi_sum_batch_multi_index(pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None):
+    '''
+    Rum sum for data with the same index indicated in valid index
+    Basic logic is same with pi_sum_multi_index,
+    differ in that valid_indx may have multiple rows, given the name "batch"
+    means that we have multiple valid_index to the same PaillierEncryptedStorage
+    So there may be parallel computation between multiple valid_index list
+    -------------------
+    Paras:
+        valid_index: List[List[int]], in brief, multiple valid_index
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    pen_storage = left_store.pen_storage
+    base_storage = left_store.base_storage
+    exp_storage = left_store.exp_storage
+    vec_size = left_store.vec_size
+
+    max_value = max(valid_index) if max_value is None else max_value
+    valid_index_num = max_value - min_value + 1
+    batch_num = valid_index.shape[0]
+    if valid_index.shape[1] != vec_size:
+        raise PermissionError("valid index shape and raw data shape cannot align!!!")
+
+    res_size = batch_num * valid_index_num
+    res_pen = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+    res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_shape_tuple = (batch_num, valid_index_num)
+
+    valid_store = te_p2c(valid_index, None)
+
+    GPU_LIB.batch_pen_sum_with_multi_index(
+        c_void_p(pen_storage), c_void_p(base_storage), c_void_p(exp_storage),
+        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_size_t(valid_index_num), c_size_t(vec_size),
+        c_size_t(min_value), c_size_t(batch_num),
+        c_void_p(valid_store.data),
+        c_void_p(pub_key.pub_key_ptr), c_size_t(CIPHER_BITS),
+        c_size_t(device_type))
+
+    return _pi_init_ss(
+        None, res_pen, res_base, res_exp, res_size,
+        None, res_shape_tuple,
+        MEM_HOST, left_store.data_type,
+        left_store.encode_n, left_store.encode_max_int)
+
+
+# WARNING: ABANDONED FOR THE SAME REASON AS pi_sum_batch_multi_index
+def pi_sum_batch_multi_index_v2(pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None):
+    '''
+    Almost the same with pi_sum_batch_multi_index,
+    differ in the C implementation
+    This implementation create a concrete C memory by
+    doing a for loop before actual computation.
+    '''
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+
+    max_value = max(valid_index) if max_value is None else max_value
+    valid_index_num = max_value - min_value + 1
+    batch_num = valid_index.shape[0] // valid_index_num
+
+    res_size = batch_num * valid_index_num
+    res_pen = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+    res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_shape_tuple = (batch_num, valid_index_num)
+
+    valid_store = te_p2c(valid_index, None)
+
+    GPU_LIB.batch_pen_sum_with_multi_index_v2(
+        c_void_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_size_t(valid_index_num), c_size_t(vec_size),
+        c_size_t(valid_index.shape[1]), c_size_t(batch_num),
+        c_size_t(min_value), c_void_p(valid_store.data),
+        c_void_p(pub_key.pub_key_ptr), c_size_t(CIPHER_BITS),
+        c_size_t(device_type))
+
+    return _pi_init_ss(
+        None, res_pen, res_base, res_exp, res_size,
+        None, res_shape_tuple,
+        MEM_HOST, left_store.data_type,
+        left_store.encode_n, left_store.encode_max_int)
+
+
+def fp_encode(store, n, max_int,
+              precision=None, max_exponent=None, res=None, stream=None):
+    '''
+    Perform encode to a TensorStorage
+    -----------------
+    Paras:
+        store:        TensorStorage, raw data to be encoded
+        n:            big int, the same n in pubkey used for encryption
+        max_int:      big int, same max_int in pubkey.
+        precision:    int, the precision of encoding, default None
+        max_exponent: None or int, currently not used
+        res:          FixedPointStorage, the return value
+    Return:
+        FixedPointStorage, same as res
+    Raise:
+        PermissionError: For unsupported data type or encoding style
+    '''
+    if max_exponent is not None:
+        raise PermissionError("max_exponent not supported")
+    if precision is None:
+        precision = -1
+    src_data = store.data
+    vec_size = store.vec_size
+    # malloc the return memory space
+    if res is None:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(PLAIN_BYTE * vec_size))
+        res_base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+        res_exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    else:
+        res_fpn = res.bigint_storage
+        res_base = res.base_storage
+        res_exp = res.exp_storage
+    # Due to the different nature of encoding float/int
+    # Handle the two different data type seperately
+    if store.data_type == FLOAT_TYPE:
+        GPU_LIB.encode_double(
+            c_void_p(src_data),
+            c_void_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
+            c_int32(precision),
+            c_char_p(n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS), c_size_t(vec_size), c_uint32(device_type))
+    elif store.data_type == INT64_TYPE:
+        GPU_LIB.encode_int(
+            c_void_p(src_data),
+            c_void_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
+            c_int32(precision),
+            c_char_p(n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS), c_size_t(vec_size), c_uint32(device_type))
+    else:
+        raise PermissionError("Invalid Data Type")
+
+    '''get the three elements, store it in a FPNStorage'''
+
+    return _fp_init_store(
+        res, res_fpn, res_base, res_exp, vec_size,
+        n, max_int, store.mem_type, store.data_type)
+
+
+def __fp_decode(store, res, stream):
+    '''
+    Decode a FixedPointStorage in CPU, using fp_c2p to implement
+    Currently not used, as a GPU version has been done
+    ------------------
+    Paras:
+        store:   FixedPointStorage, the raw data to be decoded
+        res:     TensorStorage, the decoded result
+    Return:
+        TensorStorage, same as res
+    '''
+    res_fpn = store.bigint_storage
+    res_base = store.base_storage
+    res_exp = store.exp_storage
+    vec_size = store.vec_size
+    fpn_array = __get_c_fpn_storage(
+        res_fpn, res_base, res_exp, vec_size,
+        store.encode_n, store.max_int)
+
+    CPU_decode = []
+    if (store.data_type == INT64_TYPE):
+        for i in range(vec_size):
+            CPU_decode.append(int(fpn_array[i].decode()))
+    elif (store.data_type == FLOAT_TYPE):
+        for i in range(vec_size):
+            CPU_decode.append(fpn_array[i].decode())
+    else:
+        raise PermissionError("Invalid Data Type")
+
+    # reform the value to TensorStorage
+    decode_data = te_p2c(CPU_decode, None)
+    res_data = decode_data.data
+    decode_data.data = None
+    return _te_init_store(res, res_data, vec_size,
+                          store.mem_type, store.data_type)
+
+
+def fp_decode(store, res, stream):
+    '''
+    Decode a FixedPointStorage in GPU
+    ------------------
+    Paras:
+        store:   FixedPointStorage, the raw data to be decoded
+        res:     TensorStorage, the decoded result
+    Return:
+        TensorStorage, same as res
+    '''
+    if store.data_type == FLOAT_TYPE:
+        if res is None:
+            res_store = GPU_LIB.c_malloc(c_size_t(store.vec_size * DOUBLE_BYTE))
+        else:
+            res_store = res.data
+        GPU_LIB.decode_double(
+            c_void_p(store.bigint_storage), c_void_p(store.base_storage),
+            c_void_p(store.exp_storage),
+            c_char_p(store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(store.max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_void_p(res_store), c_size_t(store.vec_size))
+    elif store.data_type == INT64_TYPE:
+        res_store = GPU_LIB.c_malloc(c_size_t(store.vec_size * INT64_BYTE)) \
+            if res is None else res.data
+        GPU_LIB.decode_int(
+            c_void_p(store.bigint_storage), c_void_p(store.base_storage),
+            c_void_p(store.exp_storage),
+            c_char_p(store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(store.max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_void_p(res_store), c_size_t(store.vec_size))
+    else:
+        raise PermissionError("Invalid Data Type")
+    return _te_init_store(res, res_store, store.vec_size,
+                          store.mem_type, store.data_type)
+
+
+def bi_free(src):
+    GPU_LIB.c_free(c_void_p(src.bigint_storage))
+    src.bigint_storage = None
+
+
+def fp_free(src):
+    GPU_LIB.c_free(c_void_p(src.bigint_storage))
+    GPU_LIB.c_free(c_void_p(src.base_storage))
+    GPU_LIB.c_free(c_void_p(src.exp_storage))
+    src.bigint_storage, src.base_storage, src.exp_storage = None, None, None
+
+
+'''
+    function: change the FixedPointStorage's data back into a C type
+    As there is no shape involved in the function,
+    we cannot know the return shape of the function
+    input:
+            src: FixedPointStorage, containing the data that need to be changed
+    output:
+            return value: containing 3 ndarray:
+                            fpn_array,base_array,exp_array
+'''
+
+
+def fp_c2p(src):
+    src_fpn = src.bigint_storage
+    src_base = src.base_storage
+    src_exp = src.exp_storage
+    vec_size = src.vec_size
+    return __get_c_fpn_storage(
+        src_fpn, src_base, src_exp,
+        vec_size, src.encode_n, src.max_int)
+
+
+def pi_c2p_mp(src):
+    '''
+    convert PaillierEncryptedStorage from C mem type to Python one
+    this one use multiprocess to accelerate
+    --------------
+    Para:    src, PaillierEncryptedStorage
+    Return:  tuple, each element is a ndarray,
+                    identical to sequence of encoding, base, exponent
+    '''
+    src_pen = src.pen_storage
+    src_base = src.base_storage
+    src_exp = src.exp_storage
+    vec_size = src.vec_size
+    return __get_c_pen_storage_mp(
+        src_pen, src_base, src_exp, vec_size, src.encode_n)
+
+
+def pi_c2p(src):
+    '''convert PaillierEncryptedStorage from C mem type to Python one'''
+    src_pen = src.pen_storage
+    src_base = src.base_storage
+    src_exp = src.exp_storage
+    vec_size = src.vec_size
+    return __get_c_pen_storage_raw(
+        src_pen, src_base, src_exp, vec_size, src.encode_n)
+
+
+def fp_mul(left_store, right_store, left_shape, right_shape,
+           res_store, res_shape, stream):
+    '''
+    Perform element-wise multiplication between two FixedPointStorage.
+    This is a plaintext computation rather than an encrypted one.
+    ------------------
+    Paras:
+        left_store, right_store: FixedPointStorage
+        left_shape, right_shape: TensorShapeStorage
+    Return:
+        tuple, (FixedPointStorage, TensorShapeStorage)
+    '''
+    # P,Q is the dim of the left_store(pen)
+    # R,S is the dim of the right_store(fpn)
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    res_size = max(P, R) * max(Q, S)
+    # the left_store data
+    l_fpn = left_store.bigint_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_fpn = right_store.bigint_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(res_size * PLAIN_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    GPU_LIB.fpn_mul(
+        c_char_p(l_fpn), c_void_p(l_base), c_void_p(l_exp),
+        c_char_p(r_fpn), c_void_p(r_base), c_void_p(r_exp),
+        c_char_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
+        c_size_t(P), c_size_t(Q), c_size_t(R), c_size_t(S),
+        c_char_p(left_store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+        c_size_t(PLAIN_BITS), c_uint32(device_type))
+    # handle the data_type according to left & right's data_type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and \
+            right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _fp_init_ss(
+        res_store, res_fpn, res_base, res_exp, res_size,
+        left_store.encode_n, left_store.max_int, res_shape, res_shape_tuple,
+        left_store.mem_type, data_type)
+
+
+def fp_p2c(target, src, data_type=FLOAT_TYPE):
+    '''change a FixedPointNumber ndarray into a FixedPointStorage Class'''
+    if isinstance(src, list):
+        vec_size = len(src)
+    elif isinstance(src, np.ndarray):
+        vec_size = src.size
+        src = src.flat
+    else:
+        raise TypeError("Unsupported Data Structure")
+    # malloc the space for the type
+    if target is None:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_fpn = target.bigint_storage
+        res_base = target.base_storage
+        res_exp = target.exp_storage
+    # the temp ndarray buffer
+    base_temp = []
+    exp_temp = []
+    # get the two encoding parameters
+    n = src[0].n
+    max_int = src[0].max_int
+    for i in range(vec_size):
+        src_number = src[i].encoding.to_bytes(PLAIN_BYTE, 'little')
+        GPU_LIB.c_memcpy(c_void_p(res_fpn + i * PLAIN_BYTE),
+                         c_char_p(src_number), c_size_t(PLAIN_BYTE))
+        base_temp.append(src[i].BASE)
+        exp_temp.append(src[i].exponent)
+
+    base_array_pointer = np.asarray(base_temp, np.uint32).ctypes.data_as(c_void_p)
+    exp_array_pointer = np.asarray(exp_temp, np.uint32).ctypes.data_as(c_void_p)
+    GPU_LIB.c_memcpy(
+        c_void_p(res_base), base_array_pointer,
+        c_size_t(vec_size * U_INT32_BYTE))
+    GPU_LIB.c_memcpy(
+        c_void_p(res_exp), exp_array_pointer,
+        c_size_t(vec_size * U_INT32_BYTE))
+
+    return _fp_init_store(target, res_fpn, res_base, res_exp,
+                          vec_size, n, max_int, MEM_HOST, data_type)
+
+
+def _index_reset(index, dim_size):
+    if index < 0:
+        res_index = index + dim_size
+        res_index = max(0, res_index)
+    elif index > dim_size:
+        res_index = dim_size
+    else:
+        res_index = index
+    return res_index
+
+
+def fp_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
+    '''
+    slice a contiguous memory space, now support two directions.
+    -----------------------------
+    Para:
+    store: FixedPointStorage, the data to be sliced
+    shape: TensorShapeStorage, the original shape of the storage
+    start: int, the start index of the slice (included)
+    end:   int, the end index of the slice(not included),
+           if larger than the last index, concatencate it into the dim size
+    axis:  0 or 1, 0 means cut it horizontally, 1 means cut it vertically
+    stream: the current stream of the task, not used now
+    -----------------------------
+    Return:
+    res_store, res_shape, FixedPointStorage, TensorShapeStorage
+    Raise:
+        PermissionError: if the input start/stop/axis is not valid
+    '''
+    src_fpn = store.bigint_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    fpn_shape_tuple = shape.to_tuple()
+    dim0, dim1 = 0, 0
+    '''handle shape and index'''
+    if len(fpn_shape_tuple) == 0:
+        raise PermissionError("Cannot slice 0 dim!")
+    elif len(fpn_shape_tuple) == 1:
+        dim0, dim1 = 1, fpn_shape_tuple[0]
+        if axis == 0:
+            raise PermissionError("Cannot slice 1 dim horizontally!")
+        start = _index_reset(start, dim1)
+        stop = _index_reset(stop, dim1)
+    elif len(fpn_shape_tuple) == 2:
+        dim0, dim1 = fpn_shape_tuple[0], fpn_shape_tuple[1]
+        if axis == 0:
+            start = _index_reset(start, dim0)
+            stop = _index_reset(stop, dim0)
+        if axis == 1:
+            start = _index_reset(start, dim1)
+            stop = _index_reset(stop, dim1)
+    else:
+        raise PermissionError("Invalid shape")
+    # handle condition that a[k: l] k>=l for 2-d array
+    # will cause the result shape to be (0, dim1)
+    if axis == 0 and start >= stop:
+        res_fpn, res_base, res_exp = None, None, None
+        return _fp_init_ss(
+            None, res_fpn, res_base, res_exp, 0,
+            store.encode_n, store.encode_max_int,
+            None, (0, dim1),
+            store.mem_type, store.data_type)
+    # handle condition that a[:,k:l] k>=l for 2-d array
+    # will cause the result shape to be (dim0, 0)
+    if axis == 1 and start >= stop:
+        res_fpn, res_base, res_exp = None, None, None
+        res_shape_tuple = (dim0, 0) if len(fpn_shape_tuple) == 2 else (0,)
+        return _fp_init_ss(
+            None, res_fpn, res_base, res_exp, 0,
+            store.encode_n, store.encode_max_int,
+            None, res_shape_tuple,
+            store.mem_type, store.data_type)
+        # handle the normal slice
+    res_shape_tuple, vec_size = (), 0
+    '''useful paras'''
+    bigint_row_bytelen = dim1 * PLAIN_BYTE
+    uint32_row_bytelen = dim1 * U_INT32_BYTE
+    gap_length = stop - start
+    # start normal slice
+    if axis == 1:
+        'axis == 1 means that we need to cut the matrix vertically'
+        res_bigint_row_bytelen = gap_length * PLAIN_BYTE
+        res_uint32_row_bytelen = gap_length * U_INT32_BYTE
+        if res_store is None:
+            res_fpn = GPU_LIB.c_malloc(c_size_t(res_bigint_row_bytelen * dim0))
+            res_base = GPU_LIB.c_malloc(c_size_t(res_uint32_row_bytelen * dim0))
+            res_exp = GPU_LIB.c_malloc(c_size_t(res_uint32_row_bytelen * dim0))
+        else:
+            res_fpn = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        GPU_LIB.slice_vertical(
+            c_char_p(src_fpn), c_void_p(src_base), c_void_p(src_exp),
+            c_char_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
+            c_size_t(dim0), c_size_t(dim1), c_size_t(start), c_size_t(stop),
+            c_size_t(PLAIN_BITS), c_uint32(device_type))
+        if len(fpn_shape_tuple) == 1:
+            res_shape_tuple = (gap_length,)
+            vec_size = res_shape_tuple[0]
+        else:
+            res_shape_tuple = (dim0, gap_length)
+            vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+
+    elif axis == 0:
+        'axis == 0 means that we nned to cut the matrix horizontally '
+        if res_store is None:
+            res_fpn = GPU_LIB.c_malloc(c_size_t(bigint_row_bytelen * gap_length))
+            res_base = GPU_LIB.c_malloc(c_size_t(uint32_row_bytelen * gap_length))
+            res_exp = GPU_LIB.c_malloc(c_size_t(uint32_row_bytelen * gap_length))
+        else:
+            res_fpn = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        GPU_LIB.slice_horizontal(
+            c_char_p(src_fpn), c_void_p(src_base), c_void_p(src_exp),
+            c_char_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
+            c_size_t(dim0), c_size_t(dim1), c_size_t(start), c_size_t(stop),
+            c_size_t(PLAIN_BITS), c_uint32(device_type))
+        res_shape_tuple = (gap_length, dim1)
+        vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+    else:
+        raise NotImplementedError("Only support 2 dimensional slice")
+
+    return _fp_init_ss(
+        res_store, res_fpn, res_base, res_exp,
+        vec_size, store.encode_n, store.max_int,
+        res_shape, res_shape_tuple, store.mem_type, store.data_type)
+
+
+def pi_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
+    '''
+    slice a contiguous memory space, now support two directions.
+    -----------------------------
+    Para:
+    store: PaillierEncryptedStorage, the data to be sliced
+    shape: TensorShapeStorage, the original shape of the storage
+    start: int, the start index of the slice (included)
+    end:   int, the end index of the slice(not included),
+           if it is larger than the last index, then it concatencate into the dim size
+    axis:  0 or 1, 0 means cut it horizontally, 1 means cut it vertically
+    stream: the current stream of the task, not used now
+    -----------------------------
+    Return:
+    res_store, res_shape, PaillierEncryptedStorage, TensorShapeStorage
+    '''
+    src_pen = store.pen_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    # get the two dims and check for illegal status
+    pen_shape_tuple = shape.to_tuple()
+    dim0, dim1 = 0, 0
+    if len(pen_shape_tuple) == 0:
+        raise PermissionError("Cannot slice 0 dim!")
+    elif len(pen_shape_tuple) == 1:
+        dim0, dim1 = 1, pen_shape_tuple[0]
+        if axis == 0:
+            raise PermissionError("Cannot slice 1 dim horizontally!")
+        start = _index_reset(start, dim1)
+        stop = _index_reset(stop, dim1)
+    elif len(pen_shape_tuple) == 2:
+        dim0, dim1 = pen_shape_tuple[0], pen_shape_tuple[1]
+        if axis == 0:
+            start = _index_reset(start, dim0)
+            stop = _index_reset(stop, dim0)
+        if axis == 1:
+            start = _index_reset(start, dim1)
+            stop = _index_reset(stop, dim1)
+    else:
+        raise PermissionError("Invalid shape")
+
+    # handle condition that a[k, l], k>=l for 2-d array
+    # will cause the result shape to be (0, dim1)
+    if axis == 0 and start >= stop:
+        res_pen, res_base, res_exp = None, None, None
+        return _pi_init_ss(
+            None, res_pen, res_base, res_exp, 0, None, (0, dim1),
+            store.mem_type, store.data_type,
+            store.encode_n, store.encode_max_int)
+    # handle condition that a[:, k, l] k>=l for 2-d array
+    # will cause the result shape to be (dim0, 0)
+    if axis == 1 and start >= stop:
+        res_pen, res_base, res_exp = None, None, None
+        res_shape_tuple = (dim0, 0) if len(pen_shape_tuple) == 2 else (0,)
+        return _pi_init_ss(
+            None, res_pen, res_base, res_exp, 0, None, res_shape_tuple,
+            store.mem_type, store.data_type,
+            store.encode_n, store.encode_max_int)
+    # handle the normal slice
+    res_shape_tuple = ()
+    vec_size = 0
+    '''useful paras'''
+    bigint_row_bytelen = dim1 * PLAIN_BYTE
+    uint32_row_bytelen = dim1 * U_INT32_BYTE
+    gap_length = stop - start
+    # start slice
+    if axis == 1:
+        'axis == 1 means that we need to cut the matrix vertically'
+        res_bigint_row_bytelen = gap_length * PLAIN_BYTE
+        res_uint32_row_bytelen = gap_length * U_INT32_BYTE
+        # malloc space for result
+        if res_store is None:
+            res_pen = GPU_LIB.c_malloc(c_size_t(res_bigint_row_bytelen * dim0))
+            res_base = GPU_LIB.c_malloc(c_size_t(res_uint32_row_bytelen * dim0))
+            res_exp = GPU_LIB.c_malloc(c_size_t(res_uint32_row_bytelen * dim0))
+        else:
+            res_pen = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        # call the raw function
+        GPU_LIB.slice_vertical(
+            c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+            c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+            c_size_t(dim0), c_size_t(dim1), c_size_t(start), c_size_t(stop),
+            c_size_t(CIPHER_BITS), c_uint32(device_type))
+        if len(pen_shape_tuple) == 1:
+            res_shape_tuple = (gap_length,)
+            vec_size = res_shape_tuple[0]
+        else:
+            res_shape_tuple = (dim0, gap_length)
+            vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+    elif axis == 0:
+        'axis == 0 means that we nned to cut the matrix horizontally '
+        if res_store is None:
+            res_pen = GPU_LIB.c_malloc(c_size_t(bigint_row_bytelen * gap_length))
+            res_base = GPU_LIB.c_malloc(c_size_t(uint32_row_bytelen * gap_length))
+            res_exp = GPU_LIB.c_malloc(c_size_t(uint32_row_bytelen * gap_length))
+        else:
+            res_pen = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        GPU_LIB.slice_horizontal(
+            c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+            c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+            c_size_t(dim0), c_size_t(dim1), c_size_t(start), c_size_t(stop),
+            c_size_t(CIPHER_BITS), c_uint32(device_type))
+        # since 1-dim shape will not occur here, result shape is always 2-D
+        res_shape_tuple = (gap_length, dim1)
+        vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+    else:
+        raise NotImplementedError()
+
+    return _pi_init_ss(
+        res_store, res_pen, res_base, res_exp, vec_size,
+        res_shape, res_shape_tuple,
+        store.mem_type, store.data_type,
+        store.encode_n, store.encode_max_int)
+
+
+def fp_cat(stores, shapes, axis, res_store, res_shape):
+    '''
+    concat several FixedPointStorage according to axis
+    --------------------
+    Para:
+        stores: List or ndarray, elements are FixedPointStorage
+        shapes: List or ndarray, elements are TensorShapeStorage
+        axis:   int, how stores will be stacked
+                    0 means a vertical stack, stack along 1st dim
+                    1 means a horizontal stack, stack along 2nd dim
+        res_store: FixedPointStorage, the stacked result
+        res_shape: TensorShapeStorage, the result's shape
+    Return:
+        tuple, (FixedPointStorage, TensorShapeStorage)
+    Raise:
+        PermissionError: Invalid input data or invalid shape
+        NotImplementedError: Current only support at most 2-D matrix
+    '''
+    stores = list(stores)
+    shapes = list(shapes)
+    num_stores = len(stores)
+    res_vec_size = np.sum([v.vec_size for v in stores])
+    # Abnormaly checks
+    if num_stores < 2:
+        raise PermissionError("At least 2 Storages required for concatenation")
+    if len(shapes) != num_stores:
+        raise PermissionError("The number of storages and that of shapes didn't match")
+    for v in stores:
+        if v.data_type != stores[0].data_type:
+            raise PermissionError("All storages should have the same data type")
+        if v.encode_n != stores[0].encode_n:
+            raise PermissionError("All storages should have the same n")
+        if v.max_int != stores[0].max_int:
+            raise PermissionError("All storages should have the same max_int")
+        if v.mem_type != stores[0].mem_type:
+            raise PermissionError("All storages should have the same memory type")
+    # num_rows, num_cols is the data demanded by C functions
+    # res_rows, res_cols are return values that should be same as numpy's output
+    # distinguish them so upper and lower level won't bother each other
+    if axis == 0:
+        first_shape_decomposed = __shape_decompose(shapes[0])
+        num_rows, num_cols = 0, first_shape_decomposed[1]
+        for v in shapes:
+            shape_tuple = __shape_decompose(v)
+            if shape_tuple[1] != num_cols:
+                raise PermissionError("Shapes didn't align")
+            num_rows += shape_tuple[0]
+        res_rows = num_rows
+        res_cols = num_cols
+    elif axis == 1:
+        first_shape = shapes[0].to_tuple()
+        if len(first_shape) <= 1:
+            num_rows, num_cols = 1, 0
+            for v in shapes:
+                if len(v.to_tuple()) == 0:
+                    num_cols += 1
+                if len(v.to_tuple()) == 1:
+                    num_cols += v.to_tuple()[0]
+                if len(v.to_tuple()) >= 2:
+                    raise PermissionError("Shape cannot align!!!")
+            res_rows = num_cols
+            res_cols = None
+        elif len(first_shape) == 2:
+            num_rows, num_cols = first_shape[0], 0
+            for v in shapes:
+                v_shape = v.to_tuple()
+                if len(v_shape) != 2 or num_rows != v_shape[0]:
+                    raise PermissionError("Shape cannot align!")
+                num_cols += v_shape[1]
+            res_rows = num_rows
+            res_cols = num_cols
+        else:
+            raise NotImplementedError("Now only support up to 2-D array")
+    else:
+        raise PermissionError("Invalid Axis")
+    res_shape = TensorShapeStorage(res_rows, res_cols)
+
+    fpn_pointers = [c_void_p(v.bigint_storage) for v in stores]
+    base_pointers = [c_void_p(v.base_storage) for v in stores]
+    exp_pointers = [c_void_p(v.exp_storage) for v in stores]
+
+    if res_store is None:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(PLAIN_BYTE * res_vec_size))
+        res_base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+        res_exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    fpn_arr = (c_void_p * num_stores)(*fpn_pointers)
+    base_arr = (c_void_p * num_stores)(*base_pointers)
+    exp_arr = (c_void_p * num_stores)(*exp_pointers)
+    vec_sizes = (c_size_t * num_stores)(*[v.vec_size for v in stores])
+
+    if axis == 0:
+        '''means that we should cat stores vertically'''
+        GPU_LIB.vstack(fpn_arr, base_arr, exp_arr,
+                       c_void_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
+                       c_size_t(num_stores), vec_sizes, c_size_t(num_cols), c_size_t(PLAIN_BITS))
+    elif axis == 1:
+        '''means that we should cat stores horizontally'''
+        GPU_LIB.hstack(fpn_arr, base_arr, exp_arr,
+                       c_void_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
+                       c_size_t(num_stores), vec_sizes, c_size_t(num_rows), c_size_t(PLAIN_BITS))
+    else:
+        raise NotImplementedError()
+
+    return _fp_init_ss(
+        res_store, res_fpn, res_base, res_exp, int(round(res_vec_size)),
+        stores[0].encode_n, stores[0].max_int, res_shape, res_shape.to_tuple(),
+        stores[0].mem_type, stores[0].data_type)
+
+
+def pi_cat(stores, shapes, axis, res_store, res_shape):
+    '''
+    concat several PaillierEncryptedStorage according to axis
+    --------------------
+    Para:
+        stores: List or ndarray, elements are PaillierEncryptedStorage
+        shapes: List or ndarray, elements are TensorShapeStorage
+        axis:   int, how stores will be stacked
+                    0 means a vertical stack, stack along 1st dim
+                    1 means a horizontal stack, stack along 2nd dim
+        res_store: PaillierEncryptedStorage, the stacked result
+        res_shape: TensorShapeStorage, the result's shape
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError: Invalid input data or invalid shape
+        NotImplementedError: Current only support at most 2-D matrix
+    '''
+    stores = list(stores)
+    shapes = list(shapes)
+    num_stores = len(stores)
+    res_vec_size = np.sum([v.vec_size for v in stores])
+
+    # Anomaly checks
+    if num_stores < 2:
+        raise PermissionError("At least 2 Storages required for concatenation")
+    if len(shapes) != num_stores:
+        raise PermissionError("The number of storages and that of shapes didn't match")
+    for v in stores:
+        if v.data_type != stores[0].data_type:
+            raise PermissionError("All storages should have the same data type")
+        if v.encode_n != stores[0].encode_n:
+            raise PermissionError("All storages should have the same n")
+        if v.encode_max_int != stores[0].encode_max_int:
+            raise PermissionError("All storages should have the same max_int")
+        if v.mem_type != stores[0].mem_type:
+            raise PermissionError("All storages should have the same memory type")
+    # num_rows, num_cols is the data demanded by C functions
+    # res_rows, res_cols are return values that should be same as numpy's output
+    # distinguish them so upper and lower level won't bother each other
+    if axis == 0:
+        first_shape_decomposed = __shape_decompose(shapes[0])
+        num_rows, num_cols = 0, first_shape_decomposed[1]
+        for v in shapes:
+            shape_tuple = __shape_decompose(v)
+            if shape_tuple[1] != num_cols:
+                raise PermissionError("Shapes didn't align")
+            num_rows += shape_tuple[0]
+        res_rows = num_rows
+        res_cols = num_cols
+    elif axis == 1:
+        '''the horizontal cat'''
+        first_shape = shapes[0].to_tuple()
+        if len(first_shape) <= 1:
+            num_rows = 1
+            num_cols = 0
+            for v in shapes:
+                if len(v.to_tuple()) == 0:
+                    num_cols += 1
+                if len(v.to_tuple()) == 1:
+                    num_cols += v.to_tuple()[0]
+                if len(v.to_tuple()) >= 2:
+                    raise PermissionError("Shape cannot align!!!")
+            res_rows = num_cols
+            res_cols = None
+        elif len(first_shape) == 2:
+            num_rows = first_shape[0]
+            num_cols = 0
+            for v in shapes:
+                v_shape = v.to_tuple()
+                if len(v_shape) != 2 or num_rows != v_shape[0]:
+                    raise PermissionError("Shape cannot align!")
+                # num_rows += v_shape[0]
+                num_cols += v_shape[1]
+            res_rows = num_rows
+            res_cols = num_cols
+        else:
+            raise NotImplementedError("Now only support up to 2-D array")
+    else:
+        raise PermissionError("Invalid Axis")
+    res_shape = TensorShapeStorage(res_rows, res_cols)
+
+    pen_pointers = [c_void_p(v.pen_storage) for v in stores]
+    base_pointers = [c_void_p(v.base_storage) for v in stores]
+    exp_pointers = [c_void_p(v.exp_storage) for v in stores]
+
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(CIPHER_BYTE * res_vec_size))
+        res_base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+        res_exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    # call the C stack functions
+    pen_arr = (c_void_p * num_stores)(*pen_pointers)
+    base_arr = (c_void_p * num_stores)(*base_pointers)
+    exp_arr = (c_void_p * num_stores)(*exp_pointers)
+    vec_sizes = (c_size_t * num_stores)(*[v.vec_size for v in stores])
+
+    if axis == 0:
+        GPU_LIB.vstack(pen_arr, base_arr, exp_arr,
+                       c_void_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+                       c_size_t(num_stores), vec_sizes, c_size_t(num_cols), c_size_t(CIPHER_BITS))
+    elif axis == 1:
+        GPU_LIB.hstack(pen_arr, base_arr, exp_arr,
+                       c_void_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+                       c_size_t(num_stores), vec_sizes, c_size_t(num_rows), c_size_t(CIPHER_BITS))
+    else:
+        raise NotImplementedError()
+
+    return _pi_init_ss(
+        res_store, res_pen, res_base, res_exp,
+        int(round(res_vec_size)), res_shape, res_shape.to_tuple(),
+        stores[0].mem_type, stores[0].data_type,
+        stores[0].encode_n, stores[0].encode_max_int)
+
+
+def bi_p2c(data, res):
+    '''
+    copy data to the C memory pointed to by res
+    -------------------
+    Para:
+        data: List[object], each object is a bigint CIPHER_BIT long
+        res:  int, actually a pointer pointing to C memory
+    Return:
+        None, but the contents in c_void_p(res) has been changed
+    '''
+    vec_size = data.size
+    for i in range(vec_size):
+        GPU_LIB.c_memcpy(c_void_p(res + i * CIPHER_BYTE),
+                         c_char_p(data[i].to_bytes(CIPHER_BYTE, 'little')), c_size_t(CIPHER_BYTE))
+
+
+def bi_gen_rand(elem_size, count, res, rand_seed, stream):
+    '''
+    generate random bigint for pi_obfuscation
+    ------------------
+    Para:
+        elem_size: int, length of random bigint, upper bound is CIPHER_BYTE
+        count:     int, number of random bigint to be generated
+        res:       BigintStorage, the return value
+        rand_seed: seed used for generating random data
+    Return:
+        BigintStorage, same as res
+    '''
+    # Didn't use vectorize since that we need to_bytes()
+    # But ndarray_float64 has no to_bytes method
+    random.seed(rand_seed)
+    rands = np.asarray([random.randrange(1, 8 ** elem_size)
+                        for i in range(count)])
+    if res is None:
+        data_storage = GPU_LIB.c_malloc(c_size_t(count * CIPHER_BYTE))
+    else:
+        data_storage = res.bigint_storage
+    bi_p2c(rands, data_storage)
+    # CIPHER_BYTE is the upper bound of the length of the rand number
+    return _bi_init_store(
+        res, data_storage, count, CIPHER_BYTE, MEM_DEVICE)
+
+
+def __get_shape_size(shape_tuple):
+    shape_size = 1
+    if len(shape_tuple) == 0:
+        shape_size = 1
+    elif len(shape_tuple) == 1:
+        shape_size = shape_tuple[0]
+    elif len(shape_tuple) == 2:
+        shape_size = shape_tuple[0] * shape_tuple[1]
+    else:
+        raise PermissionError("Invalid Shape Tuple")
+    return shape_size
+
+
+def pi_reshape(store, shape, new_shape, res_store, res_shape, stream):
+    '''
+    Change a PaillierEcnryptedStorage's shape.
+    No need for change the continuous storage, only change the shape.
+    -------------------
+    Paras:
+        store, shape:  PaillierEncryptedStorage, TensorShapeStorage
+        new_shape:     TensorShapeStorage, the new shape for the pi_storage
+    Returns:
+        tuple: (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        ValueError:    If shape and new_shape's size is unequal
+    '''
+    res_shape_tuple = new_shape.to_tuple()
+    old_shape_tuple = shape.to_tuple()
+    res_shape_size = __get_shape_size(res_shape_tuple)
+    old_shape_size = __get_shape_size(old_shape_tuple)
+    res_vec_size = store.vec_size
+    if res_shape_size != old_shape_size:
+        raise ValueError("total size of new array must be unchanged!")
+    # Still, we do a malloc and memcpy in order to avoid double free in python
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(CIPHER_BYTE * res_vec_size))
+        res_base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+        res_exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    GPU_LIB.c_memcpy(c_void_p(res_pen), c_void_p(store.pen_storage), c_size_t(CIPHER_BYTE * res_vec_size))
+    GPU_LIB.c_memcpy(c_void_p(res_base), c_void_p(store.base_storage), c_size_t(U_INT32_BYTE * res_vec_size))
+    GPU_LIB.c_memcpy(c_void_p(res_exp), c_void_p(store.exp_storage), c_size_t(U_INT32_BYTE * res_vec_size))
+
+    return _pi_init_ss(
+        res_store, res_pen, res_base, res_exp,
+        store.vec_size, res_shape, res_shape_tuple,
+        store.mem_type, store.data_type,
+        store.encode_n, store.encode_max_int)
+
+
+def pi_accumulate(gpu_pubkey, pubkey_n, left_store, left_shape):
+    '''
+    Perform acummulate add for a vector
+    ----------------
+    Paras:
+        gpu_pubkey:  Dev_PubKeyStorage,
+        pubkey_n:    big int, n in PaillierPublicKey
+        left_store:  PaillierEncryptedStorage
+        left_shape:  TensorShapeStorage
+    Return:
+        tuple:       (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+
+    res_pen = GPU_LIB.c_malloc(c_size_t(CIPHER_BYTE * vec_size))
+    res_base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    res_exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    res_shape_tuple = left_shape.to_tuple()
+
+    c_pubkey_n = c_char_p(pubkey_n.to_bytes(CIPHER_BYTE, "little"))
+
+    GPU_LIB.gmp_accumulate(
+        c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+        c_void_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_size_t(vec_size), c_size_t(CIPHER_BITS),
+        c_void_p(gpu_pubkey.pub_key_ptr), c_pubkey_n)
+
+    return _pi_init_ss(
+        None, res_pen, res_base, res_exp,
+        vec_size, None, res_shape_tuple,
+        left_store.mem_type, left_store.data_type,
+        left_store.encode_n, left_store.encode_max_int)
+
+
+def pi_add_with_index(gpu_pubkey, pubkey_n, l_store, l_shape, r_store, r_shape, valid_index):
+    '''
+    Add a single PaillierEncryptedNumber to the designated index in a vector
+    ----------------------------
+    Para:
+        gpu_pubkey:  Dev_PubKeyStorage,
+        pubkey_n:    big int, n in PaillierPublicKey
+        l_store:     PaillierEncryptedStorage
+        l_shape:     TensorShapeStorage
+        r_store:     PaillierEncryptedStorage, 0-D number
+        r_shape:     TensorShapeStorage
+        valid_index: int, indicating a index offset in l_store,
+                          that r_store should be added to.
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    # check for data format
+    if r_store.vec_size != 1:
+        raise NotImplementedError("Now only support r_store with only one vector size")
+    # transform data format
+    vec_size = l_store.vec_size
+    c_pubkey_n = c_char_p(pubkey_n.to_bytes(CIPHER_BYTE, "little"))
+    res_shape_tuple = l_shape.to_tuple()
+    # alias for parameters
+    l_pen = l_store.pen_storage
+    l_base = l_store.base_storage
+    l_exp = l_store.exp_storage
+    r_pen = r_store.pen_storage
+    r_base = r_store.base_storage
+    r_exp = r_store.exp_storage
+    # alloc memory for return value
+    res_pen = GPU_LIB.c_malloc(c_size_t(CIPHER_BYTE * vec_size))
+    res_base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    res_exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    # call the C functions
+    GPU_LIB.pen_add_with_index(
+        c_char_p(l_pen), c_void_p(l_base), c_void_p(l_exp),
+        c_char_p(r_pen), c_void_p(r_base), c_void_p(r_exp),
+        c_void_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_size_t(vec_size), c_size_t(valid_index), c_size_t(CIPHER_BITS),
+        c_void_p(gpu_pubkey.pub_key_ptr), c_pubkey_n)
+    return _pi_init_ss(
+        None, res_pen, res_base, res_exp,
+        vec_size, None, res_shape_tuple,
+        l_store.mem_type, l_store.data_type,
+        l_store.encode_n, l_store.encode_max_int)
+
+
+def pi_partition_by_index(l_store, valid_index, valid_cnt=None):
+    '''
+    Rearrange the store to a number of stores according to valid_index
+    For instance, l_store with value [A,B,C,D,E] and valid_index [0,1,2,-1,2]
+    will become [[A],[B],[C,E]]
+    -----------------
+    Para:
+        l_store:     PaillierEncryptedStorage
+        valid_index: List[int], indicating the data in l_store belongs to which bin
+        valid_cnt:   List[int] or None, each bins length, default None
+    Return:
+        List[PaillierEncryptedStorage], the partitioned result, multiple PEN
+    '''
+    src_pen = l_store.pen_storage
+    src_base = l_store.base_storage
+    src_exp = l_store.exp_storage
+    vec_size = l_store.vec_size
+    valid_store = te_p2c(valid_index, None)
+    # if not pre-counted, then calculate valid_cnt here
+    if valid_cnt is None:
+        bin_cnt = max(valid_index) + 1
+        valid_cnt = [0 for _ in range(bin_cnt)]
+        for i in range(len(valid_index)):
+            if valid_index[i] == -1:
+                continue
+            bin_idx = valid_index[i]
+            valid_cnt[bin_idx] += 1
+    bin_cnt = len(valid_cnt)
+    # prepare and call for C function
+    res_pen_list, res_base_list, res_exp_list = [], [], []
+    for i in range(bin_cnt):
+        if valid_cnt[i] > 0:
+            res_pen_list.append(GPU_LIB.cuda_malloc(c_size_t(CIPHER_BYTE * valid_cnt[i])))
+            # Assume that this data has already been aligned to max_exp
+            # which is done in h2d
+            base_ptr = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * valid_cnt[i]))
+            exp_ptr = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * valid_cnt[i]))
+            GPU_LIB.c_memcpy(c_void_p(base_ptr), c_void_p(src_base),
+                             c_size_t(U_INT32_BYTE * valid_cnt[i]))
+            GPU_LIB.c_memcpy(c_void_p(exp_ptr), c_void_p(src_exp),
+                             c_size_t(U_INT32_BYTE * valid_cnt[i]))
+            res_base_list.append(base_ptr)
+            res_exp_list.append(exp_ptr)
+        else:
+            res_pen_list.append(None)
+            res_base_list.append(None)
+            res_exp_list.append(None)
+    pen_ptr_list = [c_void_p(x) for x in res_pen_list]
+    cipher_arr = (c_void_p * bin_cnt)(*pen_ptr_list)
+    GPU_LIB.partition_by_index(
+        c_char_p(src_pen), cipher_arr,
+        c_void_p(valid_store.data),
+        c_uint32(vec_size), c_uint32(bin_cnt))
+    # construct return list
+    res_list = []
+    for i in range(bin_cnt):
+        res_list.append(_pi_init_ss(
+            None, res_pen_list[i], res_base_list[i], res_exp_list[i], valid_cnt[i],
+            None, (valid_cnt[i],),
+            l_store.mem_type, l_store.data_type,
+            l_store.encode_n, l_store.encode_max_int))
+    return res_list
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_tensor.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_tensor.py
new file mode 100644
index 0000000000..98cea5f5e8
--- /dev/null
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_tensor.py
@@ -0,0 +1,428 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import typing
+
+import numpy as np
+
+from .gpu_engine import PaillierEncryptedStorage, \
+    TensorShapeStorage, pi_add, te_p2c, fp_encode, pi_encrypt, pi_mul, pi_matmul, pi_rmatmul, pi_sum, pi_h2d_pub_key, \
+    pi_p2c_pub_key, pi_decrypt, te_c2p, pi_h2d_priv_key, pi_p2c_priv_key
+from .secureprotol.fate_paillier import PaillierPublicKey, PaillierPrivateKey, PaillierKeypair
+
+
+class Cipherblock:
+    def __init__(self, store: PaillierEncryptedStorage,
+                 shape: TensorShapeStorage,
+                 pk: "PK"):
+        self.store = store
+        self.shape = shape
+        self.pk = pk
+
+    def get_shape(self):
+        return self.shape.to_tuple()
+
+    def get_size(self):
+        return self.shape.size()
+
+    @staticmethod
+    def gen_shape(other):
+        return TensorShapeStorage().from_tuple(other.shape)
+
+    def _add_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(te_p2c(other, None), self.pk.pub_key.n, self.pk.pub_key.max_int)
+        pi_store = pi_encrypt(self.pk.gpu_pub_key, fp_store, None, None)
+        res_store, res_shape = pi_add(self.pk.gpu_pub_key, self.store, pi_store, self.shape, self.gen_shape(other),
+                                      None, None, None)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def _mul_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(te_p2c(other, None), self.pk.pub_key.n, self.pk.pub_key.max_int)
+        res_store, res_shape = pi_mul(self.pk.gpu_pub_key, self.store, fp_store, self.shape, self.gen_shape(other),
+                                      None, None, None)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def _matmul_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(te_p2c(other, None), self.pk.pub_key.n, self.pk.pub_key.max_int)
+        res_store, res_shape = pi_matmul(self.pk.gpu_pub_key, self.store, fp_store, self.shape, self.gen_shape(other),
+                                         None, None, None)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def _rmatmul_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(te_p2c(other, None), self.pk.pub_key.n, self.pk.pub_key.max_int)
+        res_store, res_shape = pi_rmatmul(self.pk.gpu_pub_key, fp_store, self.store, self.gen_shape(other), self.shape,
+                                          None, None, None)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def add_cipherblock(self, other: "Cipherblock") -> "Cipherblock":
+        res_store, res_shape = pi_add(self.pk.gpu_pub_key, self.store, other.store, self.shape, other.shape, None,
+                                      None, None)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def add_plaintext_f64(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_f32(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_i64(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_i32(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_scalar_f64(self, other: typing.Union[float, np.float64]) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float64)
+        return self._add_plaintext(other_array)
+
+    def add_plaintext_scalar_f32(self, other: typing.Union[float, np.float32]) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float32)
+        return self._add_plaintext(other_array)
+
+    def add_plaintext_scalar_i64(self, other: typing.Union[int, np.int64]) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int64)
+        return self._add_plaintext(other_array)
+
+    def add_plaintext_scalar_i32(self, other: typing.Union[int, np.int32]) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int32)
+        return self._add_plaintext(other_array)
+
+    def sub_cipherblock(self, other: "Cipherblock") -> "Cipherblock":
+        return self.add_cipherblock(other.mul_plaintext_scalar_i32(-1))
+
+    def sub_plaintext_f64(self, other) -> "Cipherblock":
+        return self.add_plaintext_f64(other * -1)
+
+    def sub_plaintext_f32(self, other) -> "Cipherblock":
+        return self.add_plaintext_f32(other * -1)
+
+    def sub_plaintext_i64(self, other) -> "Cipherblock":
+        return self.add_plaintext_i64(other * -1)
+
+    def sub_plaintext_i32(self, other) -> "Cipherblock":
+        return self.add_plaintext_i32(other * -1)
+
+    def sub_plaintext_scalar_f64(self, other: typing.Union[float, np.float64]) -> "Cipherblock":
+        return self.add_plaintext_scalar_f64(other * -1)
+
+    def sub_plaintext_scalar_f32(self, other: typing.Union[float, np.float32]) -> "Cipherblock":
+        return self.add_plaintext_scalar_f32(other * -1)
+
+    def sub_plaintext_scalar_i64(self, other: typing.Union[int, np.int64]) -> "Cipherblock":
+        return self.add_plaintext_scalar_i64(other * -1)
+
+    def sub_plaintext_scalar_i32(self, other: typing.Union[int, np.int32]) -> "Cipherblock":
+        return self.add_plaintext_scalar_i32(other * -1)
+
+    def mul_plaintext_f64(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_f32(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_i64(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_i32(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_scalar_f64(self, other: typing.Union[float, np.float64]) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float64)
+        return self._mul_plaintext(other_array)
+
+    def mul_plaintext_scalar_f32(self, other: typing.Union[float, np.float32]) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float32)
+        return self._mul_plaintext(other_array)
+
+    def mul_plaintext_scalar_i64(self, other: typing.Union[int, np.int64]) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int64)
+        return self._mul_plaintext(other_array)
+
+    def mul_plaintext_scalar_i32(self, other: typing.Union[int, np.int32]) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int32)
+        return self._mul_plaintext(other_array)
+
+    def matmul_plaintext_ix2_f64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix2_f32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix2_i64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix2_i32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_f64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_f32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_i64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_i32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_f64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_f32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_i64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_i32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_f64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_f32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_i64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_i32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def sum(self) -> "Cipherblock":
+        res_store, res_shape = pi_sum(self.pk.gpu_pub_key, self.store, self.shape, None, None, None, None)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def sum_axis(self, axis=None):
+        res_store, res_shape = pi_sum(self.pk.gpu_pub_key, self.store, self.shape, axis, None, None, None)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def mean(self) -> "Cipherblock":
+        return self.sum().mul_plaintext_scalar_f64(float(1 / self.get_size()))
+
+    """parallel"""
+
+    def add_cipherblock_par(self, other: "Cipherblock") -> "Cipherblock":
+        return self.add_cipherblock(other)
+
+    def add_plaintext_f64_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_f64(other)
+
+    def add_plaintext_f32_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_f32(other)
+
+    def add_plaintext_i64_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_i64(other)
+
+    def add_plaintext_scalar_f64_par(self, other: typing.Union[float, np.float64]) -> "Cipherblock":
+        return self.add_plaintext_scalar_f64(other)
+
+    def add_plaintext_scalar_f32_par(self, other: typing.Union[float, np.float32]) -> "Cipherblock":
+        return self.add_plaintext_scalar_f32(other)
+
+    def add_plaintext_scalar_i64_par(self, other: typing.Union[int, np.int64]) -> "Cipherblock":
+        return self.add_plaintext_scalar_i64(other)
+
+    def add_plaintext_scalar_i32_par(self, other: typing.Union[int, np.int32]) -> "Cipherblock":
+        return self.add_plaintext_scalar_i32(other)
+
+    def add_plaintext_i32_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_i32(other)
+
+    def sub_cipherblock_par(self, other: "Cipherblock") -> "Cipherblock":
+        return self.sub_cipherblock(other)
+
+    def sub_plaintext_f64_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_f64(other)
+
+    def sub_plaintext_f32_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_f32(other)
+
+    def sub_plaintext_i64_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_i64(other)
+
+    def sub_plaintext_i32_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_i32(other)
+
+    def sub_plaintext_scalar_f64_par(self, other: typing.Union[float, np.float64]) -> "Cipherblock":
+        return self.sub_plaintext_scalar_f64(other)
+
+    def sub_plaintext_scalar_f32_par(self, other: typing.Union[float, np.float32]) -> "Cipherblock":
+        return self.sub_plaintext_scalar_f32(other)
+
+    def sub_plaintext_scalar_i64_par(self, other: typing.Union[int, np.int64]) -> "Cipherblock":
+        return self.sub_plaintext_scalar_i64(other)
+
+    def sub_plaintext_scalar_i32_par(self, other: typing.Union[int, np.int32]) -> "Cipherblock":
+        return self.sub_plaintext_scalar_i32(other)
+
+    def mul_plaintext_f64_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_f64(other)
+
+    def mul_plaintext_f32_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_f32(other)
+
+    def mul_plaintext_i64_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_i64(other)
+
+    def mul_plaintext_i32_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_i32(other)
+
+    def mul_plaintext_scalar_f64_par(self, other: typing.Union[float, np.float64]) -> "Cipherblock":
+        return self.mul_plaintext_scalar_f64(other)
+
+    def mul_plaintext_scalar_f32_par(self, other: typing.Union[float, np.float32]) -> "Cipherblock":
+        return self.mul_plaintext_scalar_f32(other)
+
+    def mul_plaintext_scalar_i64_par(self, other: typing.Union[int, np.int64]) -> "Cipherblock":
+        return self.mul_plaintext_scalar_i64(other)
+
+    def mul_plaintext_scalar_i32_par(self, other: typing.Union[int, np.int32]) -> "Cipherblock":
+        return self.mul_plaintext_scalar_i32(other)
+
+    def matmul_plaintext_ix2_f64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_f64(other)
+
+    def matmul_plaintext_ix2_f32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_f32(other)
+
+    def matmul_plaintext_ix2_i64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_i64(other)
+
+    def matmul_plaintext_ix2_i32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_i32(other)
+
+    def matmul_plaintext_ix1_f64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_f64(other)
+
+    def matmul_plaintext_ix1_f32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_f32(other)
+
+    def matmul_plaintext_ix1_i64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_i64(other)
+
+    def matmul_plaintext_ix1_i32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_i32(other)
+
+    def rmatmul_plaintext_ix2_f64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_f64(other)
+
+    def rmatmul_plaintext_ix2_f32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_f32(other)
+
+    def rmatmul_plaintext_ix2_i64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_i64(other)
+
+    def rmatmul_plaintext_ix2_i32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_i32(other)
+
+    def rmatmul_plaintext_ix1_f64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_f64(other)
+
+    def rmatmul_plaintext_ix1_f32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_f32(other)
+
+    def rmatmul_plaintext_ix1_i64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_i64(other)
+
+    def rmatmul_plaintext_ix1_i32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_i32(other)
+
+    def sum_par(self) -> "Cipherblock":
+        return self.sum()
+
+    def mean_par(self) -> "Cipherblock":
+        return self.mean()
+
+
+class PK:
+    def __init__(self, pub_key: PaillierPublicKey):
+        self.pub_key = pub_key
+        self.gpu_pub_key = pi_h2d_pub_key(None, pi_p2c_pub_key(None, self.pub_key))
+
+    def _encrypt(self, a) -> Cipherblock:
+        shape = TensorShapeStorage().from_tuple(a.shape)
+        fp_store = fp_encode(te_p2c(a, None), self.pub_key.n, self.pub_key.max_int)
+        pi_store = pi_encrypt(self.gpu_pub_key, fp_store, None, None)
+        return Cipherblock(pi_store, shape, self)
+
+    def encrypt_f64(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_f32(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_i64(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_i32(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_f64_par(self, a) -> Cipherblock:
+        return self.encrypt_f64(a)
+
+    def encrypt_f32_par(self, a) -> Cipherblock:
+        return self.encrypt_f32(a)
+
+    def encrypt_i64_par(self, a) -> Cipherblock:
+        return self.encrypt_i64(a)
+
+    def encrypt_i32_par(self, a) -> Cipherblock:
+        return self.encrypt_i32(a)
+
+
+class SK:
+    def __init__(self, priv_key: PaillierPrivateKey, pk: PK):
+        self.priv_key = priv_key
+        self.gpu_priv_key = pi_h2d_priv_key(None, pi_p2c_priv_key(None, priv_key))
+        self.pk = pk
+
+    def _decrypt(self, a: Cipherblock):
+        if a.store.vec_size == 0:
+            return np.asarray([])
+        te_res = pi_decrypt(a.pk.gpu_pub_key, self.gpu_priv_key, a.store, None, None, None)
+        return te_c2p(te_res).reshape(a.get_shape())
+
+    def decrypt_f64(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.float64)
+
+    def decrypt_f32(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.float32)
+
+    def decrypt_i64(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.int64)
+
+    def decrypt_i32(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.int32)
+
+    def decrypt_f64_par(self, a: Cipherblock):
+        return self.decrypt_f64(a)
+
+    def decrypt_f32_par(self, a: Cipherblock):
+        return self.decrypt_f32(a)
+
+    def decrypt_i64_par(self, a: Cipherblock):
+        return self.decrypt_i64(a)
+
+    def decrypt_i32_par(self, a: Cipherblock):
+        return self.decrypt_i32(a)
+
+
+def keygen(bit_size) -> typing.Tuple[PK, SK]:
+    pub_key, priv_key = PaillierKeypair.generate_keypair(n_length=bit_size)
+    pk = PK(pub_key)
+    sk = SK(priv_key, pk)
+    return pk, sk
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/__init__.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/__init__.py
new file mode 100644
index 0000000000..ef471ba686
--- /dev/null
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/__init__.py
@@ -0,0 +1,15 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
\ No newline at end of file
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py
new file mode 100644
index 0000000000..adbf383e24
--- /dev/null
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py
@@ -0,0 +1,343 @@
+"""Paillier encryption library for partially homomorphic encryption."""
+
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import random
+
+from . import gmpy_math
+from .fixedpoint import FixedPointNumber
+
+
+class PaillierKeypair(object):
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def generate_keypair(n_length=1024):
+        """return a new :class:`PaillierPublicKey` and :class:`PaillierPrivateKey`.
+        """
+        p = q = n = None
+        n_len = 0
+
+        while n_len != n_length:
+            p = gmpy_math.getprimeover(n_length // 2)
+            q = p
+            while q == p:
+                q = gmpy_math.getprimeover(n_length // 2)
+            n = p * q
+            n_len = n.bit_length()
+
+        public_key = PaillierPublicKey(n)
+        private_key = PaillierPrivateKey(public_key, p, q)
+
+        return public_key, private_key
+
+
+class PaillierPublicKey(object):
+    """Contains a public key and associated encryption methods.
+    """
+
+    def __init__(self, n):
+        self.g = n + 1
+        self.n = n
+        self.nsquare = n * n
+        self.max_int = n // 3 - 1
+
+    def __repr__(self):
+        hashcode = hex(hash(self))[2:]
+        return "<PaillierPublicKey {}>".format(hashcode[:10])
+
+    def __eq__(self, other):
+        return self.n == other.n
+
+    def __hash__(self):
+        return hash(self.n)
+
+    def apply_obfuscator(self, ciphertext, random_value=None):
+        """
+        """
+        r = random_value or random.SystemRandom().randrange(1, self.n)
+        obfuscator = gmpy_math.powmod(r, self.n, self.nsquare)
+
+        return (ciphertext * obfuscator) % self.nsquare
+
+    def raw_encrypt(self, plaintext, random_value=None):
+        """
+        """
+        if not isinstance(plaintext, int):
+            raise TypeError("plaintext should be int, but got: %s" %
+                            type(plaintext))
+
+        if plaintext >= (self.n - self.max_int) and plaintext < self.n:
+            # Very large plaintext, take a sneaky shortcut using inverses
+            neg_plaintext = self.n - plaintext  # = abs(plaintext - nsquare)
+            neg_ciphertext = (self.n * neg_plaintext + 1) % self.nsquare
+            ciphertext = gmpy_math.invert(neg_ciphertext, self.nsquare)
+        else:
+            ciphertext = (self.n * plaintext + 1) % self.nsquare
+
+        ciphertext = self.apply_obfuscator(ciphertext, random_value)
+
+        return ciphertext
+
+    def encrypt(self, value, precision=None, random_value=None):
+        """Encode and Paillier encrypt a real number value.
+        """
+        if isinstance(value, FixedPointNumber):
+            value = value.decode()
+        encoding = FixedPointNumber.encode(value, self.n, self.max_int, precision)
+        obfuscator = random_value or 1
+        ciphertext = self.raw_encrypt(encoding.encoding, random_value=obfuscator)
+        encryptednumber = PaillierEncryptedNumber(self, ciphertext, encoding.exponent)
+        if random_value is None:
+            encryptednumber.apply_obfuscator()
+
+        return encryptednumber
+
+
+class PaillierPrivateKey(object):
+    """Contains a private key and associated decryption method.
+    """
+
+    def __init__(self, public_key, p, q):
+        if not p * q == public_key.n:
+            raise ValueError("given public key does not match the given p and q")
+        if p == q:
+            raise ValueError("p and q have to be different")
+        self.public_key = public_key
+        if q < p:
+            self.p = q
+            self.q = p
+        else:
+            self.p = p
+            self.q = q
+        self.psquare = self.p * self.p
+        self.qsquare = self.q * self.q
+        self.q_inverse = gmpy_math.invert(self.q, self.p)
+        self.hp = self.h_func(self.p, self.psquare)
+        self.hq = self.h_func(self.q, self.qsquare)
+
+    def __eq__(self, other):
+        return self.p == other.p and self.q == other.q
+
+    def __hash__(self):
+        return hash((self.p, self.q))
+
+    def __repr__(self):
+        hashcode = hex(hash(self))[2:]
+
+        return "<PaillierPrivateKey {}>".format(hashcode[:10])
+
+    def h_func(self, x, xsquare):
+        """Computes the h-function as defined in Paillier's paper page.
+        """
+        return gmpy_math.invert(self.l_func(gmpy_math.powmod(self.public_key.g,
+                                                             x - 1, xsquare), x), x)
+
+    def l_func(self, x, p):
+        """computes the L function as defined in Paillier's paper.
+        """
+
+        return (x - 1) // p
+
+    def crt(self, mp, mq):
+        """the Chinese Remainder Theorem as needed for decryption.
+           return the solution modulo n=pq.
+       """
+        u = (mp - mq) * self.q_inverse % self.p
+        x = (mq + (u * self.q)) % self.public_key.n
+
+        return x
+
+    def raw_decrypt(self, ciphertext):
+        """return raw plaintext.
+        """
+        if not isinstance(ciphertext, int):
+            raise TypeError("ciphertext should be an int, not: %s" %
+                            type(ciphertext))
+
+        mp = self.l_func(gmpy_math.powmod(ciphertext,
+                                          self.p - 1, self.psquare),
+                         self.p) * self.hp % self.p
+
+        mq = self.l_func(gmpy_math.powmod(ciphertext,
+                                          self.q - 1, self.qsquare),
+                         self.q) * self.hq % self.q
+
+        return self.crt(mp, mq)
+
+    def decrypt(self, encrypted_number):
+        """return the decrypted & decoded plaintext of encrypted_number.
+        """
+        if not isinstance(encrypted_number, PaillierEncryptedNumber):
+            raise TypeError("encrypted_number should be an PaillierEncryptedNumber, \
+                             not: %s" % type(encrypted_number))
+
+        if self.public_key != encrypted_number.public_key:
+            raise ValueError("encrypted_number was encrypted against a different key!")
+
+        encoded = self.raw_decrypt(encrypted_number.ciphertext(be_secure=False))
+        encoded = FixedPointNumber(encoded,
+                                   encrypted_number.exponent,
+                                   self.public_key.n,
+                                   self.public_key.max_int)
+        decrypt_value = encoded.decode()
+
+        return decrypt_value
+
+
+class PaillierEncryptedNumber(object):
+    """Represents the Paillier encryption of a float or int.
+    """
+
+    def __init__(self, public_key, ciphertext, exponent=0):
+        self.public_key = public_key
+        self.__ciphertext = ciphertext
+        self.exponent = exponent
+        self.__is_obfuscator = False
+
+        if not isinstance(self.__ciphertext, int):
+            raise TypeError("ciphertext should be an int, not: %s" % type(self.__ciphertext))
+
+        if not isinstance(self.public_key, PaillierPublicKey):
+            raise TypeError("public_key should be a PaillierPublicKey, not: %s" % type(self.public_key))
+
+    def ciphertext(self, be_secure=True):
+        """return the ciphertext of the PaillierEncryptedNumber.
+        """
+        if be_secure and not self.__is_obfuscator:
+            self.apply_obfuscator()
+
+        return self.__ciphertext
+
+    def apply_obfuscator(self):
+        """ciphertext by multiplying by r ** n with random r
+        """
+        self.__ciphertext = self.public_key.apply_obfuscator(self.__ciphertext)
+        self.__is_obfuscator = True
+
+    def __add__(self, other):
+        if isinstance(other, PaillierEncryptedNumber):
+            return self.__add_encryptednumber(other)
+        else:
+            return self.__add_scalar(other)
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+    def __sub__(self, other):
+        return self + (other * -1)
+
+    def __rsub__(self, other):
+        return other + (self * -1)
+
+    def __rmul__(self, scalar):
+        return self.__mul__(scalar)
+
+    def __truediv__(self, scalar):
+        return self.__mul__(1 / scalar)
+
+    def __mul__(self, scalar):
+        """return Multiply by an scalar(such as int, float)
+        """
+        if isinstance(scalar, FixedPointNumber):
+            scalar = scalar.decode()
+        encode = FixedPointNumber.encode(scalar, self.public_key.n, self.public_key.max_int)
+        plaintext = encode.encoding
+
+        if plaintext < 0 or plaintext >= self.public_key.n:
+            raise ValueError("Scalar out of bounds: %i" % plaintext)
+
+        if plaintext >= self.public_key.n - self.public_key.max_int:
+            # Very large plaintext, play a sneaky trick using inverses
+            neg_c = gmpy_math.invert(self.ciphertext(False), self.public_key.nsquare)
+            neg_scalar = self.public_key.n - plaintext
+            ciphertext = gmpy_math.powmod(neg_c, neg_scalar, self.public_key.nsquare)
+        else:
+            ciphertext = gmpy_math.powmod(self.ciphertext(False), plaintext, self.public_key.nsquare)
+
+        exponent = self.exponent + encode.exponent
+
+        return PaillierEncryptedNumber(self.public_key, ciphertext, exponent)
+
+    def increase_exponent_to(self, new_exponent):
+        """return PaillierEncryptedNumber:
+           new PaillierEncryptedNumber with same value but having great exponent.
+        """
+        if new_exponent < self.exponent:
+            raise ValueError("New exponent %i should be great than old exponent %i" % (new_exponent, self.exponent))
+
+        factor = pow(FixedPointNumber.BASE, new_exponent - self.exponent)
+        new_encryptednumber = self.__mul__(factor)
+        new_encryptednumber.exponent = new_exponent
+
+        return new_encryptednumber
+
+    def __align_exponent(self, x, y):
+        """return x,y with same exponet
+        """
+        if x.exponent < y.exponent:
+            x = x.increase_exponent_to(y.exponent)
+        elif x.exponent > y.exponent:
+            y = y.increase_exponent_to(x.exponent)
+
+        return x, y
+
+    def __add_scalar(self, scalar):
+        """return PaillierEncryptedNumber: z = E(x) + y
+        """
+        if isinstance(scalar, FixedPointNumber):
+            scalar = scalar.decode()
+        encoded = FixedPointNumber.encode(scalar,
+                                          self.public_key.n,
+                                          self.public_key.max_int,
+                                          max_exponent=self.exponent)
+        return self.__add_fixpointnumber(encoded)
+
+    def __add_fixpointnumber(self, encoded):
+        """return PaillierEncryptedNumber: z = E(x) + FixedPointNumber(y)
+        """
+        if self.public_key.n != encoded.n:
+            raise ValueError("Attempted to add numbers encoded against different public keys!")
+
+        # their exponents must match, and align.
+        x, y = self.__align_exponent(self, encoded)
+
+        encrypted_scalar = x.public_key.raw_encrypt(y.encoding, 1)
+        encryptednumber = self.__raw_add(x.ciphertext(False), encrypted_scalar, x.exponent)
+
+        return encryptednumber
+
+    def __add_encryptednumber(self, other):
+        """return PaillierEncryptedNumber: z = E(x) + E(y)
+        """
+        if self.public_key != other.public_key:
+            raise ValueError("add two numbers have different public key!")
+
+        # their exponents must match, and align.
+        x, y = self.__align_exponent(self, other)
+
+        encryptednumber = self.__raw_add(x.ciphertext(False), y.ciphertext(False), x.exponent)
+
+        return encryptednumber
+
+    def __raw_add(self, e_x, e_y, exponent):
+        """return the integer E(x + y) given ints E(x) and E(y).
+        """
+        ciphertext = gmpy_math.mpz(e_x) * gmpy_math.mpz(e_y) % self.public_key.nsquare
+
+        return PaillierEncryptedNumber(self.public_key, int(ciphertext), exponent)
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py
new file mode 100644
index 0000000000..af3ae2a754
--- /dev/null
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py
@@ -0,0 +1,298 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import math
+import sys
+
+import numpy as np
+
+
+class FixedPointNumber(object):
+    """Represents a float or int fixedpoint encoding;.
+    """
+    BASE = 16
+    LOG2_BASE = math.log(BASE, 2)
+    FLOAT_MANTISSA_BITS = sys.float_info.mant_dig
+
+    Q = 293973345475167247070445277780365744413 ** 2
+
+    def __init__(self, encoding, exponent, n=None, max_int=None):
+        if n is None:
+            self.n = FixedPointNumber.Q
+            self.max_int = self.n // 2
+        else:
+            self.n = n
+            if max_int is None:
+                self.max_int = self.n // 2
+            else:
+                self.max_int = max_int
+
+        self.encoding = encoding
+        self.exponent = exponent
+
+    @classmethod
+    def calculate_exponent_from_precision(cls, precision):
+        exponent = math.floor(math.log(precision, cls.BASE))
+        return exponent
+
+    @classmethod
+    def encode(cls, scalar, n=None, max_int=None, precision=None, max_exponent=None):
+        """return an encoding of an int or float.
+        """
+        # Calculate the maximum exponent for desired precision
+        exponent = None
+
+        #  Too low value preprocess;
+        #  avoid "OverflowError: int too large to convert to float"
+
+        if np.abs(scalar) < 1e-200:
+            scalar = 0
+
+        if n is None:
+            n = cls.Q
+            max_int = n // 2
+
+        if precision is None:
+            if isinstance(scalar, int) or isinstance(scalar, np.int16) or \
+                    isinstance(scalar, np.int32) or isinstance(scalar, np.int64):
+                exponent = 0
+            elif isinstance(scalar, float) or isinstance(scalar, np.float16) \
+                    or isinstance(scalar, np.float32) or isinstance(scalar, np.float64):
+                flt_exponent = math.frexp(scalar)[1]
+                lsb_exponent = cls.FLOAT_MANTISSA_BITS - flt_exponent
+                exponent = math.floor(lsb_exponent / cls.LOG2_BASE)
+            else:
+                raise TypeError("Don't know the precision of type %s."
+                                % type(scalar))
+        else:
+            exponent = cls.calculate_exponent_from_precision(precision)
+
+        if max_exponent is not None:
+            exponent = max(max_exponent, exponent)
+
+        int_fixpoint = int(round(scalar * pow(cls.BASE, exponent)))
+
+        if abs(int_fixpoint) > max_int:
+            raise ValueError(f"Integer needs to be within +/- {max_int},but got {int_fixpoint},"
+                             f"basic info, scalar={scalar}, base={cls.BASE}, exponent={exponent}"
+                             )
+
+        return cls(int_fixpoint % n, exponent, n, max_int)
+
+    def decode(self):
+        """return decode plaintext.
+        """
+        if self.encoding >= self.n:
+            # Should be mod n
+            raise ValueError('Attempted to decode corrupted number')
+        elif self.encoding <= self.max_int:
+            # Positive
+            mantissa = self.encoding
+        elif self.encoding >= self.n - self.max_int:
+            # Negative
+            mantissa = self.encoding - self.n
+        else:
+            raise OverflowError(f'Overflow detected in decode number, encoding: {self.encoding}，'
+                                f'{self.exponent}'
+                                f' {self.n}')
+
+        return mantissa * pow(self.BASE, -self.exponent)
+
+    def increase_exponent_to(self, new_exponent):
+        """return FixedPointNumber: new encoding with same value but having great exponent.
+        """
+        if new_exponent < self.exponent:
+            raise ValueError('New exponent %i should be greater than'
+                             'old exponent %i' % (new_exponent, self.exponent))
+
+        factor = pow(self.BASE, new_exponent - self.exponent)
+        new_encoding = self.encoding * factor % self.n
+
+        return FixedPointNumber(new_encoding, new_exponent, self.n, self.max_int)
+
+    def __align_exponent(self, x, y):
+        """return x,y with same exponent
+        """
+        if x.exponent < y.exponent:
+            x = x.increase_exponent_to(y.exponent)
+        elif x.exponent > y.exponent:
+            y = y.increase_exponent_to(x.exponent)
+
+        return x, y
+
+    def __truncate(self, a):
+        scalar = a.decode()
+        return FixedPointNumber.encode(scalar, n=self.n, max_int=self.max_int)
+
+    def __add__(self, other):
+        if isinstance(other, FixedPointNumber):
+            return self.__add_fixedpointnumber(other)
+        elif type(other).__name__ == "PaillierEncryptedNumber":
+            return other + self.decode()
+        else:
+            return self.__add_scalar(other)
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+    def __sub__(self, other):
+        if isinstance(other, FixedPointNumber):
+            return self.__sub_fixedpointnumber(other)
+        elif type(other).__name__ == "PaillierEncryptedNumber":
+            return (other - self.decode()) * -1
+        else:
+            return self.__sub_scalar(other)
+
+    def __rsub__(self, other):
+        if type(other).__name__ == "PaillierEncryptedNumber":
+            return other - self.decode()
+
+        x = self.__sub__(other)
+        x = -1 * x.decode()
+        return self.encode(x, n=self.n, max_int=self.max_int)
+
+    def __rmul__(self, other):
+        return self.__mul__(other)
+
+    def __mul__(self, other):
+        if isinstance(other, FixedPointNumber):
+            return self.__mul_fixedpointnumber(other)
+        elif type(other).__name__ == "PaillierEncryptedNumber":
+            return other * self.decode()
+        else:
+            return self.__mul_scalar(other)
+
+    def __truediv__(self, other):
+        if isinstance(other, FixedPointNumber):
+            scalar = other.decode()
+        else:
+            scalar = other
+
+        return self.__mul__(1 / scalar)
+
+    def __rtruediv__(self, other):
+        res = 1.0 / self.__truediv__(other).decode()
+        return FixedPointNumber.encode(res, n=self.n, max_int=self.max_int)
+
+    def __lt__(self, other):
+        x = self.decode()
+        if isinstance(other, FixedPointNumber):
+            y = other.decode()
+        else:
+            y = other
+        if x < y:
+            return True
+        else:
+            return False
+
+    def __gt__(self, other):
+        x = self.decode()
+        if isinstance(other, FixedPointNumber):
+            y = other.decode()
+        else:
+            y = other
+        if x > y:
+            return True
+        else:
+            return False
+
+    def __le__(self, other):
+        x = self.decode()
+        if isinstance(other, FixedPointNumber):
+            y = other.decode()
+        else:
+            y = other
+        if x <= y:
+            return True
+        else:
+            return False
+
+    def __ge__(self, other):
+        x = self.decode()
+        if isinstance(other, FixedPointNumber):
+            y = other.decode()
+        else:
+            y = other
+
+        if x >= y:
+            return True
+        else:
+            return False
+
+    def __eq__(self, other):
+        x = self.decode()
+        if isinstance(other, FixedPointNumber):
+            y = other.decode()
+        else:
+            y = other
+        if x == y:
+            return True
+        else:
+            return False
+
+    def __ne__(self, other):
+        x = self.decode()
+        if isinstance(other, FixedPointNumber):
+            y = other.decode()
+        else:
+            y = other
+        if x != y:
+            return True
+        else:
+            return False
+
+    def __add_fixedpointnumber(self, other):
+        if self.n != other.n:
+            other = self.encode(other.decode(), n=self.n, max_int=self.max_int)
+        x, y = self.__align_exponent(self, other)
+        encoding = (x.encoding + y.encoding) % self.n
+        return FixedPointNumber(encoding, x.exponent, n=self.n, max_int=self.max_int)
+
+    def __add_scalar(self, scalar):
+        encoded = self.encode(scalar, n=self.n, max_int=self.max_int)
+        return self.__add_fixedpointnumber(encoded)
+
+    def __sub_fixedpointnumber(self, other):
+        if self.n != other.n:
+            other = self.encode(other.decode(), n=self.n, max_int=self.max_int)
+        x, y = self.__align_exponent(self, other)
+        encoding = (x.encoding - y.encoding) % self.n
+
+        return FixedPointNumber(encoding, x.exponent, n=self.n, max_int=self.max_int)
+
+    def __sub_scalar(self, scalar):
+        scalar = -1 * scalar
+        return self.__add_scalar(scalar)
+
+    def __mul_fixedpointnumber(self, other):
+        return self.__mul_scalar(other.decode())
+
+    def __mul_scalar(self, scalar):
+        val = self.decode()
+        z = val * scalar
+        z_encode = FixedPointNumber.encode(z, n=self.n, max_int=self.max_int)
+        return z_encode
+
+    def __abs__(self):
+        if self.encoding <= self.max_int:
+            # Positive
+            return self
+        elif self.encoding >= self.n - self.max_int:
+            # Negative
+            return self * -1
+
+    def __mod__(self, other):
+        return FixedPointNumber(self.encoding % other, self.exponent, n=self.n, max_int=self.max_int)
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/gmpy_math.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/gmpy_math.py
new file mode 100644
index 0000000000..c56c574df6
--- /dev/null
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/gmpy_math.py
@@ -0,0 +1,133 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import random
+import gmpy2
+
+POWMOD_GMP_SIZE = pow(2, 64)
+
+
+def powmod(a, b, c):
+    """
+    return int: (a ** b) % c
+    """
+
+    if a == 1:
+        return 1
+
+    if max(a, b, c) < POWMOD_GMP_SIZE:
+        return pow(a, b, c)
+
+    else:
+        return int(gmpy2.powmod(a, b, c))
+
+
+def crt_coefficient(p, q):
+    """
+    return crt coefficient
+    """
+    tq = gmpy2.invert(p, q)
+    tp = gmpy2.invert(q, p)
+    return tp * q, tq * p
+
+
+def powmod_crt(x, d, n, p, q, cp, cq):
+    """
+    return int: (a ** b) % n
+    """
+
+    rp = gmpy2.powmod(x, d % (p - 1), p)
+    rq = gmpy2.powmod(x, d % (q - 1), q)
+    return int((rp * cp + rq * cq) % n)
+
+
+def invert(a, b):
+    """return int: x, where a * x == 1 mod b"""
+    x = int(gmpy2.invert(a, b))
+
+    if x == 0:
+        raise ZeroDivisionError("invert(a, b) no inverse exists")
+
+    return x
+
+
+def getprimeover(n):
+    """return a random n-bit prime number"""
+    r = gmpy2.mpz(random.SystemRandom().getrandbits(n))
+    r = gmpy2.bit_set(r, n - 1)
+
+    return int(gmpy2.next_prime(r))
+
+
+def isqrt(n):
+    """ return the integer square root of N """
+
+    return int(gmpy2.isqrt(n))
+
+
+def is_prime(n):
+    """
+    true if n is probably a prime, false otherwise
+    :param n:
+    :return:
+    """
+    return gmpy2.is_prime(int(n))
+
+
+def legendre(a, p):
+    return pow(a, (p - 1) // 2, p)
+
+
+def tonelli(n, p):
+    # assert legendre(n, p) == 1, "not a square (mod p)"
+    q = p - 1
+    s = 0
+    while q % 2 == 0:
+        q //= 2
+        s += 1
+    if s == 1:
+        return pow(n, (p + 1) // 4, p)
+    for z in range(2, p):
+        if p - 1 == legendre(z, p):
+            break
+    c = pow(z, q, p)
+    r = pow(n, (q + 1) // 2, p)
+    t = pow(n, q, p)
+    m = s
+    while (t - 1) % p != 0:
+        t2 = (t * t) % p
+        for i in range(1, m):
+            if (t2 - 1) % p == 0:
+                break
+            t2 = (t2 * t2) % p
+        b = pow(c, 1 << (m - i - 1), p)
+        r = (r * b) % p
+        c = (b * b) % p
+        t = (t * c) % p
+        m = i
+    return r
+
+
+def gcd(a, b):
+    return int(gmpy2.gcd(a, b))
+
+
+def next_prime(n):
+    return int(gmpy2.next_prime(n))
+
+
+def mpz(n):
+    return gmpy2.mpz(n)
diff --git a/gpu/fate-tensor-gpu/pyproject.toml b/gpu/fate-tensor-gpu/pyproject.toml
new file mode 100644
index 0000000000..16df1d9d42
--- /dev/null
+++ b/gpu/fate-tensor-gpu/pyproject.toml
@@ -0,0 +1,17 @@
+[tool.poetry]
+name = "fate-tensor-gpu"
+version = "0.1.0"
+description = "This project is an industrial-level heterogeneous acceleration system to support and speed up federated learning. We've designed and implemented a heterogeneous acceleration solutions using GPU, respectively, that can significantly accelerate the Paillier cryptosystem while maintaining functionality, accuracy and scalability."
+authors = ["Xiaolong.Gao <1506957902@qq.com>"]
+
+[tool.poetry.dependencies]
+python = "^3.6"
+numpy = "~1.18.4"
+gmpy2 = "^2.0.8"
+
+[tool.poetry.dev-dependencies]
+pytest = "^5.2"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/gpu/fate-tensor-gpu/tests/__init__.py b/gpu/fate-tensor-gpu/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/gpu/fate-tensor-gpu/tests/test_fate_tensor_gpu.py b/gpu/fate-tensor-gpu/tests/test_fate_tensor_gpu.py
new file mode 100644
index 0000000000..8701c384b9
--- /dev/null
+++ b/gpu/fate-tensor-gpu/tests/test_fate_tensor_gpu.py
@@ -0,0 +1,5 @@
+from fate_tensor_gpu import __version__
+
+
+def test_version():
+    assert __version__ == '0.1.0'

From b452120fe1bdede164e69137ec77de483789ad7e Mon Sep 17 00:00:00 2001
From: "Xiaolong.Gao" <1506957902@qq.com>
Date: Wed, 20 Jul 2022 15:38:42 +0800
Subject: [PATCH 2/8] feat: pep8 format of gpu

Signed-off-by: Xiaolong.Gao <1506957902@qq.com>
---
 .../{ => fate_tensor_gpu}/tests/__init__.py   |   0
 .../fate_tensor_gpu/tests/test_gpu_engine.py  | 712 ++++++++++++++++++
 .../tests/test_fate_tensor_gpu.py             |   5 -
 3 files changed, 712 insertions(+), 5 deletions(-)
 rename gpu/fate-tensor-gpu/{ => fate_tensor_gpu}/tests/__init__.py (100%)
 create mode 100755 gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py
 delete mode 100644 gpu/fate-tensor-gpu/tests/test_fate_tensor_gpu.py

diff --git a/gpu/fate-tensor-gpu/tests/__init__.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/__init__.py
similarity index 100%
rename from gpu/fate-tensor-gpu/tests/__init__.py
rename to gpu/fate-tensor-gpu/fate_tensor_gpu/tests/__init__.py
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py
new file mode 100755
index 0000000000..0cd461f260
--- /dev/null
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py
@@ -0,0 +1,712 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import random
+
+import numpy as np
+import unittest
+import functools
+import time
+
+from fate_tensor_gpu.secureprotol.fixedpoint import FixedPointNumber
+from fate_tensor_gpu.secureprotol import gmpy_math
+from fate_tensor_gpu.secureprotol.fate_paillier import (
+    PaillierKeypair,
+    PaillierEncryptedNumber,
+)
+
+from fate_tensor_gpu.gpu_engine import (
+    FLOAT_TYPE,
+    INT64_TYPE,
+    pi_p2c_pub_key,
+    pi_p2c_priv_key,
+    pi_h2d_pub_key,
+    pi_h2d_priv_key,
+    TensorShapeStorage,
+    bi_alloc,
+    PLAIN_BYTE,
+    MEM_HOST,
+    te_alloc,
+    fp_alloc,
+    pi_alloc,
+    te_p2c,
+    fp_encode,
+    fp_decode,
+    te_c2p,
+    pi_encrypt,
+    pi_gen_obf_seed,
+    CIPHER_BITS,
+    pi_obfuscate,
+    pi_c2p,
+    pi_decrypt,
+    fp_mul,
+    fp_c2p,
+    pi_add,
+    pi_mul,
+    pi_sum,
+    bi_free,
+    te_free,
+    fp_free,
+    pi_free,
+)
+
+RAND_TYPE = FLOAT_TYPE  # SWITCH DATA TYPE HERE: EITHER INT64_TYPE OR FLOAT_TYPE
+NUM_ROWS = 200
+NUM_COLS = 200
+TEST_SIZE = NUM_ROWS * NUM_COLS
+KEY_LEN = 1024
+DATA_SIZE = TEST_SIZE * KEY_LEN * 2 // 8
+ERROR_TOLERANCE = 1e-10
+
+
+class TestCaseReport:
+    def __init__(self, name, batch_size, bit_len, data_size):
+        self.name = name
+        self.batch_size = batch_size
+        self.bit_len = bit_len
+        self.data_size = int(data_size)
+        self.content = {}
+        self.width = 100
+        self.column = [30, 20, 25, 24]
+        self.cpu_throughput = 0.0
+        self.gpu_throughput = 0.0
+
+    def add_perf_report(self, name):
+        self.content[name] = {}
+
+    def add_item(self, report_name, item_name, time, ops, bw):
+        self.content[report_name][item_name] = {}
+        self.content[report_name][item_name]['time'] = time
+        self.content[report_name][item_name]['ops'] = ops
+        self.content[report_name][item_name]['bw'] = bw
+
+    def gen_line(self, *args):
+        i = 0
+        size = 0
+        res = ''
+        for v in args:
+            res += '|' + str(v) + ' ' * (self.column[i] - len(str(v)) - 1)
+            size += self.column[i]
+            i += 1
+        if i < 3:
+            res += " " * (self.width - size - 1)
+        res += '|'
+        return res
+
+    def dump_header(self):
+        res = []
+        res.append('=' * self.width)
+        res.append(
+            '|'
+            + ' ' * (int(self.width - len(self.name) - 2) // 2)
+            + self.name
+            + ' ' * (int(self.width - len(self.name) - 1) // 2)
+            + '|'
+        )
+        res.append('=' * self.width)
+        res.append(self.gen_line("Data Information"))
+        res.append('-' * self.width)
+        res.append(self.gen_line("Batch Size", self.batch_size))
+        res.append(self.gen_line("Bit Length", self.bit_len))
+        res.append(self.gen_line("Data Size (Bytes)", self.data_size))
+        return "\n".join(res)
+
+    def dump_item(self, report_name, item_name):
+        time = self.content[report_name][item_name]['time']
+        time = "{0:.4f}".format(time)
+        ops = self.content[report_name][item_name]['ops']
+        ops = "{0:.4f}".format(ops)
+        bw = self.content[report_name][item_name]['bw'] / (2 ** 20)
+        bw = "{0:.4f}".format(bw)
+        line = self.gen_line(item_name, time, ops, bw)
+        return line
+
+    def dump_perf_report(self, report_name):
+        res = []
+        res.append("=" * self.width)
+        res.append(self.gen_line(report_name))
+        res.append("-" * self.width)
+        res.append(
+            self.gen_line(
+                "Item",
+                "Time Elapsed(s)",
+                "Operations Per Second",
+                "Bandwidth (MB/s)"))
+        res.append("-" * self.width)
+        for v in self.content[report_name]:
+            res.append(self.dump_item(report_name, v))
+        return "\n".join(res)
+
+    def dump_summary(self):
+        self.ratio = self.gpu_throughput / self.cpu_throughput
+        res = []
+        res.append("=" * self.width)
+        res.append(self.gen_line("Performance of GPU/CPU"))
+        res.append('-' * self.width)
+        res.append(
+            self.gen_line(
+                "GPU/CPU Ratio (Speedup)",
+                "{0:.4f}".format(
+                    self.ratio)))
+        res.append("=" * self.width)
+        res.append('\n')
+
+        return "\n".join(res)
+
+    def dump_result(self):
+        res = []
+        res.append(self.dump_header())
+        for v in self.content:
+            res.append(self.dump_perf_report(v))
+        res.append(self.dump_summary())
+        report = "\n".join(res)
+        print(report)
+
+
+def generate_rand(test_size):
+    if RAND_TYPE == FLOAT_TYPE:
+        return np.random.normal(0, 5, test_size)
+    elif RAND_TYPE == INT64_TYPE:
+        return np.random.randint(-(2 ** 10), 2 ** 10, test_size)
+    else:
+        raise TypeError("Invalid data type")
+
+
+def assert_diff(res, ref):
+    if res == 0 or ref == 0:
+        assert res == 0
+        assert ref == 0
+    else:
+        diff = res - ref
+        assert abs(diff / res) < ERROR_TOLERANCE
+        assert abs(diff / ref) < ERROR_TOLERANCE
+
+
+def assert_ndarray_diff(res, ref):
+    assert res.shape == ref.shape
+    res, ref = res.flatten(), ref.flatten()
+    assert res.shape == ref.shape
+    for i in range(res.size):
+        try:
+            assert_diff(res[i], ref[i])
+        except AssertionError:
+            print(
+                "Assertion Error at location",
+                i,
+                ", GPU result:",
+                res[i],
+                ", reference result:",
+                ref[i],
+            )
+
+
+def profile(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        res = func(*args, **kwargs)
+        end_time = time.time()
+        return res, end_time - start_time
+
+    return wrapper
+
+
+def compare_time(gpu_time, cpu_time, num_instances=TEST_SIZE):
+    print("GPU time:", gpu_time, "second(s)")
+    print("CPU time:", cpu_time, "second(s)")
+    print(
+        "GPU throughput:",
+        num_instances /
+        gpu_time,
+        "instance(s) per second")
+    print(
+        "CPU throughput:",
+        num_instances /
+        cpu_time,
+        "instance(s) per second")
+    print("Speedup:", cpu_time / gpu_time)
+
+
+def cpu_pi_gen_obf_seed(
+        res_store,
+        public_key,
+        count,
+        elem_size,
+        rand_seed,
+        stream):
+    random.seed(rand_seed)
+    rand_vals = [random.randrange(1, 8 ** elem_size) for _ in range(count)]
+    return [
+        gmpy_math.powmod(
+            v,
+            public_key.n,
+            public_key.nsquare) for v in rand_vals]
+
+
+def cpu_pi_obfuscate(
+        public_key, encrypted_numbers, obf_seeds, exponents, res_store, stream
+):
+    return [
+        PaillierEncryptedNumber(
+            public_key,
+            (encrypted_numbers[i] * obf_seeds[i]) % public_key.nsquare,
+            exponents[i],
+        )
+        for i in range(len(encrypted_numbers))
+    ]
+
+
+def cpu_fp_mul(left, right):
+    return [
+        FixedPointNumber(
+            (left[i].encoding * right[i].encoding) % left[i].n,
+            left[i].exponent + right[i].exponent,
+            left[i].n,
+            left[i].max_int,
+        )
+        for i in range(len(left))
+    ]
+
+
+def add_to_perf_reports(_perf_reports, name, gpu_time, cpu_time, data_size):
+    perf_report = TestCaseReport(name, TEST_SIZE, KEY_LEN, data_size)
+    perf_report.gpu_throughput = TEST_SIZE / gpu_time
+    perf_report.add_perf_report("GPU Performance")
+    perf_report.add_item(
+        "GPU Performance",
+        "Computation on GPU",
+        gpu_time,
+        TEST_SIZE / gpu_time,
+        data_size / gpu_time,
+    )
+    perf_report.cpu_throughput = TEST_SIZE / cpu_time
+    perf_report.add_perf_report("CPU Performance")
+    perf_report.add_item(
+        "CPU Performance",
+        "Computation on CPU",
+        cpu_time,
+        TEST_SIZE / cpu_time,
+        data_size / cpu_time,
+    )
+    _perf_reports.append(perf_report)
+
+
+class TestOperators(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._pub_key, cls._priv_key = PaillierKeypair.generate_keypair()
+        cls.n, cls.max_int = cls._pub_key.n, cls._pub_key.max_int
+        cls._cpu_pub_key = pi_p2c_pub_key(cls._pub_key)
+        cls._cpu_priv_key = pi_p2c_priv_key(cls._priv_key)
+        cls._gpu_pub_key = pi_h2d_pub_key(cls._cpu_pub_key)
+        cls._gpu_priv_key = pi_h2d_priv_key(cls._cpu_priv_key)
+        cls._perf_reports = []
+        print(
+            "\n\n",
+            "*" * 100,
+            "\n\nInitialization complete\nTest Size:",
+            TEST_SIZE)
+
+    def test_performance(self):
+        print("\n\n", "*" * 100, "\n\nTest performance begins")
+
+        print("\n>>>>> generate data and allocate memory spaces")
+        raw, raw2 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        shape_tuple, shape_tuple_T = (NUM_ROWS, NUM_COLS), (NUM_COLS, NUM_ROWS)
+        shape_store, _ = TensorShapeStorage(*shape_tuple), TensorShapeStorage(
+            *shape_tuple_T
+        )
+        gpu_bi_store, gpu_bi_store2 = bi_alloc(
+            None, TEST_SIZE, PLAIN_BYTE, MEM_HOST
+        ), bi_alloc(None, TEST_SIZE, PLAIN_BYTE, MEM_HOST)
+        gpu_te_store, gpu_te_store2 = te_alloc(
+            None, TEST_SIZE, MEM_HOST), te_alloc(
+            None, TEST_SIZE, MEM_HOST)
+        gpu_fp_store, gpu_fp_store2 = fp_alloc(
+            None, TEST_SIZE, MEM_HOST), fp_alloc(
+            None, TEST_SIZE, MEM_HOST)
+        gpu_pi_store, gpu_pi_store2 = pi_alloc(
+            None, TEST_SIZE, MEM_HOST), pi_alloc(
+            None, TEST_SIZE, MEM_HOST)
+        gpu_te_store, gpu_te_store2 = te_p2c(raw, gpu_te_store), te_p2c(
+            raw2, gpu_te_store2
+        )
+
+        print("\n>>>>> fp_encode profiling begins")
+        gpu_encoded, gpu_encode_time = profile(fp_encode)(
+            gpu_te_store, self.n, self.max_int, res=gpu_fp_store
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_encode_time
+            )
+        )
+        cpu_encoded, cpu_encode_time = profile(
+            lambda l: [
+                FixedPointNumber.encode(
+                    v, self.n, self.max_int) for v in l])(raw)
+        compare_time(gpu_encode_time, cpu_encode_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Encode",
+            gpu_encode_time,
+            cpu_encode_time,
+            DATA_SIZE)
+
+        print("\n>>>>> fp_decode profiling begins")
+        gpu_decoded, gpu_decode_time = profile(fp_decode)(
+            gpu_encoded, gpu_te_store, None
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_decode_time
+            )
+        )
+        cpu_decoded, cpu_decode_time = profile(
+            lambda l: [v.decode() for v in l])(cpu_encoded)
+        compare_time(gpu_decode_time, cpu_decode_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Decode",
+            gpu_decode_time,
+            cpu_decode_time,
+            DATA_SIZE)
+
+        # check decoded results
+        assert_ndarray_diff(te_c2p(gpu_decoded), np.asarray(cpu_decoded))
+
+        print("\n>>>>> pi_encrypt profiling begins")
+        print("This function calculates (encoding * n + 1) % nsquare")
+        gpu_encrypted, gpu_encrypt_time = profile(pi_encrypt)(
+            self._gpu_pub_key, gpu_encoded, gpu_pi_store, None
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_encrypt_time
+            )
+        )
+        cpu_encrypted, cpu_encrypt_time = profile(
+            lambda l: [self._pub_key.raw_encrypt(v.encoding, 1) for v in l]
+        )(cpu_encoded)
+        compare_time(gpu_encrypt_time, cpu_encrypt_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Encrypt",
+            gpu_encrypt_time,
+            cpu_encrypt_time,
+            DATA_SIZE)
+
+        print("\n>>>>> pi_gen_obf_seed profiling begins")
+        print("This function calculates (rand() ^ n) % nsquare")
+        gpu_obf_seeds, gpu_gen_obf_seeds_time = profile(pi_gen_obf_seed)(
+            gpu_bi_store, self._gpu_pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None)
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_gen_obf_seeds_time
+            )
+        )
+        cpu_obf_seeds, cpu_gen_obf_seefs_time = profile(cpu_pi_gen_obf_seed)(
+            None, self._pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None
+        )
+        compare_time(gpu_gen_obf_seeds_time, cpu_gen_obf_seefs_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Generate Obfuscators",
+            gpu_gen_obf_seeds_time,
+            cpu_gen_obf_seefs_time,
+            DATA_SIZE,
+        )
+
+        print("\n>>>>> pi_obfuscate profiling begins")
+        print("This function calculates (raw_cipher * obf_seed) % nsquare,")
+        print(
+            "\twhere raw_cipher and obf_seed are calculated in pi_encrypt and pi_gen_obf_seeds, respectively"
+        )
+        gpu_obfuscated, gpu_obfuscate_time = profile(pi_obfuscate)(
+            self._gpu_pub_key, gpu_encrypted, gpu_obf_seeds, gpu_pi_store, None
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_obfuscate_time
+            )
+        )
+        cpu_obfuscated, cpu_obfuscate_time = profile(cpu_pi_obfuscate)(
+            self._pub_key,
+            cpu_encrypted,
+            cpu_obf_seeds,
+            [v.exponent for v in cpu_encoded],
+            None,
+            None,
+        )
+        compare_time(gpu_obfuscate_time, cpu_obfuscate_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Obfuscate",
+            gpu_obfuscate_time,
+            cpu_obfuscate_time,
+            DATA_SIZE,
+        )
+
+        # check intermediate result
+        assert_ndarray_diff(
+            np.asarray(pi_c2p(gpu_obfuscated)[0]),
+            np.asarray([v.ciphertext(False) for v in cpu_obfuscated]),
+        )
+
+        print("\n>>>>> pi_decrypt profiling begins")
+        print(
+            "This function calculates L(cipher ^ lambda % nsquare) * L(g ^ lambda % nsquare) ^ -1 % n"
+        )
+        print("fp_decode is by default included in pi_decrypt")
+        fps_buffer = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        gpu_decrypted, gpu_decrypt_time = profile(pi_decrypt)(
+            self._gpu_pub_key,
+            self._gpu_priv_key,
+            gpu_obfuscated,
+            gpu_te_store,
+            fps_buffer,
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_decrypt_time
+            )
+        )
+        cpu_decrypted, cpu_decrypt_time = profile(
+            lambda l: [self._priv_key.decrypt(v) for v in l]
+        )(cpu_obfuscated)
+        compare_time(gpu_decrypt_time, cpu_decrypt_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Decrypt",
+            gpu_decrypt_time,
+            cpu_decrypt_time,
+            DATA_SIZE)
+
+        # check decrypted results
+        assert_ndarray_diff(te_c2p(gpu_decrypted), np.asarray(cpu_decrypted))
+
+        print("\n>>>>> generating the other array")
+        # encode the other array
+        gpu_encoded2 = fp_encode(
+            gpu_te_store2,
+            self.n,
+            self.max_int,
+            res=gpu_fp_store2)
+        cpu_encoded2 = [
+            FixedPointNumber.encode(
+                v, self.n, self.max_int) for v in raw2]
+        # encrypt the other array
+        gpu_encrypted2 = pi_encrypt(
+            self._gpu_pub_key, gpu_encoded2, gpu_pi_store2, None
+        )
+        cpu_encrypted2 = [
+            self._pub_key.raw_encrypt(v.encoding, 1) for v in cpu_encoded2
+        ]
+        # generate obfuscation seeds (obfuscators) for the other array using a
+        # different random seed
+        gpu_obf_seeds2 = pi_gen_obf_seed(
+            gpu_bi_store2,
+            self._gpu_pub_key,
+            TEST_SIZE,
+            CIPHER_BITS // 6,
+            1,
+            None)
+        cpu_obf_seeds2 = cpu_pi_gen_obf_seed(
+            None, self._pub_key, TEST_SIZE, CIPHER_BITS // 6, 1, None
+        )
+        # obfuscate the other array
+        gpu_obfuscated2 = pi_obfuscate(
+            self._gpu_pub_key,
+            gpu_encrypted2,
+            gpu_obf_seeds2,
+            gpu_pi_store2,
+            None)
+        cpu_obfuscated2 = cpu_pi_obfuscate(
+            self._pub_key,
+            cpu_encrypted2,
+            cpu_obf_seeds2,
+            [v.exponent for v in cpu_encoded2],
+            None,
+            None,
+        )
+        # check intermediate result
+        assert_ndarray_diff(
+            np.asarray(pi_c2p(gpu_obfuscated2)[0]),
+            np.asarray([v.ciphertext(False) for v in cpu_obfuscated2]),
+        )
+
+        print("\n>>>>> fp_mul profiling begins")
+        gpu_fp_mul_store = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        (gpu_fp_mul_res, _), gpu_fp_mul_time = profile(fp_mul)(
+            gpu_encoded,
+            gpu_encoded2,
+            shape_store,
+            shape_store,
+            gpu_fp_mul_store,
+            shape_store,
+            None,
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_fp_mul_time
+            )
+        )
+        cpu_fp_mul_res, cpu_fp_mul_time = profile(
+            cpu_fp_mul)(cpu_encoded, cpu_encoded2)
+        compare_time(gpu_fp_mul_time, cpu_fp_mul_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Fixed-point Number Multiply",
+            gpu_fp_mul_time,
+            cpu_fp_mul_time,
+            DATA_SIZE * 2,
+        )
+
+        # Compare results
+        received_fp_mul_res = fp_c2p(gpu_fp_mul_res)
+        for i in range(TEST_SIZE):
+            assert_diff(
+                received_fp_mul_res[i].encoding,
+                cpu_fp_mul_res[i].encoding)
+            assert received_fp_mul_res[i].BASE == cpu_fp_mul_res[i].BASE
+            assert received_fp_mul_res[i].exponent == cpu_fp_mul_res[i].exponent
+
+        print("\n>>>>> pi_add profiling begins")
+        (gpu_add_res, _), gpu_add_time = profile(pi_add)(
+            self._gpu_pub_key,
+            gpu_obfuscated,
+            gpu_obfuscated2,
+            shape_store,
+            shape_store,
+            gpu_pi_store,
+            shape_store,
+            None,
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_add_time
+            )
+        )
+        cpu_add_res, cpu_add_time = profile(
+            lambda a, b: [a[i] + b[i] for i in range(TEST_SIZE)]
+        )(cpu_obfuscated, cpu_obfuscated2)
+        compare_time(gpu_add_time, cpu_add_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Add",
+            gpu_add_time,
+            cpu_add_time,
+            DATA_SIZE * 2)
+
+        print("\n>>>>> pi_mul profiling begins")
+        (gpu_mul_res, _), gpu_mul_time = profile(pi_mul)(
+            self._gpu_pub_key,
+            gpu_add_res,
+            gpu_encoded2,
+            shape_store,
+            shape_store,
+            gpu_pi_store,
+            shape_store,
+            None,
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_mul_time
+            )
+        )
+        cpu_mul_res, cpu_mul_time = profile(
+            lambda a, b: [a[i] * b[i] for i in range(TEST_SIZE)]
+        )(cpu_add_res, cpu_encoded2)
+        compare_time(gpu_mul_time, cpu_mul_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Multiply",
+            gpu_mul_time,
+            cpu_mul_time,
+            DATA_SIZE * 2)
+
+        gpu_pi_matmul_store = pi_alloc(None, NUM_ROWS * NUM_ROWS, MEM_HOST)
+        gpu_matmul_res, gpu_matmul_shape = gpu_mul_res, shape_store
+        cpu_matmul_res = np.asarray(cpu_mul_res).reshape(shape_tuple)
+
+        print("\n>>>>> pi_sum profiling begins")
+        print("shape is", gpu_matmul_shape.to_tuple())
+        gpu_pi_sum_store = pi_alloc(None, max(NUM_ROWS, NUM_COLS), MEM_HOST)
+        for axis in [0, 1, None]:
+            print(">>> axis:", axis)
+            (gpu_sum_res, _), gpu_sum_time = profile(pi_sum)(
+                self._gpu_pub_key,
+                gpu_matmul_res,
+                gpu_matmul_shape,
+                axis,
+                gpu_pi_sum_store,
+                None,
+                None,
+            )
+            print(
+                "GPU computation completed in {} second(s), waiting for CPU".format(
+                    gpu_sum_time
+                )
+            )
+            cpu_sum_res, cpu_sum_time = profile(lambda a: np.sum(a, axis))(
+                cpu_matmul_res
+            )
+            compare_time(gpu_sum_time, cpu_sum_time)
+            add_to_perf_reports(
+                self._perf_reports,
+                "Sum (axis={})".format(axis),
+                gpu_sum_time,
+                cpu_sum_time,
+                DATA_SIZE,
+            )
+
+            # check result
+            gpu_decrypted = te_c2p(
+                pi_decrypt(
+                    self._gpu_pub_key,
+                    self._gpu_priv_key,
+                    gpu_sum_res,
+                    None,
+                    None,
+                    None))
+            cpu_decrypted = np.asarray(
+                [self._priv_key.decrypt(v) for v in cpu_sum_res.flat]
+                if axis is not None
+                else [self._priv_key.decrypt(cpu_sum_res)]
+            )
+            assert_ndarray_diff(gpu_decrypted, cpu_decrypted)
+
+        print("\n>>>>> free all allocated spaces")
+        bi_free(gpu_bi_store)
+        bi_free(gpu_bi_store2)
+        te_free(gpu_te_store)
+        te_free(gpu_te_store2)
+        fp_free(gpu_fp_store)
+        fp_free(gpu_fp_store2)
+        fp_free(fps_buffer)
+        fp_free(gpu_fp_mul_store)
+        pi_free(gpu_pi_store)
+        pi_free(gpu_pi_store2)
+        pi_free(gpu_pi_matmul_store)
+        pi_free(gpu_pi_sum_store)
+
+    @classmethod
+    def tearDownClass(cls):
+        for v in cls._perf_reports:
+            v.dump_result()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/gpu/fate-tensor-gpu/tests/test_fate_tensor_gpu.py b/gpu/fate-tensor-gpu/tests/test_fate_tensor_gpu.py
deleted file mode 100644
index 8701c384b9..0000000000
--- a/gpu/fate-tensor-gpu/tests/test_fate_tensor_gpu.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from fate_tensor_gpu import __version__
-
-
-def test_version():
-    assert __version__ == '0.1.0'

From 85289a6b16d3968aba162656ce2670f623737f9c Mon Sep 17 00:00:00 2001
From: "Xiaolong.Gao" <1506957902@qq.com>
Date: Wed, 20 Jul 2022 15:38:42 +0800
Subject: [PATCH 3/8] feat: pep8 format of gpu

Signed-off-by: Xiaolong.Gao <1506957902@qq.com>
---
 .../fate_tensor_gpu/gpu_engine.py             | 2680 ++++++++++++-----
 .../fate_tensor_gpu/gpu_tensor.py             |  189 +-
 .../fate_tensor_gpu/secureprotol/__init__.py  |   15 -
 .../secureprotol/fate_paillier.py             |  209 +-
 .../secureprotol/fixedpoint.py                |   91 +-
 .../fate_tensor_gpu/secureprotol/gmpy_math.py |    2 +-
 .../{ => fate_tensor_gpu}/tests/__init__.py   |    0
 .../fate_tensor_gpu/tests/test_gpu_engine.py  |  712 +++++
 .../tests/test_fate_tensor_gpu.py             |    5 -
 9 files changed, 3025 insertions(+), 878 deletions(-)
 rename gpu/fate-tensor-gpu/{ => fate_tensor_gpu}/tests/__init__.py (100%)
 create mode 100755 gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py
 delete mode 100644 gpu/fate-tensor-gpu/tests/test_fate_tensor_gpu.py

diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py
index 0e96d2cb56..077c528ec1 100644
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py
@@ -19,9 +19,22 @@
 import numpy as np
 
 from ctypes import cdll, sizeof, c_buffer, cast, c_int32
-from ctypes import c_char, c_char_p, c_void_p, c_uint32, c_double, c_int64, c_int, c_size_t
-
-from .secureprotol.fate_paillier import PaillierPublicKey, PaillierPrivateKey, PaillierEncryptedNumber
+from ctypes import (
+    c_char,
+    c_char_p,
+    c_void_p,
+    c_uint32,
+    c_double,
+    c_int64,
+    c_int,
+    c_size_t,
+)
+
+from .secureprotol.fate_paillier import (
+    PaillierPublicKey,
+    PaillierPrivateKey,
+    PaillierEncryptedNumber,
+)
 from .secureprotol.fixedpoint import FixedPointNumber
 
 from concurrent.futures import ProcessPoolExecutor as Executor
@@ -140,6 +153,7 @@ def free_GPU_context(context_pointer):
 # ###################Reconstruct ndaray from C memory type####################
 # ############################################################################
 
+
 def __get_C_fpn(fpn_space, size):
     '''
     copy FixedPointNumber (FPN) object out from C memory space,
@@ -154,9 +168,11 @@ def __get_C_fpn(fpn_space, size):
     res_fpn = []
     get_res = c_buffer(PLAIN_BYTE)
     for i in range(size):
-        GPU_LIB.c_memcpy(cast(get_res, c_void_p),
-                         c_void_p(fpn_space + i * PLAIN_BYTE),
-                         c_size_t(PLAIN_BYTE))
+        GPU_LIB.c_memcpy(
+            cast(get_res, c_void_p),
+            c_void_p(fpn_space + i * PLAIN_BYTE),
+            c_size_t(PLAIN_BYTE),
+        )
         res_fpn.append(int.from_bytes(get_res.raw, 'little'))
     return np.asarray(res_fpn)
 
@@ -176,9 +192,11 @@ def __get_C_pen(pen_space, index, size):
     res_pen = []
     get_res = c_buffer(CIPHER_BYTE)
     for i in range(size):
-        GPU_LIB.c_memcpy(cast(get_res, c_void_p),
-                         c_void_p(pen_space + (index + i) * CIPHER_BYTE),
-                         c_size_t(CIPHER_BYTE))
+        GPU_LIB.c_memcpy(
+            cast(get_res, c_void_p),
+            c_void_p(pen_space + (index + i) * CIPHER_BYTE),
+            c_size_t(CIPHER_BYTE),
+        )
         res_pen.append(int.from_bytes(get_res.raw, 'little'))
     return np.asarray(res_pen)
 
@@ -197,16 +215,24 @@ def __get_C_uint32(uint32_space, size):
     size: int, the number of uint32 ought to get
     '''
     uint32_list = (c_uint32 * size)(*[0 for _ in range(size)])
-    GPU_LIB.c_memcpy(uint32_list, c_void_p(uint32_space),
-                     c_size_t(size * U_INT32_BYTE))
+    GPU_LIB.c_memcpy(
+        uint32_list,
+        c_void_p(uint32_space),
+        c_size_t(
+            size *
+            U_INT32_BYTE))
     return np.asarray(uint32_list)
 
 
 def __get_C_double(double_space, size):
     '''copy double out from C memory space, form a ndarray'''
     double_list = (c_double * size)(*[0 for _ in range(size)])
-    GPU_LIB.c_memcpy(double_list, c_void_p(double_space),
-                     c_size_t(size * DOUBLE_BYTE))
+    GPU_LIB.c_memcpy(
+        double_list,
+        c_void_p(double_space),
+        c_size_t(
+            size *
+            DOUBLE_BYTE))
     # convert all the data in one step, no loop
     return np.asarray(double_list)
 
@@ -214,8 +240,12 @@ def __get_C_double(double_space, size):
 def __get_C_int64(int64_space, size):
     '''copy int64 out from C memory space, form a ndarray'''
     int64_list = (c_int64 * size)(*[0 for _ in range(size)])
-    GPU_LIB.c_memcpy(int64_list, c_void_p(int64_space),
-                     c_size_t(size * INT64_BYTE))
+    GPU_LIB.c_memcpy(
+        int64_list,
+        c_void_p(int64_space),
+        c_size_t(
+            size *
+            INT64_BYTE))
     # convert all the data in one step, no loop
     return np.asarray(int64_list)
 
@@ -242,8 +272,9 @@ def __get_c_fpn_storage(fpn, base, exp, vec_size, n, max_int):
     res_exp = __get_C_uint32(exp, vec_size)
     res_FixedPointNumber = []
     for i in range(vec_size):
-        res_FixedPointNumber.append(FixedPointNumber(
-            res_fpn[i], int(round(res_exp[i])), n, max_int))
+        res_FixedPointNumber.append(
+            FixedPointNumber(res_fpn[i], int(round(res_exp[i])), n, max_int)
+        )
     return np.asarray(res_FixedPointNumber)
 
 
@@ -282,8 +313,9 @@ def __get_c_pen_storage_mp(pen, base, exp, vec_size, n, thread_num=4):
     executor = Executor()
     futures = []
     for i in range(thread_num):
-        futures.append(executor.submit(
-            __get_C_pen, pen, job_idx_list[i], job_cnt_list[i]))
+        futures.append(
+            executor.submit(__get_C_pen, pen, job_idx_list[i], job_cnt_list[i])
+        )
     res_list = [r.result() for r in futures]
     res_pen = []
     for res in res_list:
@@ -317,7 +349,10 @@ def __get_c_pen_storage(pen, base, exp, vec_size, n):
     public_key = PaillierPublicKey(n)
     for i in range(vec_size):
         res_PaillierEncryptedNumber.append(
-            PaillierEncryptedNumber(public_key, res_pen[i], int(round(res_exp[i]))))
+            PaillierEncryptedNumber(
+                public_key, res_pen[i], int(
+                    round(
+                        res_exp[i]))))
 
     return np.asarray(res_PaillierEncryptedNumber)
 
@@ -361,7 +396,8 @@ class TensorStorage(object):
     def __init__(self, data, vec_size, mem_type: int, data_type: int):
         # numpy has some strange shallowcopies which causes incontinuous memory space
         # so add np.ascontiguousarray here to prevent potential errors
-        self.data = np.ascontiguousarray(data) if isinstance(data, np.ndarray) else data
+        self.data = np.ascontiguousarray(
+            data) if isinstance(data, np.ndarray) else data
         self.vec_size = vec_size
         self.mem_type = mem_type
         self.data_type = data_type  # new parameter
@@ -422,8 +458,17 @@ class FixedPointStorage:
         encode_n, max_int: bigint, the para used for encode the plaintext
     '''
 
-    def __init__(self, bigint_storage, base_storage, exp_storage, vec_size,
-                 n, max_int, mem_type: int, data_type):
+    def __init__(
+        self,
+        bigint_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        n,
+        max_int,
+        mem_type: int,
+        data_type,
+    ):
         # 1:cpu/host  2:gpu/device
         self.mem_type = mem_type
         '''Actual data and length for fpn'''
@@ -467,8 +512,17 @@ class PaillierEncryptedStorage:
         encode_n, max_int: bigint, the para used for encode the plaintext
     '''
 
-    def __init__(self, pen_storage, base_storage, exp_storage, vec_size,
-                 mem_type: int, data_type, fpn_encode_n, fpn_encode_max_int):
+    def __init__(
+        self,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        mem_type: int,
+        data_type,
+        fpn_encode_n,
+        fpn_encode_max_int,
+    ):
         self.mem_type = mem_type
         '''Actual data and length for pen'''
         self.pen_storage = pen_storage
@@ -595,7 +649,8 @@ def __init__(self, pubkey_storage):
             pubkey_storage.n,
             pubkey_storage.g,
             pubkey_storage.nsquare,
-            pubkey_storage.max_int)
+            pubkey_storage.max_int,
+        )
 
     def __del__(self):
         pi_free_d_pub_key(self.pub_key_ptr)
@@ -619,7 +674,8 @@ def __init__(self, privkey_storage):
             privkey_storage.qsquare,
             privkey_storage.q_inverse,
             privkey_storage.hp,
-            privkey_storage.hq)
+            privkey_storage.hq,
+        )
 
     def __del__(self):
         pi_free_d_priv_key(self.priv_key_ptr)
@@ -664,12 +720,12 @@ def te_free(tes):
     Return:
         None
     '''
-    if (isinstance(tes.data, int)):
+    if isinstance(tes.data, int):
         GPU_LIB.c_free(c_void_p(tes.data))
         tes.data = None
 
 
-def te_p2c(data, res):
+def te_p2c(data, res=None):
     '''
     transmit the data storage form from Python to C
     we assume data's structure has already been preserved by the upper layer
@@ -694,24 +750,44 @@ def te_p2c(data, res):
         storage_pointer = res.data
 
     # switch the differnt data types
-    if (data.dtype == 'int32'):
+    if data.dtype == 'int32':
         new_data = data.astype(np.int64)
         data_pointer = new_data.ctypes.data_as(c_void_p)
         data_type = INT64_TYPE
-        GPU_LIB.c_memcpy(c_void_p(storage_pointer), data_pointer, c_size_t(vec_size * INT64_BYTE))
-    elif (data.dtype == 'int64'):
+        GPU_LIB.c_memcpy(
+            c_void_p(storage_pointer),
+            data_pointer,
+            c_size_t(
+                vec_size *
+                INT64_BYTE))
+    elif data.dtype == 'int64':
         data_pointer = data.ctypes.data_as(c_void_p)
         data_type = INT64_TYPE
-        GPU_LIB.c_memcpy(c_void_p(storage_pointer), data_pointer, c_size_t(vec_size * INT64_BYTE))
-    elif (data.dtype == 'float32'):
+        GPU_LIB.c_memcpy(
+            c_void_p(storage_pointer),
+            data_pointer,
+            c_size_t(
+                vec_size *
+                INT64_BYTE))
+    elif data.dtype == 'float32':
         new_data = data.astype(np.float64)
         data_pointer = new_data.ctypes.data_as(c_void_p)
         data_type = FLOAT_TYPE
-        GPU_LIB.c_memcpy(c_void_p(storage_pointer), data_pointer, c_size_t(vec_size * DOUBLE_BYTE))
-    elif (data.dtype == 'float64'):
+        GPU_LIB.c_memcpy(
+            c_void_p(storage_pointer),
+            data_pointer,
+            c_size_t(
+                vec_size *
+                DOUBLE_BYTE))
+    elif data.dtype == 'float64':
         data_pointer = data.ctypes.data_as(c_void_p)
         data_type = FLOAT_TYPE
-        GPU_LIB.c_memcpy(c_void_p(storage_pointer), data_pointer, c_size_t(vec_size * DOUBLE_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(storage_pointer),
+            data_pointer,
+            c_size_t(
+                vec_size *
+                DOUBLE_BYTE))
     else:
         raise PermissionError("Invalid Data Type")
     return _te_init_store(res, storage_pointer, vec_size, MEM_HOST, data_type)
@@ -739,7 +815,7 @@ def te_c2p(store):
         raise PermissionError("Invalid Data Type")
 
 
-def te_c2bytes(data, res):
+def te_c2bytes(data, res=None):
     '''
     transmit TensorShapeStorage form from C to bytes stream.
     Used for communication between sites, since C memory is not shared
@@ -753,14 +829,17 @@ def te_c2bytes(data, res):
     bytes_res = c_buffer(DOUBLE_BYTE * data.vec_size + U_INT32_BYTE)
     # first 4 bytes: contains the data_type info
     # remaining bytes:  contains the data
-    GPU_LIB.te_get_bytes(cast(bytes_res, c_void_p),
-                         c_char_p(data.data_type.to_bytes(U_INT32_BYTE, 'little')),
-                         c_void_p(data.data), c_size_t(data.vec_size))
+    GPU_LIB.te_get_bytes(
+        cast(bytes_res, c_void_p),
+        c_char_p(data.data_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_void_p(data.data),
+        c_size_t(data.vec_size),
+    )
     return bytes_res.raw
     # return pickle.dumps(data)
 
 
-def fp_c2bytes(store, res):
+def fp_c2bytes(store, res=None):
     '''
     transmit FixedPointStorage form to bytes stream;
     Used for communication between sites, since C memory is not shared
@@ -780,20 +859,26 @@ def fp_c2bytes(store, res):
     encode_n = store.encode_n
     max_int = store.max_int
     # C memory storage
-    bytes_res = c_buffer((PLAIN_BYTE + U_INT32_BYTE * 2) * store.vec_size + U_INT32_BYTE * 2 + PLAIN_BYTE * 2)
-    GPU_LIB.fp_get_bytes(cast(bytes_res, c_void_p),
-                         c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
-                         c_char_p(mem_type.to_bytes(U_INT32_BYTE, 'little')),
-                         c_char_p(encode_n.to_bytes(PLAIN_BYTE, 'little')),
-                         c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
-                         c_void_p(store.bigint_storage),
-                         c_void_p(store.base_storage),
-                         c_void_p(store.exp_storage),
-                         c_size_t(store.vec_size))
+    bytes_res = c_buffer(
+        (PLAIN_BYTE + U_INT32_BYTE * 2) * store.vec_size
+        + U_INT32_BYTE * 2
+        + PLAIN_BYTE * 2
+    )
+    GPU_LIB.fp_get_bytes(
+        cast(bytes_res, c_void_p),
+        c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(mem_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(encode_n.to_bytes(PLAIN_BYTE, 'little')),
+        c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
+        c_void_p(store.bigint_storage),
+        c_void_p(store.base_storage),
+        c_void_p(store.exp_storage),
+        c_size_t(store.vec_size),
+    )
     return bytes_res.raw
 
 
-def pi_c2bytes(store, res):
+def pi_c2bytes(store, res=None):
     '''
     transmit PaillierEncryptedNumber form to bytes stream
     Used for communication between sites, since C memory is not shared
@@ -811,22 +896,27 @@ def pi_c2bytes(store, res):
     encode_n = store.encode_n
     max_int = store.encode_max_int
     # C memory storage
-    bytes_res = c_buffer((CIPHER_BYTE + U_INT32_BYTE * 2) * store.vec_size + U_INT32_BYTE * 2 + CIPHER_BYTE * 2)
-    GPU_LIB.pi_get_bytes(cast(bytes_res, c_void_p),
-                         c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
-                         c_char_p(mem_type.to_bytes(U_INT32_BYTE, 'little')),
-                         c_char_p(encode_n.to_bytes(CIPHER_BYTE, 'little')),
-                         c_char_p(max_int.to_bytes(CIPHER_BYTE, 'little')),
-                         c_void_p(store.pen_storage),
-                         c_void_p(store.base_storage),
-                         c_void_p(store.exp_storage),
-                         c_size_t(store.vec_size))
+    bytes_res = c_buffer(
+        (CIPHER_BYTE + U_INT32_BYTE * 2) * store.vec_size
+        + U_INT32_BYTE * 2
+        + CIPHER_BYTE * 2
+    )
+    GPU_LIB.pi_get_bytes(
+        cast(bytes_res, c_void_p),
+        c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(mem_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(encode_n.to_bytes(CIPHER_BYTE, 'little')),
+        c_char_p(max_int.to_bytes(CIPHER_BYTE, 'little')),
+        c_void_p(store.pen_storage),
+        c_void_p(store.base_storage),
+        c_void_p(store.exp_storage),
+        c_size_t(store.vec_size),
+    )
 
     return bytes_res.raw
 
 
-def _te_init_store(store, data, vec_size, mem_type,
-                   data_type):
+def _te_init_store(store, data, vec_size, mem_type, data_type):
     '''
     initialize tensor storage,
     -----------
@@ -847,7 +937,7 @@ def _te_init_store(store, data, vec_size, mem_type,
     return store
 
 
-def te_bytes2c(data, res):
+def te_bytes2c(data, res=None):
     '''
     Restore TensorStorage from bytes buffer,
     TensorStorage.data is a ptr pointing to the restored C memory space.
@@ -864,9 +954,12 @@ def te_bytes2c(data, res):
         storage_pointer = GPU_LIB.c_malloc(c_size_t(len_data))
     else:
         storage_pointer = res.data
-    GPU_LIB.te_from_bytes_get_c(cast(data_type_res, c_void_p),
-                                c_void_p(storage_pointer),
-                                c_char_p(data), c_size_t(len_data))
+    GPU_LIB.te_from_bytes_get_c(
+        cast(data_type_res, c_void_p),
+        c_void_p(storage_pointer),
+        c_char_p(data),
+        c_size_t(len_data),
+    )
     data_type = int.from_bytes(data_type_res, 'little')
     # TODO: change according to different data_types' length,
     # now just use DOUBLE BYTE because we have only INT64 and DOUBLE,
@@ -875,7 +968,7 @@ def te_bytes2c(data, res):
     return _te_init_store(res, storage_pointer, vec_size, MEM_HOST, data_type)
 
 
-def fp_bytes2c(data, res):
+def fp_bytes2c(data, res=None):
     '''
     Restore FixedPointStorage from bytes buffer.
     ---------------
@@ -886,7 +979,9 @@ def fp_bytes2c(data, res):
         res:  FixedPointStorage, the restored struct from para.data.
     '''
     # caculate vec_size
-    vec_size = ((len(data) - 2 * (U_INT32_BYTE + PLAIN_BYTE)) // (U_INT32_BYTE * 2 + PLAIN_BYTE))
+    vec_size = (len(data) - 2 * (U_INT32_BYTE + PLAIN_BYTE)) // (
+        U_INT32_BYTE * 2 + PLAIN_BYTE
+    )
     # uint32
     data_type = c_buffer(U_INT32_BYTE)
     mem_type = c_buffer(U_INT32_BYTE)
@@ -904,18 +999,30 @@ def fp_bytes2c(data, res):
         exp = res.exp_storage
 
     GPU_LIB.fp_from_bytes_get_c(
-        cast(data_type, c_void_p), cast(mem_type, c_void_p),
-        cast(encode_n, c_void_p), cast(max_int, c_void_p),
-        cast(fpn, c_void_p), cast(base, c_void_p), cast(exp, c_void_p),
-        c_char_p(data), c_size_t(vec_size))
-    return _fp_init_store(res, fpn, base, exp, vec_size,
-                          int.from_bytes(encode_n, 'little'),
-                          int.from_bytes(max_int, 'little'),
-                          int.from_bytes(mem_type, 'little'),
-                          int.from_bytes(data_type, 'little'))
+        cast(data_type, c_void_p),
+        cast(mem_type, c_void_p),
+        cast(encode_n, c_void_p),
+        cast(max_int, c_void_p),
+        cast(fpn, c_void_p),
+        cast(base, c_void_p),
+        cast(exp, c_void_p),
+        c_char_p(data),
+        c_size_t(vec_size),
+    )
+    return _fp_init_store(
+        res,
+        fpn,
+        base,
+        exp,
+        vec_size,
+        int.from_bytes(encode_n, 'little'),
+        int.from_bytes(max_int, 'little'),
+        int.from_bytes(mem_type, 'little'),
+        int.from_bytes(data_type, 'little'),
+    )
 
 
-def pi_bytes2c(data, res):
+def pi_bytes2c(data, res=None):
     '''
     Restored PaillierEncryptedStorage from bytes buffer
     --------------
@@ -926,7 +1033,9 @@ def pi_bytes2c(data, res):
         res:  PaillierEncryptedStorage, the restored struct from para.data
     '''
     # caculate vec_size
-    vec_size = ((len(data) - 2 * (U_INT32_BYTE + CIPHER_BYTE)) // (U_INT32_BYTE * 2 + CIPHER_BYTE))
+    vec_size = (len(data) - 2 * (U_INT32_BYTE + CIPHER_BYTE)) // (
+        U_INT32_BYTE * 2 + CIPHER_BYTE
+    )
     # uint32
     data_type = c_buffer(U_INT32_BYTE)
     mem_type = c_buffer(U_INT32_BYTE)
@@ -944,15 +1053,27 @@ def pi_bytes2c(data, res):
         exp = res.exp_storage
 
     GPU_LIB.fp_from_bytes_get_c(
-        cast(data_type, c_void_p), cast(mem_type, c_void_p),
-        cast(encode_n, c_void_p), cast(max_int, c_void_p),
-        cast(pen, c_void_p), cast(base, c_void_p),
-        cast(exp, c_void_p), c_char_p(data), c_size_t(vec_size))
-    return _pi_init_store(res, pen, base, exp, vec_size,
-                          int.from_bytes(mem_type, 'little'),
-                          int.from_bytes(data_type, 'little'),
-                          int.from_bytes(encode_n, 'little'),
-                          int.from_bytes(max_int, 'little'))
+        cast(data_type, c_void_p),
+        cast(mem_type, c_void_p),
+        cast(encode_n, c_void_p),
+        cast(max_int, c_void_p),
+        cast(pen, c_void_p),
+        cast(base, c_void_p),
+        cast(exp, c_void_p),
+        c_char_p(data),
+        c_size_t(vec_size),
+    )
+    return _pi_init_store(
+        res,
+        pen,
+        base,
+        exp,
+        vec_size,
+        int.from_bytes(mem_type, 'little'),
+        int.from_bytes(data_type, 'little'),
+        int.from_bytes(encode_n, 'little'),
+        int.from_bytes(max_int, 'little'),
+    )
 
 
 def _te_init_shape(shape_store, shape_tuple):
@@ -971,8 +1092,9 @@ def _te_init_shape(shape_store, shape_tuple):
     return shape_store
 
 
-def _te_init_ss(res_store, res_data, vec_size,
-                res_shape, shape_tuple, mem_type, data_type):
+def _te_init_ss(
+    res_store, res_data, vec_size, res_shape, shape_tuple, mem_type, data_type
+):
     '''
     Init TensorStorage and TensorShapeStorage at the same time
     ------------
@@ -987,154 +1109,341 @@ def _te_init_ss(res_store, res_data, vec_size,
     Return:
         tuple, (TensorStorage, TensorShapeStorage)
     '''
-    return _te_init_store(res_store, res_data, vec_size, mem_type, data_type), _te_init_shape(res_shape, shape_tuple)
+    return _te_init_store(
+        res_store, res_data, vec_size, mem_type, data_type
+    ), _te_init_shape(res_shape, shape_tuple)
 
 
-'''''''''
+'''''' '''
 The following calculators are done on TensorStorage
 Definition are the same with numpy
 TensorStorage.data should all be ndarray datatype in order to support numpy
 
 NOT USED IN OUR FATE IMPLEMENTATION,
 but Webank's implementation seems to have used them
-'''''''''
-
-
-def te_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
+''' ''''''
+
+
+def te_slice(
+        store,
+        shape,
+        start,
+        stop,
+        axis,
+        res_store=None,
+        res_shape=None,
+        stream=None):
     if axis == 1:
         res_data = store.data[:, start:stop]
     elif axis == 0:
         res_data = store.data[start:stop]
     else:
         raise NotImplementedError()
-    return _te_init_ss(res_store, res_data, res_data.size,
-                       res_shape, res_data.shape,
-                       store.mem_type, store.data_type)
-
-
-def te_cat(stores, axis, res_store, res_shape):
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def te_cat(stores, axis, res_store=None, res_shape=None):
     if axis == 0:
         res_data = np.vstack([x.data for x in stores])
     elif axis == 1:
         res_data = np.hstack([x.data for x in stores])
     else:
         raise NotImplementedError()
-    return _te_init_ss(res_store, res_data, res_data.size,
-                       res_shape, res_data.shape,
-                       stores[0].mem_type, stores[0].data_type)
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        stores[0].mem_type,
+        stores[0].data_type,
+    )
 
 
 # TODO: precise data_type
 
-def te_pow(left_store, right, left_shape, res_store, res_shape, stream):
-    res_data = left_store.data ** right
-    return _te_init_ss(res_store, res_data, res_data.size,
-                       res_shape, res_data.shape,
-                       left_store.mem_type, left_store.data_type)
+
+def te_pow(
+        left_store,
+        right,
+        left_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    res_data = left_store.data**right
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
 
 
 # TODO: precise data_type
 
-def te_add(left_store, right_store, left_shape, right_shape,
-           res_store, res_shape, stream):
+
+def te_add(
+    left_store,
+    right_store,
+    left_shape,
+    right_shape,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
     res_data = left_store.data + right_store.data
-    return _te_init_ss(res_store, res_data, res_data.size,
-                       res_shape, res_data.shape,
-                       left_store.mem_type, left_store.data_type)
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
 
 
 # TODO: precise data_type
 
-def te_mul(left_store, right_store, left_shape, right_shape,
-           res_store, res_shape, stream):
+
+def te_mul(
+    left_store,
+    right_store,
+    left_shape,
+    right_shape,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
     res_data = left_store.data * right_store.data
-    return _te_init_ss(res_store, res_data, res_data.size,
-                       res_shape, res_data.shape,
-                       left_store.mem_type, left_store.data_type)
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
 
 
 # TODO: precise data_type
 
-def te_truediv(left_store, right_store, left_shape, right_shape,
-               res_store, res_shape, stream):
-    res_data = left_store.data / right_store.data
-    return _te_init_ss(res_store, res_data, res_data.size,
-                       res_shape, res_data.shape,
-                       left_store.mem_type, FLOAT_TYPE)
-
 
-def te_floordiv(left_store, right_store, left_shape, right_shape,
-                res_store, res_shape, stream):
+def te_truediv(
+    left_store,
+    right_store,
+    left_shape,
+    right_shape,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
+    res_data = left_store.data / right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        FLOAT_TYPE,
+    )
+
+
+def te_floordiv(
+    left_store,
+    right_store,
+    left_shape,
+    right_shape,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
     res_data = left_store.data // right_store.data
-    return _te_init_ss(res_store, res_data, res_data.size,
-                       res_shape, res_data.shape,
-                       left_store.mem_type, INT64_TYPE)
-
-
-def te_sub(left_store, right_store, left_shape, right_shape,
-           res_store, res_shape, stream):
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        INT64_TYPE,
+    )
+
+
+def te_sub(
+    left_store,
+    right_store,
+    left_shape,
+    right_shape,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
     res_data = left_store.data - right_store.data
-    return _te_init_ss(res_store, res_data, res_data.size,
-                       res_shape, res_data.shape,
-                       left_store.mem_type, left_store.data_type)
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
 
 
 # TODO: precise data_type, currently only inherent from left
 
-def te_matmul(left_store, right_store, left_shape, right_shape,
-              res_store, res_shape, stream):
+
+def te_matmul(
+    left_store,
+    right_store,
+    left_shape,
+    right_shape,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
     res_data = left_store.data @ right_store.data
-    return _te_init_ss(res_store, res_data, res_data.size,
-                       res_shape, res_data.shape,
-                       left_store.mem_type, left_store.data_type)
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
 
 
 def te_abs(left_store, left_shape, res_store, res_shape, stream):
-    return _te_init_ss(res_store, abs(left_store.data), left_store.vec_size,
-                       res_shape, left_shape.to_tuple(),
-                       left_store.mem_type, left_store.data_type)
-
-
-def te_neg(left_store, left_shape, res_store, res_shape, stream):
-    return _te_init_ss(res_store, -left_store.data, left_store.vec_size,
-                       res_shape, left_shape.to_tuple(),
-                       left_store.mem_type, left_store.data_type)
-
-
-def te_transpose(left_store, left_shape, res_store, res_shape, stream):
+    return _te_init_ss(
+        res_store,
+        abs(left_store.data),
+        left_store.vec_size,
+        res_shape,
+        left_shape.to_tuple(),
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_neg(
+        left_store,
+        left_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    return _te_init_ss(
+        res_store,
+        -left_store.data,
+        left_store.vec_size,
+        res_shape,
+        left_shape.to_tuple(),
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_transpose(
+        left_store,
+        left_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
     res_data = left_store.data.transpose()
-    return _te_init_ss(res_store, res_data, res_data.size,
-                       res_shape, res_data.shape,
-                       left_store.mem_type, left_store.data_type)
-
-
-def te_sum(left_store, left_shape, axis, res_store, res_shape, stream):
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_sum(
+        left_store,
+        left_shape,
+        axis,
+        res_store=None,
+        res_shape=None,
+        stream=None):
     res_data = left_store.data.sum(axis=axis)
-    return _te_init_ss(res_store, res_data, res_data.size,
-                       res_shape, res_data.shape,
-                       left_store.mem_type, left_store.data_type)
-
-
-def te_reshape(store, shape, new_shape, res_store, res_shape, stream):
-    return _te_init_ss(res_store, store.data.reshape(new_shape),
-                       store.vec_size,
-                       res_shape, new_shape.to_tuple(),
-                       store.mem_type, store.data_type)
-
-
-def te_exp(store, shape, res_store, res_shape, stream):
-    return _te_init_ss(res_store, np.exp(store.data), store.vec_size,
-                       res_shape, shape.to_tuple(),
-                       store.mem_type, FLOAT_TYPE)
-
-
-def te_hstack(left_store, right_store, left_shape, right_shape,
-              res_store, res_shape, stream):
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_reshape(
+        store,
+        shape,
+        new_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    return _te_init_ss(
+        res_store,
+        store.data.reshape(new_shape),
+        store.vec_size,
+        res_shape,
+        new_shape.to_tuple(),
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def te_exp(store, shape, res_store=None, res_shape=None, stream=None):
+    return _te_init_ss(
+        res_store,
+        np.exp(store.data),
+        store.vec_size,
+        res_shape,
+        shape.to_tuple(),
+        store.mem_type,
+        FLOAT_TYPE,
+    )
+
+
+def te_hstack(
+    left_store,
+    right_store,
+    left_shape,
+    right_shape,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
     _store, _shape = te_cat([left_store, right_store], 1, res_store, res_shape)
     # avoid naming collision
-    return _te_init_ss(res_store, _store.data, _store.vec_size,
-                       _shape, _shape.to_tuple(),
-                       left_store.mem_type, left_store.data_type)
+    return _te_init_ss(
+        res_store,
+        _store.data,
+        _store.vec_size,
+        _shape,
+        _shape.to_tuple(),
+        left_store.mem_type,
+        left_store.data_type,
+    )
 
 
 def te_c2p_first(store):
@@ -1170,8 +1479,12 @@ def te_c2p_first(store):
 
 def direct_bi_alloc(res, vec_size, elem_size, mem_type):
     return _bi_init_store(
-        res, GPU_LIB.c_direct_malloc(c_size_t(vec_size * elem_size)),
-        vec_size, elem_size, mem_type)
+        res,
+        GPU_LIB.c_direct_malloc(c_size_t(vec_size * elem_size)),
+        vec_size,
+        elem_size,
+        mem_type,
+    )
 
 
 def direct_pi_alloc(res, size, mem_type):
@@ -1184,8 +1497,16 @@ def direct_pi_alloc(res, size, mem_type):
     res_base = GPU_LIB.c_direct_malloc(c_size_t(size * U_INT32_BYTE))
     res_exp = GPU_LIB.c_direct_malloc(c_size_t(size * U_INT32_BYTE))
     # data_type, encode_n and encode_max_int all set to 0
-    return _pi_init_store(res, res_pen, res_base, res_exp,
-                          size, mem_type, 0, 0, 0)
+    return _pi_init_store(
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        size,
+        mem_type,
+        0,
+        0,
+        0)
 
 
 def direct_fp_alloc(res, size, mem_type):
@@ -1197,8 +1518,16 @@ def direct_fp_alloc(res, size, mem_type):
         res_fpn = None
     res_base = GPU_LIB.c_direct_malloc(c_size_t(size * U_INT32_BYTE))
     res_exp = GPU_LIB.c_direct_malloc(c_size_t(size * U_INT32_BYTE))
-    return _fp_init_store(res, res_fpn, res_base, res_exp,
-                          size, 0, 0, mem_type, 0)
+    return _fp_init_store(
+        res,
+        res_fpn,
+        res_base,
+        res_exp,
+        size,
+        0,
+        0,
+        mem_type,
+        0)
 
 
 def direct_te_alloc(res, size, mem_type):
@@ -1208,8 +1537,12 @@ def direct_te_alloc(res, size, mem_type):
 
 def bi_alloc(res, vec_size, elem_size, mem_type):
     return _bi_init_store(
-        res, GPU_LIB.c_malloc(c_size_t(vec_size * elem_size)),
-        vec_size, elem_size, mem_type)
+        res,
+        GPU_LIB.c_malloc(c_size_t(vec_size * elem_size)),
+        vec_size,
+        elem_size,
+        mem_type,
+    )
 
 
 def pi_alloc(res, size, mem_type):
@@ -1222,8 +1555,16 @@ def pi_alloc(res, size, mem_type):
     res_base = GPU_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
     res_exp = GPU_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
     # data_type, encode_n and encode_max_int all set to 0
-    return _pi_init_store(res, res_pen, res_base, res_exp,
-                          size, mem_type, 0, 0, 0)
+    return _pi_init_store(
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        size,
+        mem_type,
+        0,
+        0,
+        0)
 
 
 def fp_alloc(res, size, mem_type):
@@ -1235,8 +1576,16 @@ def fp_alloc(res, size, mem_type):
         res_fpn = None
     res_base = GPU_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
     res_exp = GPU_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
-    return _fp_init_store(res, res_fpn, res_base, res_exp,
-                          size, 0, 0, mem_type, 0)
+    return _fp_init_store(
+        res,
+        res_fpn,
+        res_base,
+        res_exp,
+        size,
+        0,
+        0,
+        mem_type,
+        0)
 
 
 def te_alloc(res, size, mem_type):
@@ -1262,17 +1611,17 @@ def pi_free(ptr):
     ptr.pen_storage, ptr.base_storage, ptr.exp_storage = None, None, None
 
 
-def fp_h2d(target, src, stream=None):
+def fp_h2d(target, src=None, stream=None):
     '''TODO: currently not Implemented because it is not used'''
     return src
 
 
-def fp_d2h(target, src, stream):
+def fp_d2h(target, src=None, stream=None):
     '''TODO: currently not Implemented because it is not used'''
     return src
 
 
-def pi_h2d(pub_key, target, src, stream):
+def pi_h2d(pub_key, target, src=None, stream=None):
     '''
     Transfer C-memory stored PaillierEncryptedStorage into GPU-memory stored,
     with the internal exponent aligned done.
@@ -1301,16 +1650,30 @@ def pi_h2d(pub_key, target, src, stream):
         exp_storage = target.exp_storage
 
     GPU_LIB.pen_host2device_exp_align(
-        c_char_p(src.pen_storage), c_void_p(src.base_storage), c_void_p(src.exp_storage),
-        c_void_p(pen_storage), c_void_p(base_storage), c_void_p(exp_storage),
-        c_size_t(vec_size), c_void_p(pub_key.pub_key_ptr))
+        c_char_p(src.pen_storage),
+        c_void_p(src.base_storage),
+        c_void_p(src.exp_storage),
+        c_void_p(pen_storage),
+        c_void_p(base_storage),
+        c_void_p(exp_storage),
+        c_size_t(vec_size),
+        c_void_p(pub_key.pub_key_ptr),
+    )
     mem_type = MEM_DEVICE
     return _pi_init_store(
-        target, pen_storage, base_storage, exp_storage,
-        vec_size, mem_type, src.data_type, src.encode_n, src.encode_max_int)
+        target,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        mem_type,
+        src.data_type,
+        src.encode_n,
+        src.encode_max_int,
+    )
 
 
-def pi_d2h(target, src, stream):
+def pi_d2h(target, src=None, stream=None):
     '''
     Transfer GPU-memory stored PaillierEncryptedStorage into C-memory stored ones.
     --------------
@@ -1332,25 +1695,36 @@ def pi_d2h(target, src, stream):
         base_storage = target.base_storage
         exp_storage = target.exp_storage
 
-    GPU_LIB.c_memcpy(c_void_p(base_storage),
-                     c_void_p(src.base_storage),
-                     c_size_t(vec_size * U_INT32_BYTE))
-    GPU_LIB.c_memcpy(c_void_p(exp_storage),
-                     c_void_p(src.exp_storage),
-                     c_size_t(vec_size * U_INT32_BYTE))
+    GPU_LIB.c_memcpy(
+        c_void_p(base_storage),
+        c_void_p(src.base_storage),
+        c_size_t(vec_size * U_INT32_BYTE),
+    )
+    GPU_LIB.c_memcpy(
+        c_void_p(exp_storage),
+        c_void_p(src.exp_storage),
+        c_size_t(vec_size * U_INT32_BYTE),
+    )
 
     GPU_LIB.pen_device2host(
-        c_void_p(src.pen_storage),
-        c_char_p(pen_storage),
-        c_size_t(src.vec_size))
+        c_void_p(
+            src.pen_storage), c_char_p(pen_storage), c_size_t(
+            src.vec_size))
     mem_type = MEM_HOST
     return _pi_init_store(
-        target, pen_storage, base_storage, exp_storage,
-        src.vec_size, mem_type, src.data_type,
-        src.encode_n, src.encode_max_int)
+        target,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        src.vec_size,
+        mem_type,
+        src.data_type,
+        src.encode_n,
+        src.encode_max_int,
+    )
 
 
-def pi_h2d_pub_key(target, src):
+def pi_h2d_pub_key(src):
     '''
     Transfer CPU C-memory stored PubKeyStorage to GPU-memory stored Dev_PubKeyStorage
     ----------------
@@ -1361,7 +1735,7 @@ def pi_h2d_pub_key(target, src):
     return target
 
 
-def pi_h2d_priv_key(target, src):
+def pi_h2d_priv_key(src):
     '''
     Transfer CPU C-memory stored PubKeyStorage to GPU-memory stored Dev_PubKeyStorage
     ----------------
@@ -1390,7 +1764,7 @@ def pi_free_d_priv_key(target):
     GPU_LIB.cuda_free(c_void_p(target))
 
 
-def pi_p2c_pub_key(target, src):
+def pi_p2c_pub_key(src):
     '''
     Transfer Python form PaillierPublicKey to C form PubKeyStorage,
     the latter can be used for C/Cuda computing
@@ -1399,16 +1773,26 @@ def pi_p2c_pub_key(target, src):
     return target
 
 
-def pi_p2c_priv_key(target, src):
+def pi_p2c_priv_key(src):
     '''Transfer Python form PaillierPrivateKey to C form PrivKeyStorage'''
-    target = PrivKeyStorage(src.p, src.q, src.psquare, src.qsquare,
-                            src.q_inverse, src.hp, src.hq)
+    target = PrivKeyStorage(
+        src.p, src.q, src.psquare, src.qsquare, src.q_inverse, src.hp, src.hq
+    )
     return target
 
 
 # ###########PaillierEncrypted STORAGE INITIALIZE#################
-def _pi_init_store(res_store, pen_storage, base_storage, exp_storage, vec_size,
-                   mem_type, data_type, encode_n, encode_max_int):
+def _pi_init_store(
+    res_store,
+    pen_storage,
+    base_storage,
+    exp_storage,
+    vec_size,
+    mem_type,
+    data_type,
+    encode_n,
+    encode_max_int,
+):
     '''
     init a new PaillierEncryptedStorage
     ---------------
@@ -1418,8 +1802,15 @@ def _pi_init_store(res_store, pen_storage, base_storage, exp_storage, vec_size,
     '''
     if res_store is None:
         res_store = PaillierEncryptedStorage(
-            pen_storage, base_storage, exp_storage, vec_size,
-            mem_type, data_type, encode_n, encode_max_int)
+            pen_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            mem_type,
+            data_type,
+            encode_n,
+            encode_max_int,
+        )
     else:
         res_store.pen_storage = pen_storage
         res_store.base_storage = base_storage
@@ -1437,15 +1828,34 @@ def _pi_init_store(res_store, pen_storage, base_storage, exp_storage, vec_size,
 _pi_init_shape = _te_init_shape
 
 
-def _pi_init_ss(res_store, pen_storage, base_storage, exp_storage, vec_size,
-                res_shape, res_shape_tuple,
-                mem_type, data_type, encode_n, encode_max_int):
+def _pi_init_ss(
+    res_store,
+    pen_storage,
+    base_storage,
+    exp_storage,
+    vec_size,
+    res_shape,
+    res_shape_tuple,
+    mem_type,
+    data_type,
+    encode_n,
+    encode_max_int,
+):
     '''
     init new PaillierEncryptedStorage and corresponding TensorShapeStorage at same time
     Paras are identical to _pi_init_store & _te_init_shape
     '''
-    return _pi_init_store(res_store, pen_storage, base_storage, exp_storage, vec_size, mem_type, data_type, encode_n,
-                          encode_max_int), _pi_init_shape(res_shape, res_shape_tuple)
+    return _pi_init_store(
+        res_store,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        mem_type,
+        data_type,
+        encode_n,
+        encode_max_int,
+    ), _pi_init_shape(res_shape, res_shape_tuple)
 
 
 ''' transfor PEN tensor from Python memory to C memory '''
@@ -1487,22 +1897,37 @@ def pi_p2c(target, src, data_type=FLOAT_TYPE):
     for i in range(vec_size):
         src_number = src[i].ciphertext(False).to_bytes(CIPHER_BYTE, 'little')
         GPU_LIB.c_memcpy(
-            c_void_p(res_pen + i * CIPHER_BYTE), c_char_p(src_number),
-            c_size_t(CIPHER_BYTE))
+            c_void_p(res_pen + i * CIPHER_BYTE),
+            c_char_p(src_number),
+            c_size_t(CIPHER_BYTE),
+        )
         base_temp.append(PEN_BASE)
         exp_temp.append(src[i].exponent)
-    # base and exp are deepcopyed in order to prevent potential double free here
-    base_array_pointer = np.asarray(base_temp, np.uint32).ctypes.data_as(c_void_p)
-    exp_array_pointer = np.asarray(exp_temp, np.uint32).ctypes.data_as(c_void_p)
+    # base and exp are deepcopyed in order to prevent potential double free
+    # here
+    base_array_pointer = np.asarray(
+        base_temp, np.uint32).ctypes.data_as(c_void_p)
+    exp_array_pointer = np.asarray(
+        exp_temp, np.uint32).ctypes.data_as(c_void_p)
     GPU_LIB.c_memcpy(
-        c_void_p(res_base), base_array_pointer,
-        c_size_t(vec_size * U_INT32_BYTE))
+        c_void_p(res_base),
+        base_array_pointer,
+        c_size_t(
+            vec_size *
+            U_INT32_BYTE))
     GPU_LIB.c_memcpy(
-        c_void_p(res_exp), exp_array_pointer,
-        c_size_t(vec_size * U_INT32_BYTE))
+        c_void_p(res_exp), exp_array_pointer, c_size_t(vec_size * U_INT32_BYTE)
+    )
     return _pi_init_store(
-        target, res_pen, res_base, res_exp,
-        vec_size, MEM_HOST, data_type, n, max_int)
+        target,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        MEM_HOST,
+        data_type,
+        n,
+        max_int)
 
 
 def _bi_init_store(res_store, data, count, elem_size, mem_type):
@@ -1520,24 +1945,46 @@ def _bi_init_store(res_store, data, count, elem_size, mem_type):
 _bi_init_shape = _te_init_shape
 
 
-def _bi_init_ss(res_store, res_data, vec_size, res_shape, res_shape_tuple,
-                elem_size, mem_type):
+def _bi_init_ss(
+        res_store,
+        res_data,
+        vec_size,
+        res_shape,
+        res_shape_tuple,
+        elem_size,
+        mem_type):
     '''Init BigIntStorage and the corresponding TensorShapeStorage'''
-    return _bi_init_store(res_store, res_data, vec_size, elem_size, mem_type), _bi_init_shape(res_shape,
-                                                                                              res_shape_tuple)
-
-
-def _fp_init_store(res_store, fpn_storage, base_storage, exp_storage,
-                   vec_size, n, max_int, mem_type,
-                   data_type):
+    return _bi_init_store(
+        res_store, res_data, vec_size, elem_size, mem_type
+    ), _bi_init_shape(res_shape, res_shape_tuple)
+
+
+def _fp_init_store(
+    res_store,
+    fpn_storage,
+    base_storage,
+    exp_storage,
+    vec_size,
+    n,
+    max_int,
+    mem_type,
+    data_type,
+):
     '''
     Init FixedPointStorage class,
     paras are identical to the elements in FixedPointStorage
     '''
     if res_store is None:
         res_store = FixedPointStorage(
-            fpn_storage, base_storage, exp_storage,
-            vec_size, n, max_int, mem_type, data_type)
+            fpn_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            n,
+            max_int,
+            mem_type,
+            data_type,
+        )
     else:
         res_store.bigint_storage = fpn_storage
         res_store.base_storage = base_storage
@@ -1552,15 +1999,36 @@ def _fp_init_store(res_store, fpn_storage, base_storage, exp_storage,
     return res_store
 
 
-def _fp_init_ss(res_store, fpn_storage, base_storage, exp_storage,
-                vec_size, n, max_int,
-                res_shape, res_shape_tuple, mem_type, data_type):
+def _fp_init_ss(
+    res_store,
+    fpn_storage,
+    base_storage,
+    exp_storage,
+    vec_size,
+    n,
+    max_int,
+    res_shape,
+    res_shape_tuple,
+    mem_type,
+    data_type,
+):
     '''Init FixedPointStorage and the corresponding TensorShapeStorage'''
-    return _fp_init_store(res_store, fpn_storage, base_storage, exp_storage, vec_size, n, max_int, mem_type,
-                          data_type), _te_init_shape(res_shape, res_shape_tuple)
-
-
-def get_add_mul_size(left_shape: TensorShapeStorage, right_shape: TensorShapeStorage):
+    return _fp_init_store(
+        res_store,
+        fpn_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        n,
+        max_int,
+        mem_type,
+        data_type,
+    ), _te_init_shape(res_shape, res_shape_tuple)
+
+
+def get_add_mul_size(
+        left_shape: TensorShapeStorage,
+        right_shape: TensorShapeStorage):
     '''
     Get the result size of pi_add, pi_mul, fp_mul calculators
     --------------------
@@ -1570,16 +2038,22 @@ def get_add_mul_size(left_shape: TensorShapeStorage, right_shape: TensorShapeSto
         res_size: int, the size of the return value
     '''
     if isinstance(left_shape, TensorShapeStorage) is False:
-        raise RuntimeError(f"Illegal shape type : {type(left_shape)}, params need type: {TensorShapeStorage}")
+        raise RuntimeError(
+            f"Illegal shape type : {type(left_shape)}, params need type: {TensorShapeStorage}"
+        )
     if isinstance(right_shape, TensorShapeStorage) is False:
-        raise RuntimeError(f"Illegal shape type : {type(right_shape)}, params need type: {TensorShapeStorage}")
+        raise RuntimeError(
+            f"Illegal shape type : {type(right_shape)}, params need type: {TensorShapeStorage}"
+        )
 
     P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
     res_size = max(P, R) * max(Q, S)
     return res_size
 
 
-def get_matmul_rmatmul_size(left_shape: TensorShapeStorage, right_shape: TensorShapeStorage):
+def get_matmul_rmatmul_size(
+    left_shape: TensorShapeStorage, right_shape: TensorShapeStorage
+):
     '''
     Get the result size of matmul, rmatmul calculators
     ----------------------
@@ -1589,9 +2063,13 @@ def get_matmul_rmatmul_size(left_shape: TensorShapeStorage, right_shape: TensorS
         res_size: int, the size of the result of corresponding calculators
     '''
     if isinstance(left_shape, TensorShapeStorage) is False:
-        raise RuntimeError(f"Illegal shape type : {type(left_shape)}, params need type: {TensorShapeStorage}")
+        raise RuntimeError(
+            f"Illegal shape type : {type(left_shape)}, params need type: {TensorShapeStorage}"
+        )
     if isinstance(right_shape, TensorShapeStorage) is False:
-        raise RuntimeError(f"Illegal shape type : {type(right_shape)}, params need type: {TensorShapeStorage}")
+        raise RuntimeError(
+            f"Illegal shape type : {type(right_shape)}, params need type: {TensorShapeStorage}"
+        )
     P, Q = __shape_decompose(left_shape)
     R, S = __shape_decompose(right_shape)
     res_size = P * S
@@ -1610,7 +2088,9 @@ def get_sum_size(shape: TensorShapeStorage, axis):
         int, the size of the result of pi_sum
     '''
     if isinstance(shape, TensorShapeStorage) is False:
-        raise RuntimeError(f"Illegal shape type : {type(shape)}, params need type: {TensorShapeStorage}")
+        raise RuntimeError(
+            f"Illegal shape type : {type(shape)}, params need type: {TensorShapeStorage}"
+        )
     if axis is None:
         return 1
     if len(shape.to_tuple()) < 2:
@@ -1636,11 +2116,17 @@ def get_slice_size(shape: TensorShapeStorage, start: int, stop: int, axis):
         int, the result size of corresponding calculators
     '''
     if isinstance(shape, TensorShapeStorage) is False:
-        raise RuntimeError(f"Illegal shape type : {type(shape)}, params need type: {TensorShapeStorage}")
+        raise RuntimeError(
+            f"Illegal shape type : {type(shape)}, params need type: {TensorShapeStorage}"
+        )
     if isinstance(start, int) is False:
-        raise RuntimeError(f"Illegal start type : {type(start)}, params need type : {int}")
+        raise RuntimeError(
+            f"Illegal start type : {type(start)}, params need type : {int}"
+        )
     if isinstance(stop, int) is False:
-        raise RuntimeError(f"Illegal stop type : {type(stop)}, params need type : {int}")
+        raise RuntimeError(
+            f"Illegal stop type : {type(stop)}, params need type : {int}"
+        )
     shape_tuple = shape.to_tuple()
     dim0, dim1 = 0, 0
     if len(shape_tuple) == 1:
@@ -1668,12 +2154,14 @@ def get_cat_size(shapes: list):
         int, the sum result of all shapes
     '''
     if isinstance(shapes, list) is False:
-        raise RuntimeError(f"Illegal shapes type : {type(shapes)}, params need type : {list}")
+        raise RuntimeError(
+            f"Illegal shapes type : {type(shapes)}, params need type : {list}"
+        )
     res_size = np.sum([v.size() for v in shapes])
     return res_size
 
 
-def pi_encrypt(pub_key, fps, res, stream):
+def pi_encrypt(pub_key, fps, res=None, stream=None):
     '''
     perform paillier encryption for FixedPointStorage,
     use raw encrypt with no obfuscation
@@ -1701,18 +2189,32 @@ def pi_encrypt(pub_key, fps, res, stream):
         res_exp = res.exp_storage
     '''call the encrypt function'''
     GPU_LIB.encrypt_paillier(
-        c_char_p(src_fpn), c_void_p(src_base), c_void_p(src_exp),
-        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_char_p(src_fpn),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
         c_void_p(pub_key.pub_key_ptr),
-        c_size_t(PLAIN_BITS), c_size_t(CIPHER_BITS),
-        c_size_t(vec_size), c_uint32(device_type))
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(vec_size),
+        c_uint32(device_type),
+    )
     return _pi_init_store(
-        res, res_pen, res_base, res_exp, vec_size,
-        fps.mem_type, fps.data_type,
-        fps.encode_n, fps.max_int)
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        fps.mem_type,
+        fps.data_type,
+        fps.encode_n,
+        fps.max_int,
+    )
 
 
-def pi_decrypt(pub_key, priv_key, pes, res, stream, fps=None):
+def pi_decrypt(pub_key, priv_key, pes, res=None, stream=None, fps=None):
     '''
     perform decryption and decode as a whole
     ---------------------
@@ -1742,20 +2244,34 @@ def pi_decrypt(pub_key, priv_key, pes, res, stream, fps=None):
         res_exp = fps.exp_storage
     '''call the decrypt function'''
     GPU_LIB.decrypt_paillier(
-        c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
-        c_char_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
-        c_void_p(pub_key.pub_key_ptr), c_void_p(priv_key.priv_key_ptr),
-        c_size_t(PLAIN_BITS), c_size_t(CIPHER_BITS),
-        c_size_t(vec_size), c_uint32(device_type))
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_fpn),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_void_p(pub_key.pub_key_ptr),
+        c_void_p(priv_key.priv_key_ptr),
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(vec_size),
+        c_uint32(device_type),
+    )
 
     decrypt_store = FixedPointStorage(
-        res_fpn, res_base, res_exp, vec_size,
-        pes.encode_n, pes.encode_max_int,
-        pes.mem_type, pes.data_type)
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        pes.encode_n,
+        pes.encode_max_int,
+        pes.mem_type,
+        pes.data_type,
+    )
     return fp_decode(decrypt_store, res, stream)
 
 
-def pi_obfuscate(pub_key, pes, obf_seeds, res, stream):
+def pi_obfuscate(pub_key, pes, obf_seeds, res=None, stream=None):
     '''
     apply obfuscation to a PaillierEncryptedStorage using the
     obfuscation seed given, actually a mulmod
@@ -1786,16 +2302,30 @@ def pi_obfuscate(pub_key, pes, obf_seeds, res, stream):
         res_exp = res.exp_storage
     '''run the modular mul function'''
     GPU_LIB.obf_modular_multiplication(
-        c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
         c_char_p(obf_rand),
-        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
         c_void_p(pub_key.pub_key_ptr),
-        c_size_t(CIPHER_BITS), c_size_t(CIPHER_BITS),
-        c_size_t(vec_size), c_uint32(device_type))
+        c_size_t(CIPHER_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(vec_size),
+        c_uint32(device_type),
+    )
     return _pi_init_store(
-        res, res_pen, res_base, res_exp, vec_size,
-        pes.mem_type, pes.data_type,
-        pes.encode_n, pes.encode_max_int)
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        pes.mem_type,
+        pes.data_type,
+        pes.encode_n,
+        pes.encode_max_int,
+    )
 
 
 def pi_gen_obf_seed(res_store, pub_key, count, elem_size, rand_seed, stream):
@@ -1819,10 +2349,14 @@ def pi_gen_obf_seed(res_store, pub_key, count, elem_size, rand_seed, stream):
     else:
         res_data = res_store.bigint_storage
     GPU_LIB.obf_modular_exponentiation(
-        c_char_p(rand_data), c_size_t(CIPHER_BITS),
+        c_char_p(rand_data),
+        c_size_t(CIPHER_BITS),
         c_void_p(pub_key.pub_key_ptr),
-        c_char_p(res_data), c_size_t(CIPHER_BITS),
-        c_size_t(count), c_uint32(device_type))
+        c_char_p(res_data),
+        c_size_t(CIPHER_BITS),
+        c_size_t(count),
+        c_uint32(device_type),
+    )
     return _bi_init_store(res_store, res_data, count, elem_size, MEM_DEVICE)
 
 
@@ -1865,8 +2399,16 @@ def check_func(a, b):
         raise PermissionError("shape cannot align", shape_1, shape_2)
 
 
-def pi_add(pub_key, left_store, right_store, left_shape, right_shape,
-           res_store, res_shape, stream):
+def pi_add(
+    pub_key,
+    left_store,
+    right_store,
+    left_shape,
+    right_shape,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
     '''
     Perform element-wise encrypted add, support broadcast over cols or rows
     ---------------
@@ -1906,28 +2448,54 @@ def pi_add(pub_key, left_store, right_store, left_shape, right_shape,
         res_exp = res_store.exp_storage
     # perform calculation
     GPU_LIB.pen_matrix_add_pen_matrix(
-        c_char_p(l_pen), c_void_p(l_base), c_void_p(l_exp),
-        c_char_p(r_pen), c_void_p(r_base), c_void_p(r_exp),
-        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-        c_size_t(P), c_size_t(Q), c_size_t(R), c_size_t(S),
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_pen),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(R),
+        c_size_t(S),
         c_void_p(pub_key.pub_key_ptr),
-        c_size_t(CIPHER_BITS), c_uint32(device_type))
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
     # handle the result's data type
     data_type = 0
-    if left_store.data_type == INT64_TYPE and \
-            right_store.data_type == INT64_TYPE:
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
         data_type = INT64_TYPE
     else:
         data_type = FLOAT_TYPE
     return _pi_init_ss(
-        res_store, res_pen, res_base, res_exp, res_size,
-        res_shape, res_shape_tuple,
-        left_store.mem_type, data_type,
-        left_store.encode_n, left_store.encode_max_int)
-
-
-def pi_mul(pub_key, left_store, right_store, left_shape, right_shape,
-           res_store, res_shape, stream):
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_mul(
+    pub_key,
+    left_store,
+    right_store,
+    left_shape,
+    right_shape,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
     '''
     Perform element-wise encrypted muliply, support broadcast for cols and rows
     --------------------
@@ -1967,27 +2535,51 @@ def pi_mul(pub_key, left_store, right_store, left_shape, right_shape,
         res_exp = res_store.exp_storage
     # '''call the batch_mul function'''
     GPU_LIB.fpn_matrix_elementwise_multiply_pen_matrix(
-        c_char_p(r_fpn), c_void_p(r_base), c_void_p(r_exp),
-        c_char_p(l_pen), c_void_p(l_base), c_void_p(l_exp),
-        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-        c_size_t(R), c_size_t(S), c_size_t(P), c_size_t(Q),
+        c_char_p(r_fpn),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(R),
+        c_size_t(S),
+        c_size_t(P),
+        c_size_t(Q),
         c_void_p(pub_key.pub_key_ptr),
-        c_size_t(PLAIN_BITS), c_size_t(CIPHER_BITS), c_uint32(device_type))
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
     # handle the result's data type
     data_type = 0
-    if left_store.data_type == INT64_TYPE and \
-            right_store.data_type == INT64_TYPE:
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
         data_type = INT64_TYPE
     else:
         data_type = FLOAT_TYPE
     return _pi_init_ss(
-        res_store, res_pen, res_base, res_exp, res_size,
-        res_shape, res_shape_tuple,
-        left_store.mem_type, data_type,
-        left_store.encode_n, left_store.encode_max_int)
-
-
-def fp_transpose(left_store, left_shape, res_store, res_shape, stream):
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def fp_transpose(
+        left_store,
+        left_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
     '''
     transpose the C-memory stored matrix of FixedPointStorage,
     support at most 2-D matrix
@@ -2023,27 +2615,64 @@ def fp_transpose(left_store, left_shape, res_store, res_shape, stream):
         # the tuple is 0-D or 1-D,
         # transpose returns the same value as input in numpy
         # make the output same as numpy, so no need for transpose
-        GPU_LIB.c_memcpy(c_void_p(res_fpn), c_void_p(src_fpn), c_size_t(vec_size * PLAIN_BYTE))
-        GPU_LIB.c_memcpy(c_void_p(res_base), c_void_p(src_base), c_size_t(vec_size * U_INT32_BYTE))
-        GPU_LIB.c_memcpy(c_void_p(res_exp), c_void_p(src_exp), c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_fpn),
+            c_void_p(src_fpn),
+            c_size_t(
+                vec_size *
+                PLAIN_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(
+                vec_size *
+                U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(
+                vec_size *
+                U_INT32_BYTE))
         return _fp_init_ss(
-            res_store, res_fpn, res_base, res_exp,
-            left_store.vec_size, left_store.encode_n, left_store.max_int,
-            left_shape, left_shape_tuple,
-            left_store.mem_type, left_store.data_type)
+            res_store,
+            res_fpn,
+            res_base,
+            res_exp,
+            left_store.vec_size,
+            left_store.encode_n,
+            left_store.max_int,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+        )
     elif len(left_shape_tuple) == 2:
         # the tuple is 2-D
         # do a normal transpose
         res_shape_tuple = (left_shape_tuple[1], left_shape_tuple[0])
         GPU_LIB.transpose(
-            c_char_p(src_fpn), c_void_p(src_base), c_void_p(src_exp),
-            c_char_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
-            c_size_t(res_shape_tuple[1]), c_size_t(res_shape_tuple[0]))
+            c_char_p(src_fpn),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(res_shape_tuple[1]),
+            c_size_t(res_shape_tuple[0]),
+        )
         return _fp_init_ss(
-            res_store, res_fpn, res_base, res_exp,
-            vec_size, left_store.encode_n, left_store.max_int,
-            res_shape, res_shape_tuple,
-            left_store.mem_type, left_store.data_type)
+            res_store,
+            res_fpn,
+            res_base,
+            res_exp,
+            vec_size,
+            left_store.encode_n,
+            left_store.max_int,
+            res_shape,
+            res_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+        )
     else:
         raise PermissionError("Unsupported shape")
 
@@ -2063,8 +2692,16 @@ def fp_transpose(left_store, left_shape, res_store, res_shape, stream):
 '''
 
 
-def pi_matmul(pub_key, left_store, right_store, left_shape, right_shape,
-              res_store, res_shape, stream):
+def pi_matmul(
+    pub_key,
+    left_store,
+    right_store,
+    left_shape,
+    right_shape,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
     '''
     Perform matrix multiply under encryption.
     Due to implementation of cuda code, right_store needs to be transposed
@@ -2087,8 +2724,12 @@ def pi_matmul(pub_key, left_store, right_store, left_shape, right_shape,
     # '''Pre-process shape'''
     left_tuple = left_shape.to_tuple()
     right_tuple = right_shape.to_tuple()
-    if len(left_tuple) == 0 or len(right_tuple) == 0 or \
-            len(left_tuple) > 2 or len(right_tuple) > 2:
+    if (
+        len(left_tuple) == 0
+        or len(right_tuple) == 0
+        or len(left_tuple) > 2
+        or len(right_tuple) > 2
+    ):
         raise PermissionError("Invalid shape")
     P, Q = __shape_decompose(left_shape)
     R, S = __shape_decompose(right_shape)
@@ -2105,11 +2746,14 @@ def pi_matmul(pub_key, left_store, right_store, left_shape, right_shape,
     elif len(left_tuple) == 2 and len(right_tuple) == 2:
         res_shape_tuple = (P, S)
     else:
-        raise RuntimeError("You should never ever see this error unless something VERY STRANGE occurs")
+        raise RuntimeError(
+            "You should never ever see this error unless something VERY STRANGE occurs"
+        )
     res_size = P * S
     '''A transpose is need to make the right matrix vertically flattened'''
     transpose_right_store, _ = fp_transpose(
-        right_store, right_shape, None, None, stream)
+        right_store, right_shape, None, None, stream
+    )
     # the left_store data
     l_pen = left_store.pen_storage
     l_base = left_store.base_storage
@@ -2129,16 +2773,26 @@ def pi_matmul(pub_key, left_store, right_store, left_shape, right_shape,
         res_exp = res_store.exp_storage
     '''call the batch_mul function'''
     GPU_LIB.pen_matrix_multiply_fpn_matrix(
-        c_char_p(l_pen), c_void_p(l_base), c_void_p(l_exp),
-        c_char_p(r_fpn), c_void_p(r_base), c_void_p(r_exp),
-        c_char_p(res_cipher), c_void_p(res_base), c_void_p(res_exp),
-        c_size_t(P), c_size_t(Q), c_size_t(S),
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_fpn),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_cipher),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(S),
         c_void_p(pub_key.pub_key_ptr),
-        c_size_t(PLAIN_BITS), c_size_t(CIPHER_BITS), c_uint32(device_type))
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
 
     data_type = 0
-    if left_store.data_type == INT64_TYPE and \
-            right_store.data_type == INT64_TYPE:
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
         data_type = INT64_TYPE
     else:
         data_type = FLOAT_TYPE
@@ -2146,14 +2800,30 @@ def pi_matmul(pub_key, left_store, right_store, left_shape, right_shape,
     del transpose_right_store
 
     return _pi_init_ss(
-        res_store, res_cipher, res_base, res_exp, res_size,
-        res_shape, res_shape_tuple,
-        left_store.mem_type, data_type,
-        left_store.encode_n, left_store.encode_max_int)
-
-
-def pi_rmatmul(pub_key, left_store, right_store, left_shape, right_shape,
-               res_store, res_shape, stream):
+        res_store,
+        res_cipher,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_rmatmul(
+    pub_key,
+    left_store,
+    right_store,
+    left_shape,
+    right_shape,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
     '''
     Perform matrix multiply under encryption.
     rmatmul means right_op is PaillierEncryptedStorage, differ from pi_matmul
@@ -2177,7 +2847,12 @@ def pi_rmatmul(pub_key, left_store, right_store, left_shape, right_shape,
     # pre-process of shapes
     left_tuple = left_shape.to_tuple()
     right_tuple = right_shape.to_tuple()
-    if len(left_tuple) == 0 or len(right_tuple) == 0 or len(left_tuple) > 2 or len(right_tuple) > 2:
+    if (
+        len(left_tuple) == 0
+        or len(right_tuple) == 0
+        or len(left_tuple) > 2
+        or len(right_tuple) > 2
+    ):
         raise PermissionError("Invalid shape")
     P, Q = __shape_decompose(left_shape)
     R, S = __shape_decompose(right_shape)
@@ -2194,11 +2869,14 @@ def pi_rmatmul(pub_key, left_store, right_store, left_shape, right_shape,
     elif len(left_tuple) == 2 and len(right_tuple) == 2:
         res_shape_tuple = (P, S)
     else:
-        raise RuntimeError("You should never ever see this error unless something VERY STRANGE occurs")
+        raise RuntimeError(
+            "You should never ever see this error unless something VERY STRANGE occurs"
+        )
     res_size = P * S
     '''A transpose is needed to make the right matrix vertically flattened'''
     transpose_right_store, _ = pi_transpose(
-        right_store, right_shape, None, None, stream)
+        right_store, right_shape, None, None, stream
+    )
     # the left_store data
     l_fpn = left_store.bigint_storage
     l_base = left_store.base_storage
@@ -2218,16 +2896,26 @@ def pi_rmatmul(pub_key, left_store, right_store, left_shape, right_shape,
         res_exp = res_store.exp_storage
 
     GPU_LIB.fpn_matrix_multiply_pen_matrix(
-        c_char_p(l_fpn), c_void_p(l_base), c_void_p(l_exp),
-        c_char_p(r_pen), c_void_p(r_base), c_void_p(r_exp),
-        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-        c_size_t(P), c_size_t(Q), c_size_t(S),
+        c_char_p(l_fpn),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_pen),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(S),
         c_void_p(pub_key.pub_key_ptr),
-        c_size_t(PLAIN_BITS), c_size_t(CIPHER_BITS), c_uint32(device_type))
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
 
     data_type = 0
-    if left_store.data_type == INT64_TYPE and \
-            right_store.data_type == INT64_TYPE:
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
         data_type = INT64_TYPE
     else:
         data_type = FLOAT_TYPE
@@ -2235,13 +2923,26 @@ def pi_rmatmul(pub_key, left_store, right_store, left_shape, right_shape,
     del transpose_right_store
 
     return _pi_init_ss(
-        res_store, res_pen, res_base, res_exp, res_size,
-        res_shape, res_shape_tuple,
-        right_store.mem_type, data_type,
-        right_store.encode_n, right_store.encode_max_int)
-
-
-def pi_transpose(left_store, left_shape, res_store, res_shape, stream):
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        right_store.mem_type,
+        data_type,
+        right_store.encode_n,
+        right_store.encode_max_int,
+    )
+
+
+def pi_transpose(
+        left_store,
+        left_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
     '''
     transpose the C-memory stored matrix of PaillierEncryptedStorage,
     support at most 2-D matrix
@@ -2274,32 +2975,77 @@ def pi_transpose(left_store, left_shape, res_store, res_shape, stream):
     '''Start handling different type of data '''
     if len(left_shape_tuple) < 2:
         # just a raw memcpy, no transpose needed for this scene
-        GPU_LIB.c_memcpy(c_void_p(res_pen), c_void_p(src_pen), c_size_t(vec_size * CIPHER_BYTE))
-        GPU_LIB.c_memcpy(c_void_p(res_base), c_void_p(src_base), c_size_t(vec_size * U_INT32_BYTE))
-        GPU_LIB.c_memcpy(c_void_p(res_exp), c_void_p(src_exp), c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_pen),
+            c_void_p(src_pen),
+            c_size_t(
+                vec_size *
+                CIPHER_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(
+                vec_size *
+                U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(
+                vec_size *
+                U_INT32_BYTE))
         return _pi_init_ss(
-            res_store, res_pen, res_base, res_exp,
-            left_store.vec_size, left_shape, left_shape_tuple,
-            left_store.mem_type, left_store.data_type,
-            left_store.encode_n, left_store.encode_max_int)
+            res_store,
+            res_pen,
+            res_base,
+            res_exp,
+            left_store.vec_size,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
     elif len(left_shape_tuple) == 2:
         res_shape_tuple = (left_shape_tuple[1], left_shape_tuple[0])
         # call the C transpose functions
         GPU_LIB.transpose(
-            c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
-            c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-            c_size_t(res_shape_tuple[1]), c_size_t(res_shape_tuple[0]))
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(res_shape_tuple[1]),
+            c_size_t(res_shape_tuple[0]),
+        )
         return _pi_init_ss(
-            res_store, res_pen, res_base, res_exp, vec_size,
-            res_shape, res_shape_tuple,
-            left_store.mem_type, left_store.data_type,
-            left_store.encode_n, left_store.encode_max_int)
+            res_store,
+            res_pen,
+            res_base,
+            res_exp,
+            vec_size,
+            res_shape,
+            res_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
     else:
         raise PermissionError("Invalid Shape")
 
 
 # WARNING:  NOW ALMOST ABANDONED DUE TO NOT IDEAL PERFORMANCE!
-def pi_sum_multi_stream(pub_key, left_store, left_shape, axis=None, res_store=None, res_shape=None, stream=None):
+def pi_sum_multi_stream(
+    pub_key,
+    left_store,
+    left_shape,
+    axis=None,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
     '''Doing pi_sum using multi cuda stream'''
     src_pen = left_store.pen_storage
     src_base = left_store.base_storage
@@ -2318,23 +3064,45 @@ def pi_sum_multi_stream(pub_key, left_store, left_shape, axis=None, res_store=No
     shape_tuple = left_shape.to_tuple()
 
     GPU_LIB.pen_sum_multi_stream(
-        c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
-        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-        c_size_t(shape_tuple[0]), c_size_t(shape_tuple[1]),
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(shape_tuple[0]),
+        c_size_t(shape_tuple[1]),
         c_void_p(pub_key.pub_key_ptr),
-        c_size_t(CIPHER_BITS), c_uint32(device_type))
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
 
     res_size = shape_tuple[0]
     res_shape_tuple = (res_size,)
     return _pi_init_ss(
-        res_store, res_pen, res_base, res_exp, res_size,
-        res_shape, res_shape_tuple,
-        left_store.mem_type, left_store.data_type,
-        left_store.encode_n, left_store.encode_max_int)
-
-
-def pi_sum(pub_key, left_store, left_shape, axis,
-           res_store, res_shape, stream):
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_sum(
+    pub_key,
+    left_store,
+    left_shape,
+    axis=None,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
     '''
     Perform sum according to the axis
     ----------------------
@@ -2365,7 +3133,9 @@ def pi_sum(pub_key, left_store, left_shape, axis,
     if len(left_shape_tuple) == 0:
         # handling shape (), meaning only one element in left_store
         if axis is not None and axis != 0:
-            raise PermissionError("Cannot set axis other than 0 or None for dimension 0")
+            raise PermissionError(
+                "Cannot set axis other than 0 or None for dimension 0"
+            )
         if res_store is None:
             res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
             res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
@@ -2374,19 +3144,43 @@ def pi_sum(pub_key, left_store, left_shape, axis,
             res_pen = res_store.pen_storage
             res_base = res_store.base_storage
             res_exp = res_store.exp_storage
-        GPU_LIB.c_memcpy(c_void_p(res_pen), c_void_p(src_pen), c_size_t(vec_size * CIPHER_BYTE))
-        GPU_LIB.c_memcpy(c_void_p(res_base), c_void_p(src_base), c_size_t(vec_size * U_INT32_BYTE))
-        GPU_LIB.c_memcpy(c_void_p(res_exp), c_void_p(src_exp), c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_pen),
+            c_void_p(src_pen),
+            c_size_t(
+                vec_size *
+                CIPHER_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(
+                vec_size *
+                U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(
+                vec_size *
+                U_INT32_BYTE))
         return _pi_init_ss(
-            left_store, res_pen, res_base, res_exp, vec_size,
-            left_shape, left_shape_tuple,
-            left_store.mem_type, left_store.data_type,
-            left_store.encode_n, left_store.encode_max_int)
+            left_store,
+            res_pen,
+            res_base,
+            res_exp,
+            vec_size,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
     elif axis is None or len(left_shape_tuple) == 1:
         # handling shape (n,) or axis == None
         # both mean sum for all elements
         if len(left_shape_tuple) == 1 and axis is not None and axis >= 1:
-            raise PermissionError("axis is out of bounds for array of dimension 1")
+            raise PermissionError(
+                "axis is out of bounds for array of dimension 1")
         if res_store is None:
             res_pen = GPU_LIB.c_malloc(c_size_t(1 * CIPHER_BYTE))
             res_base = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
@@ -2398,27 +3192,39 @@ def pi_sum(pub_key, left_store, left_shape, axis,
         res_size = 1
         res_shape_tuple = ()
         GPU_LIB.pen_sum(
-            c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
-            c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-            c_size_t(1), c_size_t(vec_size),
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(1),
+            c_size_t(vec_size),
             c_void_p(pub_key.pub_key_ptr),
-            c_size_t(CIPHER_BITS), c_uint32(device_type))
+            c_size_t(CIPHER_BITS),
+            c_uint32(device_type),
+        )
     elif axis == 0:
         # handling 2-D matrix, axis == 0 means sum vertically
         # since our gpu sum support only horizontal sum
         # aka batch sum over continuous memory space
         transpose_store, transpose_shape = pi_transpose(
-            left_store, left_shape, None, None, stream)
+            left_store, left_shape, None, None, stream
+        )
         src_pen = transpose_store.pen_storage
         src_base = transpose_store.base_storage
         src_exp = transpose_store.exp_storage
         transpose_tuple = transpose_shape.to_tuple()
         '''perform sum on the transposed matrix'''
         if res_store is None:
-            res_pen = GPU_LIB.c_malloc(c_size_t(transpose_tuple[0] * CIPHER_BYTE))
-            res_base = GPU_LIB.c_malloc(c_size_t(transpose_tuple[0] * U_INT32_BYTE))
-            res_exp = GPU_LIB.c_malloc(c_size_t(
-                transpose_tuple[0] * U_INT32_BYTE))
+            res_pen = GPU_LIB.c_malloc(
+                c_size_t(transpose_tuple[0] * CIPHER_BYTE))
+            res_base = GPU_LIB.c_malloc(
+                c_size_t(transpose_tuple[0] * U_INT32_BYTE))
+            res_exp = GPU_LIB.c_malloc(
+                c_size_t(
+                    transpose_tuple[0] *
+                    U_INT32_BYTE))
         else:
             res_pen = res_store.pen_storage
             res_base = res_store.base_storage
@@ -2426,18 +3232,29 @@ def pi_sum(pub_key, left_store, left_shape, axis,
         res_size = transpose_tuple[0]
         res_shape_tuple = (transpose_tuple[0],)
         GPU_LIB.pen_sum(
-            c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
-            c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-            c_size_t(transpose_tuple[0]), c_size_t(transpose_tuple[1]),
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(transpose_tuple[0]),
+            c_size_t(transpose_tuple[1]),
             c_void_p(pub_key.pub_key_ptr),
-            c_size_t(CIPHER_BITS), c_uint32(device_type))
+            c_size_t(CIPHER_BITS),
+            c_uint32(device_type),
+        )
     elif axis == 1:
         # handling 2-D matrix, axis == 1 means sum horizontally
         if res_store is None:
-            res_pen = GPU_LIB.c_malloc(c_size_t(left_shape_tuple[0] * CIPHER_BYTE))
-            res_base = GPU_LIB.c_malloc(c_size_t(left_shape_tuple[0] * U_INT32_BYTE))
-            res_exp = GPU_LIB.c_malloc(c_size_t(
-                left_shape_tuple[0] * U_INT32_BYTE))
+            res_pen = GPU_LIB.c_malloc(
+                c_size_t(left_shape_tuple[0] * CIPHER_BYTE))
+            res_base = GPU_LIB.c_malloc(
+                c_size_t(left_shape_tuple[0] * U_INT32_BYTE))
+            res_exp = GPU_LIB.c_malloc(
+                c_size_t(
+                    left_shape_tuple[0] *
+                    U_INT32_BYTE))
         else:
             res_pen = res_store.pen_storage
             res_base = res_store.base_storage
@@ -2446,19 +3263,34 @@ def pi_sum(pub_key, left_store, left_shape, axis,
         res_size = left_shape_tuple[0]
         res_shape_tuple = (left_shape_tuple[0],)
         GPU_LIB.pen_sum(
-            c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
-            c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-            c_size_t(left_shape_tuple[0]), c_size_t(left_shape_tuple[1]),
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(left_shape_tuple[0]),
+            c_size_t(left_shape_tuple[1]),
             c_void_p(pub_key.pub_key_ptr),
-            c_size_t(CIPHER_BITS), c_uint32(device_type))
+            c_size_t(CIPHER_BITS),
+            c_uint32(device_type),
+        )
     else:
         raise PermissionError("Invalid Axis or Shape")
 
     return _pi_init_ss(
-        res_store, res_pen, res_base, res_exp, res_size,
-        res_shape, res_shape_tuple,
-        left_store.mem_type, left_store.data_type,
-        left_store.encode_n, left_store.encode_max_int)
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
 
 
 # WARNING: ABANDONED BECAUSE OF NOT IDEAL PERFORMANCE
@@ -2480,14 +3312,37 @@ def pi_sum_with_index_v2(pub_key, left_store, left_shape, valid_index):
         res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
         res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
         res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
-        GPU_LIB.c_memcpy(c_void_p(res_pen), c_void_p(src_pen), c_size_t(vec_size * CIPHER_BYTE))
-        GPU_LIB.c_memcpy(c_void_p(res_base), c_void_p(src_base), c_size_t(vec_size * U_INT32_BYTE))
-        GPU_LIB.c_memcpy(c_void_p(res_exp), c_void_p(src_exp), c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_pen),
+            c_void_p(src_pen),
+            c_size_t(
+                vec_size *
+                CIPHER_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(
+                vec_size *
+                U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(
+                vec_size *
+                U_INT32_BYTE))
         return _pi_init_ss(
-            left_store, res_pen, res_base, res_exp, vec_size,
-            left_shape, left_shape_tuple,
-            left_store.mem_type, left_store.data_type,
-            left_store.encode_n, left_store.encode_max_int)
+            left_store,
+            res_pen,
+            res_base,
+            res_exp,
+            vec_size,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
 
     res_pen = GPU_LIB.c_malloc(c_size_t(1 * CIPHER_BYTE))
     res_base = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
@@ -2496,18 +3351,34 @@ def pi_sum_with_index_v2(pub_key, left_store, left_shape, valid_index):
     res_shape_tuple = ()
 
     GPU_LIB.pen_sum_with_index_v2(
-        c_void_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
-        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-        c_size_t(1), c_size_t(vec_size),
-        c_size_t(valid_size), c_void_p(valid_store.data),
+        c_void_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(1),
+        c_size_t(vec_size),
+        c_size_t(valid_size),
+        c_void_p(valid_store.data),
         c_void_p(pub_key.pub_key_ptr),
-        c_size_t(CIPHER_BITS), c_uint32(device_type))
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
 
     return _pi_init_ss(
-        None, res_pen, res_base, res_exp, res_size,
-        None, res_shape_tuple,
-        MEM_HOST, left_store.data_type,
-        left_store.encode_n, left_store.encode_max_int)
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        None,
+        res_shape_tuple,
+        MEM_HOST,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
 
 
 def pi_sum_with_index(pub_key, left_store, left_shape, valid_index):
@@ -2543,14 +3414,37 @@ def pi_sum_with_index(pub_key, left_store, left_shape, valid_index):
         res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
         res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
         res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
-        GPU_LIB.c_memcpy(c_void_p(res_pen), c_void_p(src_pen), c_size_t(vec_size * CIPHER_BYTE))
-        GPU_LIB.c_memcpy(c_void_p(res_base), c_void_p(src_base), c_size_t(vec_size * U_INT32_BYTE))
-        GPU_LIB.c_memcpy(c_void_p(res_exp), c_void_p(src_exp), c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_pen),
+            c_void_p(src_pen),
+            c_size_t(
+                vec_size *
+                CIPHER_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(
+                vec_size *
+                U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(
+                vec_size *
+                U_INT32_BYTE))
         return _pi_init_ss(
-            left_store, res_pen, res_base, res_exp, vec_size,
-            left_shape, left_shape_tuple,
-            left_store.mem_type, left_store.data_type,
-            left_store.encode_n, left_store.encode_max_int)
+            left_store,
+            res_pen,
+            res_base,
+            res_exp,
+            vec_size,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
 
     res_pen = GPU_LIB.c_malloc(c_size_t(1 * CIPHER_BYTE))
     res_base = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
@@ -2559,20 +3453,37 @@ def pi_sum_with_index(pub_key, left_store, left_shape, valid_index):
     res_size = 1
     res_shape_tuple = ()
     GPU_LIB.pen_sum_with_index(
-        c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
-        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-        c_size_t(1), c_size_t(vec_size),
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(1),
+        c_size_t(vec_size),
         c_void_p(valid_store.data),
         c_void_p(pub_key.pub_key_ptr),
-        c_size_t(CIPHER_BITS), c_uint32(device_type))
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
     return _pi_init_ss(
-        None, res_pen, res_base, res_exp, res_size,
-        None, res_shape_tuple,
-        MEM_HOST, left_store.data_type,
-        left_store.encode_n, left_store.encode_max_int)
-
-
-def pi_sum_multi_index(pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None):
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        None,
+        res_shape_tuple,
+        MEM_HOST,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_sum_multi_index(
+    pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None
+):
     '''
     Run sum for data with the same index indicated in the valid_index list
     Return: A PEN_Storage class with max_value-min_value+1 number of PEN values
@@ -2603,22 +3514,40 @@ def pi_sum_multi_index(pub_key, left_store, left_shape, valid_index, min_value=0
     res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
     res_shape_tuple = (res_size,)
     GPU_LIB.pen_sum_with_multi_index_v2(
-        c_void_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
-        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-        c_size_t(res_size), c_size_t(vec_size), c_int64(min_value),
+        c_void_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(res_size),
+        c_size_t(vec_size),
+        c_int64(min_value),
         c_void_p(valid_store.data),
         c_void_p(pub_key.pub_key_ptr),
-        c_size_t(CIPHER_BITS), c_uint32(device_type))
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
     return _pi_init_ss(
-        None, res_pen, res_base, res_exp, res_size,
-        None, res_shape_tuple,
-        MEM_HOST, left_store.data_type,
-        left_store.encode_n, left_store.encode_max_int)
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        None,
+        res_shape_tuple,
+        MEM_HOST,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
 
 
 # WARNNIG: CURRENTLY NOT IN USE BECAUSE NO APPRENT IMPROVEMENT WHEN left_store.vec_size is very large
 # TODO: apply this to store with small size
-def pi_sum_batch_multi_index(pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None):
+def pi_sum_batch_multi_index(
+    pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None
+):
     '''
     Rum sum for data with the same index indicated in valid index
     Basic logic is same with pi_sum_multi_index,
@@ -2640,7 +3569,8 @@ def pi_sum_batch_multi_index(pub_key, left_store, left_shape, valid_index, min_v
     valid_index_num = max_value - min_value + 1
     batch_num = valid_index.shape[0]
     if valid_index.shape[1] != vec_size:
-        raise PermissionError("valid index shape and raw data shape cannot align!!!")
+        raise PermissionError(
+            "valid index shape and raw data shape cannot align!!!")
 
     res_size = batch_num * valid_index_num
     res_pen = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
@@ -2651,23 +3581,41 @@ def pi_sum_batch_multi_index(pub_key, left_store, left_shape, valid_index, min_v
     valid_store = te_p2c(valid_index, None)
 
     GPU_LIB.batch_pen_sum_with_multi_index(
-        c_void_p(pen_storage), c_void_p(base_storage), c_void_p(exp_storage),
-        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-        c_size_t(valid_index_num), c_size_t(vec_size),
-        c_size_t(min_value), c_size_t(batch_num),
+        c_void_p(pen_storage),
+        c_void_p(base_storage),
+        c_void_p(exp_storage),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(valid_index_num),
+        c_size_t(vec_size),
+        c_size_t(min_value),
+        c_size_t(batch_num),
         c_void_p(valid_store.data),
-        c_void_p(pub_key.pub_key_ptr), c_size_t(CIPHER_BITS),
-        c_size_t(device_type))
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS),
+        c_size_t(device_type),
+    )
 
     return _pi_init_ss(
-        None, res_pen, res_base, res_exp, res_size,
-        None, res_shape_tuple,
-        MEM_HOST, left_store.data_type,
-        left_store.encode_n, left_store.encode_max_int)
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        None,
+        res_shape_tuple,
+        MEM_HOST,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
 
 
 # WARNING: ABANDONED FOR THE SAME REASON AS pi_sum_batch_multi_index
-def pi_sum_batch_multi_index_v2(pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None):
+def pi_sum_batch_multi_index_v2(
+    pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None
+):
     '''
     Almost the same with pi_sum_batch_multi_index,
     differ in the C implementation
@@ -2692,23 +3640,41 @@ def pi_sum_batch_multi_index_v2(pub_key, left_store, left_shape, valid_index, mi
     valid_store = te_p2c(valid_index, None)
 
     GPU_LIB.batch_pen_sum_with_multi_index_v2(
-        c_void_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
-        c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-        c_size_t(valid_index_num), c_size_t(vec_size),
-        c_size_t(valid_index.shape[1]), c_size_t(batch_num),
-        c_size_t(min_value), c_void_p(valid_store.data),
-        c_void_p(pub_key.pub_key_ptr), c_size_t(CIPHER_BITS),
-        c_size_t(device_type))
+        c_void_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(valid_index_num),
+        c_size_t(vec_size),
+        c_size_t(valid_index.shape[1]),
+        c_size_t(batch_num),
+        c_size_t(min_value),
+        c_void_p(valid_store.data),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS),
+        c_size_t(device_type),
+    )
 
     return _pi_init_ss(
-        None, res_pen, res_base, res_exp, res_size,
-        None, res_shape_tuple,
-        MEM_HOST, left_store.data_type,
-        left_store.encode_n, left_store.encode_max_int)
-
-
-def fp_encode(store, n, max_int,
-              precision=None, max_exponent=None, res=None, stream=None):
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        None,
+        res_shape_tuple,
+        MEM_HOST,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def fp_encode(
+    store, n, max_int, precision=None, max_exponent=None, res=None, stream=None
+):
     '''
     Perform encode to a TensorStorage
     -----------------
@@ -2744,30 +3710,48 @@ def fp_encode(store, n, max_int,
     if store.data_type == FLOAT_TYPE:
         GPU_LIB.encode_double(
             c_void_p(src_data),
-            c_void_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
             c_int32(precision),
             c_char_p(n.to_bytes(PLAIN_BYTE, 'little')),
             c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
-            c_size_t(PLAIN_BITS), c_size_t(vec_size), c_uint32(device_type))
+            c_size_t(PLAIN_BITS),
+            c_size_t(vec_size),
+            c_uint32(device_type),
+        )
     elif store.data_type == INT64_TYPE:
         GPU_LIB.encode_int(
             c_void_p(src_data),
-            c_void_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
             c_int32(precision),
             c_char_p(n.to_bytes(PLAIN_BYTE, 'little')),
             c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
-            c_size_t(PLAIN_BITS), c_size_t(vec_size), c_uint32(device_type))
+            c_size_t(PLAIN_BITS),
+            c_size_t(vec_size),
+            c_uint32(device_type),
+        )
     else:
         raise PermissionError("Invalid Data Type")
 
     '''get the three elements, store it in a FPNStorage'''
 
     return _fp_init_store(
-        res, res_fpn, res_base, res_exp, vec_size,
-        n, max_int, store.mem_type, store.data_type)
+        res,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        n,
+        max_int,
+        store.mem_type,
+        store.data_type,
+    )
 
 
-def __fp_decode(store, res, stream):
+def __fp_decode(store, res=None, stream=None):
     '''
     Decode a FixedPointStorage in CPU, using fp_c2p to implement
     Currently not used, as a GPU version has been done
@@ -2783,14 +3767,14 @@ def __fp_decode(store, res, stream):
     res_exp = store.exp_storage
     vec_size = store.vec_size
     fpn_array = __get_c_fpn_storage(
-        res_fpn, res_base, res_exp, vec_size,
-        store.encode_n, store.max_int)
+        res_fpn, res_base, res_exp, vec_size, store.encode_n, store.max_int
+    )
 
     CPU_decode = []
-    if (store.data_type == INT64_TYPE):
+    if store.data_type == INT64_TYPE:
         for i in range(vec_size):
             CPU_decode.append(int(fpn_array[i].decode()))
-    elif (store.data_type == FLOAT_TYPE):
+    elif store.data_type == FLOAT_TYPE:
         for i in range(vec_size):
             CPU_decode.append(fpn_array[i].decode())
     else:
@@ -2800,11 +3784,15 @@ def __fp_decode(store, res, stream):
     decode_data = te_p2c(CPU_decode, None)
     res_data = decode_data.data
     decode_data.data = None
-    return _te_init_store(res, res_data, vec_size,
-                          store.mem_type, store.data_type)
+    return _te_init_store(
+        res,
+        res_data,
+        vec_size,
+        store.mem_type,
+        store.data_type)
 
 
-def fp_decode(store, res, stream):
+def fp_decode(store, res=None, stream=None):
     '''
     Decode a FixedPointStorage in GPU
     ------------------
@@ -2816,30 +3804,41 @@ def fp_decode(store, res, stream):
     '''
     if store.data_type == FLOAT_TYPE:
         if res is None:
-            res_store = GPU_LIB.c_malloc(c_size_t(store.vec_size * DOUBLE_BYTE))
+            res_store = GPU_LIB.c_malloc(
+                c_size_t(store.vec_size * DOUBLE_BYTE))
         else:
             res_store = res.data
         GPU_LIB.decode_double(
-            c_void_p(store.bigint_storage), c_void_p(store.base_storage),
+            c_void_p(store.bigint_storage),
+            c_void_p(store.base_storage),
             c_void_p(store.exp_storage),
             c_char_p(store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
             c_char_p(store.max_int.to_bytes(PLAIN_BYTE, 'little')),
             c_size_t(PLAIN_BITS),
-            c_void_p(res_store), c_size_t(store.vec_size))
+            c_void_p(res_store),
+            c_size_t(store.vec_size),
+        )
     elif store.data_type == INT64_TYPE:
-        res_store = GPU_LIB.c_malloc(c_size_t(store.vec_size * INT64_BYTE)) \
-            if res is None else res.data
+        res_store = (
+            GPU_LIB.c_malloc(c_size_t(store.vec_size * INT64_BYTE))
+            if res is None
+            else res.data
+        )
         GPU_LIB.decode_int(
-            c_void_p(store.bigint_storage), c_void_p(store.base_storage),
+            c_void_p(store.bigint_storage),
+            c_void_p(store.base_storage),
             c_void_p(store.exp_storage),
             c_char_p(store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
             c_char_p(store.max_int.to_bytes(PLAIN_BYTE, 'little')),
             c_size_t(PLAIN_BITS),
-            c_void_p(res_store), c_size_t(store.vec_size))
+            c_void_p(res_store),
+            c_size_t(store.vec_size),
+        )
     else:
         raise PermissionError("Invalid Data Type")
-    return _te_init_store(res, res_store, store.vec_size,
-                          store.mem_type, store.data_type)
+    return _te_init_store(
+        res, res_store, store.vec_size, store.mem_type, store.data_type
+    )
 
 
 def bi_free(src):
@@ -2872,8 +3871,8 @@ def fp_c2p(src):
     src_exp = src.exp_storage
     vec_size = src.vec_size
     return __get_c_fpn_storage(
-        src_fpn, src_base, src_exp,
-        vec_size, src.encode_n, src.max_int)
+        src_fpn, src_base, src_exp, vec_size, src.encode_n, src.max_int
+    )
 
 
 def pi_c2p_mp(src):
@@ -2890,7 +3889,11 @@ def pi_c2p_mp(src):
     src_exp = src.exp_storage
     vec_size = src.vec_size
     return __get_c_pen_storage_mp(
-        src_pen, src_base, src_exp, vec_size, src.encode_n)
+        src_pen,
+        src_base,
+        src_exp,
+        vec_size,
+        src.encode_n)
 
 
 def pi_c2p(src):
@@ -2903,8 +3906,15 @@ def pi_c2p(src):
         src_pen, src_base, src_exp, vec_size, src.encode_n)
 
 
-def fp_mul(left_store, right_store, left_shape, right_shape,
-           res_store, res_shape, stream):
+def fp_mul(
+    left_store,
+    right_store,
+    left_shape,
+    right_shape,
+    res_store=None,
+    res_shape=None,
+    stream=None,
+):
     '''
     Perform element-wise multiplication between two FixedPointStorage.
     This is a plaintext computation rather than an encrypted one.
@@ -2937,23 +3947,42 @@ def fp_mul(left_store, right_store, left_shape, right_shape,
         res_base = res_store.base_storage
         res_exp = res_store.exp_storage
     GPU_LIB.fpn_mul(
-        c_char_p(l_fpn), c_void_p(l_base), c_void_p(l_exp),
-        c_char_p(r_fpn), c_void_p(r_base), c_void_p(r_exp),
-        c_char_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
-        c_size_t(P), c_size_t(Q), c_size_t(R), c_size_t(S),
+        c_char_p(l_fpn),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_fpn),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_fpn),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(R),
+        c_size_t(S),
         c_char_p(left_store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
-        c_size_t(PLAIN_BITS), c_uint32(device_type))
+        c_size_t(PLAIN_BITS),
+        c_uint32(device_type),
+    )
     # handle the data_type according to left & right's data_type
     data_type = 0
-    if left_store.data_type == INT64_TYPE and \
-            right_store.data_type == INT64_TYPE:
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
         data_type = INT64_TYPE
     else:
         data_type = FLOAT_TYPE
     return _fp_init_ss(
-        res_store, res_fpn, res_base, res_exp, res_size,
-        left_store.encode_n, left_store.max_int, res_shape, res_shape_tuple,
-        left_store.mem_type, data_type)
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        res_size,
+        left_store.encode_n,
+        left_store.max_int,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+    )
 
 
 def fp_p2c(target, src, data_type=FLOAT_TYPE):
@@ -2982,22 +4011,38 @@ def fp_p2c(target, src, data_type=FLOAT_TYPE):
     max_int = src[0].max_int
     for i in range(vec_size):
         src_number = src[i].encoding.to_bytes(PLAIN_BYTE, 'little')
-        GPU_LIB.c_memcpy(c_void_p(res_fpn + i * PLAIN_BYTE),
-                         c_char_p(src_number), c_size_t(PLAIN_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_fpn + i * PLAIN_BYTE),
+            c_char_p(src_number),
+            c_size_t(PLAIN_BYTE),
+        )
         base_temp.append(src[i].BASE)
         exp_temp.append(src[i].exponent)
 
-    base_array_pointer = np.asarray(base_temp, np.uint32).ctypes.data_as(c_void_p)
-    exp_array_pointer = np.asarray(exp_temp, np.uint32).ctypes.data_as(c_void_p)
+    base_array_pointer = np.asarray(
+        base_temp, np.uint32).ctypes.data_as(c_void_p)
+    exp_array_pointer = np.asarray(
+        exp_temp, np.uint32).ctypes.data_as(c_void_p)
     GPU_LIB.c_memcpy(
-        c_void_p(res_base), base_array_pointer,
-        c_size_t(vec_size * U_INT32_BYTE))
+        c_void_p(res_base),
+        base_array_pointer,
+        c_size_t(
+            vec_size *
+            U_INT32_BYTE))
     GPU_LIB.c_memcpy(
-        c_void_p(res_exp), exp_array_pointer,
-        c_size_t(vec_size * U_INT32_BYTE))
+        c_void_p(res_exp), exp_array_pointer, c_size_t(vec_size * U_INT32_BYTE)
+    )
 
-    return _fp_init_store(target, res_fpn, res_base, res_exp,
-                          vec_size, n, max_int, MEM_HOST, data_type)
+    return _fp_init_store(
+        target,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        n,
+        max_int,
+        MEM_HOST,
+        data_type)
 
 
 def _index_reset(index, dim_size):
@@ -3011,7 +4056,15 @@ def _index_reset(index, dim_size):
     return res_index
 
 
-def fp_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
+def fp_slice(
+        store,
+        shape,
+        start,
+        stop,
+        axis,
+        res_store=None,
+        res_shape=None,
+        stream=None):
     '''
     slice a contiguous memory space, now support two directions.
     -----------------------------
@@ -3058,20 +4111,36 @@ def fp_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
     if axis == 0 and start >= stop:
         res_fpn, res_base, res_exp = None, None, None
         return _fp_init_ss(
-            None, res_fpn, res_base, res_exp, 0,
-            store.encode_n, store.encode_max_int,
-            None, (0, dim1),
-            store.mem_type, store.data_type)
+            None,
+            res_fpn,
+            res_base,
+            res_exp,
+            0,
+            store.encode_n,
+            store.encode_max_int,
+            None,
+            (0, dim1),
+            store.mem_type,
+            store.data_type,
+        )
     # handle condition that a[:,k:l] k>=l for 2-d array
     # will cause the result shape to be (dim0, 0)
     if axis == 1 and start >= stop:
         res_fpn, res_base, res_exp = None, None, None
         res_shape_tuple = (dim0, 0) if len(fpn_shape_tuple) == 2 else (0,)
         return _fp_init_ss(
-            None, res_fpn, res_base, res_exp, 0,
-            store.encode_n, store.encode_max_int,
-            None, res_shape_tuple,
-            store.mem_type, store.data_type)
+            None,
+            res_fpn,
+            res_base,
+            res_exp,
+            0,
+            store.encode_n,
+            store.encode_max_int,
+            None,
+            res_shape_tuple,
+            store.mem_type,
+            store.data_type,
+        )
         # handle the normal slice
     res_shape_tuple, vec_size = (), 0
     '''useful paras'''
@@ -3085,17 +4154,27 @@ def fp_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
         res_uint32_row_bytelen = gap_length * U_INT32_BYTE
         if res_store is None:
             res_fpn = GPU_LIB.c_malloc(c_size_t(res_bigint_row_bytelen * dim0))
-            res_base = GPU_LIB.c_malloc(c_size_t(res_uint32_row_bytelen * dim0))
+            res_base = GPU_LIB.c_malloc(
+                c_size_t(res_uint32_row_bytelen * dim0))
             res_exp = GPU_LIB.c_malloc(c_size_t(res_uint32_row_bytelen * dim0))
         else:
             res_fpn = res_store.bigint_storage
             res_base = res_store.base_storage
             res_exp = res_store.exp_storage
         GPU_LIB.slice_vertical(
-            c_char_p(src_fpn), c_void_p(src_base), c_void_p(src_exp),
-            c_char_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
-            c_size_t(dim0), c_size_t(dim1), c_size_t(start), c_size_t(stop),
-            c_size_t(PLAIN_BITS), c_uint32(device_type))
+            c_char_p(src_fpn),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(PLAIN_BITS),
+            c_uint32(device_type),
+        )
         if len(fpn_shape_tuple) == 1:
             res_shape_tuple = (gap_length,)
             vec_size = res_shape_tuple[0]
@@ -3104,32 +4183,61 @@ def fp_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
             vec_size = res_shape_tuple[0] * res_shape_tuple[1]
 
     elif axis == 0:
-        'axis == 0 means that we nned to cut the matrix horizontally '
+        'axis == 0 means that we nned to cut the matrix horizontally'
         if res_store is None:
-            res_fpn = GPU_LIB.c_malloc(c_size_t(bigint_row_bytelen * gap_length))
-            res_base = GPU_LIB.c_malloc(c_size_t(uint32_row_bytelen * gap_length))
-            res_exp = GPU_LIB.c_malloc(c_size_t(uint32_row_bytelen * gap_length))
+            res_fpn = GPU_LIB.c_malloc(
+                c_size_t(bigint_row_bytelen * gap_length))
+            res_base = GPU_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+            res_exp = GPU_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
         else:
             res_fpn = res_store.bigint_storage
             res_base = res_store.base_storage
             res_exp = res_store.exp_storage
         GPU_LIB.slice_horizontal(
-            c_char_p(src_fpn), c_void_p(src_base), c_void_p(src_exp),
-            c_char_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
-            c_size_t(dim0), c_size_t(dim1), c_size_t(start), c_size_t(stop),
-            c_size_t(PLAIN_BITS), c_uint32(device_type))
+            c_char_p(src_fpn),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(PLAIN_BITS),
+            c_uint32(device_type),
+        )
         res_shape_tuple = (gap_length, dim1)
         vec_size = res_shape_tuple[0] * res_shape_tuple[1]
     else:
         raise NotImplementedError("Only support 2 dimensional slice")
 
     return _fp_init_ss(
-        res_store, res_fpn, res_base, res_exp,
-        vec_size, store.encode_n, store.max_int,
-        res_shape, res_shape_tuple, store.mem_type, store.data_type)
-
-
-def pi_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        store.encode_n,
+        store.max_int,
+        res_shape,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def pi_slice(
+        store,
+        shape,
+        start,
+        stop,
+        axis,
+        res_store=None,
+        res_shape=None,
+        stream=None):
     '''
     slice a contiguous memory space, now support two directions.
     -----------------------------
@@ -3175,18 +4283,36 @@ def pi_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
     if axis == 0 and start >= stop:
         res_pen, res_base, res_exp = None, None, None
         return _pi_init_ss(
-            None, res_pen, res_base, res_exp, 0, None, (0, dim1),
-            store.mem_type, store.data_type,
-            store.encode_n, store.encode_max_int)
+            None,
+            res_pen,
+            res_base,
+            res_exp,
+            0,
+            None,
+            (0, dim1),
+            store.mem_type,
+            store.data_type,
+            store.encode_n,
+            store.encode_max_int,
+        )
     # handle condition that a[:, k, l] k>=l for 2-d array
     # will cause the result shape to be (dim0, 0)
     if axis == 1 and start >= stop:
         res_pen, res_base, res_exp = None, None, None
         res_shape_tuple = (dim0, 0) if len(pen_shape_tuple) == 2 else (0,)
         return _pi_init_ss(
-            None, res_pen, res_base, res_exp, 0, None, res_shape_tuple,
-            store.mem_type, store.data_type,
-            store.encode_n, store.encode_max_int)
+            None,
+            res_pen,
+            res_base,
+            res_exp,
+            0,
+            None,
+            res_shape_tuple,
+            store.mem_type,
+            store.data_type,
+            store.encode_n,
+            store.encode_max_int,
+        )
     # handle the normal slice
     res_shape_tuple = ()
     vec_size = 0
@@ -3202,7 +4328,8 @@ def pi_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
         # malloc space for result
         if res_store is None:
             res_pen = GPU_LIB.c_malloc(c_size_t(res_bigint_row_bytelen * dim0))
-            res_base = GPU_LIB.c_malloc(c_size_t(res_uint32_row_bytelen * dim0))
+            res_base = GPU_LIB.c_malloc(
+                c_size_t(res_uint32_row_bytelen * dim0))
             res_exp = GPU_LIB.c_malloc(c_size_t(res_uint32_row_bytelen * dim0))
         else:
             res_pen = res_store.bigint_storage
@@ -3210,10 +4337,19 @@ def pi_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
             res_exp = res_store.exp_storage
         # call the raw function
         GPU_LIB.slice_vertical(
-            c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
-            c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-            c_size_t(dim0), c_size_t(dim1), c_size_t(start), c_size_t(stop),
-            c_size_t(CIPHER_BITS), c_uint32(device_type))
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(CIPHER_BITS),
+            c_uint32(device_type),
+        )
         if len(pen_shape_tuple) == 1:
             res_shape_tuple = (gap_length,)
             vec_size = res_shape_tuple[0]
@@ -3221,20 +4357,32 @@ def pi_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
             res_shape_tuple = (dim0, gap_length)
             vec_size = res_shape_tuple[0] * res_shape_tuple[1]
     elif axis == 0:
-        'axis == 0 means that we nned to cut the matrix horizontally '
+        'axis == 0 means that we nned to cut the matrix horizontally'
         if res_store is None:
-            res_pen = GPU_LIB.c_malloc(c_size_t(bigint_row_bytelen * gap_length))
-            res_base = GPU_LIB.c_malloc(c_size_t(uint32_row_bytelen * gap_length))
-            res_exp = GPU_LIB.c_malloc(c_size_t(uint32_row_bytelen * gap_length))
+            res_pen = GPU_LIB.c_malloc(
+                c_size_t(bigint_row_bytelen * gap_length))
+            res_base = GPU_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+            res_exp = GPU_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
         else:
             res_pen = res_store.bigint_storage
             res_base = res_store.base_storage
             res_exp = res_store.exp_storage
         GPU_LIB.slice_horizontal(
-            c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
-            c_char_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-            c_size_t(dim0), c_size_t(dim1), c_size_t(start), c_size_t(stop),
-            c_size_t(CIPHER_BITS), c_uint32(device_type))
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(CIPHER_BITS),
+            c_uint32(device_type),
+        )
         # since 1-dim shape will not occur here, result shape is always 2-D
         res_shape_tuple = (gap_length, dim1)
         vec_size = res_shape_tuple[0] * res_shape_tuple[1]
@@ -3242,10 +4390,18 @@ def pi_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
         raise NotImplementedError()
 
     return _pi_init_ss(
-        res_store, res_pen, res_base, res_exp, vec_size,
-        res_shape, res_shape_tuple,
-        store.mem_type, store.data_type,
-        store.encode_n, store.encode_max_int)
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        res_shape,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+        store.encode_n,
+        store.encode_max_int,
+    )
 
 
 def fp_cat(stores, shapes, axis, res_store, res_shape):
@@ -3274,16 +4430,19 @@ def fp_cat(stores, shapes, axis, res_store, res_shape):
     if num_stores < 2:
         raise PermissionError("At least 2 Storages required for concatenation")
     if len(shapes) != num_stores:
-        raise PermissionError("The number of storages and that of shapes didn't match")
+        raise PermissionError(
+            "The number of storages and that of shapes didn't match")
     for v in stores:
         if v.data_type != stores[0].data_type:
-            raise PermissionError("All storages should have the same data type")
+            raise PermissionError(
+                "All storages should have the same data type")
         if v.encode_n != stores[0].encode_n:
             raise PermissionError("All storages should have the same n")
         if v.max_int != stores[0].max_int:
             raise PermissionError("All storages should have the same max_int")
         if v.mem_type != stores[0].mem_type:
-            raise PermissionError("All storages should have the same memory type")
+            raise PermissionError(
+                "All storages should have the same memory type")
     # num_rows, num_cols is the data demanded by C functions
     # res_rows, res_cols are return values that should be same as numpy's output
     # distinguish them so upper and lower level won't bother each other
@@ -3345,21 +4504,48 @@ def fp_cat(stores, shapes, axis, res_store, res_shape):
 
     if axis == 0:
         '''means that we should cat stores vertically'''
-        GPU_LIB.vstack(fpn_arr, base_arr, exp_arr,
-                       c_void_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
-                       c_size_t(num_stores), vec_sizes, c_size_t(num_cols), c_size_t(PLAIN_BITS))
+        GPU_LIB.vstack(
+            fpn_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(num_stores),
+            vec_sizes,
+            c_size_t(num_cols),
+            c_size_t(PLAIN_BITS),
+        )
     elif axis == 1:
         '''means that we should cat stores horizontally'''
-        GPU_LIB.hstack(fpn_arr, base_arr, exp_arr,
-                       c_void_p(res_fpn), c_void_p(res_base), c_void_p(res_exp),
-                       c_size_t(num_stores), vec_sizes, c_size_t(num_rows), c_size_t(PLAIN_BITS))
+        GPU_LIB.hstack(
+            fpn_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(num_stores),
+            vec_sizes,
+            c_size_t(num_rows),
+            c_size_t(PLAIN_BITS),
+        )
     else:
         raise NotImplementedError()
 
     return _fp_init_ss(
-        res_store, res_fpn, res_base, res_exp, int(round(res_vec_size)),
-        stores[0].encode_n, stores[0].max_int, res_shape, res_shape.to_tuple(),
-        stores[0].mem_type, stores[0].data_type)
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        int(round(res_vec_size)),
+        stores[0].encode_n,
+        stores[0].max_int,
+        res_shape,
+        res_shape.to_tuple(),
+        stores[0].mem_type,
+        stores[0].data_type,
+    )
 
 
 def pi_cat(stores, shapes, axis, res_store, res_shape):
@@ -3389,16 +4575,19 @@ def pi_cat(stores, shapes, axis, res_store, res_shape):
     if num_stores < 2:
         raise PermissionError("At least 2 Storages required for concatenation")
     if len(shapes) != num_stores:
-        raise PermissionError("The number of storages and that of shapes didn't match")
+        raise PermissionError(
+            "The number of storages and that of shapes didn't match")
     for v in stores:
         if v.data_type != stores[0].data_type:
-            raise PermissionError("All storages should have the same data type")
+            raise PermissionError(
+                "All storages should have the same data type")
         if v.encode_n != stores[0].encode_n:
             raise PermissionError("All storages should have the same n")
         if v.encode_max_int != stores[0].encode_max_int:
             raise PermissionError("All storages should have the same max_int")
         if v.mem_type != stores[0].mem_type:
-            raise PermissionError("All storages should have the same memory type")
+            raise PermissionError(
+                "All storages should have the same memory type")
     # num_rows, num_cols is the data demanded by C functions
     # res_rows, res_cols are return values that should be same as numpy's output
     # distinguish them so upper and lower level won't bother each other
@@ -3463,21 +4652,47 @@ def pi_cat(stores, shapes, axis, res_store, res_shape):
     vec_sizes = (c_size_t * num_stores)(*[v.vec_size for v in stores])
 
     if axis == 0:
-        GPU_LIB.vstack(pen_arr, base_arr, exp_arr,
-                       c_void_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-                       c_size_t(num_stores), vec_sizes, c_size_t(num_cols), c_size_t(CIPHER_BITS))
+        GPU_LIB.vstack(
+            pen_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(num_stores),
+            vec_sizes,
+            c_size_t(num_cols),
+            c_size_t(CIPHER_BITS),
+        )
     elif axis == 1:
-        GPU_LIB.hstack(pen_arr, base_arr, exp_arr,
-                       c_void_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-                       c_size_t(num_stores), vec_sizes, c_size_t(num_rows), c_size_t(CIPHER_BITS))
+        GPU_LIB.hstack(
+            pen_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(num_stores),
+            vec_sizes,
+            c_size_t(num_rows),
+            c_size_t(CIPHER_BITS),
+        )
     else:
         raise NotImplementedError()
 
     return _pi_init_ss(
-        res_store, res_pen, res_base, res_exp,
-        int(round(res_vec_size)), res_shape, res_shape.to_tuple(),
-        stores[0].mem_type, stores[0].data_type,
-        stores[0].encode_n, stores[0].encode_max_int)
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        int(round(res_vec_size)),
+        res_shape,
+        res_shape.to_tuple(),
+        stores[0].mem_type,
+        stores[0].data_type,
+        stores[0].encode_n,
+        stores[0].encode_max_int,
+    )
 
 
 def bi_p2c(data, res):
@@ -3492,11 +4707,14 @@ def bi_p2c(data, res):
     '''
     vec_size = data.size
     for i in range(vec_size):
-        GPU_LIB.c_memcpy(c_void_p(res + i * CIPHER_BYTE),
-                         c_char_p(data[i].to_bytes(CIPHER_BYTE, 'little')), c_size_t(CIPHER_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res + i * CIPHER_BYTE),
+            c_char_p(data[i].to_bytes(CIPHER_BYTE, 'little')),
+            c_size_t(CIPHER_BYTE),
+        )
 
 
-def bi_gen_rand(elem_size, count, res, rand_seed, stream):
+def bi_gen_rand(elem_size, count, res, rand_seed, stream=None):
     '''
     generate random bigint for pi_obfuscation
     ------------------
@@ -3511,16 +4729,15 @@ def bi_gen_rand(elem_size, count, res, rand_seed, stream):
     # Didn't use vectorize since that we need to_bytes()
     # But ndarray_float64 has no to_bytes method
     random.seed(rand_seed)
-    rands = np.asarray([random.randrange(1, 8 ** elem_size)
-                        for i in range(count)])
+    rands = np.asarray([random.randrange(1, 8**elem_size)
+                       for i in range(count)])
     if res is None:
         data_storage = GPU_LIB.c_malloc(c_size_t(count * CIPHER_BYTE))
     else:
         data_storage = res.bigint_storage
     bi_p2c(rands, data_storage)
     # CIPHER_BYTE is the upper bound of the length of the rand number
-    return _bi_init_store(
-        res, data_storage, count, CIPHER_BYTE, MEM_DEVICE)
+    return _bi_init_store(res, data_storage, count, CIPHER_BYTE, MEM_DEVICE)
 
 
 def __get_shape_size(shape_tuple):
@@ -3536,7 +4753,13 @@ def __get_shape_size(shape_tuple):
     return shape_size
 
 
-def pi_reshape(store, shape, new_shape, res_store, res_shape, stream):
+def pi_reshape(
+        store,
+        shape,
+        new_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
     '''
     Change a PaillierEcnryptedStorage's shape.
     No need for change the continuous storage, only change the shape.
@@ -3566,15 +4789,35 @@ def pi_reshape(store, shape, new_shape, res_store, res_shape, stream):
         res_base = res_store.base_storage
         res_exp = res_store.exp_storage
 
-    GPU_LIB.c_memcpy(c_void_p(res_pen), c_void_p(store.pen_storage), c_size_t(CIPHER_BYTE * res_vec_size))
-    GPU_LIB.c_memcpy(c_void_p(res_base), c_void_p(store.base_storage), c_size_t(U_INT32_BYTE * res_vec_size))
-    GPU_LIB.c_memcpy(c_void_p(res_exp), c_void_p(store.exp_storage), c_size_t(U_INT32_BYTE * res_vec_size))
+    GPU_LIB.c_memcpy(
+        c_void_p(res_pen),
+        c_void_p(store.pen_storage),
+        c_size_t(CIPHER_BYTE * res_vec_size),
+    )
+    GPU_LIB.c_memcpy(
+        c_void_p(res_base),
+        c_void_p(store.base_storage),
+        c_size_t(U_INT32_BYTE * res_vec_size),
+    )
+    GPU_LIB.c_memcpy(
+        c_void_p(res_exp),
+        c_void_p(store.exp_storage),
+        c_size_t(U_INT32_BYTE * res_vec_size),
+    )
 
     return _pi_init_ss(
-        res_store, res_pen, res_base, res_exp,
-        store.vec_size, res_shape, res_shape_tuple,
-        store.mem_type, store.data_type,
-        store.encode_n, store.encode_max_int)
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        store.vec_size,
+        res_shape,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+        store.encode_n,
+        store.encode_max_int,
+    )
 
 
 def pi_accumulate(gpu_pubkey, pubkey_n, left_store, left_shape):
@@ -3602,19 +4845,36 @@ def pi_accumulate(gpu_pubkey, pubkey_n, left_store, left_shape):
     c_pubkey_n = c_char_p(pubkey_n.to_bytes(CIPHER_BYTE, "little"))
 
     GPU_LIB.gmp_accumulate(
-        c_char_p(src_pen), c_void_p(src_base), c_void_p(src_exp),
-        c_void_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-        c_size_t(vec_size), c_size_t(CIPHER_BITS),
-        c_void_p(gpu_pubkey.pub_key_ptr), c_pubkey_n)
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_void_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(vec_size),
+        c_size_t(CIPHER_BITS),
+        c_void_p(gpu_pubkey.pub_key_ptr),
+        c_pubkey_n,
+    )
 
     return _pi_init_ss(
-        None, res_pen, res_base, res_exp,
-        vec_size, None, res_shape_tuple,
-        left_store.mem_type, left_store.data_type,
-        left_store.encode_n, left_store.encode_max_int)
-
-
-def pi_add_with_index(gpu_pubkey, pubkey_n, l_store, l_shape, r_store, r_shape, valid_index):
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        None,
+        res_shape_tuple,
+        left_store.mem_type,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_add_with_index(
+    gpu_pubkey, pubkey_n, l_store, l_shape, r_store, r_shape, valid_index
+):
     '''
     Add a single PaillierEncryptedNumber to the designated index in a vector
     ----------------------------
@@ -3632,7 +4892,8 @@ def pi_add_with_index(gpu_pubkey, pubkey_n, l_store, l_shape, r_store, r_shape,
     '''
     # check for data format
     if r_store.vec_size != 1:
-        raise NotImplementedError("Now only support r_store with only one vector size")
+        raise NotImplementedError(
+            "Now only support r_store with only one vector size")
     # transform data format
     vec_size = l_store.vec_size
     c_pubkey_n = c_char_p(pubkey_n.to_bytes(CIPHER_BYTE, "little"))
@@ -3650,16 +4911,34 @@ def pi_add_with_index(gpu_pubkey, pubkey_n, l_store, l_shape, r_store, r_shape,
     res_exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
     # call the C functions
     GPU_LIB.pen_add_with_index(
-        c_char_p(l_pen), c_void_p(l_base), c_void_p(l_exp),
-        c_char_p(r_pen), c_void_p(r_base), c_void_p(r_exp),
-        c_void_p(res_pen), c_void_p(res_base), c_void_p(res_exp),
-        c_size_t(vec_size), c_size_t(valid_index), c_size_t(CIPHER_BITS),
-        c_void_p(gpu_pubkey.pub_key_ptr), c_pubkey_n)
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_pen),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_void_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(vec_size),
+        c_size_t(valid_index),
+        c_size_t(CIPHER_BITS),
+        c_void_p(gpu_pubkey.pub_key_ptr),
+        c_pubkey_n,
+    )
     return _pi_init_ss(
-        None, res_pen, res_base, res_exp,
-        vec_size, None, res_shape_tuple,
-        l_store.mem_type, l_store.data_type,
-        l_store.encode_n, l_store.encode_max_int)
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        None,
+        res_shape_tuple,
+        l_store.mem_type,
+        l_store.data_type,
+        l_store.encode_n,
+        l_store.encode_max_int,
+    )
 
 
 def pi_partition_by_index(l_store, valid_index, valid_cnt=None):
@@ -3694,15 +4973,23 @@ def pi_partition_by_index(l_store, valid_index, valid_cnt=None):
     res_pen_list, res_base_list, res_exp_list = [], [], []
     for i in range(bin_cnt):
         if valid_cnt[i] > 0:
-            res_pen_list.append(GPU_LIB.cuda_malloc(c_size_t(CIPHER_BYTE * valid_cnt[i])))
+            res_pen_list.append(
+                GPU_LIB.cuda_malloc(c_size_t(CIPHER_BYTE * valid_cnt[i]))
+            )
             # Assume that this data has already been aligned to max_exp
             # which is done in h2d
             base_ptr = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * valid_cnt[i]))
             exp_ptr = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * valid_cnt[i]))
-            GPU_LIB.c_memcpy(c_void_p(base_ptr), c_void_p(src_base),
-                             c_size_t(U_INT32_BYTE * valid_cnt[i]))
-            GPU_LIB.c_memcpy(c_void_p(exp_ptr), c_void_p(src_exp),
-                             c_size_t(U_INT32_BYTE * valid_cnt[i]))
+            GPU_LIB.c_memcpy(
+                c_void_p(base_ptr),
+                c_void_p(src_base),
+                c_size_t(U_INT32_BYTE * valid_cnt[i]),
+            )
+            GPU_LIB.c_memcpy(
+                c_void_p(exp_ptr),
+                c_void_p(src_exp),
+                c_size_t(U_INT32_BYTE * valid_cnt[i]),
+            )
             res_base_list.append(base_ptr)
             res_exp_list.append(exp_ptr)
         else:
@@ -3712,15 +4999,28 @@ def pi_partition_by_index(l_store, valid_index, valid_cnt=None):
     pen_ptr_list = [c_void_p(x) for x in res_pen_list]
     cipher_arr = (c_void_p * bin_cnt)(*pen_ptr_list)
     GPU_LIB.partition_by_index(
-        c_char_p(src_pen), cipher_arr,
+        c_char_p(src_pen),
+        cipher_arr,
         c_void_p(valid_store.data),
-        c_uint32(vec_size), c_uint32(bin_cnt))
+        c_uint32(vec_size),
+        c_uint32(bin_cnt),
+    )
     # construct return list
     res_list = []
     for i in range(bin_cnt):
-        res_list.append(_pi_init_ss(
-            None, res_pen_list[i], res_base_list[i], res_exp_list[i], valid_cnt[i],
-            None, (valid_cnt[i],),
-            l_store.mem_type, l_store.data_type,
-            l_store.encode_n, l_store.encode_max_int))
+        res_list.append(
+            _pi_init_ss(
+                None,
+                res_pen_list[i],
+                res_base_list[i],
+                res_exp_list[i],
+                valid_cnt[i],
+                None,
+                (valid_cnt[i],),
+                l_store.mem_type,
+                l_store.data_type,
+                l_store.encode_n,
+                l_store.encode_max_int,
+            )
+        )
     return res_list
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_tensor.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_tensor.py
index 98cea5f5e8..82e82865f5 100644
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_tensor.py
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_tensor.py
@@ -17,16 +17,37 @@
 
 import numpy as np
 
-from .gpu_engine import PaillierEncryptedStorage, \
-    TensorShapeStorage, pi_add, te_p2c, fp_encode, pi_encrypt, pi_mul, pi_matmul, pi_rmatmul, pi_sum, pi_h2d_pub_key, \
-    pi_p2c_pub_key, pi_decrypt, te_c2p, pi_h2d_priv_key, pi_p2c_priv_key
-from .secureprotol.fate_paillier import PaillierPublicKey, PaillierPrivateKey, PaillierKeypair
+from .gpu_engine import (
+    PaillierEncryptedStorage,
+    TensorShapeStorage,
+    pi_add,
+    te_p2c,
+    fp_encode,
+    pi_encrypt,
+    pi_mul,
+    pi_matmul,
+    pi_rmatmul,
+    pi_sum,
+    pi_h2d_pub_key,
+    pi_p2c_pub_key,
+    pi_decrypt,
+    te_c2p,
+    pi_h2d_priv_key,
+    pi_p2c_priv_key,
+)
+from .secureprotol.fate_paillier import (
+    PaillierPublicKey,
+    PaillierPrivateKey,
+    PaillierKeypair,
+)
 
 
 class Cipherblock:
-    def __init__(self, store: PaillierEncryptedStorage,
-                 shape: TensorShapeStorage,
-                 pk: "PK"):
+    def __init__(
+            self,
+            store: PaillierEncryptedStorage,
+            shape: TensorShapeStorage,
+            pk: "PK"):
         self.store = store
         self.shape = shape
         self.pk = pk
@@ -42,33 +63,45 @@ def gen_shape(other):
         return TensorShapeStorage().from_tuple(other.shape)
 
     def _add_plaintext(self, other) -> "Cipherblock":
-        fp_store = fp_encode(te_p2c(other, None), self.pk.pub_key.n, self.pk.pub_key.max_int)
-        pi_store = pi_encrypt(self.pk.gpu_pub_key, fp_store, None, None)
-        res_store, res_shape = pi_add(self.pk.gpu_pub_key, self.store, pi_store, self.shape, self.gen_shape(other),
-                                      None, None, None)
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        pi_store = pi_encrypt(self.pk.gpu_pub_key, fp_store)
+        res_store, res_shape = pi_add(
+            self.pk.gpu_pub_key, self.store, pi_store, self.shape, self.gen_shape(other))
         return Cipherblock(res_store, res_shape, self.pk)
 
     def _mul_plaintext(self, other) -> "Cipherblock":
-        fp_store = fp_encode(te_p2c(other, None), self.pk.pub_key.n, self.pk.pub_key.max_int)
-        res_store, res_shape = pi_mul(self.pk.gpu_pub_key, self.store, fp_store, self.shape, self.gen_shape(other),
-                                      None, None, None)
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        res_store, res_shape = pi_mul(
+            self.pk.gpu_pub_key, self.store, fp_store, self.shape, self.gen_shape(other))
         return Cipherblock(res_store, res_shape, self.pk)
 
     def _matmul_plaintext(self, other) -> "Cipherblock":
-        fp_store = fp_encode(te_p2c(other, None), self.pk.pub_key.n, self.pk.pub_key.max_int)
-        res_store, res_shape = pi_matmul(self.pk.gpu_pub_key, self.store, fp_store, self.shape, self.gen_shape(other),
-                                         None, None, None)
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        res_store, res_shape = pi_matmul(
+            self.pk.gpu_pub_key, self.store, fp_store, self.shape, self.gen_shape(other))
         return Cipherblock(res_store, res_shape, self.pk)
 
     def _rmatmul_plaintext(self, other) -> "Cipherblock":
-        fp_store = fp_encode(te_p2c(other, None), self.pk.pub_key.n, self.pk.pub_key.max_int)
-        res_store, res_shape = pi_rmatmul(self.pk.gpu_pub_key, fp_store, self.store, self.gen_shape(other), self.shape,
-                                          None, None, None)
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        res_store, res_shape = pi_rmatmul(
+            self.pk.gpu_pub_key, fp_store, self.store, self.gen_shape(other), self.shape)
         return Cipherblock(res_store, res_shape, self.pk)
 
     def add_cipherblock(self, other: "Cipherblock") -> "Cipherblock":
-        res_store, res_shape = pi_add(self.pk.gpu_pub_key, self.store, other.store, self.shape, other.shape, None,
-                                      None, None)
+        res_store, res_shape = pi_add(
+            self.pk.gpu_pub_key, self.store, other.store, self.shape, other.shape)
         return Cipherblock(res_store, res_shape, self.pk)
 
     def add_plaintext_f64(self, other) -> "Cipherblock":
@@ -83,19 +116,27 @@ def add_plaintext_i64(self, other) -> "Cipherblock":
     def add_plaintext_i32(self, other) -> "Cipherblock":
         return self._add_plaintext(other)
 
-    def add_plaintext_scalar_f64(self, other: typing.Union[float, np.float64]) -> "Cipherblock":
+    def add_plaintext_scalar_f64(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.float64)
         return self._add_plaintext(other_array)
 
-    def add_plaintext_scalar_f32(self, other: typing.Union[float, np.float32]) -> "Cipherblock":
+    def add_plaintext_scalar_f32(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.float32)
         return self._add_plaintext(other_array)
 
-    def add_plaintext_scalar_i64(self, other: typing.Union[int, np.int64]) -> "Cipherblock":
+    def add_plaintext_scalar_i64(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.int64)
         return self._add_plaintext(other_array)
 
-    def add_plaintext_scalar_i32(self, other: typing.Union[int, np.int32]) -> "Cipherblock":
+    def add_plaintext_scalar_i32(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.int32)
         return self._add_plaintext(other_array)
 
@@ -114,16 +155,24 @@ def sub_plaintext_i64(self, other) -> "Cipherblock":
     def sub_plaintext_i32(self, other) -> "Cipherblock":
         return self.add_plaintext_i32(other * -1)
 
-    def sub_plaintext_scalar_f64(self, other: typing.Union[float, np.float64]) -> "Cipherblock":
+    def sub_plaintext_scalar_f64(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
         return self.add_plaintext_scalar_f64(other * -1)
 
-    def sub_plaintext_scalar_f32(self, other: typing.Union[float, np.float32]) -> "Cipherblock":
+    def sub_plaintext_scalar_f32(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
         return self.add_plaintext_scalar_f32(other * -1)
 
-    def sub_plaintext_scalar_i64(self, other: typing.Union[int, np.int64]) -> "Cipherblock":
+    def sub_plaintext_scalar_i64(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
         return self.add_plaintext_scalar_i64(other * -1)
 
-    def sub_plaintext_scalar_i32(self, other: typing.Union[int, np.int32]) -> "Cipherblock":
+    def sub_plaintext_scalar_i32(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
         return self.add_plaintext_scalar_i32(other * -1)
 
     def mul_plaintext_f64(self, other) -> "Cipherblock":
@@ -138,19 +187,27 @@ def mul_plaintext_i64(self, other) -> "Cipherblock":
     def mul_plaintext_i32(self, other) -> "Cipherblock":
         return self._mul_plaintext(other)
 
-    def mul_plaintext_scalar_f64(self, other: typing.Union[float, np.float64]) -> "Cipherblock":
+    def mul_plaintext_scalar_f64(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.float64)
         return self._mul_plaintext(other_array)
 
-    def mul_plaintext_scalar_f32(self, other: typing.Union[float, np.float32]) -> "Cipherblock":
+    def mul_plaintext_scalar_f32(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.float32)
         return self._mul_plaintext(other_array)
 
-    def mul_plaintext_scalar_i64(self, other: typing.Union[int, np.int64]) -> "Cipherblock":
+    def mul_plaintext_scalar_i64(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.int64)
         return self._mul_plaintext(other_array)
 
-    def mul_plaintext_scalar_i32(self, other: typing.Union[int, np.int32]) -> "Cipherblock":
+    def mul_plaintext_scalar_i32(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.int32)
         return self._mul_plaintext(other_array)
 
@@ -203,11 +260,13 @@ def rmatmul_plaintext_ix1_i32(self, other) -> "Cipherblock":
         return self._rmatmul_plaintext(other)
 
     def sum(self) -> "Cipherblock":
-        res_store, res_shape = pi_sum(self.pk.gpu_pub_key, self.store, self.shape, None, None, None, None)
+        res_store, res_shape = pi_sum(
+            self.pk.gpu_pub_key, self.store, self.shape)
         return Cipherblock(res_store, res_shape, self.pk)
 
     def sum_axis(self, axis=None):
-        res_store, res_shape = pi_sum(self.pk.gpu_pub_key, self.store, self.shape, axis, None, None, None)
+        res_store, res_shape = pi_sum(
+            self.pk.gpu_pub_key, self.store, self.shape, axis)
         return Cipherblock(res_store, res_shape, self.pk)
 
     def mean(self) -> "Cipherblock":
@@ -227,16 +286,24 @@ def add_plaintext_f32_par(self, other) -> "Cipherblock":
     def add_plaintext_i64_par(self, other) -> "Cipherblock":
         return self.add_plaintext_i64(other)
 
-    def add_plaintext_scalar_f64_par(self, other: typing.Union[float, np.float64]) -> "Cipherblock":
+    def add_plaintext_scalar_f64_par(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
         return self.add_plaintext_scalar_f64(other)
 
-    def add_plaintext_scalar_f32_par(self, other: typing.Union[float, np.float32]) -> "Cipherblock":
+    def add_plaintext_scalar_f32_par(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
         return self.add_plaintext_scalar_f32(other)
 
-    def add_plaintext_scalar_i64_par(self, other: typing.Union[int, np.int64]) -> "Cipherblock":
+    def add_plaintext_scalar_i64_par(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
         return self.add_plaintext_scalar_i64(other)
 
-    def add_plaintext_scalar_i32_par(self, other: typing.Union[int, np.int32]) -> "Cipherblock":
+    def add_plaintext_scalar_i32_par(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
         return self.add_plaintext_scalar_i32(other)
 
     def add_plaintext_i32_par(self, other) -> "Cipherblock":
@@ -257,16 +324,24 @@ def sub_plaintext_i64_par(self, other) -> "Cipherblock":
     def sub_plaintext_i32_par(self, other) -> "Cipherblock":
         return self.sub_plaintext_i32(other)
 
-    def sub_plaintext_scalar_f64_par(self, other: typing.Union[float, np.float64]) -> "Cipherblock":
+    def sub_plaintext_scalar_f64_par(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
         return self.sub_plaintext_scalar_f64(other)
 
-    def sub_plaintext_scalar_f32_par(self, other: typing.Union[float, np.float32]) -> "Cipherblock":
+    def sub_plaintext_scalar_f32_par(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
         return self.sub_plaintext_scalar_f32(other)
 
-    def sub_plaintext_scalar_i64_par(self, other: typing.Union[int, np.int64]) -> "Cipherblock":
+    def sub_plaintext_scalar_i64_par(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
         return self.sub_plaintext_scalar_i64(other)
 
-    def sub_plaintext_scalar_i32_par(self, other: typing.Union[int, np.int32]) -> "Cipherblock":
+    def sub_plaintext_scalar_i32_par(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
         return self.sub_plaintext_scalar_i32(other)
 
     def mul_plaintext_f64_par(self, other) -> "Cipherblock":
@@ -281,16 +356,24 @@ def mul_plaintext_i64_par(self, other) -> "Cipherblock":
     def mul_plaintext_i32_par(self, other) -> "Cipherblock":
         return self.mul_plaintext_i32(other)
 
-    def mul_plaintext_scalar_f64_par(self, other: typing.Union[float, np.float64]) -> "Cipherblock":
+    def mul_plaintext_scalar_f64_par(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
         return self.mul_plaintext_scalar_f64(other)
 
-    def mul_plaintext_scalar_f32_par(self, other: typing.Union[float, np.float32]) -> "Cipherblock":
+    def mul_plaintext_scalar_f32_par(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
         return self.mul_plaintext_scalar_f32(other)
 
-    def mul_plaintext_scalar_i64_par(self, other: typing.Union[int, np.int64]) -> "Cipherblock":
+    def mul_plaintext_scalar_i64_par(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
         return self.mul_plaintext_scalar_i64(other)
 
-    def mul_plaintext_scalar_i32_par(self, other: typing.Union[int, np.int32]) -> "Cipherblock":
+    def mul_plaintext_scalar_i32_par(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
         return self.mul_plaintext_scalar_i32(other)
 
     def matmul_plaintext_ix2_f64_par(self, other) -> "Cipherblock":
@@ -351,12 +434,12 @@ def mean_par(self) -> "Cipherblock":
 class PK:
     def __init__(self, pub_key: PaillierPublicKey):
         self.pub_key = pub_key
-        self.gpu_pub_key = pi_h2d_pub_key(None, pi_p2c_pub_key(None, self.pub_key))
+        self.gpu_pub_key = pi_h2d_pub_key(pi_p2c_pub_key(self.pub_key))
 
     def _encrypt(self, a) -> Cipherblock:
         shape = TensorShapeStorage().from_tuple(a.shape)
-        fp_store = fp_encode(te_p2c(a, None), self.pub_key.n, self.pub_key.max_int)
-        pi_store = pi_encrypt(self.gpu_pub_key, fp_store, None, None)
+        fp_store = fp_encode(te_p2c(a), self.pub_key.n, self.pub_key.max_int)
+        pi_store = pi_encrypt(self.gpu_pub_key, fp_store)
         return Cipherblock(pi_store, shape, self)
 
     def encrypt_f64(self, a) -> Cipherblock:
@@ -387,13 +470,13 @@ def encrypt_i32_par(self, a) -> Cipherblock:
 class SK:
     def __init__(self, priv_key: PaillierPrivateKey, pk: PK):
         self.priv_key = priv_key
-        self.gpu_priv_key = pi_h2d_priv_key(None, pi_p2c_priv_key(None, priv_key))
+        self.gpu_priv_key = pi_h2d_priv_key(pi_p2c_priv_key(priv_key))
         self.pk = pk
 
     def _decrypt(self, a: Cipherblock):
         if a.store.vec_size == 0:
             return np.asarray([])
-        te_res = pi_decrypt(a.pk.gpu_pub_key, self.gpu_priv_key, a.store, None, None, None)
+        te_res = pi_decrypt(a.pk.gpu_pub_key, self.gpu_priv_key, a.store)
         return te_c2p(te_res).reshape(a.get_shape())
 
     def decrypt_f64(self, a: Cipherblock):
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/__init__.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/__init__.py
index ef471ba686..e69de29bb2 100644
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/__init__.py
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/__init__.py
@@ -1,15 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
\ No newline at end of file
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py
index adbf383e24..9851e8df36 100644
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py
@@ -28,8 +28,7 @@ def __init__(self):
 
     @staticmethod
     def generate_keypair(n_length=1024):
-        """return a new :class:`PaillierPublicKey` and :class:`PaillierPrivateKey`.
-        """
+        """return a new :class:`PaillierPublicKey` and :class:`PaillierPrivateKey`."""
         p = q = n = None
         n_len = 0
 
@@ -48,8 +47,7 @@ def generate_keypair(n_length=1024):
 
 
 class PaillierPublicKey(object):
-    """Contains a public key and associated encryption methods.
-    """
+    """Contains a public key and associated encryption methods."""
 
     def __init__(self, n):
         self.g = n + 1
@@ -68,19 +66,18 @@ def __hash__(self):
         return hash(self.n)
 
     def apply_obfuscator(self, ciphertext, random_value=None):
-        """
-        """
+        """ """
         r = random_value or random.SystemRandom().randrange(1, self.n)
         obfuscator = gmpy_math.powmod(r, self.n, self.nsquare)
 
         return (ciphertext * obfuscator) % self.nsquare
 
     def raw_encrypt(self, plaintext, random_value=None):
-        """
-        """
+        """ """
         if not isinstance(plaintext, int):
-            raise TypeError("plaintext should be int, but got: %s" %
-                            type(plaintext))
+            raise TypeError(
+                "plaintext should be int, but got: %s" %
+                type(plaintext))
 
         if plaintext >= (self.n - self.max_int) and plaintext < self.n:
             # Very large plaintext, take a sneaky shortcut using inverses
@@ -95,14 +92,16 @@ def raw_encrypt(self, plaintext, random_value=None):
         return ciphertext
 
     def encrypt(self, value, precision=None, random_value=None):
-        """Encode and Paillier encrypt a real number value.
-        """
+        """Encode and Paillier encrypt a real number value."""
         if isinstance(value, FixedPointNumber):
             value = value.decode()
-        encoding = FixedPointNumber.encode(value, self.n, self.max_int, precision)
+        encoding = FixedPointNumber.encode(
+            value, self.n, self.max_int, precision)
         obfuscator = random_value or 1
-        ciphertext = self.raw_encrypt(encoding.encoding, random_value=obfuscator)
-        encryptednumber = PaillierEncryptedNumber(self, ciphertext, encoding.exponent)
+        ciphertext = self.raw_encrypt(
+            encoding.encoding, random_value=obfuscator)
+        encryptednumber = PaillierEncryptedNumber(
+            self, ciphertext, encoding.exponent)
         if random_value is None:
             encryptednumber.apply_obfuscator()
 
@@ -110,12 +109,12 @@ def encrypt(self, value, precision=None, random_value=None):
 
 
 class PaillierPrivateKey(object):
-    """Contains a private key and associated decryption method.
-    """
+    """Contains a private key and associated decryption method."""
 
     def __init__(self, public_key, p, q):
         if not p * q == public_key.n:
-            raise ValueError("given public key does not match the given p and q")
+            raise ValueError(
+                "given public key does not match the given p and q")
         if p == q:
             raise ValueError("p and q have to be different")
         self.public_key = public_key
@@ -143,66 +142,90 @@ def __repr__(self):
         return "<PaillierPrivateKey {}>".format(hashcode[:10])
 
     def h_func(self, x, xsquare):
-        """Computes the h-function as defined in Paillier's paper page.
-        """
-        return gmpy_math.invert(self.l_func(gmpy_math.powmod(self.public_key.g,
-                                                             x - 1, xsquare), x), x)
+        """Computes the h-function as defined in Paillier's paper page."""
+        return gmpy_math.invert(
+            self.l_func(
+                gmpy_math.powmod(
+                    self.public_key.g,
+                    x - 1,
+                    xsquare),
+                x),
+            x)
 
     def l_func(self, x, p):
-        """computes the L function as defined in Paillier's paper.
-        """
+        """computes the L function as defined in Paillier's paper."""
 
         return (x - 1) // p
 
     def crt(self, mp, mq):
         """the Chinese Remainder Theorem as needed for decryption.
-           return the solution modulo n=pq.
-       """
+        return the solution modulo n=pq.
+        """
         u = (mp - mq) * self.q_inverse % self.p
         x = (mq + (u * self.q)) % self.public_key.n
 
         return x
 
     def raw_decrypt(self, ciphertext):
-        """return raw plaintext.
-        """
+        """return raw plaintext."""
         if not isinstance(ciphertext, int):
-            raise TypeError("ciphertext should be an int, not: %s" %
-                            type(ciphertext))
-
-        mp = self.l_func(gmpy_math.powmod(ciphertext,
-                                          self.p - 1, self.psquare),
-                         self.p) * self.hp % self.p
-
-        mq = self.l_func(gmpy_math.powmod(ciphertext,
-                                          self.q - 1, self.qsquare),
-                         self.q) * self.hq % self.q
+            raise TypeError(
+                "ciphertext should be an int, not: %s" %
+                type(ciphertext))
+
+        mp = (
+            self.l_func(
+                gmpy_math.powmod(
+                    ciphertext,
+                    self.p -
+                    1,
+                    self.psquare),
+                self.p) *
+            self.hp %
+            self.p)
+
+        mq = (
+            self.l_func(
+                gmpy_math.powmod(
+                    ciphertext,
+                    self.q -
+                    1,
+                    self.qsquare),
+                self.q) *
+            self.hq %
+            self.q)
 
         return self.crt(mp, mq)
 
     def decrypt(self, encrypted_number):
-        """return the decrypted & decoded plaintext of encrypted_number.
-        """
+        """return the decrypted & decoded plaintext of encrypted_number."""
         if not isinstance(encrypted_number, PaillierEncryptedNumber):
-            raise TypeError("encrypted_number should be an PaillierEncryptedNumber, \
-                             not: %s" % type(encrypted_number))
+            raise TypeError(
+                "encrypted_number should be an PaillierEncryptedNumber, \
+                             not: %s"
+                % type(encrypted_number)
+            )
 
         if self.public_key != encrypted_number.public_key:
-            raise ValueError("encrypted_number was encrypted against a different key!")
-
-        encoded = self.raw_decrypt(encrypted_number.ciphertext(be_secure=False))
-        encoded = FixedPointNumber(encoded,
-                                   encrypted_number.exponent,
-                                   self.public_key.n,
-                                   self.public_key.max_int)
+            raise ValueError(
+                "encrypted_number was encrypted against a different key!")
+
+        encoded = self.raw_decrypt(
+            encrypted_number.ciphertext(
+                be_secure=False))
+        encoded = FixedPointNumber(
+            encoded,
+            encrypted_number.exponent,
+            self.public_key.n,
+            self.public_key.max_int,
+        )
         decrypt_value = encoded.decode()
 
         return decrypt_value
 
 
 class PaillierEncryptedNumber(object):
-    """Represents the Paillier encryption of a float or int.
-    """
+    """Represents the Paillier encryption of a float or int."""
 
     def __init__(self, public_key, ciphertext, exponent=0):
         self.public_key = public_key
@@ -211,22 +234,26 @@ def __init__(self, public_key, ciphertext, exponent=0):
         self.__is_obfuscator = False
 
         if not isinstance(self.__ciphertext, int):
-            raise TypeError("ciphertext should be an int, not: %s" % type(self.__ciphertext))
+            raise TypeError(
+                "ciphertext should be an int, not: %s" %
+                type(
+                    self.__ciphertext))
 
         if not isinstance(self.public_key, PaillierPublicKey):
-            raise TypeError("public_key should be a PaillierPublicKey, not: %s" % type(self.public_key))
+            raise TypeError(
+                "public_key should be a PaillierPublicKey, not: %s"
+                % type(self.public_key)
+            )
 
     def ciphertext(self, be_secure=True):
-        """return the ciphertext of the PaillierEncryptedNumber.
-        """
+        """return the ciphertext of the PaillierEncryptedNumber."""
         if be_secure and not self.__is_obfuscator:
             self.apply_obfuscator()
 
         return self.__ciphertext
 
     def apply_obfuscator(self):
-        """ciphertext by multiplying by r ** n with random r
-        """
+        """ciphertext by multiplying by r ** n with random r"""
         self.__ciphertext = self.public_key.apply_obfuscator(self.__ciphertext)
         self.__is_obfuscator = True
 
@@ -252,11 +279,12 @@ def __truediv__(self, scalar):
         return self.__mul__(1 / scalar)
 
     def __mul__(self, scalar):
-        """return Multiply by an scalar(such as int, float)
-        """
+        """return Multiply by an scalar(such as int, float)"""
         if isinstance(scalar, FixedPointNumber):
             scalar = scalar.decode()
-        encode = FixedPointNumber.encode(scalar, self.public_key.n, self.public_key.max_int)
+        encode = FixedPointNumber.encode(
+            scalar, self.public_key.n, self.public_key.max_int
+        )
         plaintext = encode.encoding
 
         if plaintext < 0 or plaintext >= self.public_key.n:
@@ -264,11 +292,16 @@ def __mul__(self, scalar):
 
         if plaintext >= self.public_key.n - self.public_key.max_int:
             # Very large plaintext, play a sneaky trick using inverses
-            neg_c = gmpy_math.invert(self.ciphertext(False), self.public_key.nsquare)
+            neg_c = gmpy_math.invert(
+                self.ciphertext(False),
+                self.public_key.nsquare)
             neg_scalar = self.public_key.n - plaintext
-            ciphertext = gmpy_math.powmod(neg_c, neg_scalar, self.public_key.nsquare)
+            ciphertext = gmpy_math.powmod(
+                neg_c, neg_scalar, self.public_key.nsquare)
         else:
-            ciphertext = gmpy_math.powmod(self.ciphertext(False), plaintext, self.public_key.nsquare)
+            ciphertext = gmpy_math.powmod(
+                self.ciphertext(False), plaintext, self.public_key.nsquare
+            )
 
         exponent = self.exponent + encode.exponent
 
@@ -276,10 +309,13 @@ def __mul__(self, scalar):
 
     def increase_exponent_to(self, new_exponent):
         """return PaillierEncryptedNumber:
-           new PaillierEncryptedNumber with same value but having great exponent.
+        new PaillierEncryptedNumber with same value but having great exponent.
         """
         if new_exponent < self.exponent:
-            raise ValueError("New exponent %i should be great than old exponent %i" % (new_exponent, self.exponent))
+            raise ValueError(
+                "New exponent %i should be great than old exponent %i"
+                % (new_exponent, self.exponent)
+            )
 
         factor = pow(FixedPointNumber.BASE, new_exponent - self.exponent)
         new_encryptednumber = self.__mul__(factor)
@@ -288,8 +324,7 @@ def increase_exponent_to(self, new_exponent):
         return new_encryptednumber
 
     def __align_exponent(self, x, y):
-        """return x,y with same exponet
-        """
+        """return x,y with same exponet"""
         if x.exponent < y.exponent:
             x = x.increase_exponent_to(y.exponent)
         elif x.exponent > y.exponent:
@@ -298,46 +333,52 @@ def __align_exponent(self, x, y):
         return x, y
 
     def __add_scalar(self, scalar):
-        """return PaillierEncryptedNumber: z = E(x) + y
-        """
+        """return PaillierEncryptedNumber: z = E(x) + y"""
         if isinstance(scalar, FixedPointNumber):
             scalar = scalar.decode()
-        encoded = FixedPointNumber.encode(scalar,
-                                          self.public_key.n,
-                                          self.public_key.max_int,
-                                          max_exponent=self.exponent)
+        encoded = FixedPointNumber.encode(
+            scalar,
+            self.public_key.n,
+            self.public_key.max_int,
+            max_exponent=self.exponent,
+        )
         return self.__add_fixpointnumber(encoded)
 
     def __add_fixpointnumber(self, encoded):
-        """return PaillierEncryptedNumber: z = E(x) + FixedPointNumber(y)
-        """
+        """return PaillierEncryptedNumber: z = E(x) + FixedPointNumber(y)"""
         if self.public_key.n != encoded.n:
-            raise ValueError("Attempted to add numbers encoded against different public keys!")
+            raise ValueError(
+                "Attempted to add numbers encoded against different public keys!"
+            )
 
         # their exponents must match, and align.
         x, y = self.__align_exponent(self, encoded)
 
         encrypted_scalar = x.public_key.raw_encrypt(y.encoding, 1)
-        encryptednumber = self.__raw_add(x.ciphertext(False), encrypted_scalar, x.exponent)
+        encryptednumber = self.__raw_add(
+            x.ciphertext(False), encrypted_scalar, x.exponent
+        )
 
         return encryptednumber
 
     def __add_encryptednumber(self, other):
-        """return PaillierEncryptedNumber: z = E(x) + E(y)
-        """
+        """return PaillierEncryptedNumber: z = E(x) + E(y)"""
         if self.public_key != other.public_key:
             raise ValueError("add two numbers have different public key!")
 
         # their exponents must match, and align.
         x, y = self.__align_exponent(self, other)
 
-        encryptednumber = self.__raw_add(x.ciphertext(False), y.ciphertext(False), x.exponent)
+        encryptednumber = self.__raw_add(
+            x.ciphertext(False), y.ciphertext(False), x.exponent
+        )
 
         return encryptednumber
 
     def __raw_add(self, e_x, e_y, exponent):
-        """return the integer E(x + y) given ints E(x) and E(y).
-        """
-        ciphertext = gmpy_math.mpz(e_x) * gmpy_math.mpz(e_y) % self.public_key.nsquare
+        """return the integer E(x + y) given ints E(x) and E(y)."""
+        ciphertext = gmpy_math.mpz(
+            e_x) * gmpy_math.mpz(e_y) % self.public_key.nsquare
 
-        return PaillierEncryptedNumber(self.public_key, int(ciphertext), exponent)
+        return PaillierEncryptedNumber(
+            self.public_key, int(ciphertext), exponent)
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py
index af3ae2a754..350b6e06f6 100644
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py
@@ -21,13 +21,13 @@
 
 
 class FixedPointNumber(object):
-    """Represents a float or int fixedpoint encoding;.
-    """
+    """Represents a float or int fixedpoint encoding;."""
+
     BASE = 16
     LOG2_BASE = math.log(BASE, 2)
     FLOAT_MANTISSA_BITS = sys.float_info.mant_dig
 
-    Q = 293973345475167247070445277780365744413 ** 2
+    Q = 293973345475167247070445277780365744413**2
 
     def __init__(self, encoding, exponent, n=None, max_int=None):
         if n is None:
@@ -49,9 +49,14 @@ def calculate_exponent_from_precision(cls, precision):
         return exponent
 
     @classmethod
-    def encode(cls, scalar, n=None, max_int=None, precision=None, max_exponent=None):
-        """return an encoding of an int or float.
-        """
+    def encode(
+            cls,
+            scalar,
+            n=None,
+            max_int=None,
+            precision=None,
+            max_exponent=None):
+        """return an encoding of an int or float."""
         # Calculate the maximum exponent for desired precision
         exponent = None
 
@@ -66,17 +71,26 @@ def encode(cls, scalar, n=None, max_int=None, precision=None, max_exponent=None)
             max_int = n // 2
 
         if precision is None:
-            if isinstance(scalar, int) or isinstance(scalar, np.int16) or \
-                    isinstance(scalar, np.int32) or isinstance(scalar, np.int64):
+            if (
+                isinstance(scalar, int)
+                or isinstance(scalar, np.int16)
+                or isinstance(scalar, np.int32)
+                or isinstance(scalar, np.int64)
+            ):
                 exponent = 0
-            elif isinstance(scalar, float) or isinstance(scalar, np.float16) \
-                    or isinstance(scalar, np.float32) or isinstance(scalar, np.float64):
+            elif (
+                isinstance(scalar, float)
+                or isinstance(scalar, np.float16)
+                or isinstance(scalar, np.float32)
+                or isinstance(scalar, np.float64)
+            ):
                 flt_exponent = math.frexp(scalar)[1]
                 lsb_exponent = cls.FLOAT_MANTISSA_BITS - flt_exponent
                 exponent = math.floor(lsb_exponent / cls.LOG2_BASE)
             else:
-                raise TypeError("Don't know the precision of type %s."
-                                % type(scalar))
+                raise TypeError(
+                    "Don't know the precision of type %s." %
+                    type(scalar))
         else:
             exponent = cls.calculate_exponent_from_precision(precision)
 
@@ -86,15 +100,14 @@ def encode(cls, scalar, n=None, max_int=None, precision=None, max_exponent=None)
         int_fixpoint = int(round(scalar * pow(cls.BASE, exponent)))
 
         if abs(int_fixpoint) > max_int:
-            raise ValueError(f"Integer needs to be within +/- {max_int},but got {int_fixpoint},"
-                             f"basic info, scalar={scalar}, base={cls.BASE}, exponent={exponent}"
-                             )
+            raise ValueError(
+                f"Integer needs to be within +/- {max_int},but got {int_fixpoint},"
+                f"basic info, scalar={scalar}, base={cls.BASE}, exponent={exponent}")
 
         return cls(int_fixpoint % n, exponent, n, max_int)
 
     def decode(self):
-        """return decode plaintext.
-        """
+        """return decode plaintext."""
         if self.encoding >= self.n:
             # Should be mod n
             raise ValueError('Attempted to decode corrupted number')
@@ -105,27 +118,32 @@ def decode(self):
             # Negative
             mantissa = self.encoding - self.n
         else:
-            raise OverflowError(f'Overflow detected in decode number, encoding: {self.encoding}，'
-                                f'{self.exponent}'
-                                f' {self.n}')
+            raise OverflowError(
+                f'Overflow detected in decode number, encoding: {self.encoding}，'
+                f'{self.exponent}'
+                f' {self.n}')
 
         return mantissa * pow(self.BASE, -self.exponent)
 
     def increase_exponent_to(self, new_exponent):
-        """return FixedPointNumber: new encoding with same value but having great exponent.
-        """
+        """return FixedPointNumber: new encoding with same value but having great exponent."""
         if new_exponent < self.exponent:
-            raise ValueError('New exponent %i should be greater than'
-                             'old exponent %i' % (new_exponent, self.exponent))
+            raise ValueError(
+                'New exponent %i should be greater than'
+                'old exponent %i' % (new_exponent, self.exponent)
+            )
 
         factor = pow(self.BASE, new_exponent - self.exponent)
         new_encoding = self.encoding * factor % self.n
 
-        return FixedPointNumber(new_encoding, new_exponent, self.n, self.max_int)
+        return FixedPointNumber(
+            new_encoding,
+            new_exponent,
+            self.n,
+            self.max_int)
 
     def __align_exponent(self, x, y):
-        """return x,y with same exponent
-        """
+        """return x,y with same exponent"""
         if x.exponent < y.exponent:
             x = x.increase_exponent_to(y.exponent)
         elif x.exponent > y.exponent:
@@ -259,7 +277,11 @@ def __add_fixedpointnumber(self, other):
             other = self.encode(other.decode(), n=self.n, max_int=self.max_int)
         x, y = self.__align_exponent(self, other)
         encoding = (x.encoding + y.encoding) % self.n
-        return FixedPointNumber(encoding, x.exponent, n=self.n, max_int=self.max_int)
+        return FixedPointNumber(
+            encoding,
+            x.exponent,
+            n=self.n,
+            max_int=self.max_int)
 
     def __add_scalar(self, scalar):
         encoded = self.encode(scalar, n=self.n, max_int=self.max_int)
@@ -271,7 +293,11 @@ def __sub_fixedpointnumber(self, other):
         x, y = self.__align_exponent(self, other)
         encoding = (x.encoding - y.encoding) % self.n
 
-        return FixedPointNumber(encoding, x.exponent, n=self.n, max_int=self.max_int)
+        return FixedPointNumber(
+            encoding,
+            x.exponent,
+            n=self.n,
+            max_int=self.max_int)
 
     def __sub_scalar(self, scalar):
         scalar = -1 * scalar
@@ -295,4 +321,9 @@ def __abs__(self):
             return self * -1
 
     def __mod__(self, other):
-        return FixedPointNumber(self.encoding % other, self.exponent, n=self.n, max_int=self.max_int)
+        return FixedPointNumber(
+            self.encoding %
+            other,
+            self.exponent,
+            n=self.n,
+            max_int=self.max_int)
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/gmpy_math.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/gmpy_math.py
index c56c574df6..a316ead0ff 100644
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/gmpy_math.py
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/gmpy_math.py
@@ -73,7 +73,7 @@ def getprimeover(n):
 
 
 def isqrt(n):
-    """ return the integer square root of N """
+    """return the integer square root of N"""
 
     return int(gmpy2.isqrt(n))
 
diff --git a/gpu/fate-tensor-gpu/tests/__init__.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/__init__.py
similarity index 100%
rename from gpu/fate-tensor-gpu/tests/__init__.py
rename to gpu/fate-tensor-gpu/fate_tensor_gpu/tests/__init__.py
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py
new file mode 100755
index 0000000000..0cd461f260
--- /dev/null
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py
@@ -0,0 +1,712 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import random
+
+import numpy as np
+import unittest
+import functools
+import time
+
+from fate_tensor_gpu.secureprotol.fixedpoint import FixedPointNumber
+from fate_tensor_gpu.secureprotol import gmpy_math
+from fate_tensor_gpu.secureprotol.fate_paillier import (
+    PaillierKeypair,
+    PaillierEncryptedNumber,
+)
+
+from fate_tensor_gpu.gpu_engine import (
+    FLOAT_TYPE,
+    INT64_TYPE,
+    pi_p2c_pub_key,
+    pi_p2c_priv_key,
+    pi_h2d_pub_key,
+    pi_h2d_priv_key,
+    TensorShapeStorage,
+    bi_alloc,
+    PLAIN_BYTE,
+    MEM_HOST,
+    te_alloc,
+    fp_alloc,
+    pi_alloc,
+    te_p2c,
+    fp_encode,
+    fp_decode,
+    te_c2p,
+    pi_encrypt,
+    pi_gen_obf_seed,
+    CIPHER_BITS,
+    pi_obfuscate,
+    pi_c2p,
+    pi_decrypt,
+    fp_mul,
+    fp_c2p,
+    pi_add,
+    pi_mul,
+    pi_sum,
+    bi_free,
+    te_free,
+    fp_free,
+    pi_free,
+)
+
+RAND_TYPE = FLOAT_TYPE  # SWITCH DATA TYPE HERE: EITHER INT64_TYPE OR FLOAT_TYPE
+NUM_ROWS = 200
+NUM_COLS = 200
+TEST_SIZE = NUM_ROWS * NUM_COLS
+KEY_LEN = 1024
+DATA_SIZE = TEST_SIZE * KEY_LEN * 2 // 8
+ERROR_TOLERANCE = 1e-10
+
+
+class TestCaseReport:
+    def __init__(self, name, batch_size, bit_len, data_size):
+        self.name = name
+        self.batch_size = batch_size
+        self.bit_len = bit_len
+        self.data_size = int(data_size)
+        self.content = {}
+        self.width = 100
+        self.column = [30, 20, 25, 24]
+        self.cpu_throughput = 0.0
+        self.gpu_throughput = 0.0
+
+    def add_perf_report(self, name):
+        self.content[name] = {}
+
+    def add_item(self, report_name, item_name, time, ops, bw):
+        self.content[report_name][item_name] = {}
+        self.content[report_name][item_name]['time'] = time
+        self.content[report_name][item_name]['ops'] = ops
+        self.content[report_name][item_name]['bw'] = bw
+
+    def gen_line(self, *args):
+        i = 0
+        size = 0
+        res = ''
+        for v in args:
+            res += '|' + str(v) + ' ' * (self.column[i] - len(str(v)) - 1)
+            size += self.column[i]
+            i += 1
+        if i < 3:
+            res += " " * (self.width - size - 1)
+        res += '|'
+        return res
+
+    def dump_header(self):
+        res = []
+        res.append('=' * self.width)
+        res.append(
+            '|'
+            + ' ' * (int(self.width - len(self.name) - 2) // 2)
+            + self.name
+            + ' ' * (int(self.width - len(self.name) - 1) // 2)
+            + '|'
+        )
+        res.append('=' * self.width)
+        res.append(self.gen_line("Data Information"))
+        res.append('-' * self.width)
+        res.append(self.gen_line("Batch Size", self.batch_size))
+        res.append(self.gen_line("Bit Length", self.bit_len))
+        res.append(self.gen_line("Data Size (Bytes)", self.data_size))
+        return "\n".join(res)
+
+    def dump_item(self, report_name, item_name):
+        time = self.content[report_name][item_name]['time']
+        time = "{0:.4f}".format(time)
+        ops = self.content[report_name][item_name]['ops']
+        ops = "{0:.4f}".format(ops)
+        bw = self.content[report_name][item_name]['bw'] / (2 ** 20)
+        bw = "{0:.4f}".format(bw)
+        line = self.gen_line(item_name, time, ops, bw)
+        return line
+
+    def dump_perf_report(self, report_name):
+        res = []
+        res.append("=" * self.width)
+        res.append(self.gen_line(report_name))
+        res.append("-" * self.width)
+        res.append(
+            self.gen_line(
+                "Item",
+                "Time Elapsed(s)",
+                "Operations Per Second",
+                "Bandwidth (MB/s)"))
+        res.append("-" * self.width)
+        for v in self.content[report_name]:
+            res.append(self.dump_item(report_name, v))
+        return "\n".join(res)
+
+    def dump_summary(self):
+        self.ratio = self.gpu_throughput / self.cpu_throughput
+        res = []
+        res.append("=" * self.width)
+        res.append(self.gen_line("Performance of GPU/CPU"))
+        res.append('-' * self.width)
+        res.append(
+            self.gen_line(
+                "GPU/CPU Ratio (Speedup)",
+                "{0:.4f}".format(
+                    self.ratio)))
+        res.append("=" * self.width)
+        res.append('\n')
+
+        return "\n".join(res)
+
+    def dump_result(self):
+        res = []
+        res.append(self.dump_header())
+        for v in self.content:
+            res.append(self.dump_perf_report(v))
+        res.append(self.dump_summary())
+        report = "\n".join(res)
+        print(report)
+
+
+def generate_rand(test_size):
+    if RAND_TYPE == FLOAT_TYPE:
+        return np.random.normal(0, 5, test_size)
+    elif RAND_TYPE == INT64_TYPE:
+        return np.random.randint(-(2 ** 10), 2 ** 10, test_size)
+    else:
+        raise TypeError("Invalid data type")
+
+
+def assert_diff(res, ref):
+    if res == 0 or ref == 0:
+        assert res == 0
+        assert ref == 0
+    else:
+        diff = res - ref
+        assert abs(diff / res) < ERROR_TOLERANCE
+        assert abs(diff / ref) < ERROR_TOLERANCE
+
+
+def assert_ndarray_diff(res, ref):
+    assert res.shape == ref.shape
+    res, ref = res.flatten(), ref.flatten()
+    assert res.shape == ref.shape
+    for i in range(res.size):
+        try:
+            assert_diff(res[i], ref[i])
+        except AssertionError:
+            print(
+                "Assertion Error at location",
+                i,
+                ", GPU result:",
+                res[i],
+                ", reference result:",
+                ref[i],
+            )
+
+
+def profile(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        res = func(*args, **kwargs)
+        end_time = time.time()
+        return res, end_time - start_time
+
+    return wrapper
+
+
+def compare_time(gpu_time, cpu_time, num_instances=TEST_SIZE):
+    print("GPU time:", gpu_time, "second(s)")
+    print("CPU time:", cpu_time, "second(s)")
+    print(
+        "GPU throughput:",
+        num_instances /
+        gpu_time,
+        "instance(s) per second")
+    print(
+        "CPU throughput:",
+        num_instances /
+        cpu_time,
+        "instance(s) per second")
+    print("Speedup:", cpu_time / gpu_time)
+
+
+def cpu_pi_gen_obf_seed(
+        res_store,
+        public_key,
+        count,
+        elem_size,
+        rand_seed,
+        stream):
+    random.seed(rand_seed)
+    rand_vals = [random.randrange(1, 8 ** elem_size) for _ in range(count)]
+    return [
+        gmpy_math.powmod(
+            v,
+            public_key.n,
+            public_key.nsquare) for v in rand_vals]
+
+
+def cpu_pi_obfuscate(
+        public_key, encrypted_numbers, obf_seeds, exponents, res_store, stream
+):
+    return [
+        PaillierEncryptedNumber(
+            public_key,
+            (encrypted_numbers[i] * obf_seeds[i]) % public_key.nsquare,
+            exponents[i],
+        )
+        for i in range(len(encrypted_numbers))
+    ]
+
+
+def cpu_fp_mul(left, right):
+    return [
+        FixedPointNumber(
+            (left[i].encoding * right[i].encoding) % left[i].n,
+            left[i].exponent + right[i].exponent,
+            left[i].n,
+            left[i].max_int,
+        )
+        for i in range(len(left))
+    ]
+
+
+def add_to_perf_reports(_perf_reports, name, gpu_time, cpu_time, data_size):
+    perf_report = TestCaseReport(name, TEST_SIZE, KEY_LEN, data_size)
+    perf_report.gpu_throughput = TEST_SIZE / gpu_time
+    perf_report.add_perf_report("GPU Performance")
+    perf_report.add_item(
+        "GPU Performance",
+        "Computation on GPU",
+        gpu_time,
+        TEST_SIZE / gpu_time,
+        data_size / gpu_time,
+    )
+    perf_report.cpu_throughput = TEST_SIZE / cpu_time
+    perf_report.add_perf_report("CPU Performance")
+    perf_report.add_item(
+        "CPU Performance",
+        "Computation on CPU",
+        cpu_time,
+        TEST_SIZE / cpu_time,
+        data_size / cpu_time,
+    )
+    _perf_reports.append(perf_report)
+
+
+class TestOperators(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._pub_key, cls._priv_key = PaillierKeypair.generate_keypair()
+        cls.n, cls.max_int = cls._pub_key.n, cls._pub_key.max_int
+        cls._cpu_pub_key = pi_p2c_pub_key(cls._pub_key)
+        cls._cpu_priv_key = pi_p2c_priv_key(cls._priv_key)
+        cls._gpu_pub_key = pi_h2d_pub_key(cls._cpu_pub_key)
+        cls._gpu_priv_key = pi_h2d_priv_key(cls._cpu_priv_key)
+        cls._perf_reports = []
+        print(
+            "\n\n",
+            "*" * 100,
+            "\n\nInitialization complete\nTest Size:",
+            TEST_SIZE)
+
+    def test_performance(self):
+        print("\n\n", "*" * 100, "\n\nTest performance begins")
+
+        print("\n>>>>> generate data and allocate memory spaces")
+        raw, raw2 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        shape_tuple, shape_tuple_T = (NUM_ROWS, NUM_COLS), (NUM_COLS, NUM_ROWS)
+        shape_store, _ = TensorShapeStorage(*shape_tuple), TensorShapeStorage(
+            *shape_tuple_T
+        )
+        gpu_bi_store, gpu_bi_store2 = bi_alloc(
+            None, TEST_SIZE, PLAIN_BYTE, MEM_HOST
+        ), bi_alloc(None, TEST_SIZE, PLAIN_BYTE, MEM_HOST)
+        gpu_te_store, gpu_te_store2 = te_alloc(
+            None, TEST_SIZE, MEM_HOST), te_alloc(
+            None, TEST_SIZE, MEM_HOST)
+        gpu_fp_store, gpu_fp_store2 = fp_alloc(
+            None, TEST_SIZE, MEM_HOST), fp_alloc(
+            None, TEST_SIZE, MEM_HOST)
+        gpu_pi_store, gpu_pi_store2 = pi_alloc(
+            None, TEST_SIZE, MEM_HOST), pi_alloc(
+            None, TEST_SIZE, MEM_HOST)
+        gpu_te_store, gpu_te_store2 = te_p2c(raw, gpu_te_store), te_p2c(
+            raw2, gpu_te_store2
+        )
+
+        print("\n>>>>> fp_encode profiling begins")
+        gpu_encoded, gpu_encode_time = profile(fp_encode)(
+            gpu_te_store, self.n, self.max_int, res=gpu_fp_store
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_encode_time
+            )
+        )
+        cpu_encoded, cpu_encode_time = profile(
+            lambda l: [
+                FixedPointNumber.encode(
+                    v, self.n, self.max_int) for v in l])(raw)
+        compare_time(gpu_encode_time, cpu_encode_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Encode",
+            gpu_encode_time,
+            cpu_encode_time,
+            DATA_SIZE)
+
+        print("\n>>>>> fp_decode profiling begins")
+        gpu_decoded, gpu_decode_time = profile(fp_decode)(
+            gpu_encoded, gpu_te_store, None
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_decode_time
+            )
+        )
+        cpu_decoded, cpu_decode_time = profile(
+            lambda l: [v.decode() for v in l])(cpu_encoded)
+        compare_time(gpu_decode_time, cpu_decode_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Decode",
+            gpu_decode_time,
+            cpu_decode_time,
+            DATA_SIZE)
+
+        # check decoded results
+        assert_ndarray_diff(te_c2p(gpu_decoded), np.asarray(cpu_decoded))
+
+        print("\n>>>>> pi_encrypt profiling begins")
+        print("This function calculates (encoding * n + 1) % nsquare")
+        gpu_encrypted, gpu_encrypt_time = profile(pi_encrypt)(
+            self._gpu_pub_key, gpu_encoded, gpu_pi_store, None
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_encrypt_time
+            )
+        )
+        cpu_encrypted, cpu_encrypt_time = profile(
+            lambda l: [self._pub_key.raw_encrypt(v.encoding, 1) for v in l]
+        )(cpu_encoded)
+        compare_time(gpu_encrypt_time, cpu_encrypt_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Encrypt",
+            gpu_encrypt_time,
+            cpu_encrypt_time,
+            DATA_SIZE)
+
+        print("\n>>>>> pi_gen_obf_seed profiling begins")
+        print("This function calculates (rand() ^ n) % nsquare")
+        gpu_obf_seeds, gpu_gen_obf_seeds_time = profile(pi_gen_obf_seed)(
+            gpu_bi_store, self._gpu_pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None)
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_gen_obf_seeds_time
+            )
+        )
+        cpu_obf_seeds, cpu_gen_obf_seefs_time = profile(cpu_pi_gen_obf_seed)(
+            None, self._pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None
+        )
+        compare_time(gpu_gen_obf_seeds_time, cpu_gen_obf_seefs_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Generate Obfuscators",
+            gpu_gen_obf_seeds_time,
+            cpu_gen_obf_seefs_time,
+            DATA_SIZE,
+        )
+
+        print("\n>>>>> pi_obfuscate profiling begins")
+        print("This function calculates (raw_cipher * obf_seed) % nsquare,")
+        print(
+            "\twhere raw_cipher and obf_seed are calculated in pi_encrypt and pi_gen_obf_seeds, respectively"
+        )
+        gpu_obfuscated, gpu_obfuscate_time = profile(pi_obfuscate)(
+            self._gpu_pub_key, gpu_encrypted, gpu_obf_seeds, gpu_pi_store, None
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_obfuscate_time
+            )
+        )
+        cpu_obfuscated, cpu_obfuscate_time = profile(cpu_pi_obfuscate)(
+            self._pub_key,
+            cpu_encrypted,
+            cpu_obf_seeds,
+            [v.exponent for v in cpu_encoded],
+            None,
+            None,
+        )
+        compare_time(gpu_obfuscate_time, cpu_obfuscate_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Obfuscate",
+            gpu_obfuscate_time,
+            cpu_obfuscate_time,
+            DATA_SIZE,
+        )
+
+        # check intermediate result
+        assert_ndarray_diff(
+            np.asarray(pi_c2p(gpu_obfuscated)[0]),
+            np.asarray([v.ciphertext(False) for v in cpu_obfuscated]),
+        )
+
+        print("\n>>>>> pi_decrypt profiling begins")
+        print(
+            "This function calculates L(cipher ^ lambda % nsquare) * L(g ^ lambda % nsquare) ^ -1 % n"
+        )
+        print("fp_decode is by default included in pi_decrypt")
+        fps_buffer = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        gpu_decrypted, gpu_decrypt_time = profile(pi_decrypt)(
+            self._gpu_pub_key,
+            self._gpu_priv_key,
+            gpu_obfuscated,
+            gpu_te_store,
+            fps_buffer,
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_decrypt_time
+            )
+        )
+        cpu_decrypted, cpu_decrypt_time = profile(
+            lambda l: [self._priv_key.decrypt(v) for v in l]
+        )(cpu_obfuscated)
+        compare_time(gpu_decrypt_time, cpu_decrypt_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Decrypt",
+            gpu_decrypt_time,
+            cpu_decrypt_time,
+            DATA_SIZE)
+
+        # check decrypted results
+        assert_ndarray_diff(te_c2p(gpu_decrypted), np.asarray(cpu_decrypted))
+
+        print("\n>>>>> generating the other array")
+        # encode the other array
+        gpu_encoded2 = fp_encode(
+            gpu_te_store2,
+            self.n,
+            self.max_int,
+            res=gpu_fp_store2)
+        cpu_encoded2 = [
+            FixedPointNumber.encode(
+                v, self.n, self.max_int) for v in raw2]
+        # encrypt the other array
+        gpu_encrypted2 = pi_encrypt(
+            self._gpu_pub_key, gpu_encoded2, gpu_pi_store2, None
+        )
+        cpu_encrypted2 = [
+            self._pub_key.raw_encrypt(v.encoding, 1) for v in cpu_encoded2
+        ]
+        # generate obfuscation seeds (obfuscators) for the other array using a
+        # different random seed
+        gpu_obf_seeds2 = pi_gen_obf_seed(
+            gpu_bi_store2,
+            self._gpu_pub_key,
+            TEST_SIZE,
+            CIPHER_BITS // 6,
+            1,
+            None)
+        cpu_obf_seeds2 = cpu_pi_gen_obf_seed(
+            None, self._pub_key, TEST_SIZE, CIPHER_BITS // 6, 1, None
+        )
+        # obfuscate the other array
+        gpu_obfuscated2 = pi_obfuscate(
+            self._gpu_pub_key,
+            gpu_encrypted2,
+            gpu_obf_seeds2,
+            gpu_pi_store2,
+            None)
+        cpu_obfuscated2 = cpu_pi_obfuscate(
+            self._pub_key,
+            cpu_encrypted2,
+            cpu_obf_seeds2,
+            [v.exponent for v in cpu_encoded2],
+            None,
+            None,
+        )
+        # check intermediate result
+        assert_ndarray_diff(
+            np.asarray(pi_c2p(gpu_obfuscated2)[0]),
+            np.asarray([v.ciphertext(False) for v in cpu_obfuscated2]),
+        )
+
+        print("\n>>>>> fp_mul profiling begins")
+        gpu_fp_mul_store = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        (gpu_fp_mul_res, _), gpu_fp_mul_time = profile(fp_mul)(
+            gpu_encoded,
+            gpu_encoded2,
+            shape_store,
+            shape_store,
+            gpu_fp_mul_store,
+            shape_store,
+            None,
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_fp_mul_time
+            )
+        )
+        cpu_fp_mul_res, cpu_fp_mul_time = profile(
+            cpu_fp_mul)(cpu_encoded, cpu_encoded2)
+        compare_time(gpu_fp_mul_time, cpu_fp_mul_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Fixed-point Number Multiply",
+            gpu_fp_mul_time,
+            cpu_fp_mul_time,
+            DATA_SIZE * 2,
+        )
+
+        # Compare results
+        received_fp_mul_res = fp_c2p(gpu_fp_mul_res)
+        for i in range(TEST_SIZE):
+            assert_diff(
+                received_fp_mul_res[i].encoding,
+                cpu_fp_mul_res[i].encoding)
+            assert received_fp_mul_res[i].BASE == cpu_fp_mul_res[i].BASE
+            assert received_fp_mul_res[i].exponent == cpu_fp_mul_res[i].exponent
+
+        print("\n>>>>> pi_add profiling begins")
+        (gpu_add_res, _), gpu_add_time = profile(pi_add)(
+            self._gpu_pub_key,
+            gpu_obfuscated,
+            gpu_obfuscated2,
+            shape_store,
+            shape_store,
+            gpu_pi_store,
+            shape_store,
+            None,
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_add_time
+            )
+        )
+        cpu_add_res, cpu_add_time = profile(
+            lambda a, b: [a[i] + b[i] for i in range(TEST_SIZE)]
+        )(cpu_obfuscated, cpu_obfuscated2)
+        compare_time(gpu_add_time, cpu_add_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Add",
+            gpu_add_time,
+            cpu_add_time,
+            DATA_SIZE * 2)
+
+        print("\n>>>>> pi_mul profiling begins")
+        (gpu_mul_res, _), gpu_mul_time = profile(pi_mul)(
+            self._gpu_pub_key,
+            gpu_add_res,
+            gpu_encoded2,
+            shape_store,
+            shape_store,
+            gpu_pi_store,
+            shape_store,
+            None,
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_mul_time
+            )
+        )
+        cpu_mul_res, cpu_mul_time = profile(
+            lambda a, b: [a[i] * b[i] for i in range(TEST_SIZE)]
+        )(cpu_add_res, cpu_encoded2)
+        compare_time(gpu_mul_time, cpu_mul_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Multiply",
+            gpu_mul_time,
+            cpu_mul_time,
+            DATA_SIZE * 2)
+
+        gpu_pi_matmul_store = pi_alloc(None, NUM_ROWS * NUM_ROWS, MEM_HOST)
+        gpu_matmul_res, gpu_matmul_shape = gpu_mul_res, shape_store
+        cpu_matmul_res = np.asarray(cpu_mul_res).reshape(shape_tuple)
+
+        print("\n>>>>> pi_sum profiling begins")
+        print("shape is", gpu_matmul_shape.to_tuple())
+        gpu_pi_sum_store = pi_alloc(None, max(NUM_ROWS, NUM_COLS), MEM_HOST)
+        for axis in [0, 1, None]:
+            print(">>> axis:", axis)
+            (gpu_sum_res, _), gpu_sum_time = profile(pi_sum)(
+                self._gpu_pub_key,
+                gpu_matmul_res,
+                gpu_matmul_shape,
+                axis,
+                gpu_pi_sum_store,
+                None,
+                None,
+            )
+            print(
+                "GPU computation completed in {} second(s), waiting for CPU".format(
+                    gpu_sum_time
+                )
+            )
+            cpu_sum_res, cpu_sum_time = profile(lambda a: np.sum(a, axis))(
+                cpu_matmul_res
+            )
+            compare_time(gpu_sum_time, cpu_sum_time)
+            add_to_perf_reports(
+                self._perf_reports,
+                "Sum (axis={})".format(axis),
+                gpu_sum_time,
+                cpu_sum_time,
+                DATA_SIZE,
+            )
+
+            # check result
+            gpu_decrypted = te_c2p(
+                pi_decrypt(
+                    self._gpu_pub_key,
+                    self._gpu_priv_key,
+                    gpu_sum_res,
+                    None,
+                    None,
+                    None))
+            cpu_decrypted = np.asarray(
+                [self._priv_key.decrypt(v) for v in cpu_sum_res.flat]
+                if axis is not None
+                else [self._priv_key.decrypt(cpu_sum_res)]
+            )
+            assert_ndarray_diff(gpu_decrypted, cpu_decrypted)
+
+        print("\n>>>>> free all allocated spaces")
+        bi_free(gpu_bi_store)
+        bi_free(gpu_bi_store2)
+        te_free(gpu_te_store)
+        te_free(gpu_te_store2)
+        fp_free(gpu_fp_store)
+        fp_free(gpu_fp_store2)
+        fp_free(fps_buffer)
+        fp_free(gpu_fp_mul_store)
+        pi_free(gpu_pi_store)
+        pi_free(gpu_pi_store2)
+        pi_free(gpu_pi_matmul_store)
+        pi_free(gpu_pi_sum_store)
+
+    @classmethod
+    def tearDownClass(cls):
+        for v in cls._perf_reports:
+            v.dump_result()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/gpu/fate-tensor-gpu/tests/test_fate_tensor_gpu.py b/gpu/fate-tensor-gpu/tests/test_fate_tensor_gpu.py
deleted file mode 100644
index 8701c384b9..0000000000
--- a/gpu/fate-tensor-gpu/tests/test_fate_tensor_gpu.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from fate_tensor_gpu import __version__
-
-
-def test_version():
-    assert __version__ == '0.1.0'

From b35095a797cfde0c033eed2fb82bf3babf190135 Mon Sep 17 00:00:00 2001
From: "Xiaolong.Gao" <1506957902@qq.com>
Date: Wed, 20 Jul 2022 17:24:54 +0800
Subject: [PATCH 4/8] feat: pep8 format of gpu

Signed-off-by: Xiaolong.Gao <1506957902@qq.com>
---
 .../fate_tensor_gpu/gpu_engine.py             | 474 ++++++++----------
 .../secureprotol/fate_paillier.py             |  24 +-
 .../secureprotol/fixedpoint.py                |  19 +-
 .../fate_tensor_gpu/tests/test_gpu_engine.py  |  13 +-
 4 files changed, 215 insertions(+), 315 deletions(-)

diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py
index 077c528ec1..3fa53ac0bc 100644
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py
@@ -218,9 +218,7 @@ def __get_C_uint32(uint32_space, size):
     GPU_LIB.c_memcpy(
         uint32_list,
         c_void_p(uint32_space),
-        c_size_t(
-            size *
-            U_INT32_BYTE))
+        c_size_t(size * U_INT32_BYTE))
     return np.asarray(uint32_list)
 
 
@@ -230,9 +228,7 @@ def __get_C_double(double_space, size):
     GPU_LIB.c_memcpy(
         double_list,
         c_void_p(double_space),
-        c_size_t(
-            size *
-            DOUBLE_BYTE))
+        c_size_t(size * DOUBLE_BYTE))
     # convert all the data in one step, no loop
     return np.asarray(double_list)
 
@@ -243,9 +239,7 @@ def __get_C_int64(int64_space, size):
     GPU_LIB.c_memcpy(
         int64_list,
         c_void_p(int64_space),
-        c_size_t(
-            size *
-            INT64_BYTE))
+        c_size_t(size * INT64_BYTE))
     # convert all the data in one step, no loop
     return np.asarray(int64_list)
 
@@ -459,15 +453,15 @@ class FixedPointStorage:
     '''
 
     def __init__(
-        self,
-        bigint_storage,
-        base_storage,
-        exp_storage,
-        vec_size,
-        n,
-        max_int,
-        mem_type: int,
-        data_type,
+            self,
+            bigint_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            n,
+            max_int,
+            mem_type: int,
+            data_type,
     ):
         # 1:cpu/host  2:gpu/device
         self.mem_type = mem_type
@@ -513,15 +507,15 @@ class PaillierEncryptedStorage:
     '''
 
     def __init__(
-        self,
-        pen_storage,
-        base_storage,
-        exp_storage,
-        vec_size,
-        mem_type: int,
-        data_type,
-        fpn_encode_n,
-        fpn_encode_max_int,
+            self,
+            pen_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            mem_type: int,
+            data_type,
+            fpn_encode_n,
+            fpn_encode_max_int,
     ):
         self.mem_type = mem_type
         '''Actual data and length for pen'''
@@ -757,18 +751,14 @@ def te_p2c(data, res=None):
         GPU_LIB.c_memcpy(
             c_void_p(storage_pointer),
             data_pointer,
-            c_size_t(
-                vec_size *
-                INT64_BYTE))
+            c_size_t(vec_size * INT64_BYTE))
     elif data.dtype == 'int64':
         data_pointer = data.ctypes.data_as(c_void_p)
         data_type = INT64_TYPE
         GPU_LIB.c_memcpy(
             c_void_p(storage_pointer),
             data_pointer,
-            c_size_t(
-                vec_size *
-                INT64_BYTE))
+            c_size_t(vec_size * INT64_BYTE))
     elif data.dtype == 'float32':
         new_data = data.astype(np.float64)
         data_pointer = new_data.ctypes.data_as(c_void_p)
@@ -776,18 +766,14 @@ def te_p2c(data, res=None):
         GPU_LIB.c_memcpy(
             c_void_p(storage_pointer),
             data_pointer,
-            c_size_t(
-                vec_size *
-                DOUBLE_BYTE))
+            c_size_t(vec_size * DOUBLE_BYTE))
     elif data.dtype == 'float64':
         data_pointer = data.ctypes.data_as(c_void_p)
         data_type = FLOAT_TYPE
         GPU_LIB.c_memcpy(
             c_void_p(storage_pointer),
             data_pointer,
-            c_size_t(
-                vec_size *
-                DOUBLE_BYTE))
+            c_size_t(vec_size * DOUBLE_BYTE))
     else:
         raise PermissionError("Invalid Data Type")
     return _te_init_store(res, storage_pointer, vec_size, MEM_HOST, data_type)
@@ -860,9 +846,7 @@ def fp_c2bytes(store, res=None):
     max_int = store.max_int
     # C memory storage
     bytes_res = c_buffer(
-        (PLAIN_BYTE + U_INT32_BYTE * 2) * store.vec_size
-        + U_INT32_BYTE * 2
-        + PLAIN_BYTE * 2
+        (PLAIN_BYTE + U_INT32_BYTE * 2) * store.vec_size + U_INT32_BYTE * 2 + PLAIN_BYTE * 2
     )
     GPU_LIB.fp_get_bytes(
         cast(bytes_res, c_void_p),
@@ -897,9 +881,7 @@ def pi_c2bytes(store, res=None):
     max_int = store.encode_max_int
     # C memory storage
     bytes_res = c_buffer(
-        (CIPHER_BYTE + U_INT32_BYTE * 2) * store.vec_size
-        + U_INT32_BYTE * 2
-        + CIPHER_BYTE * 2
+        (CIPHER_BYTE + U_INT32_BYTE * 2) * store.vec_size + U_INT32_BYTE * 2 + CIPHER_BYTE * 2
     )
     GPU_LIB.pi_get_bytes(
         cast(bytes_res, c_void_p),
@@ -979,9 +961,7 @@ def fp_bytes2c(data, res=None):
         res:  FixedPointStorage, the restored struct from para.data.
     '''
     # caculate vec_size
-    vec_size = (len(data) - 2 * (U_INT32_BYTE + PLAIN_BYTE)) // (
-        U_INT32_BYTE * 2 + PLAIN_BYTE
-    )
+    vec_size = (len(data) - 2 * (U_INT32_BYTE + PLAIN_BYTE)) // (U_INT32_BYTE * 2 + PLAIN_BYTE)
     # uint32
     data_type = c_buffer(U_INT32_BYTE)
     mem_type = c_buffer(U_INT32_BYTE)
@@ -1033,9 +1013,7 @@ def pi_bytes2c(data, res=None):
         res:  PaillierEncryptedStorage, the restored struct from para.data
     '''
     # caculate vec_size
-    vec_size = (len(data) - 2 * (U_INT32_BYTE + CIPHER_BYTE)) // (
-        U_INT32_BYTE * 2 + CIPHER_BYTE
-    )
+    vec_size = (len(data) - 2 * (U_INT32_BYTE + CIPHER_BYTE)) // (U_INT32_BYTE * 2 + CIPHER_BYTE)
     # uint32
     data_type = c_buffer(U_INT32_BYTE)
     mem_type = c_buffer(U_INT32_BYTE)
@@ -1093,7 +1071,7 @@ def _te_init_shape(shape_store, shape_tuple):
 
 
 def _te_init_ss(
-    res_store, res_data, vec_size, res_shape, shape_tuple, mem_type, data_type
+        res_store, res_data, vec_size, res_shape, shape_tuple, mem_type, data_type
 ):
     '''
     Init TensorStorage and TensorShapeStorage at the same time
@@ -1178,7 +1156,7 @@ def te_pow(
         res_store=None,
         res_shape=None,
         stream=None):
-    res_data = left_store.data**right
+    res_data = left_store.data ** right
     return _te_init_ss(
         res_store,
         res_data,
@@ -1194,13 +1172,13 @@ def te_pow(
 
 
 def te_add(
-    left_store,
-    right_store,
-    left_shape,
-    right_shape,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     res_data = left_store.data + right_store.data
     return _te_init_ss(
@@ -1218,13 +1196,13 @@ def te_add(
 
 
 def te_mul(
-    left_store,
-    right_store,
-    left_shape,
-    right_shape,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     res_data = left_store.data * right_store.data
     return _te_init_ss(
@@ -1242,13 +1220,13 @@ def te_mul(
 
 
 def te_truediv(
-    left_store,
-    right_store,
-    left_shape,
-    right_shape,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     res_data = left_store.data / right_store.data
     return _te_init_ss(
@@ -1263,13 +1241,13 @@ def te_truediv(
 
 
 def te_floordiv(
-    left_store,
-    right_store,
-    left_shape,
-    right_shape,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     res_data = left_store.data // right_store.data
     return _te_init_ss(
@@ -1284,13 +1262,13 @@ def te_floordiv(
 
 
 def te_sub(
-    left_store,
-    right_store,
-    left_shape,
-    right_shape,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     res_data = left_store.data - right_store.data
     return _te_init_ss(
@@ -1308,13 +1286,13 @@ def te_sub(
 
 
 def te_matmul(
-    left_store,
-    right_store,
-    left_shape,
-    right_shape,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     res_data = left_store.data @ right_store.data
     return _te_init_ss(
@@ -1425,13 +1403,13 @@ def te_exp(store, shape, res_store=None, res_shape=None, stream=None):
 
 
 def te_hstack(
-    left_store,
-    right_store,
-    left_shape,
-    right_shape,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     _store, _shape = te_cat([left_store, right_store], 1, res_store, res_shape)
     # avoid naming collision
@@ -1783,15 +1761,15 @@ def pi_p2c_priv_key(src):
 
 # ###########PaillierEncrypted STORAGE INITIALIZE#################
 def _pi_init_store(
-    res_store,
-    pen_storage,
-    base_storage,
-    exp_storage,
-    vec_size,
-    mem_type,
-    data_type,
-    encode_n,
-    encode_max_int,
+        res_store,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        mem_type,
+        data_type,
+        encode_n,
+        encode_max_int,
 ):
     '''
     init a new PaillierEncryptedStorage
@@ -1829,17 +1807,17 @@ def _pi_init_store(
 
 
 def _pi_init_ss(
-    res_store,
-    pen_storage,
-    base_storage,
-    exp_storage,
-    vec_size,
-    res_shape,
-    res_shape_tuple,
-    mem_type,
-    data_type,
-    encode_n,
-    encode_max_int,
+        res_store,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        res_shape,
+        res_shape_tuple,
+        mem_type,
+        data_type,
+        encode_n,
+        encode_max_int,
 ):
     '''
     init new PaillierEncryptedStorage and corresponding TensorShapeStorage at same time
@@ -1912,9 +1890,7 @@ def pi_p2c(target, src, data_type=FLOAT_TYPE):
     GPU_LIB.c_memcpy(
         c_void_p(res_base),
         base_array_pointer,
-        c_size_t(
-            vec_size *
-            U_INT32_BYTE))
+        c_size_t(vec_size * U_INT32_BYTE))
     GPU_LIB.c_memcpy(
         c_void_p(res_exp), exp_array_pointer, c_size_t(vec_size * U_INT32_BYTE)
     )
@@ -1960,15 +1936,15 @@ def _bi_init_ss(
 
 
 def _fp_init_store(
-    res_store,
-    fpn_storage,
-    base_storage,
-    exp_storage,
-    vec_size,
-    n,
-    max_int,
-    mem_type,
-    data_type,
+        res_store,
+        fpn_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        n,
+        max_int,
+        mem_type,
+        data_type,
 ):
     '''
     Init FixedPointStorage class,
@@ -2000,17 +1976,17 @@ def _fp_init_store(
 
 
 def _fp_init_ss(
-    res_store,
-    fpn_storage,
-    base_storage,
-    exp_storage,
-    vec_size,
-    n,
-    max_int,
-    res_shape,
-    res_shape_tuple,
-    mem_type,
-    data_type,
+        res_store,
+        fpn_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        n,
+        max_int,
+        res_shape,
+        res_shape_tuple,
+        mem_type,
+        data_type,
 ):
     '''Init FixedPointStorage and the corresponding TensorShapeStorage'''
     return _fp_init_store(
@@ -2052,7 +2028,7 @@ def get_add_mul_size(
 
 
 def get_matmul_rmatmul_size(
-    left_shape: TensorShapeStorage, right_shape: TensorShapeStorage
+        left_shape: TensorShapeStorage, right_shape: TensorShapeStorage
 ):
     '''
     Get the result size of matmul, rmatmul calculators
@@ -2400,14 +2376,14 @@ def check_func(a, b):
 
 
 def pi_add(
-    pub_key,
-    left_store,
-    right_store,
-    left_shape,
-    right_shape,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     '''
     Perform element-wise encrypted add, support broadcast over cols or rows
@@ -2487,14 +2463,14 @@ def pi_add(
 
 
 def pi_mul(
-    pub_key,
-    left_store,
-    right_store,
-    left_shape,
-    right_shape,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     '''
     Perform element-wise encrypted muliply, support broadcast for cols and rows
@@ -2618,21 +2594,15 @@ def fp_transpose(
         GPU_LIB.c_memcpy(
             c_void_p(res_fpn),
             c_void_p(src_fpn),
-            c_size_t(
-                vec_size *
-                PLAIN_BYTE))
+            c_size_t(vec_size * PLAIN_BYTE))
         GPU_LIB.c_memcpy(
             c_void_p(res_base),
             c_void_p(src_base),
-            c_size_t(
-                vec_size *
-                U_INT32_BYTE))
+            c_size_t(vec_size * U_INT32_BYTE))
         GPU_LIB.c_memcpy(
             c_void_p(res_exp),
             c_void_p(src_exp),
-            c_size_t(
-                vec_size *
-                U_INT32_BYTE))
+            c_size_t(vec_size * U_INT32_BYTE))
         return _fp_init_ss(
             res_store,
             res_fpn,
@@ -2693,14 +2663,14 @@ def fp_transpose(
 
 
 def pi_matmul(
-    pub_key,
-    left_store,
-    right_store,
-    left_shape,
-    right_shape,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     '''
     Perform matrix multiply under encryption.
@@ -2724,12 +2694,7 @@ def pi_matmul(
     # '''Pre-process shape'''
     left_tuple = left_shape.to_tuple()
     right_tuple = right_shape.to_tuple()
-    if (
-        len(left_tuple) == 0
-        or len(right_tuple) == 0
-        or len(left_tuple) > 2
-        or len(right_tuple) > 2
-    ):
+    if len(left_tuple) == 0 or len(right_tuple) == 0 or len(left_tuple) > 2 or len(right_tuple) > 2:
         raise PermissionError("Invalid shape")
     P, Q = __shape_decompose(left_shape)
     R, S = __shape_decompose(right_shape)
@@ -2815,14 +2780,14 @@ def pi_matmul(
 
 
 def pi_rmatmul(
-    pub_key,
-    left_store,
-    right_store,
-    left_shape,
-    right_shape,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     '''
     Perform matrix multiply under encryption.
@@ -2847,12 +2812,7 @@ def pi_rmatmul(
     # pre-process of shapes
     left_tuple = left_shape.to_tuple()
     right_tuple = right_shape.to_tuple()
-    if (
-        len(left_tuple) == 0
-        or len(right_tuple) == 0
-        or len(left_tuple) > 2
-        or len(right_tuple) > 2
-    ):
+    if len(left_tuple) == 0 or len(right_tuple) == 0 or len(left_tuple) > 2 or len(right_tuple) > 2:
         raise PermissionError("Invalid shape")
     P, Q = __shape_decompose(left_shape)
     R, S = __shape_decompose(right_shape)
@@ -2978,21 +2938,15 @@ def pi_transpose(
         GPU_LIB.c_memcpy(
             c_void_p(res_pen),
             c_void_p(src_pen),
-            c_size_t(
-                vec_size *
-                CIPHER_BYTE))
+            c_size_t(vec_size * CIPHER_BYTE))
         GPU_LIB.c_memcpy(
             c_void_p(res_base),
             c_void_p(src_base),
-            c_size_t(
-                vec_size *
-                U_INT32_BYTE))
+            c_size_t(vec_size * U_INT32_BYTE))
         GPU_LIB.c_memcpy(
             c_void_p(res_exp),
             c_void_p(src_exp),
-            c_size_t(
-                vec_size *
-                U_INT32_BYTE))
+            c_size_t(vec_size * U_INT32_BYTE))
         return _pi_init_ss(
             res_store,
             res_pen,
@@ -3038,13 +2992,13 @@ def pi_transpose(
 
 # WARNING:  NOW ALMOST ABANDONED DUE TO NOT IDEAL PERFORMANCE!
 def pi_sum_multi_stream(
-    pub_key,
-    left_store,
-    left_shape,
-    axis=None,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        pub_key,
+        left_store,
+        left_shape,
+        axis=None,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     '''Doing pi_sum using multi cuda stream'''
     src_pen = left_store.pen_storage
@@ -3095,13 +3049,13 @@ def pi_sum_multi_stream(
 
 
 def pi_sum(
-    pub_key,
-    left_store,
-    left_shape,
-    axis=None,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        pub_key,
+        left_store,
+        left_shape,
+        axis=None,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     '''
     Perform sum according to the axis
@@ -3147,21 +3101,15 @@ def pi_sum(
         GPU_LIB.c_memcpy(
             c_void_p(res_pen),
             c_void_p(src_pen),
-            c_size_t(
-                vec_size *
-                CIPHER_BYTE))
+            c_size_t(vec_size * CIPHER_BYTE))
         GPU_LIB.c_memcpy(
             c_void_p(res_base),
             c_void_p(src_base),
-            c_size_t(
-                vec_size *
-                U_INT32_BYTE))
+            c_size_t(vec_size * U_INT32_BYTE))
         GPU_LIB.c_memcpy(
             c_void_p(res_exp),
             c_void_p(src_exp),
-            c_size_t(
-                vec_size *
-                U_INT32_BYTE))
+            c_size_t(vec_size * U_INT32_BYTE))
         return _pi_init_ss(
             left_store,
             res_pen,
@@ -3222,9 +3170,7 @@ def pi_sum(
             res_base = GPU_LIB.c_malloc(
                 c_size_t(transpose_tuple[0] * U_INT32_BYTE))
             res_exp = GPU_LIB.c_malloc(
-                c_size_t(
-                    transpose_tuple[0] *
-                    U_INT32_BYTE))
+                c_size_t(transpose_tuple[0] * U_INT32_BYTE))
         else:
             res_pen = res_store.pen_storage
             res_base = res_store.base_storage
@@ -3252,9 +3198,7 @@ def pi_sum(
             res_base = GPU_LIB.c_malloc(
                 c_size_t(left_shape_tuple[0] * U_INT32_BYTE))
             res_exp = GPU_LIB.c_malloc(
-                c_size_t(
-                    left_shape_tuple[0] *
-                    U_INT32_BYTE))
+                c_size_t(left_shape_tuple[0] * U_INT32_BYTE))
         else:
             res_pen = res_store.pen_storage
             res_base = res_store.base_storage
@@ -3315,21 +3259,15 @@ def pi_sum_with_index_v2(pub_key, left_store, left_shape, valid_index):
         GPU_LIB.c_memcpy(
             c_void_p(res_pen),
             c_void_p(src_pen),
-            c_size_t(
-                vec_size *
-                CIPHER_BYTE))
+            c_size_t(vec_size * CIPHER_BYTE))
         GPU_LIB.c_memcpy(
             c_void_p(res_base),
             c_void_p(src_base),
-            c_size_t(
-                vec_size *
-                U_INT32_BYTE))
+            c_size_t(vec_size * U_INT32_BYTE))
         GPU_LIB.c_memcpy(
             c_void_p(res_exp),
             c_void_p(src_exp),
-            c_size_t(
-                vec_size *
-                U_INT32_BYTE))
+            c_size_t(vec_size * U_INT32_BYTE))
         return _pi_init_ss(
             left_store,
             res_pen,
@@ -3417,21 +3355,15 @@ def pi_sum_with_index(pub_key, left_store, left_shape, valid_index):
         GPU_LIB.c_memcpy(
             c_void_p(res_pen),
             c_void_p(src_pen),
-            c_size_t(
-                vec_size *
-                CIPHER_BYTE))
+            c_size_t(vec_size * CIPHER_BYTE))
         GPU_LIB.c_memcpy(
             c_void_p(res_base),
             c_void_p(src_base),
-            c_size_t(
-                vec_size *
-                U_INT32_BYTE))
+            c_size_t(vec_size * U_INT32_BYTE))
         GPU_LIB.c_memcpy(
             c_void_p(res_exp),
             c_void_p(src_exp),
-            c_size_t(
-                vec_size *
-                U_INT32_BYTE))
+            c_size_t(vec_size * U_INT32_BYTE))
         return _pi_init_ss(
             left_store,
             res_pen,
@@ -3482,7 +3414,7 @@ def pi_sum_with_index(pub_key, left_store, left_shape, valid_index):
 
 
 def pi_sum_multi_index(
-    pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None
+        pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None
 ):
     '''
     Run sum for data with the same index indicated in the valid_index list
@@ -3546,7 +3478,7 @@ def pi_sum_multi_index(
 # WARNNIG: CURRENTLY NOT IN USE BECAUSE NO APPRENT IMPROVEMENT WHEN left_store.vec_size is very large
 # TODO: apply this to store with small size
 def pi_sum_batch_multi_index(
-    pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None
+        pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None
 ):
     '''
     Rum sum for data with the same index indicated in valid index
@@ -3614,7 +3546,7 @@ def pi_sum_batch_multi_index(
 
 # WARNING: ABANDONED FOR THE SAME REASON AS pi_sum_batch_multi_index
 def pi_sum_batch_multi_index_v2(
-    pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None
+        pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None
 ):
     '''
     Almost the same with pi_sum_batch_multi_index,
@@ -3673,7 +3605,7 @@ def pi_sum_batch_multi_index_v2(
 
 
 def fp_encode(
-    store, n, max_int, precision=None, max_exponent=None, res=None, stream=None
+        store, n, max_int, precision=None, max_exponent=None, res=None, stream=None
 ):
     '''
     Perform encode to a TensorStorage
@@ -3907,13 +3839,13 @@ def pi_c2p(src):
 
 
 def fp_mul(
-    left_store,
-    right_store,
-    left_shape,
-    right_shape,
-    res_store=None,
-    res_shape=None,
-    stream=None,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
 ):
     '''
     Perform element-wise multiplication between two FixedPointStorage.
@@ -4026,9 +3958,7 @@ def fp_p2c(target, src, data_type=FLOAT_TYPE):
     GPU_LIB.c_memcpy(
         c_void_p(res_base),
         base_array_pointer,
-        c_size_t(
-            vec_size *
-            U_INT32_BYTE))
+        c_size_t(vec_size * U_INT32_BYTE))
     GPU_LIB.c_memcpy(
         c_void_p(res_exp), exp_array_pointer, c_size_t(vec_size * U_INT32_BYTE)
     )
@@ -4729,8 +4659,8 @@ def bi_gen_rand(elem_size, count, res, rand_seed, stream=None):
     # Didn't use vectorize since that we need to_bytes()
     # But ndarray_float64 has no to_bytes method
     random.seed(rand_seed)
-    rands = np.asarray([random.randrange(1, 8**elem_size)
-                       for i in range(count)])
+    rands = np.asarray([random.randrange(1, 8 ** elem_size)
+                        for i in range(count)])
     if res is None:
         data_storage = GPU_LIB.c_malloc(c_size_t(count * CIPHER_BYTE))
     else:
@@ -4873,7 +4803,7 @@ def pi_accumulate(gpu_pubkey, pubkey_n, left_store, left_shape):
 
 
 def pi_add_with_index(
-    gpu_pubkey, pubkey_n, l_store, l_shape, r_store, r_shape, valid_index
+        gpu_pubkey, pubkey_n, l_store, l_shape, r_store, r_shape, valid_index
 ):
     '''
     Add a single PaillierEncryptedNumber to the designated index in a vector
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py
index 9851e8df36..fbb9e57abf 100644
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py
@@ -173,27 +173,9 @@ def raw_decrypt(self, ciphertext):
                 "ciphertext should be an int, not: %s" %
                 type(ciphertext))
 
-        mp = (
-            self.l_func(
-                gmpy_math.powmod(
-                    ciphertext,
-                    self.p -
-                    1,
-                    self.psquare),
-                self.p) *
-            self.hp %
-            self.p)
-
-        mq = (
-            self.l_func(
-                gmpy_math.powmod(
-                    ciphertext,
-                    self.q -
-                    1,
-                    self.qsquare),
-                self.q) *
-            self.hq %
-            self.q)
+        mp = self.l_func(gmpy_math.powmod(ciphertext, self.p - 1, self.psquare), self.p) * self.hp % self.p
+
+        mq = self.l_func(gmpy_math.powmod(ciphertext, self.q - 1, self.qsquare), self.q) * self.hq % self.q
 
         return self.crt(mp, mq)
 
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py
index 350b6e06f6..dca6d0fcda 100644
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py
@@ -27,7 +27,7 @@ class FixedPointNumber(object):
     LOG2_BASE = math.log(BASE, 2)
     FLOAT_MANTISSA_BITS = sys.float_info.mant_dig
 
-    Q = 293973345475167247070445277780365744413**2
+    Q = 293973345475167247070445277780365744413 ** 2
 
     def __init__(self, encoding, exponent, n=None, max_int=None):
         if n is None:
@@ -71,19 +71,12 @@ def encode(
             max_int = n // 2
 
         if precision is None:
-            if (
-                isinstance(scalar, int)
-                or isinstance(scalar, np.int16)
-                or isinstance(scalar, np.int32)
-                or isinstance(scalar, np.int64)
-            ):
+            if isinstance(scalar, int) or isinstance(scalar, np.int16) or isinstance(scalar, np.int32) or isinstance(
+                    scalar, np.int64):
                 exponent = 0
-            elif (
-                isinstance(scalar, float)
-                or isinstance(scalar, np.float16)
-                or isinstance(scalar, np.float32)
-                or isinstance(scalar, np.float64)
-            ):
+            elif isinstance(scalar, float) or isinstance(scalar, np.float16) or isinstance(scalar,
+                                                                                           np.float32) or isinstance(
+                    scalar, np.float64):
                 flt_exponent = math.frexp(scalar)[1]
                 lsb_exponent = cls.FLOAT_MANTISSA_BITS - flt_exponent
                 exponent = math.floor(lsb_exponent / cls.LOG2_BASE)
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py
index 0cd461f260..48e95d7ab1 100755
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py
+++ b/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py
@@ -109,11 +109,8 @@ def dump_header(self):
         res = []
         res.append('=' * self.width)
         res.append(
-            '|'
-            + ' ' * (int(self.width - len(self.name) - 2) // 2)
-            + self.name
-            + ' ' * (int(self.width - len(self.name) - 1) // 2)
-            + '|'
+            '|' + ' ' * (int(self.width - len(self.name) - 2) // 2) + self.name + ' ' * (
+                int(self.width - len(self.name) - 1) // 2) + '|'
         )
         res.append('=' * self.width)
         res.append(self.gen_line("Data Information"))
@@ -228,13 +225,11 @@ def compare_time(gpu_time, cpu_time, num_instances=TEST_SIZE):
     print("CPU time:", cpu_time, "second(s)")
     print(
         "GPU throughput:",
-        num_instances /
-        gpu_time,
+        num_instances / gpu_time,
         "instance(s) per second")
     print(
         "CPU throughput:",
-        num_instances /
-        cpu_time,
+        num_instances / cpu_time,
         "instance(s) per second")
     print("Speedup:", cpu_time / gpu_time)
 

From ef54a88c91326ba7cf28c1f337abef189edb786c Mon Sep 17 00:00:00 2001
From: "Xiaolong.Gao" <1506957902@qq.com>
Date: Wed, 20 Jul 2022 17:26:25 +0800
Subject: [PATCH 5/8] feat: impl FPGA tensor

Signed-off-by: Xiaolong.Gao <1506957902@qq.com>
---
 .../fate_tensor_fpga/__init__.py              |   19 +
 .../fate_tensor_fpga/fpga_engine.py           | 4434 +++++++++++++++++
 .../fate_tensor_fpga/fpga_tensor.py           |  511 ++
 .../fate_tensor_fpga/secureprotol/__init__.py |    0
 .../secureprotol/fate_paillier.py             |  364 ++
 .../secureprotol/fixedpoint.py                |  322 ++
 .../secureprotol/gmpy_math.py                 |  133 +
 .../fate_tensor_fpga/tests/__init__.py        |    0
 gpu/fate-tensor-fpga/pyproject.toml           |   17 +
 9 files changed, 5800 insertions(+)
 create mode 100644 gpu/fate-tensor-fpga/fate_tensor_fpga/__init__.py
 create mode 100644 gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_engine.py
 create mode 100644 gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_tensor.py
 create mode 100644 gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/__init__.py
 create mode 100644 gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fate_paillier.py
 create mode 100644 gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fixedpoint.py
 create mode 100644 gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/gmpy_math.py
 create mode 100644 gpu/fate-tensor-fpga/fate_tensor_fpga/tests/__init__.py
 create mode 100644 gpu/fate-tensor-fpga/pyproject.toml

diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/__init__.py b/gpu/fate-tensor-fpga/fate_tensor_fpga/__init__.py
new file mode 100644
index 0000000000..47fa86a8b9
--- /dev/null
+++ b/gpu/fate-tensor-fpga/fate_tensor_fpga/__init__.py
@@ -0,0 +1,19 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from .fpga_tensor import keygen, SK, PK, Cipherblock
+
+__version__ = '0.1.0'
+__all__ = ['keygen', "SK", "PK", "Cipherblock"]
diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_engine.py b/gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_engine.py
new file mode 100644
index 0000000000..1fb0c2f872
--- /dev/null
+++ b/gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_engine.py
@@ -0,0 +1,4434 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# from ctypes.util import find_library
+import os
+import random
+import math
+import numpy as np
+
+from ctypes import cdll, c_buffer, cast
+from ctypes import c_char_p, c_void_p
+from ctypes import (
+    c_int32,
+    c_uint8,
+    c_bool,
+    c_uint32,
+    c_double,
+    c_int64,
+    c_uint64,
+    c_size_t,
+)
+from .secureprotol.fate_paillier import (
+    PaillierPublicKey,
+    PaillierPrivateKey,
+    PaillierEncryptedNumber
+)
+from .secureprotol.fixedpoint import FixedPointNumber
+
+from concurrent.futures import ProcessPoolExecutor as Executor
+
+# define memory types
+MEM_HOST = 1
+MEM_DEVICE = 2
+# the extended memory types, correspond with the device type defined below
+MEM_FPGA_NUM_0 = 20
+MEM_FPGA_NUM_1 = 21
+
+# aliases defined by WeBank
+PaillierPublicKeyStorage = PaillierPublicKey
+PaillierPrivateKeyStorage = PaillierPrivateKey
+
+'''##############import ctypes to implement py2c and c2py#################'''
+'''############## load the .so library written in C     ##################'''
+
+# note the .so library hasn't be named, use FPGA_1024 as an example
+# we made 3 libraries, each one indicating a different .so library
+# the number indicating the CIPHER_BIT length
+FPGA_LIB = cdll.LoadLibrary(os.path.dirname(__file__) + "/FPGA_LIB.so")
+# FPGA_4096 = cdll.LoadLibrary("FPGA_4096.so")
+
+# set the CIPHER_BIT according to the library chosen.
+CIPHER_BITS = 2048
+PLAIN_BITS = 2048
+BYTE_LEN = 8
+CIPHER_BYTE = CIPHER_BITS // BYTE_LEN
+PLAIN_BYTE = PLAIN_BITS // BYTE_LEN
+
+# ### DEFINE THE BTYE_LENGTHS OF DATA TYPES ####
+CHAR_BYTE = 1
+U_INT32_BYTE = 4
+DOUBLE_BYTE = 8
+INT64_BYTE = 8
+
+# DEFINE THE RETURN TYPE OF C_malloc####
+FPGA_LIB.c_malloc.restype = c_void_p
+
+# DEFINE TWO DIFFERENT TYPE OF DATA####
+INT64_TYPE = 1  # datatype flag for int32 and int64
+FLOAT_TYPE = 2  # datatype flag for float and double
+
+# define base for Paillier encrypted numbers
+PEN_BASE = 16
+# as there's no BASE defined in Python PaillierEncryptedNumber,
+# and we need this in CUDA, we define PEN_BASE as 16
+
+
+# ############################################################################
+# ######################Useful independent functions##########################
+# ###################Reconstruct ndaray from C memory type####################
+# ############################################################################
+''' Device Initializer '''
+
+
+def initialize_device():
+    FPGA_LIB.init()
+    # FPGA_LIB.print_example_banner()
+
+
+'''reset FPGA functions'''
+
+
+def reset_device(dev_num):
+    FPGA_LIB.reset_device(c_uint8(dev_num))
+
+
+def init_dev_reg(dev_num):
+    FPGA_LIB.init_regs(c_uint8(dev_num))
+
+
+def check_FPGA_status(dev_num):
+    FPGA_LIB.status_check(c_uint8(dev_num))
+
+
+def __get_C_fpn(fpn_space, size):
+    '''
+    copy FixedPointNumber (FPN) object out from C memory space,
+    reform a ndarray, return it to upper python level
+    --------------------
+    Para:
+    res_fpn_space: int, indicating the start address of a c_memory space
+    size: int, the number of FPN in the C memory space
+    Return:
+    A ndarray, each element is a bigint
+    '''
+    res_fpn = []
+    get_res = c_buffer(PLAIN_BYTE)
+    for i in range(size):
+        FPGA_LIB.bigint_get(
+            cast(get_res, c_void_p),
+            c_void_p(fpn_space),
+            c_size_t(PLAIN_BITS),
+            c_size_t(i),
+        )
+        res_fpn.append(int.from_bytes(get_res.raw, 'little'))
+    return np.asarray(res_fpn)
+
+
+def __get_C_pen(pen_space, index, size):
+    '''
+    copy PaillierEncryptedNumber(PEN) object out from C memory space,
+    reform a ndarray, return it to upper python level
+    ------------------
+    Para:
+    res_pen_space: int, indicating the start address of a continuous C memory space
+    index: int, the offset from start address that we start to get PEN
+    size: int, the number of PEN ought to get
+    Return:
+    A ndarray, each element is a bigint
+    '''
+    res_pen = []
+    get_res = c_buffer(CIPHER_BYTE)
+    for i in range(size):
+        FPGA_LIB.bigint_get(
+            cast(get_res, c_void_p),
+            c_void_p(pen_space + index * CIPHER_BYTE),
+            c_size_t(CIPHER_BITS),
+            c_size_t(i),
+        )
+        res_pen.append(int.from_bytes(get_res.raw, 'little'))
+    return np.asarray(res_pen)
+
+
+bi_c2p = __get_C_pen
+
+
+def __get_C_uint32(uint32_space, size):
+    '''
+    copy uint32 out from C memory space, form a ndarraay
+    since numpy has a very good support for basic C numeric objects,
+    A single memcpy will be sufficient
+    ------------------------
+    Para:
+    res_uint32_space: int, indicating the start address of a continuous C memory space
+    size: int, the number of uint32 ought to get
+    '''
+    uint32_list = [0 for _ in range(size)]
+    int_list = (c_uint32 * size)(*uint32_list)
+    FPGA_LIB.unsigned_get(
+        int_list, c_void_p(uint32_space), c_size_t(size), c_bool(False)
+    )
+    uint32_list = [int_list[i] for i in range(size)]
+    return np.asarray(uint32_list)
+
+
+def __get_C_double(double_space, size):
+    '''copy double out from C memory space, form a ndarray'''
+    res_double_list = [0 for _ in range(size)]
+    double_list = (c_double * size)(*res_double_list)
+    FPGA_LIB.double_get(
+        double_list, c_void_p(double_space), c_size_t(size), c_bool(False)
+    )
+    # TODO: convert all the data in one step, no loop
+    res_double_list = [double_list[i] for i in range(size)]
+    return np.asarray(res_double_list)
+
+
+def __get_C_int64(int64_space, size):
+    '''copy int64 out from C memory space, form a ndarray'''
+    res_int64_list = [0 for _ in range(size)]
+    int64_list = (c_int64 * size)(*res_int64_list)
+    FPGA_LIB.int64_get(
+        int64_list,
+        c_void_p(int64_space),
+        c_size_t(size),
+        c_bool(False))
+    # TODO: convert all the data in one step, no loop
+    res_int64_list = [int64_list[i] for i in range(size)]
+    return np.asarray(res_int64_list)
+
+
+def __get_c_fpn_storage(fpn, base, exp, vec_size, n, max_int):
+    '''
+    Construct array of FixedPointNumber from given C memory spaces
+    -------------------
+    Para:
+    fpn:  int, start address of a C memory space,
+               inside which stores FPN's encodings(bigint, PLAIN_BITS long)
+    base: int, start address of a C memory space,
+               inside which stores FPN's base(uint32)
+    exp:  int, start address of a C memory space,
+               inside which stores FPN's exp(uint32)
+    vec_size:   int, the number of bigint
+    n, max_int: int, the key used to encode the original plaintext
+
+    Return:
+    A ndarray, each element is a FixedPointNumber
+    '''
+    res_fpn = __get_C_fpn(fpn, vec_size)
+    # res_base = __get_C_uint32(base,size)
+    res_exp = __get_C_uint32(exp, vec_size)
+    result_FixedPointNumber = []
+    for i in range(vec_size):
+        result_FixedPointNumber.append(
+            FixedPointNumber(res_fpn[i], float(res_exp[i]), n, max_int)
+        )
+    return result_FixedPointNumber
+
+
+def __get_c_pen_storage_raw(pen, base, exp, vec_size, n):
+    res_cipher = __get_C_pen(pen, 0, vec_size)
+    res_base = __get_C_uint32(base, vec_size)
+    res_exp = __get_C_uint32(exp, vec_size)
+    return res_cipher, res_base, res_exp
+
+
+def __get_c_pen_storage_mp(pen, base, exp, vec_size, n, thread_num=4):
+    '''
+    Use multi-process to accelerate __get_C_pen process.
+
+    Since on Linux, python use fork to create sub-process,
+    thus the C memory space is shared between father and child processes.
+    And the whole process concerns no CUDA and cuda-context,
+    even the return result is in python object form.
+    So we can use multi-process for acceleration here safely
+    ---------------------------------
+    Para:
+        thread_num: number of processes used in multi-processing
+    Return:
+        tuple, (ndarray, ndarray, ndarray)
+    '''
+    job_cnt = round(vec_size / thread_num)
+    job_idx = 0
+    job_idx_list, job_cnt_list = [0], []
+    for i in range(thread_num - 1):
+        job_idx += job_cnt
+        job_idx_list.append(job_idx)
+        job_cnt_list.append(job_cnt)
+    job_cnt_list.append(vec_size - job_cnt * (thread_num - 1))
+    # for __get_C_pen, use multiprocess to accelerate
+    executor = Executor()
+    futures = []
+    for i in range(thread_num):
+        futures.append(
+            executor.submit(__get_C_pen, pen, job_idx_list[i], job_cnt_list[i])
+        )
+    res_list = [r.result() for r in futures]
+    res_pen = []
+    for res in res_list:
+        res_pen.extend(res)
+    # for uint32, no special demand for multiprocess
+    res_base = __get_C_uint32(base, vec_size)
+    res_exp = __get_C_uint32(exp, vec_size)
+    return np.asarray(res_pen), res_base, res_exp
+
+
+def __get_c_pen_storage(pen, base, exp, vec_size, n):
+    '''
+    Construct array of PaillierEncryptedNumber storage from given memory space
+    ------------------
+    pen:  int, start address of a C memory space,
+               inside which stores PEN's encodings(bigint, CIPHER_BITS long)
+    base: int, start address of a C memory space,
+               inside which stores PEN's base(uint32)
+    exp:  int, start address of a C memory space,
+               inside which stores PEN's exp(uint32)
+    vec_size:   int, the number of bigint
+    n, max_int: int, the key used to encode the original plaintext
+
+    Return:
+    A ndarray, each element is a PaillierEncryptedNumber (PEN)
+    '''
+    res_cipher = __get_C_pen(pen, 0, vec_size)
+    res_exp = __get_C_uint32(exp, vec_size)
+
+    res_PaillierEncryptedNumber = []
+    public_key = PaillierPublicKey(n)
+    for i in range(vec_size):
+        res_PaillierEncryptedNumber.append(
+            PaillierEncryptedNumber(
+                public_key, res_cipher[i], int(
+                    round(
+                        res_exp[i]))))
+
+    return np.asarray(res_PaillierEncryptedNumber)
+
+
+#######################################################################
+# #########################DEFINITION OF CLASSES#######################
+#######################################################################
+'''#############  the definition of functions and classes #################'''
+
+'''
+    TensorStorage.data Containing the address pointing to a double type
+    All the int32/int64 have been transformed to int64_t type
+    All the float32/float64 have been transformed to double type
+    We assume that TensorStorage has 2 types:
+    1. data is ndarray, caculation can be performed directly by ndarray.
+    2. data is C memory pointer, used for performing further encoding for
+       the lower bound
+'''
+
+
+class TensorStorage:
+    '''
+    TensorStorage Class is used for store plaintexts.
+    Currently support
+    1. int32, int64 (all transformed to int64_t type)
+    2. float32, float64 (all transformed to double type)
+
+    Attributes:
+        data: ndarray or int,
+            1. ndarray means data is a python object
+            2. int means data is a C memory object, the value of int is the C memory's
+               start address
+        vec_size: int, the number of data stored in current class
+                       saved here since it may lost when data transfered to C memory
+        mem_type: int, value is MEM_HOST or MEM_DEVICE, where the data is stored
+                       default MEM_HOST
+        data_type: int, value is INT_TYPE or FLOAT_TYPE, the data type of plaintext,
+                        saved here since it may lost when data transfered to C memory
+    '''
+
+    def __init__(self, data, vec_size, mem_type: int, data_type: int):
+        # numpy has some strange shallowcopies which causes incontinuous memory space
+        # so add np.ascontinuousarray here to prevent potential errors
+        self.data = np.ascontiguousarray(
+            data) if isinstance(data, np.ndarray) else data
+        self.vec_size = vec_size
+        self.mem_type = mem_type
+        self.data_type = data_type
+
+    def __str__(self):
+        return f"{self.__class__}:{self.data}"
+
+    def __del__(self):
+        te_free(self)
+
+
+class BigIntStorage:
+    '''
+    Used for store bigint objects:
+
+    Attributes:
+        bigint_storage: int, the start address of the C memory storing bigint
+        elem_size:      int, the size of the bigint,
+                            useless since we unified into CIPHER_BITS
+        vec_size:       int, the number of bigint stored in this class
+        mem_type:       int, MEM_HOST or MEM_DEVICE, where data is stored, default MEM_HOST
+
+    '''
+
+    def __init__(self, data, vec_size, mem_type: int, elem_size: int):
+        # 1:cpu/host  2:FPGA/device
+        self.mem_type = mem_type
+        # self.data = data
+        self.bigint_storage = data
+        self.elem_size = elem_size
+        self.vec_size = vec_size
+
+    def __len__(self):
+        return len(self.data)
+
+    def __del__(self):
+        bi_free(self)
+
+
+class FixedPointStorage:
+    '''
+    Contains the 3 pointers indicating start address of C memory,
+    which can be handled directly by passing it to C functions in GPU_LIB
+    ------------------
+    Attributes:
+        bigint_storage: int, start address of C memory,
+                                in which stores the mantissa of a fpn array
+        base_storage:   int, start address of C memory,
+                                in which stores the base array of the fpn array
+        exp_storage:    int, start address of C memory,
+                                in which stores the exponent array of fpn array
+        vec_size:       int, the number of data stored in current class
+                                saved here since it may lost when data transfered to C memory
+        mem_type:       int, value is MEM_HOST or MEM_DEVICE, where the data is stored
+                                default MEM_HOST
+        data_type:      int, value is INT_TYPE or FLOAT_TYPE, the data type of plaintext,
+                                saved here since it may lost when data transfered to C memory
+        encode_n, max_int: bigint, the para used for encode the plaintext
+    '''
+
+    def __init__(
+            self,
+            bigint_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            n,
+            max_int,
+            mem_type: int,
+            data_type,
+    ):
+        # 1:cpu/host  2:FPGA/device
+        self.mem_type = mem_type
+        '''Actual data and length for fpn'''
+        self.bigint_storage = bigint_storage
+        self.base_storage = base_storage
+        self.exp_storage = exp_storage
+        self.vec_size = vec_size
+        '''TensorStorage needed paras'''
+        self.data_type = data_type
+        '''En/Decode needed paras '''
+        # these 2 are just python int, not BigintStorage nor C_types
+        self.encode_n = n
+        self.max_int = max_int
+
+    def __len__(self):
+        return self.vec_size
+
+    def __del__(self):
+        fp_free(self)
+
+
+class PaillierEncryptedStorage:
+    '''
+    Contains the 3 pointers indicating start address of C memory,
+    which can be handled directly by passing it to C functions in GPU_LIB
+    --------------------
+    Attributes:
+        pen_storage:    int, start address of C memory,
+                                in which stores the mantissa of the pen array
+        base_storage:   int, start address of C memory,
+                                in which stores the bases of the pen array
+        exp_storage:    int, start address of C memory,
+                                in which stores the exponents of the pen array
+        vec_size:       int, the number of data stored in current class
+                                saved here since it may lost when data transfered to C memory
+        mem_type:       int, value is MEM_HOST or MEM_DEVICE, where the data is stored
+                                default MEM_HOST
+        data_type:      int, value is INT_TYPE or FLOAT_TYPE, the data type of plaintext,
+                                saved here since it may lost when data transfered to C memory
+        encode_n, max_int: bigint, the para used for encode the plaintext
+    '''
+
+    def __init__(
+            self,
+            pen_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            mem_type: int,
+            data_type,
+            fpn_encode_n,
+            fpn_encode_max_int,
+    ):
+        # 1:cpu/host  2:FPGA/device
+        self.mem_type = mem_type
+        '''Actual data and length for pen'''
+        self.pen_storage = pen_storage
+        self.base_storage = base_storage
+        self.exp_storage = exp_storage
+        self.vec_size = vec_size
+        '''TensorStorage needed paras'''
+        self.data_type = data_type
+        '''En/Decode needed paras '''
+        self.encode_n = fpn_encode_n
+        self.encode_max_int = fpn_encode_max_int
+        '''Pub_key paras'''
+
+    def __len__(self):
+        return self.vec_size
+
+    def __del__(self):
+        pi_free(self)
+
+
+class TensorShapeStorage:
+    '''
+    Used for store the shape, currently support 2 dim
+    The behavior is identical to numpy
+    -------------------
+    Attributes:
+        dim1: the 1st dim, aka the row
+        dim2: the 2nd dim, aka the col
+    '''
+
+    def __init__(self, dim1=None, dim2=None):
+        if dim1 is not None and not isinstance(dim1, int):
+            raise TypeError("invalid dimension")
+        if dim2 is not None and not isinstance(dim2, int):
+            raise TypeError("invalid dimension")
+        self.dim1 = dim1
+        self.dim2 = dim2
+
+    def size(self):
+        dim1 = 1 if self.dim1 is None else self.dim1
+        dim2 = 1 if self.dim2 is None else self.dim2
+        return dim1 * dim2
+
+    def __getitem__(self, item):
+        return self.to_tuple().__getitem__(item)
+
+    def __len__(self):
+        return len(self.to_tuple())
+
+    def to_tuple(self):
+        if self.dim1 is None:
+            return ()
+        else:
+            if self.dim2 is None:
+                return (self.dim1,)
+            else:
+                return (self.dim1, self.dim2)
+
+    def from_tuple(self, v):
+        if len(v) == 1:
+            self.dim1 = v[0]
+            self.dim2 = None
+        elif len(v) == 2:
+            self.dim1 = v[0]
+            self.dim2 = v[1]
+        else:
+            self.dim1 = None
+            self.dim2 = None
+        return self
+
+    def transpose(self):
+        return TensorShapeStorage(self.dim2, self.dim1)
+
+    def matmul(self, other):
+        return TensorShapeStorage(self.dim1, other.dim2)
+
+
+class PubKeyStorage:
+    '''
+    Used for store PaillierPublicKey info as C-accpetable data type
+    -------------
+    Attributes:
+       n,g, nsquare, max_int:
+            c_char_p, actual value is bytes
+            all identical to PaillierPublicKey, which is defined in fate_script
+    '''
+
+    def __init__(self, n, g, nsquare, max_int):
+        self.n = c_char_p(n.to_bytes(CIPHER_BYTE, 'little'))
+        self.g = c_char_p(g.to_bytes(CIPHER_BYTE, 'little'))
+        self.nsquare = c_char_p(nsquare.to_bytes(CIPHER_BYTE, 'little'))
+        self.max_int = c_char_p(max_int.to_bytes(CIPHER_BYTE, 'little'))
+
+
+class PrivKeyStorage:
+    '''
+    Used for store PaillierPrivateKey info as C-acceptable data type
+    ------------
+    Attributes are all identical to PaillierPrivateKey, defined in fate_script
+    '''
+
+    def __init__(self, p, q, psquare, qsquare, q_inverse, hp, hq):
+        self.p = c_char_p(p.to_bytes(CIPHER_BYTE, 'little'))
+        self.q = c_char_p(q.to_bytes(CIPHER_BYTE, 'little'))
+        self.psquare = c_char_p(psquare.to_bytes(CIPHER_BYTE, 'little'))
+        self.qsquare = c_char_p(qsquare.to_bytes(CIPHER_BYTE, 'little'))
+        self.q_inverse = c_char_p(q_inverse.to_bytes(CIPHER_BYTE, 'little'))
+        self.hp = c_char_p(hp.to_bytes(CIPHER_BYTE, 'little'))
+        self.hq = c_char_p(hq.to_bytes(CIPHER_BYTE, 'little'))
+
+
+##########################################################################
+# ###############FUNCTION DEFINITION START################################
+##########################################################################
+
+
+def te_p2c_shape(shape, res):
+    '''
+    Change a 2-elem tuple into a TensorShapeStorage object
+    -------------
+    Para:
+        shape:   tuple, with no more than 2 elements
+        res:     return value
+    Return:
+        res,     TensorShapeStorage
+    '''
+    if res is None:
+        res = TensorShapeStorage()
+    res.from_tuple(shape)
+    return res
+
+
+def te_c2p_shape(shape):
+    '''
+    recover the shape_tuple from TensorShapeStorage
+    --------------
+    Para:   shape:   TensorShapeStorage
+    Return: tuple
+    '''
+    return shape.to_tuple()
+
+
+def te_free(tes):
+    '''
+    free the c memory space in a TensorStorage class
+    ------------
+    Para:   tes: TensorStorage
+    Return: None
+    '''
+    if isinstance(tes.data, int):
+        # means that it is a C memory pointer
+        FPGA_LIB.c_free(c_void_p(tes.data))
+        tes.data = None
+    # otherwise, tes.data is a python datatype(list or ndarray)
+
+
+def te_p2c(data, res=None):
+    '''
+    transmit the data storage form from Python to C
+    we assume data's structure has already been preserved by the upper layer
+    using the TensorShapeStorage class
+    ------------------
+    Args:
+        data, list or ndarray, the original data array
+    Return:
+        TensorStorage, and data is a C pointer
+    '''
+    # flatten the current ndarray for get the actual vec_size
+    if isinstance(data, list):
+        data = np.asarray(data)
+    if not isinstance(data, np.ndarray):
+        raise TypeError("Unsupported Data Structure")
+    vec_size = data.size
+
+    # malloc c memory space
+    if res is None:
+        storage_pointer = FPGA_LIB.c_malloc(c_size_t(vec_size * DOUBLE_BYTE))
+    else:
+        storage_pointer = res.data
+
+    # switch the differnt data types
+    if data.dtype == 'int32':
+        new_data = data.astype(np.int64)
+        data_pointer = new_data.ctypes.data_as(c_void_p)
+        data_type = INT64_TYPE
+        FPGA_LIB.int64_set(
+            c_void_p(storage_pointer),
+            data_pointer,
+            c_size_t(vec_size))
+    elif data.dtype == 'int64':
+        data_pointer = data.ctypes.data_as(c_void_p)
+        data_type = INT64_TYPE
+        FPGA_LIB.int64_set(
+            c_void_p(storage_pointer),
+            data_pointer,
+            c_size_t(vec_size))
+    elif data.dtype == 'float32':
+        new_data = data.astype(np.float64)
+        data_pointer = new_data.ctypes.data_as(c_void_p)
+        data_type = FLOAT_TYPE
+        FPGA_LIB.float64_set(
+            c_void_p(storage_pointer), data_pointer, c_size_t(vec_size)
+        )
+    elif data.dtype == 'float64':
+        data_pointer = data.ctypes.data_as(c_void_p)
+        data_type = FLOAT_TYPE
+        FPGA_LIB.float64_set(
+            c_void_p(storage_pointer), data_pointer, c_size_t(vec_size)
+        )
+    else:
+        raise PermissionError("Invalid Data Type")
+    return _te_init_store(
+        res,
+        storage_pointer,
+        vec_size,
+        MEM_FPGA_NUM_0,
+        data_type)
+
+
+def te_c2p(store):
+    '''
+    transmit TensorShapeStorage form from C to Python
+    due to different data type, the return array may diff
+    -----------
+    Para:
+        store: TensorShapeStorage, the storage waited to be changed
+    Return:
+        res_array: np.ndarray, the returned ndarray to Python
+    '''
+    if store.data_type == FLOAT_TYPE:
+        temp_array = __get_C_double(store.data, store.vec_size)
+        res_array = temp_array.astype(np.float64)
+        return res_array
+    elif store.data_type == INT64_TYPE:
+        temp_array = __get_C_int64(store.data, store.vec_size)
+        res_array = temp_array.astype(np.int64)
+        return res_array
+    else:
+        raise PermissionError("Invalid Data Type")
+
+
+def te_c2bytes(data, res):
+    '''
+    transmit TensorShapeStorage form from C to bytes stream.
+    Used for communication between sites, since C memory is not shared
+    --------------------
+    Para:
+        data: TensorShapeStorage, data is a C memory ptr
+        res:  the return bytes string
+    Return:
+        res:  bytes
+    '''
+    data_type = data.data_type
+    bytes_result = c_buffer(DOUBLE_BYTE * data.vec_size + U_INT32_BYTE)
+    # first 4 bytes: contains the data_type info
+    # remain bytes:  contains the data
+    FPGA_LIB.get_bytes(
+        cast(bytes_result, c_void_p),
+        c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_void_p(data.data),
+        c_size_t(data.vec_size),
+    )
+    return bytes_result.raw
+
+
+def fp_c2bytes(store, res):
+    '''
+    transmit FixedPointStorage form to bytes stream;
+    Used for communication between sites, since C memory is not shared
+    Other info besides the C memory, including data_type, mem_type,
+    are also included
+    -----------------
+    Para:
+        store: FixedPointStorage
+        res:   the return bytes string
+    Return:
+        res:   bytes
+    '''
+    # uint32
+    data_type = store.data_type
+    mem_type = store.mem_type
+    # bigint
+    encode_n = store.encode_n
+    max_int = store.max_int
+    # actual storage
+    bytes_result = c_buffer(
+        (PLAIN_BYTE + U_INT32_BYTE * 2) * store.vec_size + U_INT32_BYTE * 2 + PLAIN_BYTE * 2
+    )
+    FPGA_LIB.fp_get_bytes(
+        cast(bytes_result, c_void_p),
+        c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(mem_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(encode_n.to_bytes(PLAIN_BYTE, 'little')),
+        c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
+        c_void_p(store.bigint_storage),
+        c_void_p(store.base_storage),
+        c_void_p(store.exp_storage),
+        c_size_t(store.vec_size),
+    )
+    return bytes_result.raw
+
+
+def pi_c2bytes(store, res):
+    '''
+    transmit PaillierEncryptedNumber form to bytes stream
+    Used for communication between sites, since C memory is not shared
+    ----------------
+    Para:
+        store: PaillierEncryptedStorage
+        res:   the return bytes string
+    Return:
+        res:   bytes
+    '''
+    # uint32
+    data_type = store.data_type
+    mem_type = store.mem_type
+    # bigint
+    encode_n = store.encode_n
+    max_int = store.encode_max_int
+    # actual storage
+    bytes_result = c_buffer(
+        (CIPHER_BYTE + U_INT32_BYTE * 2) * store.vec_size + U_INT32_BYTE * 2 + CIPHER_BYTE * 2
+    )
+    FPGA_LIB.pi_get_bytes(
+        cast(bytes_result, c_void_p),
+        c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(mem_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(encode_n.to_bytes(CIPHER_BYTE, 'little')),
+        c_char_p(max_int.to_bytes(CIPHER_BYTE, 'little')),
+        c_void_p(store.pen_storage),
+        c_void_p(store.base_storage),
+        c_void_p(store.exp_storage),
+        c_size_t(store.vec_size),
+    )
+    return bytes_result.raw
+
+
+def _te_init_store(store, data, vec_size, mem_type, data_type):
+    '''
+    initialize tensor storage,
+    -----------
+    Para:
+        store: the return value, TensorStorage, default None
+        Other paras' definition are equals to the one in TensorStorage
+    Return:
+        TensorShapeStorage
+    '''
+    if store is None:
+        store = TensorStorage(data, vec_size, mem_type, data_type)
+    else:
+        store.data = data
+        store.vec_size = vec_size
+        if mem_type is not None:
+            store.mem_type = mem_type
+        store.data_type = data_type
+    return store
+
+
+def te_bytes2c(data, res):
+    '''
+    Restore TensorStorage from bytes buffer,
+    TensorStorage.data is a ptr pointing to the restored C memory space.
+    -------------
+    Para:
+        data: the bytes string
+        res:  the return value, TensorStorage
+    Return:
+        res:  TensorStorage, the restored struct from para.data
+    '''
+    data_type_result = c_buffer(U_INT32_BYTE)
+    len_data = len(data) - U_INT32_BYTE
+    if res is None:
+        storage_pointer = FPGA_LIB.c_malloc(c_size_t(len_data))
+    else:
+        storage_pointer = res.data
+    FPGA_LIB.from_bytes_get_c(
+        cast(data_type_result, c_void_p),
+        c_void_p(storage_pointer),
+        c_char_p(data),
+        c_size_t(len_data),
+    )
+    data_type = int.from_bytes(data_type_result, 'little')
+    # TODO: change according to different data_types,
+    # now just use DOUBLE BYTE because we have only INT64 and DOUBLE,
+    # all of them are 8 bytes(Equal to DOUBLE_BYTE)
+    vec_size = len_data // DOUBLE_BYTE
+    return _te_init_store(
+        res,
+        storage_pointer,
+        vec_size,
+        MEM_FPGA_NUM_0,
+        data_type)
+
+
+def fp_bytes2c(data, res):
+    '''
+    Restore FixedPointStorage from bytes buffer.
+    ---------------
+    Para:
+        data: the bytes string
+        res:  the return value, FixedPointStorage
+    Return:
+        res:  FixedPointStorage, the restored struct from para.data.
+    '''
+    # caculate vec_size
+    vec_size = (len(data) - 2 * (U_INT32_BYTE + PLAIN_BYTE)) // (U_INT32_BYTE * 2 + PLAIN_BYTE)
+    # uint32
+    data_type = c_buffer(U_INT32_BYTE)
+    mem_type = c_buffer(U_INT32_BYTE)
+    # bigint
+    encode_n = c_buffer(PLAIN_BYTE)
+    max_int = c_buffer(PLAIN_BYTE)
+    # storage
+    fpn = FPGA_LIB.c_malloc(c_size_t(PLAIN_BYTE * vec_size))
+    base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+
+    FPGA_LIB.fp_from_bytes_get_c(
+        cast(data_type, c_void_p),
+        cast(mem_type, c_void_p),
+        cast(encode_n, c_void_p),
+        cast(max_int, c_void_p),
+        cast(fpn, c_void_p),
+        cast(base, c_void_p),
+        cast(exp, c_void_p),
+        c_char_p(data),
+        c_size_t(vec_size),
+    )
+    return _fp_init_store(
+        res,
+        fpn,
+        base,
+        exp,
+        vec_size,
+        int.from_bytes(encode_n, 'little'),
+        int.from_bytes(max_int, 'little'),
+        int.from_bytes(mem_type, 'little'),
+        int.from_bytes(data_type, 'little'),
+    )
+
+
+def pi_bytes2c(data, res):
+    '''
+    Restored PaillierEncryptedStorage from bytes buffer
+    --------------
+    Para:
+        data: the bytes string
+        res:  the return value, PaillierEncryptedStorage
+    Return:
+        res:  PaillierEncryptedStorage, the restored struct from para.data
+    '''
+    # caculate vec_size
+    vec_size = (len(data) - 2 * (U_INT32_BYTE + CIPHER_BYTE)) // (U_INT32_BYTE * 2 + CIPHER_BYTE)
+    # uint32
+    data_type = c_buffer(U_INT32_BYTE)
+    mem_type = c_buffer(U_INT32_BYTE)
+    # bigint
+    encode_n = c_buffer(CIPHER_BYTE)
+    max_int = c_buffer(CIPHER_BYTE)
+    # storage
+    pen = FPGA_LIB.c_malloc(c_size_t(CIPHER_BYTE * vec_size))
+    base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+
+    FPGA_LIB.pi_from_bytes_get_c(
+        cast(data_type, c_void_p),
+        cast(mem_type, c_void_p),
+        cast(encode_n, c_void_p),
+        cast(max_int, c_void_p),
+        cast(pen, c_void_p),
+        cast(base, c_void_p),
+        cast(exp, c_void_p),
+        c_char_p(data),
+        c_size_t(vec_size),
+    )
+    return _pi_init_store(
+        res,
+        pen,
+        base,
+        exp,
+        vec_size,
+        int.from_bytes(mem_type, 'little'),
+        int.from_bytes(data_type, 'little'),
+        int.from_bytes(encode_n, 'little'),
+        int.from_bytes(max_int, 'little'),
+    )
+
+
+def _te_init_shape(shape_store, shape_tuple):
+    '''
+    Init TensorShapeStorage
+    ----------
+    Para:
+        shape_store: TensorShapeStorage or None, return value, default None
+        shape_tuple: tuple, at most 2 dim, source data of TensorShapeStorage
+    Return:
+        TensorShapeStorage
+    '''
+    if shape_store is None:
+        shape_store = TensorShapeStorage()
+    shape_store.from_tuple(shape_tuple)
+    return shape_store
+
+
+def _te_init_ss(
+        res_store, res_data, vec_size, res_shape, shape_tuple, mem_type, data_type
+):
+    '''
+    Init TensorStorage and TensorShapeStorage at the same time
+    ------------
+    Para:
+        res_store: The return value, TensorStorage, default None
+        res_data:  int or ndarray
+        vec_size:  int
+        res_shape: The return value, TensorShapeStorage, default None
+        shape_tuple, tuple, at most 2 dim
+        mem_type:  int
+        data_type: int
+    Return:
+        tuple, (TensorStorage, TensorShapeStorage)
+    '''
+    return _te_init_store(
+        res_store, res_data, vec_size, mem_type, data_type
+    ), _te_init_shape(res_shape, shape_tuple)
+
+
+'''''' '''
+The following calculators are done on TensorStorage
+Definition and output are the same with numpy
+TensorStorage.data should all be ndarray datatype in order to support numpy
+
+NOT USED IN OUR FATE IMPLEMENTATION,
+but Webank's implementation seems to have used them
+''' ''''''
+
+
+def te_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
+    if axis == 1:
+        res_data = store.data[:, start:stop]
+    elif axis == 0:
+        res_data = store.data[start:stop]
+    else:
+        raise NotImplementedError()
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def te_cat(stores, axis, res_store, res_shape):
+    if axis == 0:
+        res_data = np.vstack([x.data for x in stores])
+    elif axis == 1:
+        res_data = np.hstack([x.data for x in stores])
+    else:
+        raise NotImplementedError()
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        stores[0].mem_type,
+        stores[0].data_type,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_pow(left_store, right, left_shape, res_store, res_shape, stream):
+    res_data = left_store.data ** right
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_add(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    res_data = left_store.data + right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_mul(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    res_data = left_store.data * right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_truediv(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    res_data = left_store.data / right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        FLOAT_TYPE,
+    )
+
+
+def te_floordiv(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    res_data = left_store.data // right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        INT64_TYPE,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_sub(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    res_data = left_store.data - right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_matmul(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    res_data = left_store.data @ right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_abs(left_store, left_shape, res_store, res_shape, stream):
+    return _te_init_ss(
+        res_store,
+        abs(left_store.data),
+        left_store.vec_size,
+        res_shape,
+        left_shape.to_tuple(),
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_neg(left_store, left_shape, res_store, res_shape, stream):
+    return _te_init_ss(
+        res_store,
+        -left_store.data,
+        left_store.vec_size,
+        res_shape,
+        left_shape.to_tuple(),
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_transpose(left_store, left_shape, res_store, res_shape, stream):
+    res_data = left_store.data.transpose()
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_sum(left_store, left_shape, axis, res_store, res_shape, stream):
+    res_data = left_store.data.sum(axis=axis)
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_reshape(store, shape, new_shape, res_store, res_shape, stream):
+    return _te_init_ss(
+        res_store,
+        store.data.reshape(new_shape),
+        store.vec_size,
+        res_shape,
+        new_shape.to_tuple(),
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def te_exp(store, shape, res_store, res_shape, stream):
+    return _te_init_ss(
+        res_store,
+        np.exp(store.data),
+        store.vec_size,
+        res_shape,
+        shape.to_tuple(),
+        store.mem_type,
+        FLOAT_TYPE,
+    )
+
+
+def te_hstack(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    _store, _shape = te_cat([left_store, right_store], 1, res_store, res_shape)
+    # avoid naming collision
+    return _te_init_ss(
+        res_store,
+        _store.data,
+        _store.vec_size,
+        _shape,
+        _shape.to_tuple(),
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_c2p_first(store):
+    '''
+    Get the first element in the C data storage of TensorStorage
+    ---------------
+    Para:
+        store: TensorStorage, store.data must be a pointer to C memory
+    Return:
+        int or double, the first element in the C memory
+    '''
+    if store.data_type == FLOAT_TYPE:
+        temp_array = __get_C_double(store.data, store.vec_size)
+        res_array = temp_array.astype(np.float64)
+        return res_array[0]
+    elif store.data_type == INT64_TYPE:
+        temp_array = __get_C_int64(store.data, store.vec_size)
+        res_array = temp_array.astype(np.int64)
+        return res_array[0]
+    else:
+        raise PermissionError("Invalid Data Type")
+
+
+def bi_alloc(res, vec_size, elem_size, mem_type):
+    return _bi_init_store(
+        res,
+        FPGA_LIB.c_malloc(c_size_t(vec_size * elem_size)),
+        vec_size,
+        elem_size,
+        mem_type,
+    )
+
+
+'''################malloc a space with size elements############### '''
+'''
+    function: allocate space and form a new PaillierEncryptedStorage Class
+    res:    spilted to 3 different parts, indicating the 3 parts
+            that are needed for the PaillierEncrytedStorage
+    size:   is the number of elements that need to be alloced
+    return: A PaillierEncryptedStorage class, wrapping res as a class
+'''
+
+
+def pi_alloc(res, size, mem_type):
+    res_pen = FPGA_LIB.c_malloc(c_size_t(size * CIPHER_BYTE))
+    res_base = FPGA_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    res_exp = FPGA_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    # data_type, encode_n and encode_max_int all set to 0
+    return _pi_init_store(
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        size,
+        mem_type,
+        0,
+        0,
+        0)
+
+
+def fp_alloc(res, size, mem_type):
+    res_fpn = FPGA_LIB.c_malloc(c_size_t(size * PLAIN_BYTE))
+    res_base = FPGA_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    res_exp = FPGA_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    return _fp_init_store(
+        res,
+        res_fpn,
+        res_base,
+        res_exp,
+        size,
+        0,
+        0,
+        mem_type,
+        0)
+
+
+def te_alloc(res, size, mem_type):
+    data = FPGA_LIB.c_malloc(c_size_t(size * DOUBLE_BYTE))
+    return _te_init_store(res, data, size, mem_type, 0)
+
+
+def pi_free(ptr):
+    '''
+    The delete function of PaillierEncryptedStorage,
+    Due to different mem_type, the delete method may change
+    --------------
+    Para:
+        ptr: PaillierEncryptedStorage
+    '''
+    FPGA_LIB.c_free(c_void_p(ptr.pen_storage))
+    FPGA_LIB.c_free(c_void_p(ptr.base_storage))
+    FPGA_LIB.c_free(c_void_p(ptr.exp_storage))
+    ptr.pen_storage, ptr.base_storage, ptr.exp_storage = None, None, None
+
+
+# Host2Device and Device2Host calculators are not implemented on FPGA currently
+def pi_d2h(target, src, size, stream):
+    return src
+
+
+def pi_h2d(target, src, size, stream):
+    return src
+
+
+def pi_h2d_pub_key(target, src):
+    return src
+
+
+def pi_h2d_priv_key(target, src):
+    return src
+
+
+def pi_p2c_pub_key(target, src):
+    '''
+    Transfer Python form PaillierPublicKey to C form PubKeyStorage,
+    the latter can be used for C/FPGA computing
+    '''
+    target = PubKeyStorage(src.n, src.g, src.nsquare, src.max_int)
+    return target
+
+
+def pi_p2c_priv_key(target, src):
+    '''
+    Transfer Python form PaillierPrivateKey to C form PrivKeyStorage
+    the latter one can be used for C/FPGA computing
+    '''
+    target = PrivKeyStorage(
+        src.p, src.q, src.psquare, src.qsquare, src.q_inverse, src.hp, src.hq
+    )
+    return target
+
+
+# ###########PaillierEncrypted STORAGE INITIALIZE#################
+def _pi_init_store(
+        res_store,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        mem_type,
+        data_type,
+        encode_n,
+        encode_max_int,
+):
+    '''
+    init a new PaillierEncryptedStorage
+    ---------------
+    Para:
+        res_store, PaillierEncryptedStorage or None, return value, default None
+        Else paras are identical to the ones described in PaillierEncryptedStorage
+    '''
+    if res_store is None:
+        res_store = PaillierEncryptedStorage(
+            pen_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            mem_type,
+            data_type,
+            encode_n,
+            encode_max_int,
+        )
+    else:
+        res_store.pen_storage = pen_storage
+        res_store.base_storage = base_storage
+        res_store.exp_storage = exp_storage
+        res_store.vec_size = vec_size
+        res_store.mem_type = mem_type
+        '''TensorStorage needed'''
+        res_store.data_type = data_type
+        '''FixedPointNumber Needed'''
+        res_store.encode_n = encode_n
+        res_store.encode_max_int = encode_max_int
+    return res_store
+
+
+_pi_init_shape = _te_init_shape
+
+
+def _pi_init_ss(
+        res_store,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        res_shape,
+        res_shape_tuple,
+        mem_type,
+        data_type,
+        encode_n,
+        encode_max_int,
+):
+    '''
+    init PaillierEncryptedStorage and corresponding TensorShapeStorage at same time
+    Paras are identical to _pi_init_store & _te_init_shape
+    '''
+    return _pi_init_store(
+        res_store,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        mem_type,
+        data_type,
+        encode_n,
+        encode_max_int,
+    ), _pi_init_shape(res_shape, res_shape_tuple)
+
+
+''' transfor PEN tensor from Python memory to C memory '''
+
+
+def pi_p2c(target, src, data_type=FLOAT_TYPE):
+    '''
+    Transform list of PaillierEncryptedNumber to
+    C-memory style PaillierEncryptedStorage
+    --------------------
+    Para:
+        target:     PaillierEncryptedStorage, return value
+        src:        List or ndarray, each element is a PaillierEncryptedNumber
+        data_type:  int, src's original datatype, default double
+    '''
+    if isinstance(src, list):
+        src = np.array(src)
+    if not isinstance(src, np.ndarray):
+        raise TypeError("Unsupported Data Structure")
+    src = src.flatten()
+    vec_size = src.size
+    # malloc the space for the type
+    if target is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = target.pen_storage
+        res_base = target.base_storage
+        res_exp = target.exp_storage
+    # get the two encoding parameters
+    n = src[0].public_key.n
+    max_int = src[0].public_key.max_int
+    base_temp = []
+    exp_temp = []
+    # Due to the special condition that big_ints in ndaray are not continuously stored
+    # they are actually oject type rather than int type
+    # Actually ndarray stores its reference/pointer continuously rather than real value
+    # So we should use a for loop to handle each bigint and memcpy it
+    for i in range(vec_size):
+        src_number = src[i].ciphertext(False).to_bytes(CIPHER_BYTE, 'little')
+        FPGA_LIB.bigint_set(
+            c_char_p(res_pen),
+            c_char_p(src_number),
+            c_size_t(CIPHER_BITS),
+            c_size_t(i))
+        base_temp.append(PEN_BASE)
+        exp_temp.append(src[i].exponent)
+    # base and exp are deepcopyed in order to prevent potential double free
+    # here
+    base_arr_ptr = np.asarray(base_temp).ctypes.data_as(c_void_p)
+    exp_arr_ptr = np.asarray(exp_temp).ctypes.data_as(c_void_p)
+    FPGA_LIB.unsigned_set(c_void_p(res_base), base_arr_ptr, c_size_t(vec_size))
+    FPGA_LIB.unsigned_set(c_void_p(res_exp), exp_arr_ptr, c_size_t(vec_size))
+    return _pi_init_store(
+        target,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        MEM_FPGA_NUM_0,
+        data_type,
+        n,
+        max_int,
+    )
+
+
+def _bi_init_store(res_store, data, count, elem_size, mem_type):
+    '''init a new BigIntStorage object'''
+    if res_store is None:
+        res_store = BigIntStorage(data, count, mem_type, elem_size)
+    else:
+        res_store.bigint_storage = data
+        res_store.vec_size = count
+        res_store.mem_type = mem_type
+        res_store.elem_size = elem_size
+    return res_store
+
+
+_bi_init_shape = _te_init_shape
+
+
+def _bi_init_ss(
+        res_store,
+        res_data,
+        vec_size,
+        res_shape,
+        res_shape_tuple,
+        elem_size,
+        mem_type):
+    '''Init BigIntStorage and the corresponding TensorShapeStorage'''
+    return _bi_init_store(
+        res_store, res_data, vec_size, elem_size, mem_type
+    ), _bi_init_shape(res_shape, res_shape_tuple)
+
+
+def _fp_init_store(
+        res_store,
+        fpn_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        n,
+        max_int,
+        mem_type,
+        data_type,
+):
+    '''
+    Init FixedPointStorage class,
+    paras are identical to the elements in FixedPointStorage
+    '''
+    if res_store is None:
+        res_store = FixedPointStorage(
+            fpn_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            n,
+            max_int,
+            mem_type,
+            data_type,
+        )
+    else:
+        res_store.bigint_storage = fpn_storage
+        res_store.base_storage = base_storage
+        res_store.exp_storage = exp_storage
+        res_store.vec_size = vec_size
+        res_store.mem_type = mem_type
+        '''TensorStorage needed paras'''
+        res_store.data_type = data_type
+        '''En/Decode needed paras '''
+        res_store.encode_n = n
+        res_store.max_int = max_int
+    return res_store
+
+
+def _fp_init_ss(
+        res_store,
+        fpn_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        n,
+        max_int,
+        res_shape,
+        res_shape_tuple,
+        mem_type,
+        data_type,
+):
+    '''Init FexiedPointStorage and the corresponding TensorShapeStorage'''
+    return _fp_init_store(
+        res_store,
+        fpn_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        n,
+        max_int,
+        mem_type,
+        data_type,
+    ), _te_init_shape(res_shape, res_shape_tuple)
+
+
+def __get_FPGA_device_num(device_type: int):
+    '''
+    get the actual physical number of FPGA device from the current mem_type
+    ----------------
+    Para: device_type, the mem_type stored in Storage type, since it is mixed
+    with GPU and CPU, to get physical No. of FPGA, we should do some pre-process
+    '''
+    # if device_type >= MIN_FPGA and device_type <= MAX_FPGA:
+    #     FPGA_dev_num = device_type % 10
+    # else:
+    #     raise PermissionError("DEVICE TYPE IS NOT FPGA!")
+    return 0
+
+
+def pi_encrypt(pub_key, fps, res=None, stream=None):
+    '''
+    perform paillier encryption for FixedPointStorage,
+    use raw encrypt with no obfuscation
+    ----------------
+    Para:
+        pubkey: Dev_PubKeyPtr, the PaillierPublicKey class stored in GPU memory
+        fps:    FixedPointStorage, fpn value waiting to be encrypted
+        res:    None or PaillierEncryptedStorage, return value, default None
+        stream: None, currently not used
+    Return:
+        PaillierEncryptedStorage, the encrypted value
+    '''
+    src_fpn = fps.bigint_storage
+    src_base = fps.base_storage
+    src_exp = fps.exp_storage
+    vec_size = fps.vec_size
+
+    if res is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res.pen_storage
+        res_base = res.base_storage
+        res_exp = res.exp_storage
+
+    # get the actual FPGA device number and pass it to C-level function
+    FPGA_dev_num = __get_FPGA_device_num(fps.mem_type)
+    FPGA_LIB.encrypt_without_obf(
+        c_char_p(src_fpn),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(vec_size),
+        c_size_t(FPGA_dev_num),
+    )
+    return _pi_init_store(
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        fps.mem_type,
+        fps.data_type,
+        fps.encode_n,
+        fps.max_int,
+    )
+
+
+def pi_decrypt(pub_key, priv_key, pes, res=None, stream=None):
+    '''
+    perform decryption and decode as a whole
+    ---------------------
+    Para:
+        pub_key:   Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        priv_key:  Dev_PrivKeyStorage, PaillierPrivateKey stored in GPU mem
+        pes:       PaillierEncryptedStorage, pens waiting to be decrypted
+        res:       TensorStorage, the return value;
+        stream:    None, currently not used
+        fps:       FixedPointStorage, the middle memory space used
+                   after decrypt and before encode
+    Return:
+        TensorStorage, the decrypted then decoded value
+    '''
+    src_pen = pes.pen_storage
+    src_base = pes.base_storage
+    src_exp = pes.exp_storage
+    vec_size = pes.vec_size
+    '''malloc space for the return FixedPointStorage'''
+    res_fpn = FPGA_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+    res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    '''call the decrypt function'''
+    FPGA_dev_num = __get_FPGA_device_num(pes.mem_type)
+    FPGA_LIB.decrypt(
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_fpn),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        priv_key.p,
+        priv_key.q,
+        priv_key.psquare,
+        priv_key.qsquare,
+        priv_key.q_inverse,
+        priv_key.hp,
+        priv_key.hq,
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(vec_size),
+        c_size_t(FPGA_dev_num),
+    )
+    '''call the decode function'''
+    decrypt_store = FixedPointStorage(
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        pes.encode_n,
+        pes.encode_max_int,
+        pes.mem_type,
+        pes.data_type,
+    )
+    return fp_decode(decrypt_store, res, stream)
+
+
+def pi_obfuscate(pub_key, pes, obf_seeds, res, stream):
+    '''
+    apply obfuscation to a PaillierEncryptedStorage using the
+    obfuscation seed given, actually a mulmod
+    ----------------------
+    Para:
+        pubkey:    Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        pes:       PaillierEncryptedStorage, raw pen haven't be obfuscated
+        obf_seeds: BigIntStorage, random bigint generated by pi_gen_obf_seed
+        res:       PaillierEncryptedStorage, the obfuscated return value
+    Return:
+        PaillierEncryptedStorage, the same as res
+    '''
+    # get the pen storage data
+    src_pen = pes.pen_storage
+    src_base = pes.base_storage
+    src_exp = pes.exp_storage
+    vec_size = pes.vec_size
+    # get the bigint random ptr
+    obf_rand = obf_seeds.bigint_storage
+    '''initialize the result space'''
+    if res is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res.pen_storage
+        res_base = res.base_storage
+        res_exp = res.exp_storage
+    '''run the modular mul function'''
+    # we will do the obfs on the device same as pes's device
+    # Although the obfs_seed may be generated on another device
+    # But since all datas are stored in CPU memory, this won't be a serious
+    # problem
+    FPGA_dev_num = __get_FPGA_device_num(pes.mem_type)
+    FPGA_LIB.obf_modular_multiplication(
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(obf_rand),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(CIPHER_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(vec_size),
+        c_size_t(FPGA_dev_num),
+    )
+    return _pi_init_store(
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        pes.mem_type,
+        pes.data_type,
+        pes.encode_n,
+        pes.encode_max_int,
+    )
+
+
+def pi_gen_obf_seed(res_store, pub_key, count, elem_size, rand_seed, stream):
+    '''
+    generate random bigint and perform expmod based on the given public key.
+    The calculation result is then used as obfuscation seed for further encrypt.
+    --------------
+    Para:
+        res_store:   BigIntStorage, the return value
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        count:       int, the number of random numbers need to be generated
+        elem_size:   int, the length of the random bigint
+        rand_seed:   the seed used for generating random number
+    Return:
+        BigIntStorage, same as res_store
+    '''
+    rand_storage = bi_gen_rand(elem_size, count, None, rand_seed, stream)
+    rand_data = rand_storage.bigint_storage
+    if res_store is None:
+        res_data = FPGA_LIB.c_malloc(c_size_t(count * CIPHER_BYTE))
+    else:
+        res_data = res_store.bigint_storage
+    FPGA_dev_num = __get_FPGA_device_num(rand_storage.mem_type)
+    FPGA_LIB.obf_modular_exponentiation(
+        c_char_p(rand_data),
+        c_size_t(1024),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_char_p(res_data),
+        c_size_t(CIPHER_BITS),
+        c_size_t(count),
+        c_size_t(FPGA_dev_num),
+    )
+    return _bi_init_store(res_store, res_data, count, MEM_DEVICE, elem_size)
+
+
+def pi_gen_obf_seed_gmp(res_store, pub_key, count, elem_size, stream):
+    '''
+    generate random bigint and perform expmod based on the given public key.
+    The calculation result is then used as obfuscation seed for further encrypt.
+    --------------
+    Para:
+        res_store:   BigIntStorage, the return value
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        count:       int, the number of random numbers need to be generated
+        elem_size:   int, the length of the random bigint
+        rand_seed:   the seed used for generating random number
+    Return:
+        BigIntStorage, same as res_store
+    '''
+    res_rand = FPGA_LIB.c_malloc(c_size_t(count * 1024 // 8))
+    FPGA_LIB.gmp_random(
+        c_char_p(res_rand),
+        c_size_t(1024),
+        c_size_t(1024),
+        c_size_t(1024),
+        c_size_t(count),
+        pub_key.n,
+    )
+    if res_store is None:
+        res_data = FPGA_LIB.c_malloc(c_size_t(count * CIPHER_BYTE))
+    else:
+        res_data = res_store.bigint_storage
+    FPGA_dev_num = 0
+    FPGA_LIB.obf_modular_exponentiation(
+        c_char_p(res_rand),
+        c_size_t(1024),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_char_p(res_data),
+        c_size_t(CIPHER_BITS),
+        c_size_t(count),
+        c_size_t(FPGA_dev_num),
+    )
+    return _bi_init_store(res_store, res_data, count, MEM_DEVICE, 2048 // 8)
+
+
+def __shape_decompose(shape):
+    '''
+    Decompose TensorShapeStorage to 2-D tuple
+    satisfying fpga computation demand
+
+    WARNING:
+    not same output as numpy,
+    extra switch needed after computing to suit numpy shape output
+    '''
+    shape_tuple = shape.to_tuple()
+    if len(shape_tuple) == 0:
+        return 1, 1
+    elif len(shape_tuple) == 1:
+        return 1, shape_tuple[0]
+    elif len(shape_tuple) == 2:
+        return shape_tuple[0], shape_tuple[1]
+    else:
+        raise PermissionError("Invalid Shape")
+
+
+def __shape_resolve(shape_1, shape_2):
+    '''check aligment capability of shape_1 & shape_2 to support broadcast'''
+
+    def check_func(a, b):
+        return a == b or a == 1 or b == 1
+
+    P, Q = __shape_decompose(shape_1)
+    R, S = __shape_decompose(shape_2)
+    max_shape_size = max(len(shape_1.to_tuple()), len(shape_2.to_tuple()))
+    if check_func(P, R) and check_func(Q, S):
+        if max_shape_size == 0:
+            return P, Q, R, S, ()
+        elif max_shape_size == 1:
+            return P, Q, R, S, (max(Q, S),)
+        elif max_shape_size == 2:
+            return P, Q, R, S, (max(P, R), max(Q, S))
+        else:
+            raise PermissionError(f"Invalid shape, {shape_1}, {shape_2}")
+    else:
+        raise PermissionError("shape cannot align", shape_1, shape_2)
+
+
+def pi_add(
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform element-wise encrypted add, support broadcast over cols or rows
+    ---------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        right_store: PaillierEncryptedStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if left/right operators cannot aligned for compute,
+                         even if broadcast is supported
+    '''
+    # first get the shape of the res type
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    res_size = max(P, R) * max(Q, S)
+    # the left_store data
+    l_pen = left_store.pen_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_pen = right_store.pen_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    # perform calculation
+    FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+    FPGA_LIB.pen_matrix_add_pen_matrix(
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_pen),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(R),
+        c_size_t(S),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(CIPHER_BITS),
+        c_size_t(FPGA_dev_num),
+    )
+    # handle the result's data type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_mul(
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform element-wise encrypted muliply, support broadcast for cols and rows
+    --------------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        right_store: FixedPointStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if left/right operators cannot aligned for compute,
+                         even if broadcast is supported
+    '''
+    # check for alignment capablity of shapes
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    # P,Q is the dim of the left_store(pen)
+    # R,S is the dim of the right_store(fpn)
+    res_size = max(P, R) * max(Q, S)
+    # the left_store data
+    l_pen = left_store.pen_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_fpn = right_store.bigint_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    # '''call the batch_mul function'''
+    FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+    FPGA_LIB.fpn_matrix_elementwise_multiply_pen_matrix(
+        c_char_p(r_fpn),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(R),
+        c_size_t(S),
+        c_size_t(P),
+        c_size_t(Q),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(FPGA_dev_num),
+    )
+    # handle the result's data type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def fp_transpose(left_store, left_shape, res_store, res_shape, stream):
+    '''
+    transpose the C-memory stored matrix of FixedPointStorage,
+    support at most 2-D matrix
+    -----------------
+    Para:
+        left_store:  FixedPointStorage, left_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (FixedPointStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if dimension is higher than 2-D, not supported
+    '''
+    # didn't use FPGA driver, no need for check for mem_type
+    left_shape_tuple = left_shape.to_tuple()
+    # get the left_store parameters
+    src_fpn = left_store.bigint_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    # malloc space for the res value
+    if res_store is None:
+        res_fpn = FPGA_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    # Handling different shapes
+    if len(left_shape_tuple) < 2:
+        # the tuple is 0-D or 1-D
+        # transpose returns the same value as input in numpy
+        # make the output same as input, memcpy is to prevent potential double
+        # free
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_fpn),
+            c_void_p(src_fpn),
+            c_size_t(vec_size * PLAIN_BYTE))
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(vec_size * U_INT32_BYTE))
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(vec_size * U_INT32_BYTE))
+        return _fp_init_ss(
+            res_store,
+            res_fpn,
+            res_base,
+            res_exp,
+            left_store.vec_size,
+            left_store.encode_n,
+            left_store.max_int,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+        )
+    elif len(left_shape_tuple) == 2:
+        # the tuple is 2-D
+        # do a normal transpose
+        res_shape_tuple = (left_shape_tuple[1], left_shape_tuple[0])
+        FPGA_LIB.transpose(
+            c_char_p(src_fpn),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(res_shape_tuple[1]),
+            c_size_t(res_shape_tuple[0]),
+            c_size_t(PLAIN_BITS),
+        )
+        return _fp_init_ss(
+            res_store,
+            res_fpn,
+            res_base,
+            res_exp,
+            vec_size,
+            left_store.encode_n,
+            left_store.max_int,
+            res_shape,
+            res_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+        )
+    else:
+        raise PermissionError("Unsupported shape")
+
+
+def pi_matmul(
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform matrix multiply under encryption
+    ------------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        right_store: FixedPointStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if shape is invalid for 1-D or 2-D matrix mul
+        ValueError, if left/right operators' shape can't align for matmul
+    '''
+
+    # '''pre-process shape'''
+    left_tuple = left_shape.to_tuple()
+    right_tuple = right_shape.to_tuple()
+
+    if len(left_tuple) == 0 or len(right_tuple) == 0 or len(left_tuple) > 2 or len(right_tuple) > 2:
+        raise PermissionError("Invalid shape")
+    P, Q = __shape_decompose(left_shape)
+    R, S = __shape_decompose(right_shape)
+    if len(right_tuple) == 1:
+        R, S = S, R
+    if Q != R:
+        raise ValueError("shape not aligned")
+    if len(left_tuple) == 1 and len(right_tuple) == 1:
+        res_shape_tuple = ()
+    elif len(left_tuple) == 1 and len(right_tuple) == 2:
+        res_shape_tuple = (S,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 1:
+        res_shape_tuple = (P,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 2:
+        res_shape_tuple = (P, S)
+    else:
+        raise RuntimeError(
+            "Default error, won't occur unless something VERY STRANGE happens"
+        )
+    res_size = P * S
+    # the left_store data
+    l_pen = left_store.pen_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_fpn = right_store.bigint_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    '''call the matrix_mul function'''
+    FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+    FPGA_LIB.pen_matrix_multiply_fpn_matrix(
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_fpn),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(S),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(FPGA_dev_num),
+    )
+    # check for data type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_rmatmul(
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform matrix multiply under encryption.
+    rmatmul means right_op is PaillierEncryptedStorage, differ from pi_matmul
+    Due to implementation of cuda code, right_store needs to be transposed
+    -------------------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  FixedPointStorage, left_operator
+        right_store: PaillierEncryptedStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if shape is invalid for 1-D or 2-D matrix mul
+        ValueError, if left/right operators' shape can't align for matmul
+        RuntimeError,  default error for shape evaluation
+    '''
+    left_tuple = left_shape.to_tuple()
+    right_tuple = right_shape.to_tuple()
+    if len(left_tuple) == 0 or len(right_tuple) == 0 or len(left_tuple) > 2 or len(right_tuple) > 2:
+        raise PermissionError("Invalid shape")
+    P, Q = __shape_decompose(left_shape)
+    R, S = __shape_decompose(right_shape)
+    if len(right_tuple) == 1:
+        R, S = S, R
+    if Q != R:
+        raise ValueError("shape not aligned")
+    if len(left_tuple) == 1 and len(right_tuple) == 1:
+        res_shape_tuple = ()
+    elif len(left_tuple) == 1 and len(right_tuple) == 2:
+        res_shape_tuple = (S,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 1:
+        res_shape_tuple = (P,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 2:
+        res_shape_tuple = (P, S)
+    else:
+        raise RuntimeError(
+            "You should never ever see this error unless something VERY STRANGE occurs"
+        )
+    res_size = P * S
+    # the left_store data
+    l_fpn = left_store.bigint_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_pen = right_store.pen_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    '''call the matrix_mul function'''
+    FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+    FPGA_LIB.fpn_matrix_multiply_pen_matrix(
+        c_char_p(l_fpn),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_pen),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(S),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_uint32(FPGA_dev_num),
+    )
+    # check for data type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        right_store.mem_type,
+        data_type,
+        right_store.encode_n,
+        right_store.encode_max_int,
+    )
+
+
+def pi_transpose(left_store, left_shape, res_store, res_shape, stream):
+    '''
+    transpose the C-memory stored matrix of PaillierEncryptedStorage,
+    support at most 2-D matrix
+    -----------------
+    Para:
+        left_store:  PaillierEncryptedStorage, left_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if dimension is higher than 2-D, not supported
+    '''
+    left_shape_tuple = left_shape.to_tuple()
+    # get the left_store parameters
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    # malloc space for the res value
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    '''Start handling different type of data '''
+    if len(left_shape_tuple) < 2:
+        # just a raw memcpy, no transpose needed for this scene
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_pen),
+            c_void_p(src_pen),
+            c_size_t(vec_size * CIPHER_BYTE))
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(vec_size * U_INT32_BYTE))
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(vec_size * U_INT32_BYTE))
+        return _pi_init_ss(
+            res_store,
+            res_pen,
+            res_base,
+            res_exp,
+            left_store.vec_size,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
+    elif len(left_shape_tuple) == 2:
+        res_shape_tuple = (left_shape_tuple[1], left_shape_tuple[0])
+        # call the C transpose functions
+        FPGA_LIB.transpose(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(res_shape_tuple[1]),
+            c_size_t(res_shape_tuple[0]),
+            c_size_t(CIPHER_BITS),
+        )
+        return _pi_init_ss(
+            res_store,
+            res_pen,
+            res_base,
+            res_exp,
+            vec_size,
+            res_shape,
+            res_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
+    else:
+        raise PermissionError("Invalid Shape")
+
+
+def pi_sum(
+        pub_key,
+        left_store,
+        left_shape,
+        axis,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    '''
+    Perform sum according to the axis
+    ----------------------
+    Para:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        axis:        int or None, the dimension which sum is performed
+                        None: sum over all elements
+                        0:    sum vertically, over the 1st demension
+                        1:    sum horizontally, over the 2nd demension
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        Permission error: when the input axis is not aligned to input shape
+    '''
+    # get the original data
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    # initialize the result
+    res_pen, res_base, res_exp = 0, 0, 0
+    res_shape_tuple = ()
+    # get the original data's tuple
+    left_shape_tuple = left_shape.to_tuple()
+
+    if len(left_shape_tuple) == 0:
+        # handling shape (), meaning only one element in left_store
+        if axis is not None and axis != 0:
+            raise PermissionError(
+                "Cannot set axis other than 0 or None for dimension 0"
+            )
+        if res_store is None:
+            res_pen = FPGA_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+            res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+            res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_pen),
+            c_void_p(src_pen),
+            c_size_t(vec_size * CIPHER_BYTE))
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(vec_size * U_INT32_BYTE))
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(vec_size * U_INT32_BYTE))
+        return _pi_init_ss(
+            left_store,
+            res_pen,
+            res_base,
+            res_exp,
+            vec_size,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
+    elif axis is None or len(left_shape_tuple) == 1:
+        # handling shape (n, ) or axis == None
+        # malloc space for results
+        if len(left_shape_tuple) == 1 and axis is not None and axis >= 1:
+            raise PermissionError(
+                "axis is out of bounds for array of dimension 1")
+        if res_store is None:
+            res_pen = FPGA_LIB.c_malloc(c_size_t(1 * CIPHER_BYTE))
+            res_base = FPGA_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+            res_exp = FPGA_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        # other return paras
+        result_size = 1
+        res_shape_tuple = ()
+        '''call the C pen_sum function'''
+        FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+        FPGA_LIB.pen_sum(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(1),
+            c_size_t(vec_size),
+            pub_key.n,
+            pub_key.g,
+            pub_key.nsquare,
+            pub_key.max_int,
+            c_size_t(CIPHER_BITS),
+            c_size_t(FPGA_dev_num),
+        )
+    elif axis == 0:
+        # handling 2-D matrix, axis == 0 means sum vertically
+        # since current sum only support horizontal sum
+        # aka batch sum over continuous memory space
+        transpose_store, transpose_shape = pi_transpose(
+            left_store, left_shape, None, None, stream
+        )
+        src_pen = transpose_store.pen_storage
+        src_base = transpose_store.base_storage
+        src_exp = transpose_store.exp_storage
+        transpose_tuple = transpose_shape.to_tuple()
+        '''perform sum on the transposed matrix'''
+        if res_store is None:
+            res_pen = FPGA_LIB.c_malloc(
+                c_size_t(transpose_tuple[0] * CIPHER_BYTE))
+            res_base = FPGA_LIB.c_malloc(
+                c_size_t(transpose_tuple[0] * U_INT32_BYTE))
+            res_exp = FPGA_LIB.c_malloc(
+                c_size_t(transpose_tuple[0] * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        result_size = transpose_tuple[0]
+        res_shape_tuple = (transpose_tuple[0],)
+        '''Call the C function'''
+        # print(transpose_tuple[0])
+        FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+        FPGA_LIB.pen_sum(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(transpose_tuple[0]),
+            c_size_t(transpose_tuple[1]),
+            pub_key.n,
+            pub_key.g,
+            pub_key.nsquare,
+            pub_key.max_int,
+            c_size_t(CIPHER_BITS),
+            c_size_t(FPGA_dev_num),
+        )
+    elif axis == 1:
+        # handling 2-D matrix, axis == 1 means sum horizontally
+        if res_store is None:
+            res_pen = FPGA_LIB.c_malloc(
+                c_size_t(left_shape_tuple[0] * CIPHER_BYTE))
+            res_base = FPGA_LIB.c_malloc(
+                c_size_t(left_shape_tuple[0] * U_INT32_BYTE))
+            res_exp = FPGA_LIB.c_malloc(
+                c_size_t(left_shape_tuple[0] * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        result_size = left_shape_tuple[0]
+        # the res_shape tuple is also clear
+        result_size = left_shape_tuple[0]
+        res_shape_tuple = (left_shape_tuple[0],)
+        '''Call the pen_sum: a C function'''
+        FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+        FPGA_LIB.pen_sum(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(left_shape_tuple[0]),
+            c_size_t(left_shape_tuple[1]),
+            pub_key.n,
+            pub_key.g,
+            pub_key.nsquare,
+            pub_key.max_int,
+            c_size_t(CIPHER_BITS),
+            c_size_t(FPGA_dev_num),
+        )
+    else:
+        raise PermissionError("Invalid Axis or Shape")
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        result_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def fp_encode(
+        store, n, max_int, precision=None, max_exponent=None, res=None, stream=None
+):
+    '''
+    Perform encode to a TensorStorage
+    -----------------
+    Paras:
+        store:        TensorStorage, raw data to be encoded
+        n:            big int, the same n in pubkey used for encryption
+        max_int:      big int, same max_int in pubkey.
+        precision:    int, the precision of encoding, default None
+        max_exponent: None or int, currently not used
+        res:          FixedPointStorage, the return value
+    Return:
+        FixedPointStorage, same as res
+    Raise:
+        PermissionError: For unsupported data type or encoding style
+    '''
+    if max_exponent is not None:
+        raise PermissionError("max_exponent not supported")
+    if precision is None:
+        precision = -1
+    data_storage = store.data
+    vec_size = store.vec_size
+    # malloc the return memory space
+    if res is None:
+        res_fpn = FPGA_LIB.c_malloc(c_size_t(PLAIN_BYTE * vec_size))
+        res_base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    else:
+        res_fpn = res.bigint_storage
+        res_base = res.base_storage
+        res_exp = res.exp_storage
+    # Due to the different nature of encoding float/int
+    # Handle the two different data type seperately
+    FPGA_dev_num = __get_FPGA_device_num(store.mem_type)
+    if store.data_type == FLOAT_TYPE:
+        FPGA_LIB.encode_double(
+            c_void_p(data_storage),
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_int32(precision),
+            c_char_p(n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_size_t(vec_size),
+            c_size_t(FPGA_dev_num),
+        )
+    elif store.data_type == INT64_TYPE:
+        FPGA_LIB.encode_int(
+            c_void_p(data_storage),
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_int32(precision),
+            c_char_p(n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_size_t(vec_size),
+            c_size_t(FPGA_dev_num),
+        )
+    else:
+        raise PermissionError("Invalid Data Type")
+
+    '''get the three elements, store it in a FPNStorage'''
+
+    return _fp_init_store(
+        res,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        n,
+        max_int,
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def __fp_decode(store, res, stream):
+    '''
+    Decode a FixedPointStorage in CPU, using fp_c2p to implement
+    Currently not used, as a GPU version has been done
+    ------------------
+    Paras:
+        store:   FixedPointStorage, the raw data to be decoded
+        res:     TensorStorage, the decoded result
+    Return:
+        TensorStorage, same as res
+    '''
+    vec_size = store.vec_size
+    fpn_array = __get_c_fpn_storage(
+        store.bigint_storage,
+        store.base_storage,
+        store.exp_storage,
+        vec_size,
+        store.encode_n,
+        store.max_int,
+    )
+
+    CPU_decode = []
+    if store.data_type == INT64_TYPE:
+        for i in range(vec_size):
+            CPU_decode.append(int(fpn_array[i].decode()))
+    elif store.data_type == FLOAT_TYPE:
+        for i in range(vec_size):
+            CPU_decode.append(fpn_array[i].decode())
+    else:
+        raise PermissionError("Invalid Data Type")
+
+    # reform the value to TensorStorage
+    decode_data = te_p2c(CPU_decode, None)
+    res_data = decode_data.data
+    decode_data.data = None
+    return _te_init_store(
+        res,
+        res_data,
+        vec_size,
+        store.mem_type,
+        store.data_type)
+
+
+def fp_decode(store, res, stream):
+    '''
+    Decode a FixedPointStorage in GPU
+    ------------------
+    Paras:
+        store:   FixedPointStorage, the raw data to be decoded
+        res:     TensorStorage, the decoded result
+    Return:
+        TensorStorage, same as res
+    '''
+    if store.data_type == FLOAT_TYPE:
+        res_store = (
+            FPGA_LIB.c_malloc(c_size_t(store.vec_size * DOUBLE_BYTE))
+            if res is None
+            else res.data
+        )
+        FPGA_LIB.decode_double(
+            c_void_p(store.bigint_storage),
+            c_void_p(store.base_storage),
+            c_void_p(store.exp_storage),
+            c_char_p(store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(store.max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_void_p(res_store),
+            c_size_t(store.vec_size),
+        )
+    elif store.data_type == INT64_TYPE:
+        res_store = (
+            FPGA_LIB.c_malloc(c_size_t(store.vec_size * INT64_BYTE))
+            if res is None
+            else res.data
+        )
+        FPGA_LIB.decode_int(
+            c_void_p(store.bigint_storage),
+            c_void_p(store.base_storage),
+            c_void_p(store.exp_storage),
+            c_char_p(store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(store.max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_void_p(res_store),
+            c_size_t(store.vec_size),
+        )
+    else:
+        raise PermissionError("Invalid Data Type")
+    return _te_init_store(
+        res, res_store, store.vec_size, store.mem_type, store.data_type
+    )
+
+
+def fp_d2h(target, src, stream):
+    return src
+
+
+def bi_free(src):
+    FPGA_LIB.c_free(c_void_p(src.bigint_storage))
+    src.bigint_storage = None
+
+
+def fp_free(src):
+    FPGA_LIB.c_free(c_void_p(src.bigint_storage))
+    FPGA_LIB.c_free(c_void_p(src.base_storage))
+    FPGA_LIB.c_free(c_void_p(src.exp_storage))
+    src.bigint_storage, src.base_storage, src.exp_storage = None, None, None
+
+
+'''
+    function: change the FixedPointStorage's data back into a C type
+    As there is no shape involved in the function,
+    we cannot know the return shape of the function
+    input:
+            src: FixedPointStorage, containing the data that need to be changed
+    output:
+            return value: containing 3 ndarray:
+                            fpn_array,base_array,exp_array
+'''
+
+
+def fp_c2p(src):
+    return __get_c_fpn_storage(
+        src.bigint_storage,
+        src.base_storage,
+        src.exp_storage,
+        src.vec_size,
+        src.encode_n,
+        src.max_int,
+    )
+
+
+def pi_c2p_mp(src):
+    '''
+    convert PaillierEncryptedStorage from C mem type to Python one
+    this one use multiprocess to accelerate
+    --------------
+    Para:    src, PaillierEncryptedStorage
+    Return:  tuple, each element is a ndarray,
+                    identical to sequence of encoding, base, exponent
+    '''
+    return __get_c_pen_storage_mp(
+        src.pen_storage,
+        src.base_storage,
+        src.exp_storage,
+        src.vec_size,
+        src.encode_n)
+
+
+def pi_c2p(src):
+    '''convert PaillierEncryptedStorage from C mem type to Python one'''
+    return __get_c_pen_storage_raw(
+        src.pen_storage,
+        src.base_storage,
+        src.exp_storage,
+        src.vec_size,
+        src.encode_n)
+
+
+def fp_mul(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    '''
+    Perform element-wise multiplication between two FixedPointStorage.
+    This is a plaintext computation rather than an encrypted one.
+    ------------------
+    Paras:
+        left_store, right_store: FixedPointStorage
+        left_shape, right_shape: TensorShapeStorage
+    Return:
+        tuple, (FixedPointStorage, TensorShapeStorage)
+    '''
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    # P,Q is the dim of the left_store(pen)
+    # R,S is the dim of the right_store(fpn)
+    res_size = max(P, R) * max(Q, S)
+    # the left_store data
+    l_fpn = left_store.bigint_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_fpn = right_store.bigint_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_fpn = FPGA_LIB.c_malloc(c_size_t(res_size * PLAIN_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+    FPGA_LIB.fpn_mul(
+        c_char_p(l_fpn),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_fpn),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_fpn),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(R),
+        c_size_t(S),
+        c_char_p(left_store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+        c_size_t(PLAIN_BITS),
+        c_size_t(FPGA_dev_num),
+    )
+    # handle data_type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _fp_init_ss(
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        res_size,
+        left_store.encode_n,
+        left_store.max_int,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+    )
+
+
+def fp_p2c(target, src, data_type=FLOAT_TYPE):
+    '''change a FixedPointNumber ndarray into a FixedPointStorage Class'''
+    if isinstance(src, list):
+        vec_size = len(src)
+    elif isinstance(src, np.ndarray):
+        vec_size = src.size
+        src = src.flat
+    else:
+        raise TypeError("Unsupported Data Structure")
+    # malloc the space for the type
+    if target is None:
+        res_fpn = FPGA_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_fpn = target.bigint_storage
+        res_base = target.base_storage
+        res_exp = target.exp_storage
+    # the temp ndarray buffer
+    base_temp, exp_temp = [], []
+    # get the two encoding parameters
+    n = src[0].n
+    max_int = src[0].max_int
+    for i in range(vec_size):
+        src_number = src[i].encoding.to_bytes(PLAIN_BYTE, 'little')
+        FPGA_LIB.bigint_set(
+            c_char_p(res_fpn),
+            c_char_p(src_number),
+            c_size_t(PLAIN_BITS),
+            c_size_t(i))
+        base_temp.append(src[i].BASE)
+        exp_temp.append(src[i].exponent)
+
+    base_arr_ptr = np.asarray(base_temp).ctypes.data_as(c_void_p)
+    exp_arr_ptr = np.asarray(exp_temp).ctypes.data_as(c_void_p)
+    FPGA_LIB.unsigned_set(c_void_p(res_base), base_arr_ptr, c_size_t(vec_size))
+    FPGA_LIB.unsigned_set(c_void_p(res_exp), exp_arr_ptr, c_size_t(vec_size))
+
+    return _fp_init_store(
+        target,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        n,
+        max_int,
+        MEM_FPGA_NUM_0,
+        data_type,
+    )
+
+
+def fp_h2d(target, src):
+    return src
+
+
+def _index_reset(index, dim_size):
+    if index < 0:
+        res_index = index + dim_size
+        res_index = max(0, res_index)
+    elif index > dim_size:
+        res_index = dim_size
+    else:
+        res_index = index
+    return res_index
+
+
+def fp_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
+    '''
+    slice a contiguous memory space, now support two directions.
+    -----------------------------
+    Para:
+    store: FixedPointStorage, the data to be sliced
+    shape: TensorShapeStorage, the original shape of the storage
+    start: int, the start index of the slice (included)
+    end:   int, the end index of the slice(not included),
+           if larger than the last index, concatencate it into the dim size
+    axis:  0 or 1, 0 means cut it horizontally, 1 means cut it vertically
+    stream: the current stream of the task, not used now
+    -----------------------------
+    Return:
+    res_store, res_shape, FixedPointStorage, TensorShapeStorage
+    Raise:
+        PermissionError: if the input start/stop/axis is not valid
+    '''
+    src_fpn = store.bigint_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    fpn_shape_tuple = shape.to_tuple()
+    dim0, dim1 = 0, 0
+    '''handle shape and index'''
+    if len(fpn_shape_tuple) == 0:
+        raise PermissionError("Cannot slice 0 dim!")
+    elif len(fpn_shape_tuple) == 1:
+        dim0, dim1 = 1, fpn_shape_tuple[0]
+        if axis == 0:
+            raise PermissionError("Cannot slice 1 dim horizontally!")
+        start = _index_reset(start, dim1)
+        stop = _index_reset(stop, dim1)
+    elif len(fpn_shape_tuple) == 2:
+        dim0, dim1 = fpn_shape_tuple[0], fpn_shape_tuple[1]
+        if axis == 0:
+            start = _index_reset(start, dim0)
+            stop = _index_reset(stop, dim0)
+        if axis == 1:
+            start = _index_reset(start, dim1)
+            stop = _index_reset(stop, dim1)
+    else:
+        raise PermissionError("Invalid shape")
+    # handle condition that a[k: l] k>=l for 2-d array
+    # will cause the result shape to be (0, dim1)
+    if axis == 0 and start >= stop:
+        res_fpn, res_base, res_exp = None, None, None
+        return _fp_init_ss(
+            None,
+            res_fpn,
+            res_base,
+            res_exp,
+            0,
+            store.encode_n,
+            store.encode_max_int,
+            None,
+            (0, dim1),
+            store.mem_type,
+            store.data_type,
+        )
+    # handle condition that a[:,k:l] k>=l for 2-d array
+    # will cause the result shape to be (dim0, 0)
+    if axis == 1 and start >= stop:
+        res_fpn, res_base, res_exp = None, None, None
+        res_shape_tuple = (dim0, 0) if len(fpn_shape_tuple) == 2 else (0,)
+        return _fp_init_ss(
+            None,
+            res_fpn,
+            res_base,
+            res_exp,
+            0,
+            store.encode_n,
+            store.encode_max_int,
+            None,
+            res_shape_tuple,
+            store.mem_type,
+            store.data_type,
+        )
+        # handle the normal slice
+    res_shape_tuple, vec_size = (), 0
+    '''useful paras'''
+    bigint_row_bytelen = dim1 * PLAIN_BYTE
+    uint32_row_bytelen = dim1 * U_INT32_BYTE
+    gap_length = stop - start
+    # start slice
+    if axis == 1:
+        'axis == 1 means that we need to cut the matrix vertically'
+        res_bigint_row_bytelen = gap_length * PLAIN_BYTE
+        res_uint32_row_bytelen = gap_length * U_INT32_BYTE
+        if res_store is None:
+            res_fpn = FPGA_LIB.c_malloc(
+                c_size_t(res_bigint_row_bytelen * dim0))
+            res_base = FPGA_LIB.c_malloc(
+                c_size_t(res_uint32_row_bytelen * dim0))
+            res_exp = FPGA_LIB.c_malloc(
+                c_size_t(res_uint32_row_bytelen * dim0))
+        else:
+            res_fpn = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        # call the raw function
+        FPGA_LIB.slice_vertical(
+            c_char_p(src_fpn),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(PLAIN_BITS),
+            c_uint32(0),
+        )
+        if len(fpn_shape_tuple) == 1:
+            res_shape_tuple = (gap_length,)
+            vec_size = res_shape_tuple[0]
+        else:
+            res_shape_tuple = (dim0, gap_length)
+            vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+
+    elif axis == 0:
+        'axis == 0 means that we nned to cut the matrix horizontally'
+        if res_store is None:
+            res_fpn = FPGA_LIB.c_malloc(
+                c_size_t(bigint_row_bytelen * gap_length))
+            res_base = FPGA_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+            res_exp = FPGA_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+        else:
+            res_fpn = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        FPGA_LIB.slice_horizontal(
+            c_char_p(src_fpn),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(PLAIN_BITS),
+            c_uint32(0),
+        )
+        res_shape_tuple = (gap_length, dim1)
+        vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+    else:
+        raise NotImplementedError("Only support 2 dimensional slice")
+
+    return _fp_init_ss(
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        store.encode_n,
+        store.max_int,
+        res_shape,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def pi_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
+    '''
+    slice a contiguous memory space, now support two directions.
+    -----------------------------
+    Para:
+    store: PaillierEncryptedStorage, the data to be sliced
+    shape: TensorShapeStorage, the original shape of the storage
+    start: int, the start index of the slice (included)
+    end:   int, the end index of the slice(not included),
+           if it is larger than the last index, then it concatencate into the dim size
+    axis:  0 or 1, 0 means cut it horizontally, 1 means cut it vertically
+    stream: the current stream of the task, not used now
+    -----------------------------
+    Return:
+    res_store, res_shape, PaillierEncryptedStorage, TensorShapeStorage
+    '''
+    src_pen = store.pen_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    # get the two dims and check for illegal status
+    pen_shape_tuple = shape.to_tuple()
+    dim0, dim1 = 0, 0
+    if len(pen_shape_tuple) == 0:
+        raise PermissionError("Cannot slice 0 dim!")
+    elif len(pen_shape_tuple) == 1:
+        dim0, dim1 = 1, pen_shape_tuple[0]
+        if axis == 0:
+            raise PermissionError("Cannot slice 1 dim horizontally!")
+        start = _index_reset(start, dim1)
+        stop = _index_reset(stop, dim1)
+    elif len(pen_shape_tuple) == 2:
+        dim0, dim1 = pen_shape_tuple[0], pen_shape_tuple[1]
+        if axis == 0:
+            start = _index_reset(start, dim0)
+            stop = _index_reset(stop, dim0)
+        if axis == 1:
+            start = _index_reset(start, dim1)
+            stop = _index_reset(stop, dim1)
+    else:
+        raise PermissionError("Invalid shape")
+
+    # handle condition that a[k, l], k>=l for 2-d array
+    # will cause the result shape to be (0, dim1)
+    if axis == 0 and start >= stop:
+        res_pen, res_base, res_exp = None, None, None
+        return _pi_init_ss(
+            None,
+            res_pen,
+            res_base,
+            res_exp,
+            0,
+            None,
+            (0, dim1),
+            store.mem_type,
+            store.data_type,
+            store.encode_n,
+            store.encode_max_int,
+        )
+    # handle condition that a[:, k, l] k>=l for 2-d array
+    # will cause the result shape to be (dim0, 0)
+    if axis == 1 and start >= stop:
+        res_pen, res_base, res_exp = None, None, None
+        res_shape_tuple = (dim0, 0) if len(pen_shape_tuple) == 2 else (0,)
+        return _pi_init_ss(
+            None,
+            res_pen,
+            res_base,
+            res_exp,
+            0,
+            None,
+            res_shape_tuple,
+            store.mem_type,
+            store.data_type,
+            store.encode_n,
+            store.encode_max_int,
+        )
+    # handle the normal slice
+    res_shape_tuple = ()
+    vec_size = 0
+    '''useful paras'''
+    bigint_row_bytelen = dim1 * PLAIN_BYTE
+    uint32_row_bytelen = dim1 * U_INT32_BYTE
+    gap_length = stop - start
+    # start slice
+    if axis == 1:
+        'axis == 1 means that we need to cut the matrix vertically'
+        res_bigint_row_bytelen = gap_length * PLAIN_BYTE
+        res_uint32_row_bytelen = gap_length * U_INT32_BYTE
+        # malloc space for result
+        if res_store is None:
+            res_pen = FPGA_LIB.c_malloc(
+                c_size_t(res_bigint_row_bytelen * dim0))
+            res_base = FPGA_LIB.c_malloc(
+                c_size_t(res_uint32_row_bytelen * dim0))
+            res_exp = FPGA_LIB.c_malloc(
+                c_size_t(res_uint32_row_bytelen * dim0))
+        else:
+            res_pen = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        # call the raw function
+        FPGA_LIB.slice_vertical(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(CIPHER_BITS),
+            c_uint32(0),
+        )
+        if len(pen_shape_tuple) == 1:
+            res_shape_tuple = (gap_length,)
+            vec_size = res_shape_tuple[0]
+        else:
+            res_shape_tuple = (dim0, gap_length)
+            vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+
+    elif axis == 0:
+        'axis == 0 means that we nned to cut the matrix horizontally'
+        if res_store is None:
+            res_pen = FPGA_LIB.c_malloc(
+                c_size_t(bigint_row_bytelen * gap_length))
+            res_base = FPGA_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+            res_exp = FPGA_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+        else:
+            res_pen = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        FPGA_LIB.slice_horizontal(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(CIPHER_BITS),
+            c_uint32(0),
+        )
+        # since 1-dim shape will not occur here, result shape is always 2-D
+        res_shape_tuple = (gap_length, dim1)
+        vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+    else:
+        raise NotImplementedError()
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        res_shape,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+        store.encode_n,
+        store.encode_max_int,
+    )
+
+
+def bi_p2c(data, res, bytelen=CIPHER_BYTE):
+    '''
+    copy data to the C memory pointed to by res
+    -------------------
+    Para:
+        data: List[object], each object is a bigint CIPHER_BIT long
+        res:  int, actually a pointer pointing to C memory
+    Return:
+        None, but the contents in c_void_p(res) has been changed
+    '''
+    for i in range(len(data)):
+        src_number = data[i].to_bytes(bytelen, 'little')
+        FPGA_LIB.bigint_set(
+            c_char_p(res), c_char_p(src_number), c_size_t(bytelen), c_size_t(i)
+        )
+
+
+def bi_gen_rand(elem_size, count, res, rand_seed, stream):
+    '''
+    generate random bigint for pi_obfuscation
+    ------------------
+    Para:
+        elem_size: int, length of random bigint, upper bound is CIPHER_BYTE
+        count:     int, number of random bigint to be generated
+        res:       BigintStorage, the return value
+        rand_seed: seed used for generating random data
+    Return:
+        BigintStorage, same as res
+    '''
+    # Didn't use vectorize since that we need to_bytes()
+    # But ndarray_float65 has no to_bytes method
+    random.seed(rand_seed)
+    rands = np.array([random.randrange(1, 8 ** elem_size) for i in range(count)])
+    if res is None:
+        data_storage = FPGA_LIB.c_malloc(c_size_t(count * CIPHER_BYTE))
+    else:
+        data_storage = res.bigint_storage
+    # CIPHER_BYTE is the upper bound of the length of the rand number
+    '''
+    We assume that the store of random bigint is on FPGA device_0
+    TODO: Add configuration for choosing divice
+    '''
+    bi_p2c(rands, data_storage)
+    return _bi_init_store(
+        res,
+        data_storage,
+        count,
+        mem_type=MEM_FPGA_NUM_0,
+        elem_size=CIPHER_BYTE)
+
+
+def __get_shape_size(shape_tuple):
+    shape_size = 1
+    if len(shape_tuple) == 1:
+        shape_size = shape_tuple[0]
+    elif len(shape_tuple) == 2:
+        shape_size = shape_tuple[0] * shape_tuple[1]
+    else:
+        raise PermissionError("Invalid Shape Tuple")
+
+    return shape_size
+
+
+def pi_reshape(store, shape, new_shape, res_store, res_shape, stream):
+    '''
+    Change a PaillierEcnryptedStorage's shape.
+    No need for change the continuous storage, only change the shape.
+    -------------------
+    Paras:
+        store, shape:  PaillierEncryptedStorage, TensorShapeStorage
+        new_shape:     TensorShapeStorage, the new shape for the pi_storage
+    Returns:
+        tuple: (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        ValueError:    If shape and new_shape's size is unequal
+    '''
+    res_shape_tuple = new_shape.to_tuple()
+    old_shape_tuple = shape.to_tuple()
+    res_shape_size = __get_shape_size(res_shape_tuple)
+    old_shape_size = __get_shape_size(old_shape_tuple)
+    res_vec_size = store.vec_size
+    if res_shape_size != old_shape_size:
+        raise ValueError("total size of new array must be unchanged!")
+    # Still, we do a malloc and memcpy in order to avoid double free in python
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(CIPHER_BYTE * res_vec_size))
+        res_base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    FPGA_LIB.c_memcpy(
+        c_void_p(res_pen),
+        c_void_p(store.pen_storage),
+        c_size_t(CIPHER_BYTE * res_vec_size),
+    )
+    FPGA_LIB.c_memcpy(
+        c_void_p(res_base),
+        c_void_p(store.base_storage),
+        c_size_t(U_INT32_BYTE * res_vec_size),
+    )
+    FPGA_LIB.c_memcpy(
+        c_void_p(res_exp),
+        c_void_p(store.exp_storage),
+        c_size_t(U_INT32_BYTE * res_vec_size),
+    )
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        store.vec_size,
+        res_shape,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+        store.encode_n,
+        store.encode_max_int,
+    )
+
+
+def fp_cat(stores, shapes, axis, res_store, res_shape):
+    '''
+    concat several FixedPointStorage according to axis
+    --------------------
+    Para:
+        stores: List or ndarray, elements are FixedPointStorage
+        shapes: List or ndarray, elements are TensorShapeStorage
+        axis:   int, how stores will be stacked
+                    0 means a vertical stack, stack along 1st dim
+                    1 means a horizontal stack, stack along 2nd dim
+        res_store: FixedPointStorage, the stacked result
+        res_shape: TensorShapeStorage, the result's shape
+    Return:
+        tuple, (FixedPointStorage, TensorShapeStorage)
+    Raise:
+        PermissionError: Invalid input data or invalid shape
+        NotImplementedError: Current only support at most 2-D matrix
+    '''
+    stores = list(stores)
+    shapes = list(shapes)
+    num_stores = len(stores)
+    res_vec_size = np.sum([v.vec_size for v in stores])
+    # Abnormal checks
+    if num_stores < 2:
+        raise PermissionError("At least 2 Storages required for concatenation")
+    if len(shapes) != num_stores:
+        raise PermissionError(
+            "The number of storages and that of shapes didn't match")
+    for v in stores:
+        if v.data_type != stores[0].data_type:
+            raise PermissionError(
+                "All storages should have the same data type")
+        if v.encode_n != stores[0].encode_n:
+            raise PermissionError("All storages should have the same n")
+        if v.max_int != stores[0].max_int:
+            raise PermissionError("All storages should have the same max_int")
+        if v.mem_type != stores[0].mem_type:
+            raise PermissionError(
+                "All storages should have the same memory type")
+    # num_rows, num_cols is the data demanded by C functions
+    # res_rows, res_cols are return values that should be same as numpy's output
+    # distinguish them such that upper and lower level won't bother each other
+    if axis == 0:
+        first_shape_decomposed = __shape_decompose(shapes[0])
+        num_rows, num_cols = 0, first_shape_decomposed[1]
+        for v in shapes:
+            shape_tuple = __shape_decompose(v)
+            if shape_tuple[1] != num_cols:
+                raise PermissionError("Shapes didn't align")
+            num_rows += shape_tuple[0]
+        res_rows = num_rows
+        res_cols = num_cols
+    elif axis == 1:
+        first_shape = shapes[0].to_tuple()
+        if len(first_shape) <= 1:
+            num_rows, num_cols = 1, 0
+            for v in shapes:
+                if len(v.to_tuple()) == 0:
+                    num_cols += 1
+                if len(v.to_tuple()) == 1:
+                    num_cols += v.to_tuple()[0]
+                if len(v.to_tuple()) >= 2:
+                    raise PermissionError("Shape cannot align!!!")
+            res_rows = num_cols
+            res_cols = None
+        elif len(first_shape) == 2:
+            num_rows, num_cols = first_shape[0], 0
+            for v in shapes:
+                v_shape = v.to_tuple()
+                if len(v_shape) != 2 or num_rows != v_shape[0]:
+                    raise PermissionError("Shape cannot align!")
+                num_cols += v_shape[1]
+            res_rows = num_rows
+            res_cols = num_cols
+        else:
+            raise NotImplementedError("Now only support up to 2-D array")
+    else:
+        raise PermissionError("Invalid Axis")
+    res_shape = TensorShapeStorage(res_rows, res_cols)
+
+    fpn_pointers = [c_void_p(v.bigint_storage) for v in stores]
+    base_pointers = [c_void_p(v.base_storage) for v in stores]
+    exp_pointers = [c_void_p(v.exp_storage) for v in stores]
+
+    if res_store is None:
+        res_fpn = FPGA_LIB.c_malloc(c_size_t(PLAIN_BYTE * res_vec_size))
+        res_base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    fpn_arr = (c_void_p * num_stores)(*fpn_pointers)
+    base_arr = (c_void_p * num_stores)(*base_pointers)
+    exp_arr = (c_void_p * num_stores)(*exp_pointers)
+    vec_sizes = (c_uint32 * num_stores)(*[v.vec_size for v in stores])
+
+    if axis == 0:
+        '''means that we should cat stores vertically'''
+        FPGA_LIB.vstack(
+            fpn_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_uint64(num_stores),
+            vec_sizes,
+            c_uint64(res_cols),
+            c_size_t(PLAIN_BITS),
+        )
+    elif axis == 1:
+        '''means that we should cat stores horizontally'''
+        FPGA_LIB.hstack(
+            fpn_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_uint64(num_stores),
+            vec_sizes,
+            c_uint64(res_rows),
+            c_size_t(PLAIN_BITS),
+        )
+        # raise NotImplementedError()
+    else:
+        raise NotImplementedError()
+
+    return _fp_init_ss(
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        int(round(res_vec_size)),
+        stores[0].encode_n,
+        stores[0].max_int,
+        res_shape,
+        res_shape.to_tuple(),
+        stores[0].mem_type,
+        stores[0].data_type,
+    )
+
+
+def pi_cat(stores, shapes, axis, res_store, res_shape):
+    '''
+    concat several PaillierEncryptedStorage according to axis
+    --------------------
+    Para:
+        stores: List or ndarray, elements are PaillierEncryptedStorage
+        shapes: List or ndarray, elements are TensorShapeStorage
+        axis:   int, how stores will be stacked
+                    0 means a vertical stack, stack along 1st dim
+                    1 means a horizontal stack, stack along 2nd dim
+        res_store: PaillierEncryptedStorage, the stacked result
+        res_shape: TensorShapeStorage, the result's shape
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError: Invalid input data or invalid shape
+        NotImplementedError: Current only support at most 2-D matrix
+    '''
+    stores = list(stores)
+    shapes = list(shapes)
+    num_stores = len(stores)
+    res_vec_size = np.sum([v.vec_size for v in stores])
+
+    # Anomaly checks
+    if num_stores < 2:
+        raise PermissionError("At least 2 Storages required for concatenation")
+    if len(shapes) != num_stores:
+        raise PermissionError(
+            "The number of storages and that of shapes didn't match")
+    for v in stores:
+        if v.data_type != stores[0].data_type:
+            raise PermissionError(
+                "All storages should have the same data type")
+        if v.encode_n != stores[0].encode_n:
+            raise PermissionError("All storages should have the same n")
+        if v.encode_max_int != stores[0].encode_max_int:
+            raise PermissionError("All storages should have the same max_int")
+        if v.mem_type != stores[0].mem_type:
+            raise PermissionError(
+                "All storages should have the same memory type")
+    # num_rows, num_cols is the data demanded by C functions
+    # res_rows, res_cols are return values that should be same as numpy's output
+    # distinguish them so upper and lower level won't bother each other
+    if axis == 0:
+        first_shape_decomposed = __shape_decompose(shapes[0])
+        num_rows, num_cols = 0, first_shape_decomposed[1]
+        for v in shapes:
+            shape_tuple = __shape_decompose(v)
+            if shape_tuple[1] != num_cols:
+                raise PermissionError("Shapes didn't align")
+            num_rows += shape_tuple[0]
+        res_rows = num_rows
+        res_cols = num_cols
+    elif axis == 1:
+        '''the horizontal cat'''
+        first_shape = shapes[0].to_tuple()
+        if len(first_shape) <= 1:
+            num_rows = 1
+            num_cols = 0
+            for v in shapes:
+                if len(v.to_tuple()) == 0:
+                    num_cols += 1
+                if len(v.to_tuple()) == 1:
+                    num_cols += v.to_tuple()[0]
+                if len(v.to_tuple()) >= 2:
+                    raise PermissionError("Shape cannot align!!!")
+            res_rows = num_cols
+            res_cols = None
+            print(num_rows, num_cols, res_rows, res_cols)
+        elif len(first_shape) == 2:
+            num_rows = first_shape[0]
+            num_cols = 0
+            for v in shapes:
+                v_shape = v.to_tuple()
+                if len(v_shape) != 2 or num_rows != v_shape[0]:
+                    raise PermissionError("Shape cannot align!")
+                # num_rows += v_shape[0]
+                num_cols += v_shape[1]
+            res_rows = num_rows
+            res_cols = num_cols
+        else:
+            raise NotImplementedError("Now only support up to 2-D array")
+    else:
+        raise PermissionError("Invalid Axis")
+    res_shape = TensorShapeStorage(res_rows, res_cols)
+
+    pen_pointers = [c_void_p(v.pen_storage) for v in stores]
+    base_pointers = [c_void_p(v.base_storage) for v in stores]
+    exp_pointers = [c_void_p(v.exp_storage) for v in stores]
+    # print(res_vec_size)
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(CIPHER_BYTE * res_vec_size))
+        res_base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    pen_arr = (c_void_p * num_stores)(*pen_pointers)
+    base_arr = (c_void_p * num_stores)(*base_pointers)
+    exp_arr = (c_void_p * num_stores)(*exp_pointers)
+    vec_sizes = (c_uint32 * num_stores)(*[v.vec_size for v in stores])
+
+    if axis == 0:
+        FPGA_LIB.vstack(
+            pen_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_uint64(num_stores),
+            vec_sizes,
+            c_uint64(num_cols),
+            c_size_t(CIPHER_BITS),
+        )
+    elif axis == 1:
+        FPGA_LIB.hstack(
+            pen_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_uint64(num_stores),
+            vec_sizes,
+            c_uint64(num_rows),
+            c_size_t(CIPHER_BITS),
+        )
+    else:
+        raise NotImplementedError()
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        int(res_vec_size),
+        res_shape,
+        res_shape.to_tuple(),
+        stores[0].mem_type,
+        stores[0].data_type,
+        stores[0].encode_n,
+        stores[0].encode_max_int,
+    )
+
+
+def random_p2c(rands, bitlen, size):
+    bytelen = bit_change(bitlen, 1) // 8
+    data_storage = FPGA_LIB.c_malloc(c_size_t(size * bytelen))
+    bi_p2c(rands, data_storage)
+    return _bi_init_store(
+        None, data_storage, size, mem_type=MEM_FPGA_NUM_0, elem_size=bytelen
+    )
+
+
+def random_c2p(random_store: BigIntStorage, size):
+    bytelen = random_store.elem_size
+    random_res = c_buffer(bytelen)
+    res_list = []
+    for i in range(size):
+        FPGA_LIB.c_memcpy(
+            cast(random_res, c_void_p),
+            c_void_p(random_store.bigint_storage + i * bytelen),
+            c_size_t(bytelen),
+        )
+        temp_int = int.from_bytes(random_res.raw, 'little')
+        res_list.append(temp_int)
+    return res_list
+
+
+class Hash_key_storage:
+    '''
+    parameters:
+    hash_storage: int, address of C memory storing the big integer
+    '''
+
+    def __init__(self, hash_storage):
+        self.hash_storage = hash_storage
+
+    def __del__(self):
+        hash_free(self.hash_storage)
+        self.hash_storage = None
+
+
+def hash_free(hash_key):
+    FPGA_LIB.c_free(c_void_p(hash_key))
+
+
+def hash_p2c(data, bitlen):
+    '''convert the data into Hash_key_storage,
+    since all data is identically bitlen, no value/index is needed'''
+    if isinstance(data, list):
+        data = np.asarray(data)
+    if not isinstance(data, np.ndarray):
+        raise TypeError("Unsupported Data Structure")
+    bytelen = bitlen // 8
+    elem_cnt = data.size
+    data_ptr = FPGA_LIB.c_malloc(c_size_t(bytelen * elem_cnt))
+    for i in range(elem_cnt):
+        try:
+            FPGA_LIB.c_memcpy(
+                c_void_p(data_ptr + i * bytelen),
+                c_char_p(data[i].to_bytes(bytelen, "little")),
+                c_size_t(bytelen),
+            )
+        except AttributeError:
+            raise AttributeError("Only support int type!!!")
+        except BaseException:
+            raise RuntimeError("Running c memory copy failed!!")
+
+    return Hash_key_storage(data_ptr)
+
+
+def hash_c2p(store: Hash_key_storage, size, bitlength):
+    '''
+    transform the rsa computing result into a python dictionary
+    seperate from the sha_c2p due to big endian and small endian difference, especially when there is zero
+    '''
+    bytelen = bitlength // 8
+    sha_res = c_buffer(bytelen)
+    res_list = []
+    for i in range(size):
+        FPGA_LIB.c_memcpy(
+            cast(sha_res, c_void_p),
+            c_void_p(store.hash_storage + i * bytelen),
+            c_size_t(bytelen),
+        )
+        temp_int = int.from_bytes(sha_res.raw, 'little')
+        res_list.append(temp_int)
+    return res_list
+
+
+def rsa_c2bytes(storage: Hash_key_storage, size, bitlength):
+    store_size = bitlength // 8 * size
+    bytes = c_buffer(store_size)
+    FPGA_LIB.c_memcpy(
+        cast(
+            bytes, c_void_p), c_void_p(
+            storage.hash_storage), c_size_t(store_size))
+    return bytes.raw
+
+
+def rsa_bytes2c(bytes, size, bitlength):
+    store_size = bitlength // 8 * size
+    hash_key = FPGA_LIB.c_malloc(c_size_t(store_size))
+    FPGA_LIB.c_memcpy(
+        c_void_p(hash_key),
+        c_char_p(bytes),
+        c_size_t(store_size))
+    return Hash_key_storage(hash_key)
+
+
+def hash_bit_inquiry(hash_method):
+    dist_encode_function = {
+        "md5": 256,
+        "sha1": 256,
+        "sha224": 256,
+        "sha256": 256,
+        "sha384": 512,
+        "sha512": 512,
+        "sm3": 256,
+        "none": 256,
+    }
+    return dist_encode_function[hash_method]
+
+
+def gmp_gen_rand(bit_len, vec_size, n):
+    RSA_bitlength = bit_change(n, 0)
+    output_bitlength = bit_change(bit_len, 1)
+    random_bytelength = output_bitlength // 8
+    if output_bitlength > RSA_bitlength:
+        raise PermissionError(
+            f"bitlength should be smaller than the given size {RSA_bitlength}"
+        )
+
+    res_rand = FPGA_LIB.c_malloc(c_size_t(vec_size * random_bytelength))
+    FPGA_LIB.gmp_random(
+        c_char_p(res_rand),
+        c_size_t(bit_len),
+        c_size_t(output_bitlength),
+        c_size_t(RSA_bitlength),
+        c_size_t(vec_size),
+        c_char_p(n.to_bytes(RSA_bitlength // 8, 'little')),
+    )
+
+    return _bi_init_store(
+        None,
+        res_rand,
+        vec_size,
+        mem_type=MEM_FPGA_NUM_0,
+        elem_size=random_bytelength)
+
+
+def compute_hash(key_number, hash_method, hash_bitlength, size, salt):
+    hash_bytelength = hash_bitlength // 8
+    hash_storage = FPGA_LIB.c_malloc(c_size_t(hash_bytelength * size))
+    if isinstance(key_number, Hash_key_storage):
+        key_length_storage = FPGA_LIB.c_malloc(c_size_t(INT64_BYTE * size))
+        FPGA_LIB.hex_to_int(
+            c_void_p(key_number.hash_storage),
+            c_uint32(hash_bytelength),
+            c_size_t(size),
+            c_void_p(hash_storage),
+            c_void_p(key_length_storage),
+        )
+    else:
+        key_storage, key_length_storage = keyid_p2c(key_number, salt)
+
+    FPGA_LIB.computeSHA256_index(
+        c_void_p(key_storage),
+        c_void_p(key_length_storage),
+        c_size_t(size),
+        c_void_p(hash_storage),
+    )
+    return Hash_key_storage(hash_storage)
+
+
+def keyid_p2c(data, salt):
+    '''
+    Change the input list into a SHA_storage
+
+    Parameters:
+    ------------------
+    data, list or ndarray, contains a butch of id
+        we assume that each id should be a string rather than a int or something else
+    '''
+    # preprocess
+    if isinstance(data, list):
+        data = np.asarray(data)
+    if not isinstance(data, np.ndarray):
+        raise TypeError("Unsupported Data Structure")
+    # malloc the space
+    str_len = 0
+    vec_len = []
+    for x in data:
+        x = str(x) + salt
+        str_len += len(x)
+        vec_len.append(len(x))
+    hash_key = FPGA_LIB.c_malloc(c_size_t(str_len))
+    # then we should feed all the strings into this place
+    index = 0
+    for i in range(len(data)):
+        FPGA_LIB.c_memcpy(
+            c_void_p(hash_key + index),
+            c_char_p(bytes(str(data[i]), encoding="utf-8")),
+            c_size_t(vec_len[i]),
+        )
+        index += vec_len[i]
+    # then the vec_len should also be changed to as a pointer
+    vec_len = np.asarray(vec_len).astype(np.int64)
+    vec_size = vec_len.size
+    length_storage_ptr = FPGA_LIB.c_malloc(c_size_t(vec_size * INT64_BYTE))
+    len_ptr = vec_len.ctypes.data_as(c_void_p)
+    FPGA_LIB.c_memcpy(
+        c_void_p(length_storage_ptr), len_ptr, c_size_t(vec_size * INT64_BYTE)
+    )
+
+    # switch the differnt data type
+    return hash_key, length_storage_ptr
+
+
+def bit_change(raw_number, type):
+    if type == 0:
+        bitlength = math.log(raw_number, 2)
+    else:
+        bitlength = raw_number
+    if bitlength > 4096:
+        raise PermissionError("Invalid Data range for FPGA")
+    if bitlength > 2048:
+        return 4096
+    if bitlength > 1024:
+        return 2048
+    if bitlength > 512:
+        return 1024
+    if bitlength > 256:
+        return 512
+    return 256
+
+
+def rsa_pubkey_id_process(
+        random: BigIntStorage,
+        exponent,
+        modulus,
+        hash: Hash_key_storage,
+        hash_length,
+        size):
+    if size != random.vec_size:
+        raise PermissionError(
+            f"The size of random vector {random.vec_size} does not equal to size of hash {size}"
+        )
+    exp_length = bit_change(exponent, 0)
+    modulus_length = bit_change(modulus, 0)
+    res_rsa = FPGA_LIB.c_malloc(c_size_t(size * modulus_length // 8))
+    exp_ptr = c_char_p(exponent.to_bytes(exp_length // 8, 'little'))
+    modulus_ptr = c_char_p(modulus.to_bytes(modulus_length // 8, 'little'))
+
+    FPGA_LIB.rsa_pubkey_id_process(
+        c_char_p(random.bigint_storage),
+        c_char_p(hash.hash_storage),
+        modulus_ptr,
+        exp_ptr,
+        c_char_p(res_rsa),
+        c_size_t(size),
+        c_size_t(modulus_length),
+        c_size_t(exp_length),
+        c_size_t(hash_length),
+        c_size_t(8 * random.elem_size),
+        c_size_t(0),
+    )
+
+    return Hash_key_storage(res_rsa)
+
+
+def rsa_powmod(hash: Hash_key_storage, exponent, modulus, hash_length, size):
+    exp_length = bit_change(exponent, 0)
+    modulus_length = bit_change(modulus, 0)
+    res_rsa = FPGA_LIB.c_malloc(c_size_t(size * modulus_length // 8))
+    exp_ptr = c_char_p(exponent.to_bytes(exp_length // 8, 'little'))
+    modulus_ptr = c_char_p(modulus.to_bytes(modulus_length // 8, 'little'))
+
+    FPGA_LIB.rsa_powmod(
+        c_char_p(hash.hash_storage),
+        exp_ptr,
+        modulus_ptr,
+        c_char_p(res_rsa),
+        c_size_t(size),
+        c_size_t(hash_length),
+        c_size_t(exp_length),
+        c_size_t(modulus_length),
+        c_size_t(0),
+    )
+
+    return Hash_key_storage(res_rsa)
+
+
+def rsa_divm(
+        hash: Hash_key_storage,
+        random: BigIntStorage,
+        rsa_n,
+        hash_bit,
+        size):
+    if size != random.vec_size:
+        raise PermissionError(
+            f"The size of random vector {random.vec_size} does not equal to that of hash {size}"
+        )
+
+    modulus_length = bit_change(rsa_n, 0)
+    if modulus_length != hash_bit:
+        raise PermissionError(
+            f"The biglength of hash value from host {hash_bit} does not equal to that of key {modulus_length}"
+        )
+
+    modulus_ptr = c_char_p(rsa_n.to_bytes(modulus_length // 8, 'little'))
+    res_rsa = FPGA_LIB.c_malloc(c_size_t(size * modulus_length // 8))
+    FPGA_LIB.RSA_divmod(
+        c_char_p(hash.hash_storage),
+        c_char_p(random.bigint_storage),
+        modulus_ptr,
+        c_char_p(res_rsa),
+        c_size_t(size),
+        c_size_t(hash_bit),
+        c_size_t(8 * random.elem_size),
+    )
+
+    return Hash_key_storage(res_rsa)
+
+
+def fp_align(store, res_store, stream):
+    '''
+    Perform alignment for elements in a FixedPointStorage.
+    ------------------
+    Paras:
+        store: FixedPointStorage
+    Return:
+        res_store: FixedPointStorage
+    '''
+    vec_size = store.vec_size
+    # the src_store data
+    src_fpn = store.bigint_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_fpn = FPGA_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    FPGA_dev_num = __get_FPGA_device_num(store.mem_type)
+    FPGA_LIB.fpn_align(
+        c_char_p(src_fpn),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_fpn),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(vec_size),
+        c_char_p(store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+        c_size_t(CIPHER_BITS),
+        c_size_t(FPGA_dev_num),
+    )
+    # # handle data_type
+    # data_type = 0
+    # if store.data_type == INT64_TYPE:
+    #     data_type = INT64_TYPE
+    # else:
+    #     data_type = FLOAT_TYPE
+    return _fp_init_store(
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        store.encode_n,
+        store.max_int,
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def pi_sum_multi_index(
+        pub_key, store, valid_index, node_id, node_num, min_value=0, max_value=None
+):
+    '''
+    Run sum for data with the same index indicated in the valid_index list
+    Return: A PEN_Storage class with max_value-min_value+1 number of PEN values
+    ------------
+    Parameters:
+        pub_key: PubKeyStorage
+        store:   PaillierEncryptedStorage, the original PEN_storage class
+        valid_index:  ndarray, contains indices like [-1, 1, 2, 1, 3, 3, 2, -1] for each instance,
+                        -1 means that this value will not be calculated if min_value >= 0
+                        1,2,3 means the different groups that it belongs to
+        node_id:      ndarray, contains node_id like [3, 1, 0, 2, 3, 1] for each instance.
+                        0,1,2 represent the node that current instance locates in
+        node_num:     int, number of nodes
+        min_value:    int, The min valid value of the valid index, default 0,
+                           in the above example, if min_value == 1, then -1 will be invalid
+                           if min_value == -1, -1 is also valid
+        max_value:    int, The max valid value of the valid index
+    Return:
+        tuple   (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    src_pen = store.pen_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    vec_size = store.vec_size
+    valid_store = te_p2c(valid_index, None)
+    node_id_store = te_p2c(node_id, None)
+    # set max_value to maximum number if it is not designated
+    max_value = max(valid_index) if max_value is None else max_value
+    index_num = max_value - min_value + 1
+    res_size = index_num * node_num
+
+    res_pen = FPGA_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+    res_base = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_exp = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_shape_tuple = (node_num, index_num)
+    FPGA_dev_num = __get_FPGA_device_num(store.mem_type)
+    FPGA_LIB.pen_sum_with_multi_index(
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(index_num),
+        c_size_t(node_num),
+        c_int64(min_value),
+        c_void_p(valid_store.data),
+        c_void_p(node_id_store.data),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(vec_size),
+        c_size_t(CIPHER_BITS),
+        c_uint32(FPGA_dev_num),
+    )
+    return _pi_init_ss(
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        None,
+        res_shape_tuple,
+        MEM_HOST,
+        store.data_type,
+        store.encode_n,
+        store.encode_max_int,
+    )
+
+
+def pi_accumulate(pub_key, store, shape):
+    '''
+    Perform acummulate add for a vector
+    ----------------
+    Paras:
+        pub_key:     PubKeyStorage,
+        left_store:  PaillierEncryptedStorage
+        left_shape:  TensorShapeStorage
+    Return:
+        tuple:       (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    src_pen = store.pen_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    vec_size = store.vec_size
+
+    res_pen = FPGA_LIB.c_malloc(c_size_t(CIPHER_BYTE * vec_size))
+    res_base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    res_exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    res_shape_tuple = shape.to_tuple()
+    if len(res_shape_tuple) == 1:
+        res_shape_tuple = (1, res_shape_tuple[0])
+
+    FPGA_LIB.gmp_accumulate(
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_void_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(res_shape_tuple[0]),
+        c_size_t(res_shape_tuple[1]),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(CIPHER_BITS),
+        0,
+    )
+
+    return _pi_init_ss(
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        None,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+        store.encode_n,
+        store.encode_max_int,
+    )
diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_tensor.py b/gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_tensor.py
new file mode 100644
index 0000000000..f2725c578f
--- /dev/null
+++ b/gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_tensor.py
@@ -0,0 +1,511 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import typing
+
+import numpy as np
+
+from .fpga_engine import (
+    PaillierEncryptedStorage,
+    TensorShapeStorage,
+    pi_add,
+    te_p2c,
+    fp_encode,
+    pi_encrypt,
+    pi_mul,
+    pi_matmul,
+    pi_rmatmul,
+    pi_sum,
+    pi_p2c_pub_key,
+    pi_decrypt,
+    pi_p2c_priv_key,
+    te_c2p,
+)
+from .secureprotol.fate_paillier import (
+    PaillierPublicKey,
+    PaillierPrivateKey,
+    PaillierKeypair,
+)
+
+
+class Cipherblock:
+    def __init__(
+            self,
+            store: PaillierEncryptedStorage,
+            shape: TensorShapeStorage,
+            pk: "PK"):
+        self.store = store
+        self.shape = shape
+        self.pk = pk
+
+    def get_shape(self):
+        return self.shape.to_tuple()
+
+    def get_size(self):
+        return self.shape.size()
+
+    @staticmethod
+    def gen_shape(other):
+        return TensorShapeStorage().from_tuple(other.shape)
+
+    def _add_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        pi_store = pi_encrypt(self.pk.cpu_pub_key, fp_store)
+        res_store, res_shape = pi_add(
+            self.pk.cpu_pub_key, self.store, pi_store, self.shape, self.gen_shape(other))
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def _mul_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        res_store, res_shape = pi_mul(
+            self.pk.cpu_pub_key, self.store, fp_store, self.shape, self.gen_shape(other))
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def _matmul_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        res_store, res_shape = pi_matmul(
+            self.pk.cpu_pub_key, self.store, fp_store, self.shape, self.gen_shape(other))
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def _rmatmul_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        res_store, res_shape = pi_rmatmul(
+            self.pk.cpu_pub_key, fp_store, self.store, self.gen_shape(other), self.shape)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def add_cipherblock(self, other: "Cipherblock") -> "Cipherblock":
+        res_store, res_shape = pi_add(
+            self.pk.cpu_pub_key, self.store, other.store, self.shape, other.shape)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def add_plaintext_f64(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_f32(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_i64(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_i32(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_scalar_f64(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float64)
+        return self._add_plaintext(other_array)
+
+    def add_plaintext_scalar_f32(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float32)
+        return self._add_plaintext(other_array)
+
+    def add_plaintext_scalar_i64(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int64)
+        return self._add_plaintext(other_array)
+
+    def add_plaintext_scalar_i32(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int32)
+        return self._add_plaintext(other_array)
+
+    def sub_cipherblock(self, other: "Cipherblock") -> "Cipherblock":
+        return self.add_cipherblock(other.mul_plaintext_scalar_i32(-1))
+
+    def sub_plaintext_f64(self, other) -> "Cipherblock":
+        return self.add_plaintext_f64(other * -1)
+
+    def sub_plaintext_f32(self, other) -> "Cipherblock":
+        return self.add_plaintext_f32(other * -1)
+
+    def sub_plaintext_i64(self, other) -> "Cipherblock":
+        return self.add_plaintext_i64(other * -1)
+
+    def sub_plaintext_i32(self, other) -> "Cipherblock":
+        return self.add_plaintext_i32(other * -1)
+
+    def sub_plaintext_scalar_f64(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_f64(other * -1)
+
+    def sub_plaintext_scalar_f32(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_f32(other * -1)
+
+    def sub_plaintext_scalar_i64(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_i64(other * -1)
+
+    def sub_plaintext_scalar_i32(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_i32(other * -1)
+
+    def mul_plaintext_f64(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_f32(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_i64(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_i32(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_scalar_f64(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float64)
+        return self._mul_plaintext(other_array)
+
+    def mul_plaintext_scalar_f32(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float32)
+        return self._mul_plaintext(other_array)
+
+    def mul_plaintext_scalar_i64(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int64)
+        return self._mul_plaintext(other_array)
+
+    def mul_plaintext_scalar_i32(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int32)
+        return self._mul_plaintext(other_array)
+
+    def matmul_plaintext_ix2_f64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix2_f32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix2_i64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix2_i32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_f64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_f32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_i64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_i32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_f64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_f32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_i64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_i32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_f64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_f32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_i64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_i32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def sum(self) -> "Cipherblock":
+        res_store, res_shape = pi_sum(
+            self.pk.cpu_pub_key, self.store, self.shape, axis=None
+        )
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def sum_axis(self, axis=None):
+        res_store, res_shape = pi_sum(
+            self.pk.cpu_pub_key, self.store, self.shape, axis)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def mean(self) -> "Cipherblock":
+        return self.sum().mul_plaintext_scalar_f64(float(1 / self.get_size()))
+
+    """parallel"""
+
+    def add_cipherblock_par(self, other: "Cipherblock") -> "Cipherblock":
+        return self.add_cipherblock(other)
+
+    def add_plaintext_f64_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_f64(other)
+
+    def add_plaintext_f32_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_f32(other)
+
+    def add_plaintext_i64_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_i64(other)
+
+    def add_plaintext_scalar_f64_par(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_f64(other)
+
+    def add_plaintext_scalar_f32_par(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_f32(other)
+
+    def add_plaintext_scalar_i64_par(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_i64(other)
+
+    def add_plaintext_scalar_i32_par(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_i32(other)
+
+    def add_plaintext_i32_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_i32(other)
+
+    def sub_cipherblock_par(self, other: "Cipherblock") -> "Cipherblock":
+        return self.sub_cipherblock(other)
+
+    def sub_plaintext_f64_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_f64(other)
+
+    def sub_plaintext_f32_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_f32(other)
+
+    def sub_plaintext_i64_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_i64(other)
+
+    def sub_plaintext_i32_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_i32(other)
+
+    def sub_plaintext_scalar_f64_par(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        return self.sub_plaintext_scalar_f64(other)
+
+    def sub_plaintext_scalar_f32_par(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        return self.sub_plaintext_scalar_f32(other)
+
+    def sub_plaintext_scalar_i64_par(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        return self.sub_plaintext_scalar_i64(other)
+
+    def sub_plaintext_scalar_i32_par(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        return self.sub_plaintext_scalar_i32(other)
+
+    def mul_plaintext_f64_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_f64(other)
+
+    def mul_plaintext_f32_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_f32(other)
+
+    def mul_plaintext_i64_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_i64(other)
+
+    def mul_plaintext_i32_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_i32(other)
+
+    def mul_plaintext_scalar_f64_par(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        return self.mul_plaintext_scalar_f64(other)
+
+    def mul_plaintext_scalar_f32_par(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        return self.mul_plaintext_scalar_f32(other)
+
+    def mul_plaintext_scalar_i64_par(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        return self.mul_plaintext_scalar_i64(other)
+
+    def mul_plaintext_scalar_i32_par(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        return self.mul_plaintext_scalar_i32(other)
+
+    def matmul_plaintext_ix2_f64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_f64(other)
+
+    def matmul_plaintext_ix2_f32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_f32(other)
+
+    def matmul_plaintext_ix2_i64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_i64(other)
+
+    def matmul_plaintext_ix2_i32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_i32(other)
+
+    def matmul_plaintext_ix1_f64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_f64(other)
+
+    def matmul_plaintext_ix1_f32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_f32(other)
+
+    def matmul_plaintext_ix1_i64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_i64(other)
+
+    def matmul_plaintext_ix1_i32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_i32(other)
+
+    def rmatmul_plaintext_ix2_f64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_f64(other)
+
+    def rmatmul_plaintext_ix2_f32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_f32(other)
+
+    def rmatmul_plaintext_ix2_i64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_i64(other)
+
+    def rmatmul_plaintext_ix2_i32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_i32(other)
+
+    def rmatmul_plaintext_ix1_f64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_f64(other)
+
+    def rmatmul_plaintext_ix1_f32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_f32(other)
+
+    def rmatmul_plaintext_ix1_i64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_i64(other)
+
+    def rmatmul_plaintext_ix1_i32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_i32(other)
+
+    def sum_par(self) -> "Cipherblock":
+        return self.sum()
+
+    def mean_par(self) -> "Cipherblock":
+        return self.mean()
+
+
+class PK:
+    def __init__(self, pub_key: PaillierPublicKey):
+        self.pub_key = pub_key
+        self.cpu_pub_key = pi_p2c_pub_key(None, self.pub_key)
+
+    def _encrypt(self, a) -> Cipherblock:
+        shape = TensorShapeStorage().from_tuple(a.shape)
+        fp_store = fp_encode(te_p2c(a), self.pub_key.n, self.pub_key.max_int)
+        pi_store = pi_encrypt(self.cpu_pub_key, fp_store)
+        return Cipherblock(pi_store, shape, self)
+
+    def encrypt_f64(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_f32(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_i64(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_i32(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_f64_par(self, a) -> Cipherblock:
+        return self.encrypt_f64(a)
+
+    def encrypt_f32_par(self, a) -> Cipherblock:
+        return self.encrypt_f32(a)
+
+    def encrypt_i64_par(self, a) -> Cipherblock:
+        return self.encrypt_i64(a)
+
+    def encrypt_i32_par(self, a) -> Cipherblock:
+        return self.encrypt_i32(a)
+
+
+class SK:
+    def __init__(self, priv_key: PaillierPrivateKey, pk: PK):
+        self.priv_key = priv_key
+        self.cpu_priv_key = pi_p2c_priv_key(None, priv_key)
+        self.pk = pk
+
+    def _decrypt(self, a: Cipherblock):
+        if a.store.vec_size == 0:
+            return np.asarray([])
+        te_res = pi_decrypt(a.pk.cpu_pub_key, self.cpu_priv_key, a.store)
+        return te_c2p(te_res).reshape(a.get_shape())
+
+    def decrypt_f64(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.float64)
+
+    def decrypt_f32(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.float32)
+
+    def decrypt_i64(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.int64)
+
+    def decrypt_i32(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.int32)
+
+    def decrypt_f64_par(self, a: Cipherblock):
+        return self.decrypt_f64(a)
+
+    def decrypt_f32_par(self, a: Cipherblock):
+        return self.decrypt_f32(a)
+
+    def decrypt_i64_par(self, a: Cipherblock):
+        return self.decrypt_i64(a)
+
+    def decrypt_i32_par(self, a: Cipherblock):
+        return self.decrypt_i32(a)
+
+
+def keygen(bit_size) -> typing.Tuple[PK, SK]:
+    pub_key, priv_key = PaillierKeypair.generate_keypair(n_length=bit_size)
+    pk = PK(pub_key)
+    sk = SK(priv_key, pk)
+    return pk, sk
diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/__init__.py b/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fate_paillier.py b/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fate_paillier.py
new file mode 100644
index 0000000000..72c8dc9f6a
--- /dev/null
+++ b/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fate_paillier.py
@@ -0,0 +1,364 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import random
+
+from . import gmpy_math
+from .fixedpoint import FixedPointNumber
+
+
+class PaillierKeypair(object):
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def generate_keypair(n_length=1024):
+        """return a new :class:`PaillierPublicKey` and :class:`PaillierPrivateKey`."""
+        p = q = n = None
+        n_len = 0
+
+        while n_len != n_length:
+            p = gmpy_math.getprimeover(n_length // 2)
+            q = p
+            while q == p:
+                q = gmpy_math.getprimeover(n_length // 2)
+            n = p * q
+            n_len = n.bit_length()
+
+        public_key = PaillierPublicKey(n)
+        private_key = PaillierPrivateKey(public_key, p, q)
+
+        return public_key, private_key
+
+
+class PaillierPublicKey(object):
+    """Contains a public key and associated encryption methods."""
+
+    def __init__(self, n):
+        self.g = n + 1
+        self.n = n
+        self.nsquare = n * n
+        self.max_int = n // 3 - 1
+
+    def __repr__(self):
+        hashcode = hex(hash(self))[2:]
+        return "<PaillierPublicKey {}>".format(hashcode[:10])
+
+    def __eq__(self, other):
+        return self.n == other.n
+
+    def __hash__(self):
+        return hash(self.n)
+
+    def apply_obfuscator(self, ciphertext, random_value=None):
+        """ """
+        r = random_value or random.SystemRandom().randrange(1, self.n)
+        obfuscator = gmpy_math.powmod(r, self.n, self.nsquare)
+
+        return (ciphertext * obfuscator) % self.nsquare
+
+    def raw_encrypt(self, plaintext, random_value=None):
+        """ """
+        if not isinstance(plaintext, int):
+            raise TypeError(
+                "plaintext should be int, but got: %s" %
+                type(plaintext))
+
+        if plaintext >= (self.n - self.max_int) and plaintext < self.n:
+            # Very large plaintext, take a sneaky shortcut using inverses
+            neg_plaintext = self.n - plaintext  # = abs(plaintext - nsquare)
+            neg_ciphertext = (self.n * neg_plaintext + 1) % self.nsquare
+            ciphertext = gmpy_math.invert(neg_ciphertext, self.nsquare)
+        else:
+            ciphertext = (self.n * plaintext + 1) % self.nsquare
+
+        ciphertext = self.apply_obfuscator(ciphertext, random_value)
+
+        return ciphertext
+
+    def encrypt(self, value, precision=None, random_value=None):
+        """Encode and Paillier encrypt a real number value."""
+        if isinstance(value, FixedPointNumber):
+            value = value.decode()
+        encoding = FixedPointNumber.encode(
+            value, self.n, self.max_int, precision)
+        obfuscator = random_value or 1
+        ciphertext = self.raw_encrypt(
+            encoding.encoding, random_value=obfuscator)
+        encryptednumber = PaillierEncryptedNumber(
+            self, ciphertext, encoding.exponent)
+        if random_value is None:
+            encryptednumber.apply_obfuscator()
+
+        return encryptednumber
+
+
+class PaillierPrivateKey(object):
+    """Contains a private key and associated decryption method."""
+
+    def __init__(self, public_key, p, q):
+        if not p * q == public_key.n:
+            raise ValueError(
+                "given public key does not match the given p and q")
+        if p == q:
+            raise ValueError("p and q have to be different")
+        self.public_key = public_key
+        if q < p:
+            self.p = q
+            self.q = p
+        else:
+            self.p = p
+            self.q = q
+        self.psquare = self.p * self.p
+        self.qsquare = self.q * self.q
+        self.q_inverse = gmpy_math.invert(self.q, self.p)
+        self.hp = self.h_func(self.p, self.psquare)
+        self.hq = self.h_func(self.q, self.qsquare)
+
+    def __eq__(self, other):
+        return self.p == other.p and self.q == other.q
+
+    def __hash__(self):
+        return hash((self.p, self.q))
+
+    def __repr__(self):
+        hashcode = hex(hash(self))[2:]
+
+        return "<PaillierPrivateKey {}>".format(hashcode[:10])
+
+    def h_func(self, x, xsquare):
+        """Computes the h-function as defined in Paillier's paper page."""
+        return gmpy_math.invert(
+            self.l_func(
+                gmpy_math.powmod(
+                    self.public_key.g,
+                    x - 1,
+                    xsquare),
+                x),
+            x)
+
+    def l_func(self, x, p):
+        """computes the L function as defined in Paillier's paper."""
+
+        return (x - 1) // p
+
+    def crt(self, mp, mq):
+        """the Chinese Remainder Theorem as needed for decryption.
+        return the solution modulo n=pq.
+        """
+        u = (mp - mq) * self.q_inverse % self.p
+        x = (mq + (u * self.q)) % self.public_key.n
+
+        return x
+
+    def raw_decrypt(self, ciphertext):
+        """return raw plaintext."""
+        if not isinstance(ciphertext, int):
+            raise TypeError(
+                "ciphertext should be an int, not: %s" %
+                type(ciphertext))
+
+        mp = self.l_func(gmpy_math.powmod(ciphertext, self.p - 1, self.psquare), self.p) * self.hp % self.p
+
+        mq = self.l_func(gmpy_math.powmod(ciphertext, self.q - 1, self.qsquare), self.q) * self.hq % self.q
+
+        return self.crt(mp, mq)
+
+    def decrypt(self, encrypted_number):
+        """return the decrypted & decoded plaintext of encrypted_number."""
+        if not isinstance(encrypted_number, PaillierEncryptedNumber):
+            raise TypeError(
+                "encrypted_number should be an PaillierEncryptedNumber, \
+                             not: %s"
+                % type(encrypted_number)
+            )
+
+        if self.public_key != encrypted_number.public_key:
+            raise ValueError(
+                "encrypted_number was encrypted against a different key!")
+
+        encoded = self.raw_decrypt(
+            encrypted_number.ciphertext(
+                be_secure=False))
+        encoded = FixedPointNumber(
+            encoded,
+            encrypted_number.exponent,
+            self.public_key.n,
+            self.public_key.max_int,
+        )
+        decrypt_value = encoded.decode()
+
+        return decrypt_value
+
+
+class PaillierEncryptedNumber(object):
+    """Represents the Paillier encryption of a float or int."""
+
+    def __init__(self, public_key, ciphertext, exponent=0):
+        self.public_key = public_key
+        self.__ciphertext = ciphertext
+        self.exponent = exponent
+        self.__is_obfuscator = False
+
+        if not isinstance(self.__ciphertext, int):
+            raise TypeError(
+                "ciphertext should be an int, not: %s" %
+                type(
+                    self.__ciphertext))
+
+        if not isinstance(self.public_key, PaillierPublicKey):
+            raise TypeError(
+                "public_key should be a PaillierPublicKey, not: %s"
+                % type(self.public_key)
+            )
+
+    def ciphertext(self, be_secure=True):
+        """return the ciphertext of the PaillierEncryptedNumber."""
+        if be_secure and not self.__is_obfuscator:
+            self.apply_obfuscator()
+
+        return self.__ciphertext
+
+    def apply_obfuscator(self):
+        """ciphertext by multiplying by r ** n with random r"""
+        self.__ciphertext = self.public_key.apply_obfuscator(self.__ciphertext)
+        self.__is_obfuscator = True
+
+    def __add__(self, other):
+        if isinstance(other, PaillierEncryptedNumber):
+            return self.__add_encryptednumber(other)
+        else:
+            return self.__add_scalar(other)
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+    def __sub__(self, other):
+        return self + (other * -1)
+
+    def __rsub__(self, other):
+        return other + (self * -1)
+
+    def __rmul__(self, scalar):
+        return self.__mul__(scalar)
+
+    def __truediv__(self, scalar):
+        return self.__mul__(1 / scalar)
+
+    def __mul__(self, scalar):
+        """return Multiply by an scalar(such as int, float)"""
+        if isinstance(scalar, FixedPointNumber):
+            scalar = scalar.decode()
+        encode = FixedPointNumber.encode(
+            scalar, self.public_key.n, self.public_key.max_int
+        )
+        plaintext = encode.encoding
+
+        if plaintext < 0 or plaintext >= self.public_key.n:
+            raise ValueError("Scalar out of bounds: %i" % plaintext)
+
+        if plaintext >= self.public_key.n - self.public_key.max_int:
+            # Very large plaintext, play a sneaky trick using inverses
+            neg_c = gmpy_math.invert(
+                self.ciphertext(False),
+                self.public_key.nsquare)
+            neg_scalar = self.public_key.n - plaintext
+            ciphertext = gmpy_math.powmod(
+                neg_c, neg_scalar, self.public_key.nsquare)
+        else:
+            ciphertext = gmpy_math.powmod(
+                self.ciphertext(False), plaintext, self.public_key.nsquare
+            )
+
+        exponent = self.exponent + encode.exponent
+
+        return PaillierEncryptedNumber(self.public_key, ciphertext, exponent)
+
+    def increase_exponent_to(self, new_exponent):
+        """return PaillierEncryptedNumber:
+        new PaillierEncryptedNumber with same value but having great exponent.
+        """
+        if new_exponent < self.exponent:
+            raise ValueError(
+                "New exponent %i should be great than old exponent %i"
+                % (new_exponent, self.exponent)
+            )
+
+        factor = pow(FixedPointNumber.BASE, new_exponent - self.exponent)
+        new_encryptednumber = self.__mul__(factor)
+        new_encryptednumber.exponent = new_exponent
+
+        return new_encryptednumber
+
+    def __align_exponent(self, x, y):
+        """return x,y with same exponet"""
+        if x.exponent < y.exponent:
+            x = x.increase_exponent_to(y.exponent)
+        elif x.exponent > y.exponent:
+            y = y.increase_exponent_to(x.exponent)
+
+        return x, y
+
+    def __add_scalar(self, scalar):
+        """return PaillierEncryptedNumber: z = E(x) + y"""
+        if isinstance(scalar, FixedPointNumber):
+            scalar = scalar.decode()
+        encoded = FixedPointNumber.encode(
+            scalar,
+            self.public_key.n,
+            self.public_key.max_int,
+            max_exponent=self.exponent,
+        )
+        return self.__add_fixpointnumber(encoded)
+
+    def __add_fixpointnumber(self, encoded):
+        """return PaillierEncryptedNumber: z = E(x) + FixedPointNumber(y)"""
+        if self.public_key.n != encoded.n:
+            raise ValueError(
+                "Attempted to add numbers encoded against different public keys!"
+            )
+
+        # their exponents must match, and align.
+        x, y = self.__align_exponent(self, encoded)
+
+        encrypted_scalar = x.public_key.raw_encrypt(y.encoding, 1)
+        encryptednumber = self.__raw_add(
+            x.ciphertext(False), encrypted_scalar, x.exponent
+        )
+
+        return encryptednumber
+
+    def __add_encryptednumber(self, other):
+        """return PaillierEncryptedNumber: z = E(x) + E(y)"""
+        if self.public_key != other.public_key:
+            raise ValueError("add two numbers have different public key!")
+
+        # their exponents must match, and align.
+        x, y = self.__align_exponent(self, other)
+
+        encryptednumber = self.__raw_add(
+            x.ciphertext(False), y.ciphertext(False), x.exponent
+        )
+
+        return encryptednumber
+
+    def __raw_add(self, e_x, e_y, exponent):
+        """return the integer E(x + y) given ints E(x) and E(y)."""
+        ciphertext = gmpy_math.mpz(
+            e_x) * gmpy_math.mpz(e_y) % self.public_key.nsquare
+
+        return PaillierEncryptedNumber(
+            self.public_key, int(ciphertext), exponent)
diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fixedpoint.py b/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fixedpoint.py
new file mode 100644
index 0000000000..dca6d0fcda
--- /dev/null
+++ b/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fixedpoint.py
@@ -0,0 +1,322 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import math
+import sys
+
+import numpy as np
+
+
+class FixedPointNumber(object):
+    """Represents a float or int fixedpoint encoding;."""
+
+    BASE = 16
+    LOG2_BASE = math.log(BASE, 2)
+    FLOAT_MANTISSA_BITS = sys.float_info.mant_dig
+
+    Q = 293973345475167247070445277780365744413 ** 2
+
+    def __init__(self, encoding, exponent, n=None, max_int=None):
+        if n is None:
+            self.n = FixedPointNumber.Q
+            self.max_int = self.n // 2
+        else:
+            self.n = n
+            if max_int is None:
+                self.max_int = self.n // 2
+            else:
+                self.max_int = max_int
+
+        self.encoding = encoding
+        self.exponent = exponent
+
+    @classmethod
+    def calculate_exponent_from_precision(cls, precision):
+        exponent = math.floor(math.log(precision, cls.BASE))
+        return exponent
+
+    @classmethod
+    def encode(
+            cls,
+            scalar,
+            n=None,
+            max_int=None,
+            precision=None,
+            max_exponent=None):
+        """return an encoding of an int or float."""
+        # Calculate the maximum exponent for desired precision
+        exponent = None
+
+        #  Too low value preprocess;
+        #  avoid "OverflowError: int too large to convert to float"
+
+        if np.abs(scalar) < 1e-200:
+            scalar = 0
+
+        if n is None:
+            n = cls.Q
+            max_int = n // 2
+
+        if precision is None:
+            if isinstance(scalar, int) or isinstance(scalar, np.int16) or isinstance(scalar, np.int32) or isinstance(
+                    scalar, np.int64):
+                exponent = 0
+            elif isinstance(scalar, float) or isinstance(scalar, np.float16) or isinstance(scalar,
+                                                                                           np.float32) or isinstance(
+                    scalar, np.float64):
+                flt_exponent = math.frexp(scalar)[1]
+                lsb_exponent = cls.FLOAT_MANTISSA_BITS - flt_exponent
+                exponent = math.floor(lsb_exponent / cls.LOG2_BASE)
+            else:
+                raise TypeError(
+                    "Don't know the precision of type %s." %
+                    type(scalar))
+        else:
+            exponent = cls.calculate_exponent_from_precision(precision)
+
+        if max_exponent is not None:
+            exponent = max(max_exponent, exponent)
+
+        int_fixpoint = int(round(scalar * pow(cls.BASE, exponent)))
+
+        if abs(int_fixpoint) > max_int:
+            raise ValueError(
+                f"Integer needs to be within +/- {max_int},but got {int_fixpoint},"
+                f"basic info, scalar={scalar}, base={cls.BASE}, exponent={exponent}")
+
+        return cls(int_fixpoint % n, exponent, n, max_int)
+
+    def decode(self):
+        """return decode plaintext."""
+        if self.encoding >= self.n:
+            # Should be mod n
+            raise ValueError('Attempted to decode corrupted number')
+        elif self.encoding <= self.max_int:
+            # Positive
+            mantissa = self.encoding
+        elif self.encoding >= self.n - self.max_int:
+            # Negative
+            mantissa = self.encoding - self.n
+        else:
+            raise OverflowError(
+                f'Overflow detected in decode number, encoding: {self.encoding}，'
+                f'{self.exponent}'
+                f' {self.n}')
+
+        return mantissa * pow(self.BASE, -self.exponent)
+
+    def increase_exponent_to(self, new_exponent):
+        """return FixedPointNumber: new encoding with same value but having great exponent."""
+        if new_exponent < self.exponent:
+            raise ValueError(
+                'New exponent %i should be greater than'
+                'old exponent %i' % (new_exponent, self.exponent)
+            )
+
+        factor = pow(self.BASE, new_exponent - self.exponent)
+        new_encoding = self.encoding * factor % self.n
+
+        return FixedPointNumber(
+            new_encoding,
+            new_exponent,
+            self.n,
+            self.max_int)
+
+    def __align_exponent(self, x, y):
+        """return x,y with same exponent"""
+        if x.exponent < y.exponent:
+            x = x.increase_exponent_to(y.exponent)
+        elif x.exponent > y.exponent:
+            y = y.increase_exponent_to(x.exponent)
+
+        return x, y
+
+    def __truncate(self, a):
+        scalar = a.decode()
+        return FixedPointNumber.encode(scalar, n=self.n, max_int=self.max_int)
+
+    def __add__(self, other):
+        if isinstance(other, FixedPointNumber):
+            return self.__add_fixedpointnumber(other)
+        elif type(other).__name__ == "PaillierEncryptedNumber":
+            return other + self.decode()
+        else:
+            return self.__add_scalar(other)
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+    def __sub__(self, other):
+        if isinstance(other, FixedPointNumber):
+            return self.__sub_fixedpointnumber(other)
+        elif type(other).__name__ == "PaillierEncryptedNumber":
+            return (other - self.decode()) * -1
+        else:
+            return self.__sub_scalar(other)
+
+    def __rsub__(self, other):
+        if type(other).__name__ == "PaillierEncryptedNumber":
+            return other - self.decode()
+
+        x = self.__sub__(other)
+        x = -1 * x.decode()
+        return self.encode(x, n=self.n, max_int=self.max_int)
+
+    def __rmul__(self, other):
+        return self.__mul__(other)
+
+    def __mul__(self, other):
+        if isinstance(other, FixedPointNumber):
+            return self.__mul_fixedpointnumber(other)
+        elif type(other).__name__ == "PaillierEncryptedNumber":
+            return other * self.decode()
+        else:
+            return self.__mul_scalar(other)
+
+    def __truediv__(self, other):
+        if isinstance(other, FixedPointNumber):
+            scalar = other.decode()
+        else:
+            scalar = other
+
+        return self.__mul__(1 / scalar)
+
+    def __rtruediv__(self, other):
+        res = 1.0 / self.__truediv__(other).decode()
+        return FixedPointNumber.encode(res, n=self.n, max_int=self.max_int)
+
+    def __lt__(self, other):
+        x = self.decode()
+        if isinstance(other, FixedPointNumber):
+            y = other.decode()
+        else:
+            y = other
+        if x < y:
+            return True
+        else:
+            return False
+
+    def __gt__(self, other):
+        x = self.decode()
+        if isinstance(other, FixedPointNumber):
+            y = other.decode()
+        else:
+            y = other
+        if x > y:
+            return True
+        else:
+            return False
+
+    def __le__(self, other):
+        x = self.decode()
+        if isinstance(other, FixedPointNumber):
+            y = other.decode()
+        else:
+            y = other
+        if x <= y:
+            return True
+        else:
+            return False
+
+    def __ge__(self, other):
+        x = self.decode()
+        if isinstance(other, FixedPointNumber):
+            y = other.decode()
+        else:
+            y = other
+
+        if x >= y:
+            return True
+        else:
+            return False
+
+    def __eq__(self, other):
+        x = self.decode()
+        if isinstance(other, FixedPointNumber):
+            y = other.decode()
+        else:
+            y = other
+        if x == y:
+            return True
+        else:
+            return False
+
+    def __ne__(self, other):
+        x = self.decode()
+        if isinstance(other, FixedPointNumber):
+            y = other.decode()
+        else:
+            y = other
+        if x != y:
+            return True
+        else:
+            return False
+
+    def __add_fixedpointnumber(self, other):
+        if self.n != other.n:
+            other = self.encode(other.decode(), n=self.n, max_int=self.max_int)
+        x, y = self.__align_exponent(self, other)
+        encoding = (x.encoding + y.encoding) % self.n
+        return FixedPointNumber(
+            encoding,
+            x.exponent,
+            n=self.n,
+            max_int=self.max_int)
+
+    def __add_scalar(self, scalar):
+        encoded = self.encode(scalar, n=self.n, max_int=self.max_int)
+        return self.__add_fixedpointnumber(encoded)
+
+    def __sub_fixedpointnumber(self, other):
+        if self.n != other.n:
+            other = self.encode(other.decode(), n=self.n, max_int=self.max_int)
+        x, y = self.__align_exponent(self, other)
+        encoding = (x.encoding - y.encoding) % self.n
+
+        return FixedPointNumber(
+            encoding,
+            x.exponent,
+            n=self.n,
+            max_int=self.max_int)
+
+    def __sub_scalar(self, scalar):
+        scalar = -1 * scalar
+        return self.__add_scalar(scalar)
+
+    def __mul_fixedpointnumber(self, other):
+        return self.__mul_scalar(other.decode())
+
+    def __mul_scalar(self, scalar):
+        val = self.decode()
+        z = val * scalar
+        z_encode = FixedPointNumber.encode(z, n=self.n, max_int=self.max_int)
+        return z_encode
+
+    def __abs__(self):
+        if self.encoding <= self.max_int:
+            # Positive
+            return self
+        elif self.encoding >= self.n - self.max_int:
+            # Negative
+            return self * -1
+
+    def __mod__(self, other):
+        return FixedPointNumber(
+            self.encoding %
+            other,
+            self.exponent,
+            n=self.n,
+            max_int=self.max_int)
diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/gmpy_math.py b/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/gmpy_math.py
new file mode 100644
index 0000000000..a316ead0ff
--- /dev/null
+++ b/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/gmpy_math.py
@@ -0,0 +1,133 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import random
+import gmpy2
+
+POWMOD_GMP_SIZE = pow(2, 64)
+
+
+def powmod(a, b, c):
+    """
+    return int: (a ** b) % c
+    """
+
+    if a == 1:
+        return 1
+
+    if max(a, b, c) < POWMOD_GMP_SIZE:
+        return pow(a, b, c)
+
+    else:
+        return int(gmpy2.powmod(a, b, c))
+
+
+def crt_coefficient(p, q):
+    """
+    return crt coefficient
+    """
+    tq = gmpy2.invert(p, q)
+    tp = gmpy2.invert(q, p)
+    return tp * q, tq * p
+
+
+def powmod_crt(x, d, n, p, q, cp, cq):
+    """
+    return int: (a ** b) % n
+    """
+
+    rp = gmpy2.powmod(x, d % (p - 1), p)
+    rq = gmpy2.powmod(x, d % (q - 1), q)
+    return int((rp * cp + rq * cq) % n)
+
+
+def invert(a, b):
+    """return int: x, where a * x == 1 mod b"""
+    x = int(gmpy2.invert(a, b))
+
+    if x == 0:
+        raise ZeroDivisionError("invert(a, b) no inverse exists")
+
+    return x
+
+
+def getprimeover(n):
+    """return a random n-bit prime number"""
+    r = gmpy2.mpz(random.SystemRandom().getrandbits(n))
+    r = gmpy2.bit_set(r, n - 1)
+
+    return int(gmpy2.next_prime(r))
+
+
+def isqrt(n):
+    """return the integer square root of N"""
+
+    return int(gmpy2.isqrt(n))
+
+
+def is_prime(n):
+    """
+    true if n is probably a prime, false otherwise
+    :param n:
+    :return:
+    """
+    return gmpy2.is_prime(int(n))
+
+
+def legendre(a, p):
+    return pow(a, (p - 1) // 2, p)
+
+
+def tonelli(n, p):
+    # assert legendre(n, p) == 1, "not a square (mod p)"
+    q = p - 1
+    s = 0
+    while q % 2 == 0:
+        q //= 2
+        s += 1
+    if s == 1:
+        return pow(n, (p + 1) // 4, p)
+    for z in range(2, p):
+        if p - 1 == legendre(z, p):
+            break
+    c = pow(z, q, p)
+    r = pow(n, (q + 1) // 2, p)
+    t = pow(n, q, p)
+    m = s
+    while (t - 1) % p != 0:
+        t2 = (t * t) % p
+        for i in range(1, m):
+            if (t2 - 1) % p == 0:
+                break
+            t2 = (t2 * t2) % p
+        b = pow(c, 1 << (m - i - 1), p)
+        r = (r * b) % p
+        c = (b * b) % p
+        t = (t * c) % p
+        m = i
+    return r
+
+
+def gcd(a, b):
+    return int(gmpy2.gcd(a, b))
+
+
+def next_prime(n):
+    return int(gmpy2.next_prime(n))
+
+
+def mpz(n):
+    return gmpy2.mpz(n)
diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/tests/__init__.py b/gpu/fate-tensor-fpga/fate_tensor_fpga/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/gpu/fate-tensor-fpga/pyproject.toml b/gpu/fate-tensor-fpga/pyproject.toml
new file mode 100644
index 0000000000..4e3db6d158
--- /dev/null
+++ b/gpu/fate-tensor-fpga/pyproject.toml
@@ -0,0 +1,17 @@
+[tool.poetry]
+name = "fate-tensor-fpga"
+version = "0.1.0"
+description = "This project is an industrial-level heterogeneous acceleration system to support and speed up federated learning. We've designed and implemented a heterogeneous acceleration solutions using FPGA, respectively, that can significantly accelerate the Paillier cryptosystem while maintaining functionality, accuracy and scalability."
+authors = ["Xiaolong.Gao <1506957902@qq.com>"]
+
+[tool.poetry.dependencies]
+python = "^3.6"
+numpy = "~1.18.4"
+gmpy2 = "^2.0.8"
+
+[tool.poetry.dev-dependencies]
+pytest = "^5.2"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"

From 4e614617700e24df36628e4aaba67a30848ca70d Mon Sep 17 00:00:00 2001
From: "Xiaolong.Gao" <1506957902@qq.com>
Date: Fri, 22 Jul 2022 15:55:12 +0800
Subject: [PATCH 6/8] refactor: format GPU tensor

Signed-off-by: Xiaolong.Gao <1506957902@qq.com>
---
 .../secureprotol/fate_paillier.py             | 364 -----------------
 .../secureprotol/fixedpoint.py                | 322 ---------------
 .../secureprotol/gmpy_math.py                 | 133 -------
 .../fate_tensor_gpu/secureprotol/__init__.py  |   0
 .../secureprotol/fate_paillier.py             | 366 ------------------
 .../secureprotol/fixedpoint.py                | 322 ---------------
 .../fate_tensor_gpu/secureprotol/gmpy_math.py | 133 -------
 .../fate_tensor_gpu/tests/__init__.py         |   0
 .../paillier_gpu/paillier_gpu}/__init__.py    |   0
 .../paillier_gpu/paillier_gpu}/gpu_engine.py  |   4 +-
 .../paillier_gpu/paillier_gpu}/gpu_tensor.py  |   2 +-
 .../paillier_gpu}/tests/__init__.py           |   0
 .../paillier_gpu}/tests/test_gpu_engine.py    |   8 +-
 .../tests/test_gpu_performance.py             | 276 +++++++++++++
 .../paillier_gpu}/pyproject.toml              |   2 +-
 .../blocks/python_paillier_block/__init__.py  |  12 +-
 16 files changed, 295 insertions(+), 1649 deletions(-)
 delete mode 100644 gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fate_paillier.py
 delete mode 100644 gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fixedpoint.py
 delete mode 100644 gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/gmpy_math.py
 delete mode 100644 gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/__init__.py
 delete mode 100644 gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py
 delete mode 100644 gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py
 delete mode 100644 gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/gmpy_math.py
 delete mode 100644 gpu/fate-tensor-gpu/fate_tensor_gpu/tests/__init__.py
 rename gpu/{fate-tensor-gpu/fate_tensor_gpu => tensor/paillier_gpu/paillier_gpu}/__init__.py (100%)
 rename gpu/{fate-tensor-gpu/fate_tensor_gpu => tensor/paillier_gpu/paillier_gpu}/gpu_engine.py (99%)
 rename gpu/{fate-tensor-gpu/fate_tensor_gpu => tensor/paillier_gpu/paillier_gpu}/gpu_tensor.py (99%)
 rename gpu/{fate-tensor-fpga/fate_tensor_fpga => tensor/paillier_gpu/paillier_gpu}/tests/__init__.py (100%)
 rename gpu/{fate-tensor-gpu/fate_tensor_gpu => tensor/paillier_gpu/paillier_gpu}/tests/test_gpu_engine.py (99%)
 create mode 100755 gpu/tensor/paillier_gpu/paillier_gpu/tests/test_gpu_performance.py
 rename gpu/{fate-tensor-gpu => tensor/paillier_gpu}/pyproject.toml (96%)

diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fate_paillier.py b/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fate_paillier.py
deleted file mode 100644
index 72c8dc9f6a..0000000000
--- a/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fate_paillier.py
+++ /dev/null
@@ -1,364 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import random
-
-from . import gmpy_math
-from .fixedpoint import FixedPointNumber
-
-
-class PaillierKeypair(object):
-    def __init__(self):
-        pass
-
-    @staticmethod
-    def generate_keypair(n_length=1024):
-        """return a new :class:`PaillierPublicKey` and :class:`PaillierPrivateKey`."""
-        p = q = n = None
-        n_len = 0
-
-        while n_len != n_length:
-            p = gmpy_math.getprimeover(n_length // 2)
-            q = p
-            while q == p:
-                q = gmpy_math.getprimeover(n_length // 2)
-            n = p * q
-            n_len = n.bit_length()
-
-        public_key = PaillierPublicKey(n)
-        private_key = PaillierPrivateKey(public_key, p, q)
-
-        return public_key, private_key
-
-
-class PaillierPublicKey(object):
-    """Contains a public key and associated encryption methods."""
-
-    def __init__(self, n):
-        self.g = n + 1
-        self.n = n
-        self.nsquare = n * n
-        self.max_int = n // 3 - 1
-
-    def __repr__(self):
-        hashcode = hex(hash(self))[2:]
-        return "<PaillierPublicKey {}>".format(hashcode[:10])
-
-    def __eq__(self, other):
-        return self.n == other.n
-
-    def __hash__(self):
-        return hash(self.n)
-
-    def apply_obfuscator(self, ciphertext, random_value=None):
-        """ """
-        r = random_value or random.SystemRandom().randrange(1, self.n)
-        obfuscator = gmpy_math.powmod(r, self.n, self.nsquare)
-
-        return (ciphertext * obfuscator) % self.nsquare
-
-    def raw_encrypt(self, plaintext, random_value=None):
-        """ """
-        if not isinstance(plaintext, int):
-            raise TypeError(
-                "plaintext should be int, but got: %s" %
-                type(plaintext))
-
-        if plaintext >= (self.n - self.max_int) and plaintext < self.n:
-            # Very large plaintext, take a sneaky shortcut using inverses
-            neg_plaintext = self.n - plaintext  # = abs(plaintext - nsquare)
-            neg_ciphertext = (self.n * neg_plaintext + 1) % self.nsquare
-            ciphertext = gmpy_math.invert(neg_ciphertext, self.nsquare)
-        else:
-            ciphertext = (self.n * plaintext + 1) % self.nsquare
-
-        ciphertext = self.apply_obfuscator(ciphertext, random_value)
-
-        return ciphertext
-
-    def encrypt(self, value, precision=None, random_value=None):
-        """Encode and Paillier encrypt a real number value."""
-        if isinstance(value, FixedPointNumber):
-            value = value.decode()
-        encoding = FixedPointNumber.encode(
-            value, self.n, self.max_int, precision)
-        obfuscator = random_value or 1
-        ciphertext = self.raw_encrypt(
-            encoding.encoding, random_value=obfuscator)
-        encryptednumber = PaillierEncryptedNumber(
-            self, ciphertext, encoding.exponent)
-        if random_value is None:
-            encryptednumber.apply_obfuscator()
-
-        return encryptednumber
-
-
-class PaillierPrivateKey(object):
-    """Contains a private key and associated decryption method."""
-
-    def __init__(self, public_key, p, q):
-        if not p * q == public_key.n:
-            raise ValueError(
-                "given public key does not match the given p and q")
-        if p == q:
-            raise ValueError("p and q have to be different")
-        self.public_key = public_key
-        if q < p:
-            self.p = q
-            self.q = p
-        else:
-            self.p = p
-            self.q = q
-        self.psquare = self.p * self.p
-        self.qsquare = self.q * self.q
-        self.q_inverse = gmpy_math.invert(self.q, self.p)
-        self.hp = self.h_func(self.p, self.psquare)
-        self.hq = self.h_func(self.q, self.qsquare)
-
-    def __eq__(self, other):
-        return self.p == other.p and self.q == other.q
-
-    def __hash__(self):
-        return hash((self.p, self.q))
-
-    def __repr__(self):
-        hashcode = hex(hash(self))[2:]
-
-        return "<PaillierPrivateKey {}>".format(hashcode[:10])
-
-    def h_func(self, x, xsquare):
-        """Computes the h-function as defined in Paillier's paper page."""
-        return gmpy_math.invert(
-            self.l_func(
-                gmpy_math.powmod(
-                    self.public_key.g,
-                    x - 1,
-                    xsquare),
-                x),
-            x)
-
-    def l_func(self, x, p):
-        """computes the L function as defined in Paillier's paper."""
-
-        return (x - 1) // p
-
-    def crt(self, mp, mq):
-        """the Chinese Remainder Theorem as needed for decryption.
-        return the solution modulo n=pq.
-        """
-        u = (mp - mq) * self.q_inverse % self.p
-        x = (mq + (u * self.q)) % self.public_key.n
-
-        return x
-
-    def raw_decrypt(self, ciphertext):
-        """return raw plaintext."""
-        if not isinstance(ciphertext, int):
-            raise TypeError(
-                "ciphertext should be an int, not: %s" %
-                type(ciphertext))
-
-        mp = self.l_func(gmpy_math.powmod(ciphertext, self.p - 1, self.psquare), self.p) * self.hp % self.p
-
-        mq = self.l_func(gmpy_math.powmod(ciphertext, self.q - 1, self.qsquare), self.q) * self.hq % self.q
-
-        return self.crt(mp, mq)
-
-    def decrypt(self, encrypted_number):
-        """return the decrypted & decoded plaintext of encrypted_number."""
-        if not isinstance(encrypted_number, PaillierEncryptedNumber):
-            raise TypeError(
-                "encrypted_number should be an PaillierEncryptedNumber, \
-                             not: %s"
-                % type(encrypted_number)
-            )
-
-        if self.public_key != encrypted_number.public_key:
-            raise ValueError(
-                "encrypted_number was encrypted against a different key!")
-
-        encoded = self.raw_decrypt(
-            encrypted_number.ciphertext(
-                be_secure=False))
-        encoded = FixedPointNumber(
-            encoded,
-            encrypted_number.exponent,
-            self.public_key.n,
-            self.public_key.max_int,
-        )
-        decrypt_value = encoded.decode()
-
-        return decrypt_value
-
-
-class PaillierEncryptedNumber(object):
-    """Represents the Paillier encryption of a float or int."""
-
-    def __init__(self, public_key, ciphertext, exponent=0):
-        self.public_key = public_key
-        self.__ciphertext = ciphertext
-        self.exponent = exponent
-        self.__is_obfuscator = False
-
-        if not isinstance(self.__ciphertext, int):
-            raise TypeError(
-                "ciphertext should be an int, not: %s" %
-                type(
-                    self.__ciphertext))
-
-        if not isinstance(self.public_key, PaillierPublicKey):
-            raise TypeError(
-                "public_key should be a PaillierPublicKey, not: %s"
-                % type(self.public_key)
-            )
-
-    def ciphertext(self, be_secure=True):
-        """return the ciphertext of the PaillierEncryptedNumber."""
-        if be_secure and not self.__is_obfuscator:
-            self.apply_obfuscator()
-
-        return self.__ciphertext
-
-    def apply_obfuscator(self):
-        """ciphertext by multiplying by r ** n with random r"""
-        self.__ciphertext = self.public_key.apply_obfuscator(self.__ciphertext)
-        self.__is_obfuscator = True
-
-    def __add__(self, other):
-        if isinstance(other, PaillierEncryptedNumber):
-            return self.__add_encryptednumber(other)
-        else:
-            return self.__add_scalar(other)
-
-    def __radd__(self, other):
-        return self.__add__(other)
-
-    def __sub__(self, other):
-        return self + (other * -1)
-
-    def __rsub__(self, other):
-        return other + (self * -1)
-
-    def __rmul__(self, scalar):
-        return self.__mul__(scalar)
-
-    def __truediv__(self, scalar):
-        return self.__mul__(1 / scalar)
-
-    def __mul__(self, scalar):
-        """return Multiply by an scalar(such as int, float)"""
-        if isinstance(scalar, FixedPointNumber):
-            scalar = scalar.decode()
-        encode = FixedPointNumber.encode(
-            scalar, self.public_key.n, self.public_key.max_int
-        )
-        plaintext = encode.encoding
-
-        if plaintext < 0 or plaintext >= self.public_key.n:
-            raise ValueError("Scalar out of bounds: %i" % plaintext)
-
-        if plaintext >= self.public_key.n - self.public_key.max_int:
-            # Very large plaintext, play a sneaky trick using inverses
-            neg_c = gmpy_math.invert(
-                self.ciphertext(False),
-                self.public_key.nsquare)
-            neg_scalar = self.public_key.n - plaintext
-            ciphertext = gmpy_math.powmod(
-                neg_c, neg_scalar, self.public_key.nsquare)
-        else:
-            ciphertext = gmpy_math.powmod(
-                self.ciphertext(False), plaintext, self.public_key.nsquare
-            )
-
-        exponent = self.exponent + encode.exponent
-
-        return PaillierEncryptedNumber(self.public_key, ciphertext, exponent)
-
-    def increase_exponent_to(self, new_exponent):
-        """return PaillierEncryptedNumber:
-        new PaillierEncryptedNumber with same value but having great exponent.
-        """
-        if new_exponent < self.exponent:
-            raise ValueError(
-                "New exponent %i should be great than old exponent %i"
-                % (new_exponent, self.exponent)
-            )
-
-        factor = pow(FixedPointNumber.BASE, new_exponent - self.exponent)
-        new_encryptednumber = self.__mul__(factor)
-        new_encryptednumber.exponent = new_exponent
-
-        return new_encryptednumber
-
-    def __align_exponent(self, x, y):
-        """return x,y with same exponet"""
-        if x.exponent < y.exponent:
-            x = x.increase_exponent_to(y.exponent)
-        elif x.exponent > y.exponent:
-            y = y.increase_exponent_to(x.exponent)
-
-        return x, y
-
-    def __add_scalar(self, scalar):
-        """return PaillierEncryptedNumber: z = E(x) + y"""
-        if isinstance(scalar, FixedPointNumber):
-            scalar = scalar.decode()
-        encoded = FixedPointNumber.encode(
-            scalar,
-            self.public_key.n,
-            self.public_key.max_int,
-            max_exponent=self.exponent,
-        )
-        return self.__add_fixpointnumber(encoded)
-
-    def __add_fixpointnumber(self, encoded):
-        """return PaillierEncryptedNumber: z = E(x) + FixedPointNumber(y)"""
-        if self.public_key.n != encoded.n:
-            raise ValueError(
-                "Attempted to add numbers encoded against different public keys!"
-            )
-
-        # their exponents must match, and align.
-        x, y = self.__align_exponent(self, encoded)
-
-        encrypted_scalar = x.public_key.raw_encrypt(y.encoding, 1)
-        encryptednumber = self.__raw_add(
-            x.ciphertext(False), encrypted_scalar, x.exponent
-        )
-
-        return encryptednumber
-
-    def __add_encryptednumber(self, other):
-        """return PaillierEncryptedNumber: z = E(x) + E(y)"""
-        if self.public_key != other.public_key:
-            raise ValueError("add two numbers have different public key!")
-
-        # their exponents must match, and align.
-        x, y = self.__align_exponent(self, other)
-
-        encryptednumber = self.__raw_add(
-            x.ciphertext(False), y.ciphertext(False), x.exponent
-        )
-
-        return encryptednumber
-
-    def __raw_add(self, e_x, e_y, exponent):
-        """return the integer E(x + y) given ints E(x) and E(y)."""
-        ciphertext = gmpy_math.mpz(
-            e_x) * gmpy_math.mpz(e_y) % self.public_key.nsquare
-
-        return PaillierEncryptedNumber(
-            self.public_key, int(ciphertext), exponent)
diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fixedpoint.py b/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fixedpoint.py
deleted file mode 100644
index dca6d0fcda..0000000000
--- a/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/fixedpoint.py
+++ /dev/null
@@ -1,322 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import math
-import sys
-
-import numpy as np
-
-
-class FixedPointNumber(object):
-    """Represents a float or int fixedpoint encoding;."""
-
-    BASE = 16
-    LOG2_BASE = math.log(BASE, 2)
-    FLOAT_MANTISSA_BITS = sys.float_info.mant_dig
-
-    Q = 293973345475167247070445277780365744413 ** 2
-
-    def __init__(self, encoding, exponent, n=None, max_int=None):
-        if n is None:
-            self.n = FixedPointNumber.Q
-            self.max_int = self.n // 2
-        else:
-            self.n = n
-            if max_int is None:
-                self.max_int = self.n // 2
-            else:
-                self.max_int = max_int
-
-        self.encoding = encoding
-        self.exponent = exponent
-
-    @classmethod
-    def calculate_exponent_from_precision(cls, precision):
-        exponent = math.floor(math.log(precision, cls.BASE))
-        return exponent
-
-    @classmethod
-    def encode(
-            cls,
-            scalar,
-            n=None,
-            max_int=None,
-            precision=None,
-            max_exponent=None):
-        """return an encoding of an int or float."""
-        # Calculate the maximum exponent for desired precision
-        exponent = None
-
-        #  Too low value preprocess;
-        #  avoid "OverflowError: int too large to convert to float"
-
-        if np.abs(scalar) < 1e-200:
-            scalar = 0
-
-        if n is None:
-            n = cls.Q
-            max_int = n // 2
-
-        if precision is None:
-            if isinstance(scalar, int) or isinstance(scalar, np.int16) or isinstance(scalar, np.int32) or isinstance(
-                    scalar, np.int64):
-                exponent = 0
-            elif isinstance(scalar, float) or isinstance(scalar, np.float16) or isinstance(scalar,
-                                                                                           np.float32) or isinstance(
-                    scalar, np.float64):
-                flt_exponent = math.frexp(scalar)[1]
-                lsb_exponent = cls.FLOAT_MANTISSA_BITS - flt_exponent
-                exponent = math.floor(lsb_exponent / cls.LOG2_BASE)
-            else:
-                raise TypeError(
-                    "Don't know the precision of type %s." %
-                    type(scalar))
-        else:
-            exponent = cls.calculate_exponent_from_precision(precision)
-
-        if max_exponent is not None:
-            exponent = max(max_exponent, exponent)
-
-        int_fixpoint = int(round(scalar * pow(cls.BASE, exponent)))
-
-        if abs(int_fixpoint) > max_int:
-            raise ValueError(
-                f"Integer needs to be within +/- {max_int},but got {int_fixpoint},"
-                f"basic info, scalar={scalar}, base={cls.BASE}, exponent={exponent}")
-
-        return cls(int_fixpoint % n, exponent, n, max_int)
-
-    def decode(self):
-        """return decode plaintext."""
-        if self.encoding >= self.n:
-            # Should be mod n
-            raise ValueError('Attempted to decode corrupted number')
-        elif self.encoding <= self.max_int:
-            # Positive
-            mantissa = self.encoding
-        elif self.encoding >= self.n - self.max_int:
-            # Negative
-            mantissa = self.encoding - self.n
-        else:
-            raise OverflowError(
-                f'Overflow detected in decode number, encoding: {self.encoding}，'
-                f'{self.exponent}'
-                f' {self.n}')
-
-        return mantissa * pow(self.BASE, -self.exponent)
-
-    def increase_exponent_to(self, new_exponent):
-        """return FixedPointNumber: new encoding with same value but having great exponent."""
-        if new_exponent < self.exponent:
-            raise ValueError(
-                'New exponent %i should be greater than'
-                'old exponent %i' % (new_exponent, self.exponent)
-            )
-
-        factor = pow(self.BASE, new_exponent - self.exponent)
-        new_encoding = self.encoding * factor % self.n
-
-        return FixedPointNumber(
-            new_encoding,
-            new_exponent,
-            self.n,
-            self.max_int)
-
-    def __align_exponent(self, x, y):
-        """return x,y with same exponent"""
-        if x.exponent < y.exponent:
-            x = x.increase_exponent_to(y.exponent)
-        elif x.exponent > y.exponent:
-            y = y.increase_exponent_to(x.exponent)
-
-        return x, y
-
-    def __truncate(self, a):
-        scalar = a.decode()
-        return FixedPointNumber.encode(scalar, n=self.n, max_int=self.max_int)
-
-    def __add__(self, other):
-        if isinstance(other, FixedPointNumber):
-            return self.__add_fixedpointnumber(other)
-        elif type(other).__name__ == "PaillierEncryptedNumber":
-            return other + self.decode()
-        else:
-            return self.__add_scalar(other)
-
-    def __radd__(self, other):
-        return self.__add__(other)
-
-    def __sub__(self, other):
-        if isinstance(other, FixedPointNumber):
-            return self.__sub_fixedpointnumber(other)
-        elif type(other).__name__ == "PaillierEncryptedNumber":
-            return (other - self.decode()) * -1
-        else:
-            return self.__sub_scalar(other)
-
-    def __rsub__(self, other):
-        if type(other).__name__ == "PaillierEncryptedNumber":
-            return other - self.decode()
-
-        x = self.__sub__(other)
-        x = -1 * x.decode()
-        return self.encode(x, n=self.n, max_int=self.max_int)
-
-    def __rmul__(self, other):
-        return self.__mul__(other)
-
-    def __mul__(self, other):
-        if isinstance(other, FixedPointNumber):
-            return self.__mul_fixedpointnumber(other)
-        elif type(other).__name__ == "PaillierEncryptedNumber":
-            return other * self.decode()
-        else:
-            return self.__mul_scalar(other)
-
-    def __truediv__(self, other):
-        if isinstance(other, FixedPointNumber):
-            scalar = other.decode()
-        else:
-            scalar = other
-
-        return self.__mul__(1 / scalar)
-
-    def __rtruediv__(self, other):
-        res = 1.0 / self.__truediv__(other).decode()
-        return FixedPointNumber.encode(res, n=self.n, max_int=self.max_int)
-
-    def __lt__(self, other):
-        x = self.decode()
-        if isinstance(other, FixedPointNumber):
-            y = other.decode()
-        else:
-            y = other
-        if x < y:
-            return True
-        else:
-            return False
-
-    def __gt__(self, other):
-        x = self.decode()
-        if isinstance(other, FixedPointNumber):
-            y = other.decode()
-        else:
-            y = other
-        if x > y:
-            return True
-        else:
-            return False
-
-    def __le__(self, other):
-        x = self.decode()
-        if isinstance(other, FixedPointNumber):
-            y = other.decode()
-        else:
-            y = other
-        if x <= y:
-            return True
-        else:
-            return False
-
-    def __ge__(self, other):
-        x = self.decode()
-        if isinstance(other, FixedPointNumber):
-            y = other.decode()
-        else:
-            y = other
-
-        if x >= y:
-            return True
-        else:
-            return False
-
-    def __eq__(self, other):
-        x = self.decode()
-        if isinstance(other, FixedPointNumber):
-            y = other.decode()
-        else:
-            y = other
-        if x == y:
-            return True
-        else:
-            return False
-
-    def __ne__(self, other):
-        x = self.decode()
-        if isinstance(other, FixedPointNumber):
-            y = other.decode()
-        else:
-            y = other
-        if x != y:
-            return True
-        else:
-            return False
-
-    def __add_fixedpointnumber(self, other):
-        if self.n != other.n:
-            other = self.encode(other.decode(), n=self.n, max_int=self.max_int)
-        x, y = self.__align_exponent(self, other)
-        encoding = (x.encoding + y.encoding) % self.n
-        return FixedPointNumber(
-            encoding,
-            x.exponent,
-            n=self.n,
-            max_int=self.max_int)
-
-    def __add_scalar(self, scalar):
-        encoded = self.encode(scalar, n=self.n, max_int=self.max_int)
-        return self.__add_fixedpointnumber(encoded)
-
-    def __sub_fixedpointnumber(self, other):
-        if self.n != other.n:
-            other = self.encode(other.decode(), n=self.n, max_int=self.max_int)
-        x, y = self.__align_exponent(self, other)
-        encoding = (x.encoding - y.encoding) % self.n
-
-        return FixedPointNumber(
-            encoding,
-            x.exponent,
-            n=self.n,
-            max_int=self.max_int)
-
-    def __sub_scalar(self, scalar):
-        scalar = -1 * scalar
-        return self.__add_scalar(scalar)
-
-    def __mul_fixedpointnumber(self, other):
-        return self.__mul_scalar(other.decode())
-
-    def __mul_scalar(self, scalar):
-        val = self.decode()
-        z = val * scalar
-        z_encode = FixedPointNumber.encode(z, n=self.n, max_int=self.max_int)
-        return z_encode
-
-    def __abs__(self):
-        if self.encoding <= self.max_int:
-            # Positive
-            return self
-        elif self.encoding >= self.n - self.max_int:
-            # Negative
-            return self * -1
-
-    def __mod__(self, other):
-        return FixedPointNumber(
-            self.encoding %
-            other,
-            self.exponent,
-            n=self.n,
-            max_int=self.max_int)
diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/gmpy_math.py b/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/gmpy_math.py
deleted file mode 100644
index a316ead0ff..0000000000
--- a/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/gmpy_math.py
+++ /dev/null
@@ -1,133 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import random
-import gmpy2
-
-POWMOD_GMP_SIZE = pow(2, 64)
-
-
-def powmod(a, b, c):
-    """
-    return int: (a ** b) % c
-    """
-
-    if a == 1:
-        return 1
-
-    if max(a, b, c) < POWMOD_GMP_SIZE:
-        return pow(a, b, c)
-
-    else:
-        return int(gmpy2.powmod(a, b, c))
-
-
-def crt_coefficient(p, q):
-    """
-    return crt coefficient
-    """
-    tq = gmpy2.invert(p, q)
-    tp = gmpy2.invert(q, p)
-    return tp * q, tq * p
-
-
-def powmod_crt(x, d, n, p, q, cp, cq):
-    """
-    return int: (a ** b) % n
-    """
-
-    rp = gmpy2.powmod(x, d % (p - 1), p)
-    rq = gmpy2.powmod(x, d % (q - 1), q)
-    return int((rp * cp + rq * cq) % n)
-
-
-def invert(a, b):
-    """return int: x, where a * x == 1 mod b"""
-    x = int(gmpy2.invert(a, b))
-
-    if x == 0:
-        raise ZeroDivisionError("invert(a, b) no inverse exists")
-
-    return x
-
-
-def getprimeover(n):
-    """return a random n-bit prime number"""
-    r = gmpy2.mpz(random.SystemRandom().getrandbits(n))
-    r = gmpy2.bit_set(r, n - 1)
-
-    return int(gmpy2.next_prime(r))
-
-
-def isqrt(n):
-    """return the integer square root of N"""
-
-    return int(gmpy2.isqrt(n))
-
-
-def is_prime(n):
-    """
-    true if n is probably a prime, false otherwise
-    :param n:
-    :return:
-    """
-    return gmpy2.is_prime(int(n))
-
-
-def legendre(a, p):
-    return pow(a, (p - 1) // 2, p)
-
-
-def tonelli(n, p):
-    # assert legendre(n, p) == 1, "not a square (mod p)"
-    q = p - 1
-    s = 0
-    while q % 2 == 0:
-        q //= 2
-        s += 1
-    if s == 1:
-        return pow(n, (p + 1) // 4, p)
-    for z in range(2, p):
-        if p - 1 == legendre(z, p):
-            break
-    c = pow(z, q, p)
-    r = pow(n, (q + 1) // 2, p)
-    t = pow(n, q, p)
-    m = s
-    while (t - 1) % p != 0:
-        t2 = (t * t) % p
-        for i in range(1, m):
-            if (t2 - 1) % p == 0:
-                break
-            t2 = (t2 * t2) % p
-        b = pow(c, 1 << (m - i - 1), p)
-        r = (r * b) % p
-        c = (b * b) % p
-        t = (t * c) % p
-        m = i
-    return r
-
-
-def gcd(a, b):
-    return int(gmpy2.gcd(a, b))
-
-
-def next_prime(n):
-    return int(gmpy2.next_prime(n))
-
-
-def mpz(n):
-    return gmpy2.mpz(n)
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/__init__.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py
deleted file mode 100644
index fbb9e57abf..0000000000
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fate_paillier.py
+++ /dev/null
@@ -1,366 +0,0 @@
-"""Paillier encryption library for partially homomorphic encryption."""
-
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import random
-
-from . import gmpy_math
-from .fixedpoint import FixedPointNumber
-
-
-class PaillierKeypair(object):
-    def __init__(self):
-        pass
-
-    @staticmethod
-    def generate_keypair(n_length=1024):
-        """return a new :class:`PaillierPublicKey` and :class:`PaillierPrivateKey`."""
-        p = q = n = None
-        n_len = 0
-
-        while n_len != n_length:
-            p = gmpy_math.getprimeover(n_length // 2)
-            q = p
-            while q == p:
-                q = gmpy_math.getprimeover(n_length // 2)
-            n = p * q
-            n_len = n.bit_length()
-
-        public_key = PaillierPublicKey(n)
-        private_key = PaillierPrivateKey(public_key, p, q)
-
-        return public_key, private_key
-
-
-class PaillierPublicKey(object):
-    """Contains a public key and associated encryption methods."""
-
-    def __init__(self, n):
-        self.g = n + 1
-        self.n = n
-        self.nsquare = n * n
-        self.max_int = n // 3 - 1
-
-    def __repr__(self):
-        hashcode = hex(hash(self))[2:]
-        return "<PaillierPublicKey {}>".format(hashcode[:10])
-
-    def __eq__(self, other):
-        return self.n == other.n
-
-    def __hash__(self):
-        return hash(self.n)
-
-    def apply_obfuscator(self, ciphertext, random_value=None):
-        """ """
-        r = random_value or random.SystemRandom().randrange(1, self.n)
-        obfuscator = gmpy_math.powmod(r, self.n, self.nsquare)
-
-        return (ciphertext * obfuscator) % self.nsquare
-
-    def raw_encrypt(self, plaintext, random_value=None):
-        """ """
-        if not isinstance(plaintext, int):
-            raise TypeError(
-                "plaintext should be int, but got: %s" %
-                type(plaintext))
-
-        if plaintext >= (self.n - self.max_int) and plaintext < self.n:
-            # Very large plaintext, take a sneaky shortcut using inverses
-            neg_plaintext = self.n - plaintext  # = abs(plaintext - nsquare)
-            neg_ciphertext = (self.n * neg_plaintext + 1) % self.nsquare
-            ciphertext = gmpy_math.invert(neg_ciphertext, self.nsquare)
-        else:
-            ciphertext = (self.n * plaintext + 1) % self.nsquare
-
-        ciphertext = self.apply_obfuscator(ciphertext, random_value)
-
-        return ciphertext
-
-    def encrypt(self, value, precision=None, random_value=None):
-        """Encode and Paillier encrypt a real number value."""
-        if isinstance(value, FixedPointNumber):
-            value = value.decode()
-        encoding = FixedPointNumber.encode(
-            value, self.n, self.max_int, precision)
-        obfuscator = random_value or 1
-        ciphertext = self.raw_encrypt(
-            encoding.encoding, random_value=obfuscator)
-        encryptednumber = PaillierEncryptedNumber(
-            self, ciphertext, encoding.exponent)
-        if random_value is None:
-            encryptednumber.apply_obfuscator()
-
-        return encryptednumber
-
-
-class PaillierPrivateKey(object):
-    """Contains a private key and associated decryption method."""
-
-    def __init__(self, public_key, p, q):
-        if not p * q == public_key.n:
-            raise ValueError(
-                "given public key does not match the given p and q")
-        if p == q:
-            raise ValueError("p and q have to be different")
-        self.public_key = public_key
-        if q < p:
-            self.p = q
-            self.q = p
-        else:
-            self.p = p
-            self.q = q
-        self.psquare = self.p * self.p
-        self.qsquare = self.q * self.q
-        self.q_inverse = gmpy_math.invert(self.q, self.p)
-        self.hp = self.h_func(self.p, self.psquare)
-        self.hq = self.h_func(self.q, self.qsquare)
-
-    def __eq__(self, other):
-        return self.p == other.p and self.q == other.q
-
-    def __hash__(self):
-        return hash((self.p, self.q))
-
-    def __repr__(self):
-        hashcode = hex(hash(self))[2:]
-
-        return "<PaillierPrivateKey {}>".format(hashcode[:10])
-
-    def h_func(self, x, xsquare):
-        """Computes the h-function as defined in Paillier's paper page."""
-        return gmpy_math.invert(
-            self.l_func(
-                gmpy_math.powmod(
-                    self.public_key.g,
-                    x - 1,
-                    xsquare),
-                x),
-            x)
-
-    def l_func(self, x, p):
-        """computes the L function as defined in Paillier's paper."""
-
-        return (x - 1) // p
-
-    def crt(self, mp, mq):
-        """the Chinese Remainder Theorem as needed for decryption.
-        return the solution modulo n=pq.
-        """
-        u = (mp - mq) * self.q_inverse % self.p
-        x = (mq + (u * self.q)) % self.public_key.n
-
-        return x
-
-    def raw_decrypt(self, ciphertext):
-        """return raw plaintext."""
-        if not isinstance(ciphertext, int):
-            raise TypeError(
-                "ciphertext should be an int, not: %s" %
-                type(ciphertext))
-
-        mp = self.l_func(gmpy_math.powmod(ciphertext, self.p - 1, self.psquare), self.p) * self.hp % self.p
-
-        mq = self.l_func(gmpy_math.powmod(ciphertext, self.q - 1, self.qsquare), self.q) * self.hq % self.q
-
-        return self.crt(mp, mq)
-
-    def decrypt(self, encrypted_number):
-        """return the decrypted & decoded plaintext of encrypted_number."""
-        if not isinstance(encrypted_number, PaillierEncryptedNumber):
-            raise TypeError(
-                "encrypted_number should be an PaillierEncryptedNumber, \
-                             not: %s"
-                % type(encrypted_number)
-            )
-
-        if self.public_key != encrypted_number.public_key:
-            raise ValueError(
-                "encrypted_number was encrypted against a different key!")
-
-        encoded = self.raw_decrypt(
-            encrypted_number.ciphertext(
-                be_secure=False))
-        encoded = FixedPointNumber(
-            encoded,
-            encrypted_number.exponent,
-            self.public_key.n,
-            self.public_key.max_int,
-        )
-        decrypt_value = encoded.decode()
-
-        return decrypt_value
-
-
-class PaillierEncryptedNumber(object):
-    """Represents the Paillier encryption of a float or int."""
-
-    def __init__(self, public_key, ciphertext, exponent=0):
-        self.public_key = public_key
-        self.__ciphertext = ciphertext
-        self.exponent = exponent
-        self.__is_obfuscator = False
-
-        if not isinstance(self.__ciphertext, int):
-            raise TypeError(
-                "ciphertext should be an int, not: %s" %
-                type(
-                    self.__ciphertext))
-
-        if not isinstance(self.public_key, PaillierPublicKey):
-            raise TypeError(
-                "public_key should be a PaillierPublicKey, not: %s"
-                % type(self.public_key)
-            )
-
-    def ciphertext(self, be_secure=True):
-        """return the ciphertext of the PaillierEncryptedNumber."""
-        if be_secure and not self.__is_obfuscator:
-            self.apply_obfuscator()
-
-        return self.__ciphertext
-
-    def apply_obfuscator(self):
-        """ciphertext by multiplying by r ** n with random r"""
-        self.__ciphertext = self.public_key.apply_obfuscator(self.__ciphertext)
-        self.__is_obfuscator = True
-
-    def __add__(self, other):
-        if isinstance(other, PaillierEncryptedNumber):
-            return self.__add_encryptednumber(other)
-        else:
-            return self.__add_scalar(other)
-
-    def __radd__(self, other):
-        return self.__add__(other)
-
-    def __sub__(self, other):
-        return self + (other * -1)
-
-    def __rsub__(self, other):
-        return other + (self * -1)
-
-    def __rmul__(self, scalar):
-        return self.__mul__(scalar)
-
-    def __truediv__(self, scalar):
-        return self.__mul__(1 / scalar)
-
-    def __mul__(self, scalar):
-        """return Multiply by an scalar(such as int, float)"""
-        if isinstance(scalar, FixedPointNumber):
-            scalar = scalar.decode()
-        encode = FixedPointNumber.encode(
-            scalar, self.public_key.n, self.public_key.max_int
-        )
-        plaintext = encode.encoding
-
-        if plaintext < 0 or plaintext >= self.public_key.n:
-            raise ValueError("Scalar out of bounds: %i" % plaintext)
-
-        if plaintext >= self.public_key.n - self.public_key.max_int:
-            # Very large plaintext, play a sneaky trick using inverses
-            neg_c = gmpy_math.invert(
-                self.ciphertext(False),
-                self.public_key.nsquare)
-            neg_scalar = self.public_key.n - plaintext
-            ciphertext = gmpy_math.powmod(
-                neg_c, neg_scalar, self.public_key.nsquare)
-        else:
-            ciphertext = gmpy_math.powmod(
-                self.ciphertext(False), plaintext, self.public_key.nsquare
-            )
-
-        exponent = self.exponent + encode.exponent
-
-        return PaillierEncryptedNumber(self.public_key, ciphertext, exponent)
-
-    def increase_exponent_to(self, new_exponent):
-        """return PaillierEncryptedNumber:
-        new PaillierEncryptedNumber with same value but having great exponent.
-        """
-        if new_exponent < self.exponent:
-            raise ValueError(
-                "New exponent %i should be great than old exponent %i"
-                % (new_exponent, self.exponent)
-            )
-
-        factor = pow(FixedPointNumber.BASE, new_exponent - self.exponent)
-        new_encryptednumber = self.__mul__(factor)
-        new_encryptednumber.exponent = new_exponent
-
-        return new_encryptednumber
-
-    def __align_exponent(self, x, y):
-        """return x,y with same exponet"""
-        if x.exponent < y.exponent:
-            x = x.increase_exponent_to(y.exponent)
-        elif x.exponent > y.exponent:
-            y = y.increase_exponent_to(x.exponent)
-
-        return x, y
-
-    def __add_scalar(self, scalar):
-        """return PaillierEncryptedNumber: z = E(x) + y"""
-        if isinstance(scalar, FixedPointNumber):
-            scalar = scalar.decode()
-        encoded = FixedPointNumber.encode(
-            scalar,
-            self.public_key.n,
-            self.public_key.max_int,
-            max_exponent=self.exponent,
-        )
-        return self.__add_fixpointnumber(encoded)
-
-    def __add_fixpointnumber(self, encoded):
-        """return PaillierEncryptedNumber: z = E(x) + FixedPointNumber(y)"""
-        if self.public_key.n != encoded.n:
-            raise ValueError(
-                "Attempted to add numbers encoded against different public keys!"
-            )
-
-        # their exponents must match, and align.
-        x, y = self.__align_exponent(self, encoded)
-
-        encrypted_scalar = x.public_key.raw_encrypt(y.encoding, 1)
-        encryptednumber = self.__raw_add(
-            x.ciphertext(False), encrypted_scalar, x.exponent
-        )
-
-        return encryptednumber
-
-    def __add_encryptednumber(self, other):
-        """return PaillierEncryptedNumber: z = E(x) + E(y)"""
-        if self.public_key != other.public_key:
-            raise ValueError("add two numbers have different public key!")
-
-        # their exponents must match, and align.
-        x, y = self.__align_exponent(self, other)
-
-        encryptednumber = self.__raw_add(
-            x.ciphertext(False), y.ciphertext(False), x.exponent
-        )
-
-        return encryptednumber
-
-    def __raw_add(self, e_x, e_y, exponent):
-        """return the integer E(x + y) given ints E(x) and E(y)."""
-        ciphertext = gmpy_math.mpz(
-            e_x) * gmpy_math.mpz(e_y) % self.public_key.nsquare
-
-        return PaillierEncryptedNumber(
-            self.public_key, int(ciphertext), exponent)
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py
deleted file mode 100644
index dca6d0fcda..0000000000
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/fixedpoint.py
+++ /dev/null
@@ -1,322 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import math
-import sys
-
-import numpy as np
-
-
-class FixedPointNumber(object):
-    """Represents a float or int fixedpoint encoding;."""
-
-    BASE = 16
-    LOG2_BASE = math.log(BASE, 2)
-    FLOAT_MANTISSA_BITS = sys.float_info.mant_dig
-
-    Q = 293973345475167247070445277780365744413 ** 2
-
-    def __init__(self, encoding, exponent, n=None, max_int=None):
-        if n is None:
-            self.n = FixedPointNumber.Q
-            self.max_int = self.n // 2
-        else:
-            self.n = n
-            if max_int is None:
-                self.max_int = self.n // 2
-            else:
-                self.max_int = max_int
-
-        self.encoding = encoding
-        self.exponent = exponent
-
-    @classmethod
-    def calculate_exponent_from_precision(cls, precision):
-        exponent = math.floor(math.log(precision, cls.BASE))
-        return exponent
-
-    @classmethod
-    def encode(
-            cls,
-            scalar,
-            n=None,
-            max_int=None,
-            precision=None,
-            max_exponent=None):
-        """return an encoding of an int or float."""
-        # Calculate the maximum exponent for desired precision
-        exponent = None
-
-        #  Too low value preprocess;
-        #  avoid "OverflowError: int too large to convert to float"
-
-        if np.abs(scalar) < 1e-200:
-            scalar = 0
-
-        if n is None:
-            n = cls.Q
-            max_int = n // 2
-
-        if precision is None:
-            if isinstance(scalar, int) or isinstance(scalar, np.int16) or isinstance(scalar, np.int32) or isinstance(
-                    scalar, np.int64):
-                exponent = 0
-            elif isinstance(scalar, float) or isinstance(scalar, np.float16) or isinstance(scalar,
-                                                                                           np.float32) or isinstance(
-                    scalar, np.float64):
-                flt_exponent = math.frexp(scalar)[1]
-                lsb_exponent = cls.FLOAT_MANTISSA_BITS - flt_exponent
-                exponent = math.floor(lsb_exponent / cls.LOG2_BASE)
-            else:
-                raise TypeError(
-                    "Don't know the precision of type %s." %
-                    type(scalar))
-        else:
-            exponent = cls.calculate_exponent_from_precision(precision)
-
-        if max_exponent is not None:
-            exponent = max(max_exponent, exponent)
-
-        int_fixpoint = int(round(scalar * pow(cls.BASE, exponent)))
-
-        if abs(int_fixpoint) > max_int:
-            raise ValueError(
-                f"Integer needs to be within +/- {max_int},but got {int_fixpoint},"
-                f"basic info, scalar={scalar}, base={cls.BASE}, exponent={exponent}")
-
-        return cls(int_fixpoint % n, exponent, n, max_int)
-
-    def decode(self):
-        """return decode plaintext."""
-        if self.encoding >= self.n:
-            # Should be mod n
-            raise ValueError('Attempted to decode corrupted number')
-        elif self.encoding <= self.max_int:
-            # Positive
-            mantissa = self.encoding
-        elif self.encoding >= self.n - self.max_int:
-            # Negative
-            mantissa = self.encoding - self.n
-        else:
-            raise OverflowError(
-                f'Overflow detected in decode number, encoding: {self.encoding}，'
-                f'{self.exponent}'
-                f' {self.n}')
-
-        return mantissa * pow(self.BASE, -self.exponent)
-
-    def increase_exponent_to(self, new_exponent):
-        """return FixedPointNumber: new encoding with same value but having great exponent."""
-        if new_exponent < self.exponent:
-            raise ValueError(
-                'New exponent %i should be greater than'
-                'old exponent %i' % (new_exponent, self.exponent)
-            )
-
-        factor = pow(self.BASE, new_exponent - self.exponent)
-        new_encoding = self.encoding * factor % self.n
-
-        return FixedPointNumber(
-            new_encoding,
-            new_exponent,
-            self.n,
-            self.max_int)
-
-    def __align_exponent(self, x, y):
-        """return x,y with same exponent"""
-        if x.exponent < y.exponent:
-            x = x.increase_exponent_to(y.exponent)
-        elif x.exponent > y.exponent:
-            y = y.increase_exponent_to(x.exponent)
-
-        return x, y
-
-    def __truncate(self, a):
-        scalar = a.decode()
-        return FixedPointNumber.encode(scalar, n=self.n, max_int=self.max_int)
-
-    def __add__(self, other):
-        if isinstance(other, FixedPointNumber):
-            return self.__add_fixedpointnumber(other)
-        elif type(other).__name__ == "PaillierEncryptedNumber":
-            return other + self.decode()
-        else:
-            return self.__add_scalar(other)
-
-    def __radd__(self, other):
-        return self.__add__(other)
-
-    def __sub__(self, other):
-        if isinstance(other, FixedPointNumber):
-            return self.__sub_fixedpointnumber(other)
-        elif type(other).__name__ == "PaillierEncryptedNumber":
-            return (other - self.decode()) * -1
-        else:
-            return self.__sub_scalar(other)
-
-    def __rsub__(self, other):
-        if type(other).__name__ == "PaillierEncryptedNumber":
-            return other - self.decode()
-
-        x = self.__sub__(other)
-        x = -1 * x.decode()
-        return self.encode(x, n=self.n, max_int=self.max_int)
-
-    def __rmul__(self, other):
-        return self.__mul__(other)
-
-    def __mul__(self, other):
-        if isinstance(other, FixedPointNumber):
-            return self.__mul_fixedpointnumber(other)
-        elif type(other).__name__ == "PaillierEncryptedNumber":
-            return other * self.decode()
-        else:
-            return self.__mul_scalar(other)
-
-    def __truediv__(self, other):
-        if isinstance(other, FixedPointNumber):
-            scalar = other.decode()
-        else:
-            scalar = other
-
-        return self.__mul__(1 / scalar)
-
-    def __rtruediv__(self, other):
-        res = 1.0 / self.__truediv__(other).decode()
-        return FixedPointNumber.encode(res, n=self.n, max_int=self.max_int)
-
-    def __lt__(self, other):
-        x = self.decode()
-        if isinstance(other, FixedPointNumber):
-            y = other.decode()
-        else:
-            y = other
-        if x < y:
-            return True
-        else:
-            return False
-
-    def __gt__(self, other):
-        x = self.decode()
-        if isinstance(other, FixedPointNumber):
-            y = other.decode()
-        else:
-            y = other
-        if x > y:
-            return True
-        else:
-            return False
-
-    def __le__(self, other):
-        x = self.decode()
-        if isinstance(other, FixedPointNumber):
-            y = other.decode()
-        else:
-            y = other
-        if x <= y:
-            return True
-        else:
-            return False
-
-    def __ge__(self, other):
-        x = self.decode()
-        if isinstance(other, FixedPointNumber):
-            y = other.decode()
-        else:
-            y = other
-
-        if x >= y:
-            return True
-        else:
-            return False
-
-    def __eq__(self, other):
-        x = self.decode()
-        if isinstance(other, FixedPointNumber):
-            y = other.decode()
-        else:
-            y = other
-        if x == y:
-            return True
-        else:
-            return False
-
-    def __ne__(self, other):
-        x = self.decode()
-        if isinstance(other, FixedPointNumber):
-            y = other.decode()
-        else:
-            y = other
-        if x != y:
-            return True
-        else:
-            return False
-
-    def __add_fixedpointnumber(self, other):
-        if self.n != other.n:
-            other = self.encode(other.decode(), n=self.n, max_int=self.max_int)
-        x, y = self.__align_exponent(self, other)
-        encoding = (x.encoding + y.encoding) % self.n
-        return FixedPointNumber(
-            encoding,
-            x.exponent,
-            n=self.n,
-            max_int=self.max_int)
-
-    def __add_scalar(self, scalar):
-        encoded = self.encode(scalar, n=self.n, max_int=self.max_int)
-        return self.__add_fixedpointnumber(encoded)
-
-    def __sub_fixedpointnumber(self, other):
-        if self.n != other.n:
-            other = self.encode(other.decode(), n=self.n, max_int=self.max_int)
-        x, y = self.__align_exponent(self, other)
-        encoding = (x.encoding - y.encoding) % self.n
-
-        return FixedPointNumber(
-            encoding,
-            x.exponent,
-            n=self.n,
-            max_int=self.max_int)
-
-    def __sub_scalar(self, scalar):
-        scalar = -1 * scalar
-        return self.__add_scalar(scalar)
-
-    def __mul_fixedpointnumber(self, other):
-        return self.__mul_scalar(other.decode())
-
-    def __mul_scalar(self, scalar):
-        val = self.decode()
-        z = val * scalar
-        z_encode = FixedPointNumber.encode(z, n=self.n, max_int=self.max_int)
-        return z_encode
-
-    def __abs__(self):
-        if self.encoding <= self.max_int:
-            # Positive
-            return self
-        elif self.encoding >= self.n - self.max_int:
-            # Negative
-            return self * -1
-
-    def __mod__(self, other):
-        return FixedPointNumber(
-            self.encoding %
-            other,
-            self.exponent,
-            n=self.n,
-            max_int=self.max_int)
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/gmpy_math.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/gmpy_math.py
deleted file mode 100644
index a316ead0ff..0000000000
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/secureprotol/gmpy_math.py
+++ /dev/null
@@ -1,133 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import random
-import gmpy2
-
-POWMOD_GMP_SIZE = pow(2, 64)
-
-
-def powmod(a, b, c):
-    """
-    return int: (a ** b) % c
-    """
-
-    if a == 1:
-        return 1
-
-    if max(a, b, c) < POWMOD_GMP_SIZE:
-        return pow(a, b, c)
-
-    else:
-        return int(gmpy2.powmod(a, b, c))
-
-
-def crt_coefficient(p, q):
-    """
-    return crt coefficient
-    """
-    tq = gmpy2.invert(p, q)
-    tp = gmpy2.invert(q, p)
-    return tp * q, tq * p
-
-
-def powmod_crt(x, d, n, p, q, cp, cq):
-    """
-    return int: (a ** b) % n
-    """
-
-    rp = gmpy2.powmod(x, d % (p - 1), p)
-    rq = gmpy2.powmod(x, d % (q - 1), q)
-    return int((rp * cp + rq * cq) % n)
-
-
-def invert(a, b):
-    """return int: x, where a * x == 1 mod b"""
-    x = int(gmpy2.invert(a, b))
-
-    if x == 0:
-        raise ZeroDivisionError("invert(a, b) no inverse exists")
-
-    return x
-
-
-def getprimeover(n):
-    """return a random n-bit prime number"""
-    r = gmpy2.mpz(random.SystemRandom().getrandbits(n))
-    r = gmpy2.bit_set(r, n - 1)
-
-    return int(gmpy2.next_prime(r))
-
-
-def isqrt(n):
-    """return the integer square root of N"""
-
-    return int(gmpy2.isqrt(n))
-
-
-def is_prime(n):
-    """
-    true if n is probably a prime, false otherwise
-    :param n:
-    :return:
-    """
-    return gmpy2.is_prime(int(n))
-
-
-def legendre(a, p):
-    return pow(a, (p - 1) // 2, p)
-
-
-def tonelli(n, p):
-    # assert legendre(n, p) == 1, "not a square (mod p)"
-    q = p - 1
-    s = 0
-    while q % 2 == 0:
-        q //= 2
-        s += 1
-    if s == 1:
-        return pow(n, (p + 1) // 4, p)
-    for z in range(2, p):
-        if p - 1 == legendre(z, p):
-            break
-    c = pow(z, q, p)
-    r = pow(n, (q + 1) // 2, p)
-    t = pow(n, q, p)
-    m = s
-    while (t - 1) % p != 0:
-        t2 = (t * t) % p
-        for i in range(1, m):
-            if (t2 - 1) % p == 0:
-                break
-            t2 = (t2 * t2) % p
-        b = pow(c, 1 << (m - i - 1), p)
-        r = (r * b) % p
-        c = (b * b) % p
-        t = (t * c) % p
-        m = i
-    return r
-
-
-def gcd(a, b):
-    return int(gmpy2.gcd(a, b))
-
-
-def next_prime(n):
-    return int(gmpy2.next_prime(n))
-
-
-def mpz(n):
-    return gmpy2.mpz(n)
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/__init__.py b/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/__init__.py b/gpu/tensor/paillier_gpu/paillier_gpu/__init__.py
similarity index 100%
rename from gpu/fate-tensor-gpu/fate_tensor_gpu/__init__.py
rename to gpu/tensor/paillier_gpu/paillier_gpu/__init__.py
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py b/gpu/tensor/paillier_gpu/paillier_gpu/gpu_engine.py
similarity index 99%
rename from gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py
rename to gpu/tensor/paillier_gpu/paillier_gpu/gpu_engine.py
index 3fa53ac0bc..8f492aea9b 100644
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_engine.py
+++ b/gpu/tensor/paillier_gpu/paillier_gpu/gpu_engine.py
@@ -30,12 +30,12 @@
     c_size_t,
 )
 
-from .secureprotol.fate_paillier import (
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
     PaillierPublicKey,
     PaillierPrivateKey,
     PaillierEncryptedNumber,
+    FixedPointNumber
 )
-from .secureprotol.fixedpoint import FixedPointNumber
 
 from concurrent.futures import ProcessPoolExecutor as Executor
 
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_tensor.py b/gpu/tensor/paillier_gpu/paillier_gpu/gpu_tensor.py
similarity index 99%
rename from gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_tensor.py
rename to gpu/tensor/paillier_gpu/paillier_gpu/gpu_tensor.py
index 82e82865f5..6bd5049364 100644
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/gpu_tensor.py
+++ b/gpu/tensor/paillier_gpu/paillier_gpu/gpu_tensor.py
@@ -35,7 +35,7 @@
     pi_h2d_priv_key,
     pi_p2c_priv_key,
 )
-from .secureprotol.fate_paillier import (
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
     PaillierPublicKey,
     PaillierPrivateKey,
     PaillierKeypair,
diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/tests/__init__.py b/gpu/tensor/paillier_gpu/paillier_gpu/tests/__init__.py
similarity index 100%
rename from gpu/fate-tensor-fpga/fate_tensor_fpga/tests/__init__.py
rename to gpu/tensor/paillier_gpu/paillier_gpu/tests/__init__.py
diff --git a/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py b/gpu/tensor/paillier_gpu/paillier_gpu/tests/test_gpu_engine.py
similarity index 99%
rename from gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py
rename to gpu/tensor/paillier_gpu/paillier_gpu/tests/test_gpu_engine.py
index 48e95d7ab1..c0a3d0d7c5 100755
--- a/gpu/fate-tensor-gpu/fate_tensor_gpu/tests/test_gpu_engine.py
+++ b/gpu/tensor/paillier_gpu/paillier_gpu/tests/test_gpu_engine.py
@@ -20,14 +20,14 @@
 import functools
 import time
 
-from fate_tensor_gpu.secureprotol.fixedpoint import FixedPointNumber
-from fate_tensor_gpu.secureprotol import gmpy_math
-from fate_tensor_gpu.secureprotol.fate_paillier import (
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
     PaillierKeypair,
     PaillierEncryptedNumber,
+    FixedPointNumber,
+    gmpy_math,
 )
 
-from fate_tensor_gpu.gpu_engine import (
+from ..gpu_engine import (
     FLOAT_TYPE,
     INT64_TYPE,
     pi_p2c_pub_key,
diff --git a/gpu/tensor/paillier_gpu/paillier_gpu/tests/test_gpu_performance.py b/gpu/tensor/paillier_gpu/paillier_gpu/tests/test_gpu_performance.py
new file mode 100755
index 0000000000..1b09afc3d6
--- /dev/null
+++ b/gpu/tensor/paillier_gpu/paillier_gpu/tests/test_gpu_performance.py
@@ -0,0 +1,276 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import functools
+import time
+import unittest
+import numpy
+from fate_arch.tensor.impl.blocks.python_paillier_block import FixedPointNumber, PaillierKeypair
+
+from ..gpu_engine import (
+    FLOAT_TYPE,
+    INT64_TYPE,
+    pi_p2c_pub_key,
+    pi_p2c_priv_key,
+    pi_h2d_pub_key,
+    pi_h2d_priv_key,
+    TensorShapeStorage,
+    bi_alloc,
+    PLAIN_BYTE,
+    MEM_HOST,
+    te_alloc,
+    fp_alloc,
+    pi_alloc,
+    te_p2c,
+    fp_encode,
+    fp_decode,
+    te_c2p,
+    pi_encrypt,
+    pi_gen_obf_seed,
+    CIPHER_BITS,
+    pi_obfuscate,
+    pi_decrypt,
+    fp_mul,
+    fp_c2p,
+    pi_add,
+    pi_mul,
+    pi_sum,
+    bi_free,
+    te_free,
+    fp_free,
+    pi_free,
+    initialize_device,
+    pi_matmul,
+)
+
+RAND_TYPE = FLOAT_TYPE  # SWITCH DATA TYPE HERE: EITHER INT64_TYPE OR FLOAT_TYPE
+NUM_ROWS = 666
+NUM_COLS = 666
+TEST_SIZE = NUM_ROWS * NUM_COLS
+ERROR_TOLERANCE = 1e-10
+
+
+def generate_rand(test_size):
+    if RAND_TYPE == FLOAT_TYPE:
+        return numpy.random.normal(0, 5, test_size)
+    elif RAND_TYPE == INT64_TYPE:
+        return numpy.random.randint(-2 ** 10, 2 ** 10, test_size)
+    else:
+        raise TypeError("Invalid data type")
+
+
+def assert_diff(res, ref):
+    if res == 0 or ref == 0:
+        assert res == 0
+        assert ref == 0
+    else:
+        diff = res - ref
+        assert abs(diff / res) < ERROR_TOLERANCE
+        assert abs(diff / ref) < ERROR_TOLERANCE
+
+
+def assert_ndarray_diff(res, ref):
+    assert res.shape == ref.shape
+    res, ref = res.flatten(), ref.flatten()
+    assert res.shape == ref.shape
+    for i in range(res.size):
+        try:
+            assert_diff(res[i], ref[i])
+        except AssertionError:
+            print("Assertion Error at location", i, ", GPU result:",
+                  res[i], ", reference result:", ref[i])
+
+
+def profile(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        res = func(*args, **kwargs)
+        end_time = time.time()
+        return res, end_time - start_time
+
+    return wrapper
+
+
+def compare_time(gpu_time, cpu_time, num_instances=TEST_SIZE):
+    print("GPU time:", gpu_time, "second(s)")
+    print("CPU time:", cpu_time, "second(s)")
+    print("GPU throughput:", num_instances / gpu_time, "instance(s) per second")
+    print("CPU throughput:", num_instances / cpu_time, "instance(s) per second")
+    print("Speedup:", cpu_time / gpu_time)
+
+
+class TestGPUPerformance(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        initialize_device()
+        cls._pub_key, cls._priv_key = PaillierKeypair.generate_keypair()
+        cls.n, cls.max_int = cls._pub_key.n, cls._pub_key.max_int
+        cls._cpu_pub_key = pi_p2c_pub_key(None, cls._pub_key)
+        cls._cpu_priv_key = pi_p2c_priv_key(None, cls._priv_key)
+        cls._gpu_pub_key = pi_h2d_pub_key(None, cls._cpu_pub_key)
+        cls._gpu_priv_key = pi_h2d_priv_key(None, cls._cpu_priv_key)
+        print("\n\n", "*" * 100, "\n\nInitialization complete\nTest Size:", TEST_SIZE)
+
+    # test performance
+    def test_performance(self):
+        print("\n\n", "*" * 100, "\n\nTest performance begins")
+
+        print("\n>>>>> generate data and allocate memory spaces")
+        raw, raw2 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        shape_tuple, shape_tuple_T = (NUM_ROWS, NUM_COLS), (NUM_COLS, NUM_ROWS)
+        shape_store, shape_store_T = TensorShapeStorage(*shape_tuple), TensorShapeStorage(*shape_tuple_T)
+        gpu_bi_store, gpu_bi_store2 = bi_alloc(None, TEST_SIZE, PLAIN_BYTE, MEM_HOST), bi_alloc(None, TEST_SIZE,
+                                                                                                PLAIN_BYTE, MEM_HOST)
+        gpu_te_store, gpu_te_store2 = te_alloc(None, TEST_SIZE, MEM_HOST), te_alloc(None, TEST_SIZE, MEM_HOST)
+        gpu_fp_store, gpu_fp_store2 = fp_alloc(None, TEST_SIZE, MEM_HOST), fp_alloc(None, TEST_SIZE, MEM_HOST)
+        gpu_pi_store, gpu_pi_store2 = pi_alloc(None, TEST_SIZE, MEM_HOST), pi_alloc(None, TEST_SIZE, MEM_HOST)
+        gpu_te_store, gpu_te_store2 = te_p2c(raw, gpu_te_store), te_p2c(raw2, gpu_te_store2)
+
+        print("\n>>>>> fp_encode profiling begins")
+        gpu_encoded, gpu_encode_time = profile(fp_encode)(gpu_te_store, self.n, self.max_int, res=gpu_fp_store)
+        cpu_encode_time = TEST_SIZE / 62303.97
+        compare_time(gpu_encode_time, cpu_encode_time)
+
+        print("\n>>>>> fp_decode profiling begins")
+        gpu_decoded, gpu_decode_time = profile(fp_decode)(gpu_encoded, gpu_te_store, None)
+        cpu_decode_time = TEST_SIZE / 567913.21
+        compare_time(gpu_decode_time, cpu_decode_time)
+
+        # check decoded results
+        assert_ndarray_diff(te_c2p(gpu_decoded), numpy.asarray(raw))
+
+        print("\n>>>>> pi_encrypt profiling begins")
+        print("This function calculates (encoding * n + 1) % nsquare")
+        gpu_encrypted, gpu_encrypt_time = profile(pi_encrypt)(self._gpu_pub_key, gpu_encoded, gpu_pi_store, None)
+        cpu_encrypt_time = TEST_SIZE / 205864.74
+        compare_time(gpu_encrypt_time, cpu_encrypt_time)
+
+        print("\n>>>>> pi_gen_obf_seed profiling begins")
+        print("This function calculates (rand() ^ n) % nsquare")
+        gpu_obf_seeds, gpu_gen_obf_seeds_time = profile(pi_gen_obf_seed)(gpu_bi_store, self._gpu_pub_key, TEST_SIZE,
+                                                                         CIPHER_BITS // 6, 0, None)
+        cpu_gen_obf_seefs_time = TEST_SIZE / 444.05
+        compare_time(gpu_gen_obf_seeds_time, cpu_gen_obf_seefs_time)
+
+        print("\n>>>>> pi_obfuscate profiling begins")
+        print("This function calculates (raw_cipher * obf_seed) % nsquare,")
+        print("\twhere raw_cipher and obf_seed are calculated in pi_encrypt and pi_gen_obf_seeds, respectively")
+        gpu_obfuscated, gpu_obfuscate_time = profile(pi_obfuscate)(self._gpu_pub_key, gpu_encrypted, gpu_obf_seeds,
+                                                                   gpu_pi_store, None)
+        cpu_obfuscate_time = TEST_SIZE / 60236.27
+        compare_time(gpu_obfuscate_time, cpu_obfuscate_time)
+
+        print("\n>>>>> pi_decrypt profiling begins")
+        print("This function calculates L(cipher ^ lambda % nsquare) * L(g ^ lambda % nsquare) ^ -1 % n")
+        print("fp_decode is by default included in pi_decrypt")
+        fps_buffer = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        gpu_decrypted, gpu_decrypt_time = profile(pi_decrypt)(self._gpu_pub_key, self._gpu_priv_key, gpu_obfuscated,
+                                                              gpu_te_store, fps_buffer)
+        cpu_decrypt_time = TEST_SIZE / 1590.48
+        compare_time(gpu_decrypt_time, cpu_decrypt_time)
+
+        # check decrypted results
+        assert_ndarray_diff(te_c2p(gpu_decrypted), numpy.asarray(raw))
+
+        print("\n>>>>> generating the other array")
+        gpu_encoded2 = fp_encode(gpu_te_store2, self.n, self.max_int, res=gpu_fp_store2)
+        gpu_encrypted2 = pi_encrypt(self._gpu_pub_key, gpu_encoded2, gpu_pi_store2, None)
+        gpu_obf_seeds2 = pi_gen_obf_seed(gpu_bi_store2, self._gpu_pub_key, TEST_SIZE, CIPHER_BITS // 6, 1, None)
+        gpu_obfuscated2 = pi_obfuscate(self._gpu_pub_key, gpu_encrypted2, gpu_obf_seeds2, gpu_pi_store2, None)
+
+        print("\n>>>>> fp_mul profiling begins")
+        gpu_fp_mul_store = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        (gpu_fp_mul_res, _), gpu_fp_mul_time = profile(fp_mul)(gpu_encoded, gpu_encoded2, shape_store, shape_store,
+                                                               gpu_fp_mul_store, shape_store, None)
+        cpu_fp_mul_time = TEST_SIZE / 228424.79
+        compare_time(gpu_fp_mul_time, cpu_fp_mul_time)
+
+        # Compare results
+        cpu_encoded = [FixedPointNumber.encode(v, self.n, self.max_int) for v in raw]
+        cpu_encoded2 = [FixedPointNumber.encode(v, self.n, self.max_int) for v in raw2]
+        cpu_fp_mul_res = [FixedPointNumber((cpu_encoded[i].encoding * cpu_encoded2[i].encoding) % cpu_encoded[i].n,
+                                           cpu_encoded[i].exponent + cpu_encoded2[i].exponent, cpu_encoded[i].n,
+                                           cpu_encoded[i].max_int)
+                          for i in range(TEST_SIZE)]
+        received_fp_mul_res = fp_c2p(gpu_fp_mul_res)
+        for i in range(TEST_SIZE):
+            assert_diff(received_fp_mul_res[i].encoding, cpu_fp_mul_res[i].encoding)
+            assert received_fp_mul_res[i].BASE == cpu_fp_mul_res[i].BASE
+            assert received_fp_mul_res[i].exponent == cpu_fp_mul_res[i].exponent
+
+        print("\n>>>>> pi_add profiling begins")
+        (gpu_add_res, _), gpu_add_time = profile(pi_add)(self._gpu_pub_key, gpu_obfuscated, gpu_obfuscated2,
+                                                         shape_store, shape_store, gpu_pi_store, shape_store, None)
+        cpu_add_time = TEST_SIZE / 29759.90
+        compare_time(gpu_add_time, cpu_add_time)
+
+        print("\n>>>>> pi_mul profiling begins")
+        (gpu_mul_res, _), gpu_mul_time = profile(pi_mul)(self._gpu_pub_key, gpu_add_res, gpu_encoded2, shape_store,
+                                                         shape_store, gpu_pi_store, shape_store, None)
+        cpu_mul_time = TEST_SIZE / 6175.70
+        compare_time(gpu_mul_time, cpu_mul_time)
+
+        print("\n>>>>> pi_matmul profiling begins")
+        print("sizes are", shape_tuple, "and", shape_tuple_T)
+        gpu_pi_matmul_store = pi_alloc(None, NUM_ROWS * NUM_ROWS, MEM_HOST)
+        (gpu_matmul_res, gpu_matmul_shape), gpu_matmul_time = profile(pi_matmul)(self._gpu_pub_key, gpu_mul_res,
+                                                                                 gpu_encoded2, shape_store,
+                                                                                 shape_store_T, gpu_pi_matmul_store,
+                                                                                 None, None)
+        cpu_matmul_time = NUM_ROWS * TEST_SIZE / 4178.43
+        compare_time(gpu_matmul_time, cpu_matmul_time, NUM_ROWS * TEST_SIZE)
+
+        print("\n>>>>> pi_sum profiling begins")
+        print("shape is", gpu_matmul_shape.to_tuple())
+        gpu_pi_sum_store = pi_alloc(None, max(NUM_ROWS, NUM_COLS), MEM_HOST)
+        decrypted_matmul_res = numpy.asarray(
+            te_c2p(pi_decrypt(self._gpu_pub_key, self._gpu_priv_key, gpu_matmul_res, None, None))).reshape(
+            gpu_matmul_shape.to_tuple())
+        for axis in [0, 1, None]:
+            print(">>> axis:", axis)
+            (gpu_sum_res, _), gpu_sum_time = profile(pi_sum)(self._gpu_pub_key, gpu_matmul_res, gpu_matmul_shape, axis,
+                                                             gpu_pi_sum_store, None, None)
+            cpu_sum_time = TEST_SIZE / (12865.10 if axis == 0 else (15919.62 if axis == 1 else 10277.66))
+            compare_time(gpu_sum_time, cpu_sum_time)
+
+            # check result
+            gpu_decrypted = te_c2p(pi_decrypt(self._gpu_pub_key, self._gpu_priv_key, gpu_sum_res, None, None))
+            cpu_sum = decrypted_matmul_res.sum(axis)
+            if axis is None:
+                cpu_sum = numpy.asarray([cpu_sum])
+            assert_ndarray_diff(gpu_decrypted, cpu_sum)
+
+        print("\n>>>>> free all allocated spaces")
+        bi_free(gpu_bi_store)
+        bi_free(gpu_bi_store2)
+        te_free(gpu_te_store)
+        te_free(gpu_te_store2)
+        fp_free(gpu_fp_store)
+        fp_free(gpu_fp_store2)
+        fp_free(fps_buffer)
+        fp_free(gpu_fp_mul_store)
+        pi_free(gpu_pi_store)
+        pi_free(gpu_pi_store2)
+        pi_free(gpu_pi_matmul_store)
+        pi_free(gpu_pi_sum_store)
+
+        print("test passed")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/gpu/fate-tensor-gpu/pyproject.toml b/gpu/tensor/paillier_gpu/pyproject.toml
similarity index 96%
rename from gpu/fate-tensor-gpu/pyproject.toml
rename to gpu/tensor/paillier_gpu/pyproject.toml
index 16df1d9d42..2a5f5b3008 100644
--- a/gpu/fate-tensor-gpu/pyproject.toml
+++ b/gpu/tensor/paillier_gpu/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.poetry]
-name = "fate-tensor-gpu"
+name = "paillier-gpu"
 version = "0.1.0"
 description = "This project is an industrial-level heterogeneous acceleration system to support and speed up federated learning. We've designed and implemented a heterogeneous acceleration solutions using GPU, respectively, that can significantly accelerate the Paillier cryptosystem while maintaining functionality, accuracy and scalability."
 authors = ["Xiaolong.Gao <1506957902@qq.com>"]
diff --git a/python/fate_arch/tensor/impl/blocks/python_paillier_block/__init__.py b/python/fate_arch/tensor/impl/blocks/python_paillier_block/__init__.py
index 66e5fdfd25..8a8919c760 100644
--- a/python/fate_arch/tensor/impl/blocks/python_paillier_block/__init__.py
+++ b/python/fate_arch/tensor/impl/blocks/python_paillier_block/__init__.py
@@ -3,5 +3,15 @@
     BlockPaillierDecryptor,
     BlockPaillierEncryptor,
 )
+from ._fate_paillier import (
+    PaillierEncryptedNumber,
+    PaillierPrivateKey,
+    PaillierPublicKey,
+    PaillierKeypair,
+)
+from ._fixedpoint import FixedPointNumber, FixedPointEndec
+from . import _gmpy_math as gmpy_math
 
-__all__ = ["BlockPaillierCipher", "BlockPaillierEncryptor", "BlockPaillierDecryptor"]
+__all__ = ["BlockPaillierCipher", "BlockPaillierEncryptor", "BlockPaillierDecryptor", "PaillierEncryptedNumber",
+           "PaillierPrivateKey", "PaillierPublicKey", "PaillierKeypair", "FixedPointNumber", "FixedPointEndec",
+           "gmpy_math"]

From 12fa768ae88f205b6677c65111542823e498b492 Mon Sep 17 00:00:00 2001
From: "Xiaolong.Gao" <1506957902@qq.com>
Date: Fri, 22 Jul 2022 15:55:42 +0800
Subject: [PATCH 7/8] refactor: format FPGA tensor

Signed-off-by: Xiaolong.Gao <1506957902@qq.com>
---
 .../paillier_fpga/paillier_fpga}/__init__.py  |    0
 .../paillier_fpga}/fpga_engine.py             |    6 +-
 .../paillier_fpga}/fpga_tensor.py             |   50 +-
 .../paillier_fpga/tests}/__init__.py          |    0
 .../paillier_fpga/tests/test_fpga_engine.py   | 1133 +++++++++++++++++
 .../tests/test_fpga_performance.py            |  324 +++++
 .../paillier_fpga}/pyproject.toml             |    2 +-
 7 files changed, 1486 insertions(+), 29 deletions(-)
 rename gpu/{fate-tensor-fpga/fate_tensor_fpga => tensor/paillier_fpga/paillier_fpga}/__init__.py (100%)
 rename gpu/{fate-tensor-fpga/fate_tensor_fpga => tensor/paillier_fpga/paillier_fpga}/fpga_engine.py (99%)
 rename gpu/{fate-tensor-fpga/fate_tensor_fpga => tensor/paillier_fpga/paillier_fpga}/fpga_tensor.py (92%)
 rename gpu/{fate-tensor-fpga/fate_tensor_fpga/secureprotol => tensor/paillier_fpga/paillier_fpga/tests}/__init__.py (100%)
 create mode 100755 gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_engine.py
 create mode 100755 gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_performance.py
 rename gpu/{fate-tensor-fpga => tensor/paillier_fpga}/pyproject.toml (96%)

diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/__init__.py b/gpu/tensor/paillier_fpga/paillier_fpga/__init__.py
similarity index 100%
rename from gpu/fate-tensor-fpga/fate_tensor_fpga/__init__.py
rename to gpu/tensor/paillier_fpga/paillier_fpga/__init__.py
diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_engine.py b/gpu/tensor/paillier_fpga/paillier_fpga/fpga_engine.py
similarity index 99%
rename from gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_engine.py
rename to gpu/tensor/paillier_fpga/paillier_fpga/fpga_engine.py
index 1fb0c2f872..abf4edf65b 100644
--- a/gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_engine.py
+++ b/gpu/tensor/paillier_fpga/paillier_fpga/fpga_engine.py
@@ -32,12 +32,12 @@
     c_uint64,
     c_size_t,
 )
-from .secureprotol.fate_paillier import (
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
     PaillierPublicKey,
     PaillierPrivateKey,
-    PaillierEncryptedNumber
+    PaillierEncryptedNumber,
+    FixedPointNumber,
 )
-from .secureprotol.fixedpoint import FixedPointNumber
 
 from concurrent.futures import ProcessPoolExecutor as Executor
 
diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_tensor.py b/gpu/tensor/paillier_fpga/paillier_fpga/fpga_tensor.py
similarity index 92%
rename from gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_tensor.py
rename to gpu/tensor/paillier_fpga/paillier_fpga/fpga_tensor.py
index f2725c578f..b672a512e7 100644
--- a/gpu/fate-tensor-fpga/fate_tensor_fpga/fpga_tensor.py
+++ b/gpu/tensor/paillier_fpga/paillier_fpga/fpga_tensor.py
@@ -34,7 +34,7 @@
     pi_p2c_priv_key,
     te_c2p,
 )
-from .secureprotol.fate_paillier import (
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
     PaillierPublicKey,
     PaillierPrivateKey,
     PaillierKeypair,
@@ -116,25 +116,25 @@ def add_plaintext_i32(self, other) -> "Cipherblock":
         return self._add_plaintext(other)
 
     def add_plaintext_scalar_f64(
-        self, other: typing.Union[float, np.float64]
+            self, other: typing.Union[float, np.float64]
     ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.float64)
         return self._add_plaintext(other_array)
 
     def add_plaintext_scalar_f32(
-        self, other: typing.Union[float, np.float32]
+            self, other: typing.Union[float, np.float32]
     ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.float32)
         return self._add_plaintext(other_array)
 
     def add_plaintext_scalar_i64(
-        self, other: typing.Union[int, np.int64]
+            self, other: typing.Union[int, np.int64]
     ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.int64)
         return self._add_plaintext(other_array)
 
     def add_plaintext_scalar_i32(
-        self, other: typing.Union[int, np.int32]
+            self, other: typing.Union[int, np.int32]
     ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.int32)
         return self._add_plaintext(other_array)
@@ -155,22 +155,22 @@ def sub_plaintext_i32(self, other) -> "Cipherblock":
         return self.add_plaintext_i32(other * -1)
 
     def sub_plaintext_scalar_f64(
-        self, other: typing.Union[float, np.float64]
+            self, other: typing.Union[float, np.float64]
     ) -> "Cipherblock":
         return self.add_plaintext_scalar_f64(other * -1)
 
     def sub_plaintext_scalar_f32(
-        self, other: typing.Union[float, np.float32]
+            self, other: typing.Union[float, np.float32]
     ) -> "Cipherblock":
         return self.add_plaintext_scalar_f32(other * -1)
 
     def sub_plaintext_scalar_i64(
-        self, other: typing.Union[int, np.int64]
+            self, other: typing.Union[int, np.int64]
     ) -> "Cipherblock":
         return self.add_plaintext_scalar_i64(other * -1)
 
     def sub_plaintext_scalar_i32(
-        self, other: typing.Union[int, np.int32]
+            self, other: typing.Union[int, np.int32]
     ) -> "Cipherblock":
         return self.add_plaintext_scalar_i32(other * -1)
 
@@ -187,25 +187,25 @@ def mul_plaintext_i32(self, other) -> "Cipherblock":
         return self._mul_plaintext(other)
 
     def mul_plaintext_scalar_f64(
-        self, other: typing.Union[float, np.float64]
+            self, other: typing.Union[float, np.float64]
     ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.float64)
         return self._mul_plaintext(other_array)
 
     def mul_plaintext_scalar_f32(
-        self, other: typing.Union[float, np.float32]
+            self, other: typing.Union[float, np.float32]
     ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.float32)
         return self._mul_plaintext(other_array)
 
     def mul_plaintext_scalar_i64(
-        self, other: typing.Union[int, np.int64]
+            self, other: typing.Union[int, np.int64]
     ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.int64)
         return self._mul_plaintext(other_array)
 
     def mul_plaintext_scalar_i32(
-        self, other: typing.Union[int, np.int32]
+            self, other: typing.Union[int, np.int32]
     ) -> "Cipherblock":
         other_array = np.asarray([other], dtype=np.int32)
         return self._mul_plaintext(other_array)
@@ -287,22 +287,22 @@ def add_plaintext_i64_par(self, other) -> "Cipherblock":
         return self.add_plaintext_i64(other)
 
     def add_plaintext_scalar_f64_par(
-        self, other: typing.Union[float, np.float64]
+            self, other: typing.Union[float, np.float64]
     ) -> "Cipherblock":
         return self.add_plaintext_scalar_f64(other)
 
     def add_plaintext_scalar_f32_par(
-        self, other: typing.Union[float, np.float32]
+            self, other: typing.Union[float, np.float32]
     ) -> "Cipherblock":
         return self.add_plaintext_scalar_f32(other)
 
     def add_plaintext_scalar_i64_par(
-        self, other: typing.Union[int, np.int64]
+            self, other: typing.Union[int, np.int64]
     ) -> "Cipherblock":
         return self.add_plaintext_scalar_i64(other)
 
     def add_plaintext_scalar_i32_par(
-        self, other: typing.Union[int, np.int32]
+            self, other: typing.Union[int, np.int32]
     ) -> "Cipherblock":
         return self.add_plaintext_scalar_i32(other)
 
@@ -325,22 +325,22 @@ def sub_plaintext_i32_par(self, other) -> "Cipherblock":
         return self.sub_plaintext_i32(other)
 
     def sub_plaintext_scalar_f64_par(
-        self, other: typing.Union[float, np.float64]
+            self, other: typing.Union[float, np.float64]
     ) -> "Cipherblock":
         return self.sub_plaintext_scalar_f64(other)
 
     def sub_plaintext_scalar_f32_par(
-        self, other: typing.Union[float, np.float32]
+            self, other: typing.Union[float, np.float32]
     ) -> "Cipherblock":
         return self.sub_plaintext_scalar_f32(other)
 
     def sub_plaintext_scalar_i64_par(
-        self, other: typing.Union[int, np.int64]
+            self, other: typing.Union[int, np.int64]
     ) -> "Cipherblock":
         return self.sub_plaintext_scalar_i64(other)
 
     def sub_plaintext_scalar_i32_par(
-        self, other: typing.Union[int, np.int32]
+            self, other: typing.Union[int, np.int32]
     ) -> "Cipherblock":
         return self.sub_plaintext_scalar_i32(other)
 
@@ -357,22 +357,22 @@ def mul_plaintext_i32_par(self, other) -> "Cipherblock":
         return self.mul_plaintext_i32(other)
 
     def mul_plaintext_scalar_f64_par(
-        self, other: typing.Union[float, np.float64]
+            self, other: typing.Union[float, np.float64]
     ) -> "Cipherblock":
         return self.mul_plaintext_scalar_f64(other)
 
     def mul_plaintext_scalar_f32_par(
-        self, other: typing.Union[float, np.float32]
+            self, other: typing.Union[float, np.float32]
     ) -> "Cipherblock":
         return self.mul_plaintext_scalar_f32(other)
 
     def mul_plaintext_scalar_i64_par(
-        self, other: typing.Union[int, np.int64]
+            self, other: typing.Union[int, np.int64]
     ) -> "Cipherblock":
         return self.mul_plaintext_scalar_i64(other)
 
     def mul_plaintext_scalar_i32_par(
-        self, other: typing.Union[int, np.int32]
+            self, other: typing.Union[int, np.int32]
     ) -> "Cipherblock":
         return self.mul_plaintext_scalar_i32(other)
 
diff --git a/gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/__init__.py b/gpu/tensor/paillier_fpga/paillier_fpga/tests/__init__.py
similarity index 100%
rename from gpu/fate-tensor-fpga/fate_tensor_fpga/secureprotol/__init__.py
rename to gpu/tensor/paillier_fpga/paillier_fpga/tests/__init__.py
diff --git a/gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_engine.py b/gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_engine.py
new file mode 100755
index 0000000000..48c243c806
--- /dev/null
+++ b/gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_engine.py
@@ -0,0 +1,1133 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import numpy
+import unittest
+import random
+import functools
+import operator
+import time
+
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
+    PaillierKeypair,
+    PaillierEncryptedNumber,
+    FixedPointNumber,
+)
+
+from ..fpga_engine import (
+    FLOAT_TYPE,
+    INT64_TYPE,
+    pi_p2c_pub_key,
+    pi_p2c_priv_key,
+    pi_h2d_pub_key,
+    pi_h2d_priv_key,
+    TensorShapeStorage,
+    bi_alloc,
+    PLAIN_BYTE,
+    MEM_HOST,
+    te_alloc,
+    fp_alloc,
+    pi_alloc,
+    te_p2c,
+    fp_encode,
+    fp_decode,
+    te_c2p,
+    pi_encrypt,
+    pi_gen_obf_seed,
+    CIPHER_BITS,
+    pi_obfuscate,
+    pi_c2p,
+    pi_decrypt,
+    fp_mul,
+    fp_c2p,
+    pi_add,
+    pi_mul,
+    pi_sum,
+    bi_free,
+    te_free,
+    fp_free,
+    pi_free, te_slice, initialize_device, fp_p2c, pi_p2c, bi_gen_rand, bi_c2p, pi_transpose, pi_matmul, fp_transpose,
+    CIPHER_BYTE, te_c2bytes, te_bytes2c, fp_c2bytes, fp_bytes2c, pi_c2bytes, pi_bytes2c, pi_slice, pi_reshape,
+    te_c2p_first, TensorStorage, te_c2p_shape, te_cat, te_pow, te_add, te_mul, te_truediv, te_floordiv, te_sub,
+    te_matmul, te_abs, te_transpose, te_reshape, te_exp, te_hstack, pi_cat, te_sum, fp_slice, te_p2c_shape, fp_cat,
+    te_neg,
+)
+
+# SWITCH DATA TYPE HERE
+# EITHER INT64_TYPE OR FLOAT_TYPE
+RAND_TYPE = INT64_TYPE
+
+TEST_SIZE = 6
+ERROR_TOLERANCE = 1e-10
+
+
+def generate_rand(test_size):
+    if RAND_TYPE == FLOAT_TYPE:
+        return numpy.random.normal(0, 10, test_size)
+    elif RAND_TYPE == INT64_TYPE:
+        return numpy.random.randint(-2 ** 30, 2 ** 30, test_size)
+    else:
+        raise TypeError("Invalid data type")
+
+
+def assert_diff(res, ref):
+    if res == 0 or ref == 0:
+        assert res == 0
+        assert ref == 0
+    else:
+        diff = res - ref
+        assert abs(diff / res) < ERROR_TOLERANCE
+        assert abs(diff / ref) < ERROR_TOLERANCE
+
+
+def assert_ndarray_diff(res, ref):
+    assert res.shape == ref.shape
+    res, ref = res.flatten(), ref.flatten()
+    assert res.shape == ref.shape
+    for i in range(res.size):
+        assert_diff(res[i], ref[i])
+
+
+class TestOperators(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        # sys.stdout = open("stdout.log", 'a')  # uncomment this to redirect stdout
+        # random.seed(time.time())  # no need to set random.seed as we're using numpy.random
+        initialize_device()
+        cls._pub_key, cls._priv_key = PaillierKeypair.generate_keypair()
+        cls.n, cls.max_int = cls._pub_key.n, cls._pub_key.max_int
+        cls._cpu_pub_key = pi_p2c_pub_key(None, cls._pub_key)
+        cls._cpu_priv_key = pi_p2c_priv_key(None, cls._priv_key)
+        cls._fpga_pub_key = pi_h2d_pub_key(None, cls._cpu_pub_key)
+        cls._fpga_priv_key = pi_h2d_priv_key(None, cls._cpu_priv_key)
+        print("\n\n", "*" * 100, "\n\nInitialization complete\nTest Size:", TEST_SIZE)
+
+    # test encode and decode
+    # using operators: te_p2c, fp_encode, fp_c2p, fp_decode, te_c2p
+    def test_encode_and_decode(self):
+        print("\n\n", "*" * 100, "\n\nTest Encode and Decode Begins")
+
+        raw = generate_rand(TEST_SIZE)
+        raw[TEST_SIZE // 2] = 0  # test encode zero
+        store = te_p2c(raw, None)
+        precision = 10000 if RAND_TYPE == FLOAT_TYPE else None
+
+        # check encoded numbers (fixed-point numbers)
+        fpga_encoded_store = fp_encode(store, self.n, self.max_int, precision, None)
+        fpga_encoded = fp_c2p(fpga_encoded_store)
+        cpu_encoded = [FixedPointNumber.encode(v, self.n, self.max_int, precision) for v in raw]
+        assert len(fpga_encoded) == TEST_SIZE
+        assert len(cpu_encoded) == TEST_SIZE
+        for i in range(TEST_SIZE):
+            print("i:", i, ", raw data:", raw[i])
+            print("FPGA encoding:", fpga_encoded[i].encoding, ", base:", fpga_encoded[i].BASE, ", exp:",
+                  fpga_encoded[i].exponent)
+            print("CPU encoding:", cpu_encoded[i].encoding, ", base:", cpu_encoded[i].BASE, ", exp:",
+                  cpu_encoded[i].exponent)
+        for i in range(TEST_SIZE):
+            assert fpga_encoded[i].encoding == cpu_encoded[i].encoding
+            assert fpga_encoded[i].BASE == cpu_encoded[i].BASE
+            assert fpga_encoded[i].exponent == cpu_encoded[i].exponent
+
+        # check decoded numbers
+        cpu_encoded_cpu_decoded = [v.decode() for v in cpu_encoded]
+        cpu_encoded_fpga_decoded = te_c2p(fp_decode(fp_p2c(None, cpu_encoded, RAND_TYPE), None, None))
+        fpga_encoded_cpu_decoded = [v.decode() for v in fpga_encoded]
+        fpga_encoded_fpga_decoded = te_c2p(fp_decode(fpga_encoded_store, None, None))
+        assert len(cpu_encoded_cpu_decoded) == TEST_SIZE
+        assert len(cpu_encoded_fpga_decoded) == TEST_SIZE
+        assert len(fpga_encoded_cpu_decoded) == TEST_SIZE
+        assert len(fpga_encoded_fpga_decoded) == TEST_SIZE
+        for i in range(TEST_SIZE):
+            print("decoded compare: i:", i, cpu_encoded_cpu_decoded[i], cpu_encoded_fpga_decoded[i],
+                  fpga_encoded_cpu_decoded[i], fpga_encoded_fpga_decoded[i])
+            assert_diff(cpu_encoded_fpga_decoded[i], cpu_encoded_cpu_decoded[i])
+            assert_diff(fpga_encoded_cpu_decoded[i], cpu_encoded_cpu_decoded[i])
+            assert_diff(fpga_encoded_fpga_decoded[i], cpu_encoded_cpu_decoded[i])
+
+        print("test passed")
+
+    # test encrypt and decrypt
+    # using operators: fp_encode, pi_encrypt, pi_decrypt, te_p2c, te_c2p, pi_c2p
+    def test_encrypt_and_decrypt(self):
+        print("\n\n", "*" * 100, "\n\nTest Encrypt And Decrypt Begins")
+
+        print("\nPart 1: FPGA encrypt, FPGA decrypt")
+        raw = generate_rand(TEST_SIZE)
+        store = te_p2c(raw, None)
+        encoded = fp_encode(store, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded, None, None)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, encrypted, None, None)
+        ref1 = te_c2p(decrypted)
+        assert store.data_type == RAND_TYPE
+        assert encoded.data_type == RAND_TYPE
+        assert encrypted.data_type == RAND_TYPE
+        assert decrypted.data_type == RAND_TYPE
+        for i in range(TEST_SIZE):
+            print("i:", i, ", original:", raw[i], ", decrypted:", ref1[i])
+            assert_diff(raw[i], ref1[i])
+
+        print("\nPart 2: FPGA encrypt, CPU decrypt")
+        tmp_enc, _, tmp_exp = pi_c2p(encrypted)
+        pen_recv = [PaillierEncryptedNumber(self._pub_key, tmp_enc[i], int(round(tmp_exp[i]))) for i in
+                    range(TEST_SIZE)]
+        ref2 = [self._priv_key.decrypt(v) for v in pen_recv]
+        for i in range(TEST_SIZE):
+            print("i:", i, ", original:", raw[i], ", decrypted:", ref2[i])
+            assert_diff(raw[i], ref2[i])
+
+        print("\nPart 3: CPU encrypt, FPGA decrypt")
+        # print("FPGA decrypting a CPU encrypted number currently unavailable, needs pi_p2c support")
+        cpu_encrypted = [self._pub_key.encrypt(raw[i], None, 0) for i in range(TEST_SIZE)]
+        for i in range(TEST_SIZE):
+            print("FPGA: i:", i, ", cipher text:", pen_recv[i].ciphertext(False), ", exp:", pen_recv[i].exponent)
+            print("CPU: i:", i, ", cipher text:", cpu_encrypted[i].ciphertext(False), ", exp:",
+                  cpu_encrypted[i].exponent)
+            assert pen_recv[i].exponent == cpu_encrypted[i].exponent
+            try:
+                assert pen_recv[i].ciphertext(False) == cpu_encrypted[i].ciphertext(False)
+            except AssertionError:
+                # Note that there's an approx 1/1000 probability that these ciphers don't match
+                # However, this shouldn't affect the final result
+                print("\n>>>>>> The following cipher texts didn't match:")
+                print("raw number:", raw[i])
+                print("FPGA encoding:", fp_c2p(encoded)[i].encoding)
+                print("CPU encoding:", FixedPointNumber.encode(raw[i], self.n, self.max_int).encoding)
+                print("FPGA cipher:", pen_recv[i].ciphertext(False))
+                print("CPU cipher:", cpu_encrypted[i].ciphertext(False))
+                print("pub_key.n:", self._pub_key.n)
+                print("pub_key.nsquare:", self._pub_key.nsquare)
+                print("priv_key.p:", self._priv_key.p)
+                print("priv_key.q:", self._priv_key.q)
+                print(">>>>>> End Dumping\n")
+        pi_store = pi_p2c(None, cpu_encrypted, RAND_TYPE)
+        ref3 = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, pi_store, None, None))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], ref3[i])
+            print("i:", i, ", original:", raw[i], ", decrypted:", ref3[i])
+
+        print("test passed")
+
+    def test_pi_add(self):
+        print("\n\n", "*" * 100, "\n\nTest Paillier Encrypted Number Add Begins")
+        raw_1, raw_2 = generate_rand(2), generate_rand(TEST_SIZE)
+        te_store_1, te_store_2 = te_p2c(raw_1, None), te_p2c(raw_2, None)
+        encoded_1, encoded_2 = fp_encode(te_store_1, self.n, self.max_int), fp_encode(te_store_2, self.n, self.max_int)
+        encrypted_1, encrypted_2 = pi_encrypt(self._fpga_pub_key, encoded_1, None, None), pi_encrypt(self._fpga_pub_key,
+                                                                                                     encoded_2, None,
+                                                                                                     None)
+        shape_1, shape_2 = TensorShapeStorage(2, 1), TensorShapeStorage(2, 3)  # passed different shapes
+        res_store, res_shape = pi_add(self._fpga_pub_key, encrypted_1, encrypted_2, shape_1, shape_2, None, None, None)
+        assert res_shape.to_tuple() == (2, 3)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, res_store, None, None)
+        received = te_c2p(decrypted)
+        for i in range(TEST_SIZE):
+            print("i:", i, ", raw result:", raw_1[i // 3] + raw_2[i], ", FPGA result:", received[i])
+            assert_diff(raw_1[i // 3] + raw_2[i], received[i])
+        print("test passed")
+
+    def test_pi_mul(self):
+        print("\n\n", "*" * 100, "\n\nTest PEN Multiplies FPN Begins")
+        raw_1, raw_2 = generate_rand(3), generate_rand(TEST_SIZE)
+        te_store_1, te_store_2 = te_p2c(raw_1, None), te_p2c(raw_2, None)
+        encoded_1, encoded_2 = fp_encode(te_store_1, self.n, self.max_int), fp_encode(te_store_2, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded_1, None, None)
+        shape_1, shape_2 = TensorShapeStorage(3), TensorShapeStorage(2, 3)  # passed different shapes
+        res_store, res_shape = pi_mul(self._fpga_pub_key, encrypted, encoded_2, shape_1, shape_2, None, None, None)
+        assert res_shape.to_tuple() == (2, 3)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, res_store, None, None)
+        received = te_c2p(decrypted)
+        for i in range(TEST_SIZE):
+            print("i:", i, ", raw result:", raw_1[i % 3] * raw_2[i], ", FPGA result:", received[i])
+            assert_diff(raw_1[i % 3] * raw_2[i], received[i])
+        print("test passed")
+
+    def test_gen_obf_seed(self):
+        print("\n\n", "*" * 100, "\n\nTest Generate Obfscator Begins")
+        # why divided by 6, see pi_gen_obf_seed implementation
+        bi_store = pi_gen_obf_seed(None, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None)
+        obfuscators = bi_c2p(bi_store.bigint_storage, 0, TEST_SIZE)
+        for i in range(TEST_SIZE):
+            print("i:", i, "obfuscator:", obfuscators[i])
+            assert CIPHER_BITS * 0.9 <= obfuscators[i].bit_length()
+            assert obfuscators[i].bit_length() <= CIPHER_BITS
+        print("test passed")
+
+    def test_obfuscate(self):
+        print("\n\n", "*" * 100, "\n\nTest Obfuscate Begins")
+
+        # generate big random values
+        bi_rand_store = bi_gen_rand(CIPHER_BITS // 6, TEST_SIZE, None, 0, None)
+        bi_rand_vals = bi_c2p(bi_rand_store.bigint_storage, 0, TEST_SIZE)
+        obf_rand_store = pi_gen_obf_seed(None, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None)
+
+        print("\nPart 1: FPGA encrypt, FPGA decrypt")
+        raw = generate_rand(TEST_SIZE)
+        store = te_p2c(raw, None)
+        encoded = fp_encode(store, self.n, self.max_int)
+        raw_encrypted = pi_encrypt(self._fpga_pub_key, encoded, None, None)
+        encrypted = pi_obfuscate(self._fpga_pub_key, raw_encrypted, obf_rand_store, None, None)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, encrypted, None, None)
+        ref1 = te_c2p(decrypted)
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], ref1[i])
+            print("i:", i, ", original:", raw[i], ", decrypted:", ref1[i])
+
+        print("\nPart 2: FPGA encrypt, CPU decrypt")
+        tmp_enc, _, tmp_exp = pi_c2p(encrypted)
+        pen_recv = [PaillierEncryptedNumber(self._pub_key, tmp_enc[i], int(round(tmp_exp[i]))) for i in
+                    range(TEST_SIZE)]
+        ref2 = [self._priv_key.decrypt(v) for v in pen_recv]
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], ref2[i])
+            print("i:", i, ", original:", raw[i], ", decrypted:", ref2[i])
+
+        print("\nPart 3: CPU encrypt, FPGA decrypt")
+        cpu_encrypted = [self._pub_key.encrypt(raw[i], None, bi_rand_vals[i]) for i in range(TEST_SIZE)]
+        for i in range(TEST_SIZE):
+            print("FPGA: i:", i, ", encoding:", pen_recv[i].ciphertext(False), ", exp:", pen_recv[i].exponent)
+            print("CPU: i:", i, ", encoding:", cpu_encrypted[i].ciphertext(False), ", exp:", cpu_encrypted[i].exponent)
+            assert pen_recv[i].ciphertext(False) == cpu_encrypted[i].ciphertext(False)
+            assert pen_recv[i].exponent == cpu_encrypted[i].exponent
+        pi_store = pi_p2c(None, cpu_encrypted, RAND_TYPE)
+        ref3 = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, pi_store, None, None))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], ref3[i])
+            print("i:", i, ", original:", raw[i], ", decrypted:", ref3[i])
+
+        print("test passed")
+
+    # tests both PEN and FPN transpose
+    def test_transpose(self):
+        print("\n\n", "*" * 100, "\n\nTest transpose of both FPN and PEN matrices Begins")
+        raw = generate_rand(TEST_SIZE)
+        # generate test PaillierEncryptedStorage and its shape
+        te_store = te_p2c(raw, None)
+        encoded = fp_encode(te_store, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded, None, None)
+        rows, cols = 2, 3
+        shape = TensorShapeStorage(rows, cols)
+        pi_transpose_store, pi_transpose_shape = pi_transpose(encrypted, shape, None, None, None)
+        fp_transpose_store, fp_transpose_shape = fp_transpose(encoded, shape, None, None, None)
+        print("original shape:", shape.to_tuple(), ", transposed FPN shape:", fp_transpose_shape.to_tuple(),
+              ", transposed PEN shape", pi_transpose_shape.to_tuple())
+        assert pi_transpose_shape.to_tuple() == (cols, rows)
+        assert fp_transpose_shape.to_tuple() == (cols, rows)
+        fp_original = fp_c2p(encoded)
+        fp_transposed = fp_c2p(fp_transpose_store)
+        pi_original_cipher, pi_original_base, pi_original_exp = pi_c2p(encrypted)
+        pi_transposed_cipher, pi_transposed_base, pi_transposed_exp = pi_c2p(pi_transpose_store)
+        for i in range(rows):
+            for j in range(cols):
+                print("testing index (", i, ", ", j, ")")
+                assert fp_original[i * cols + j].encoding == fp_transposed[j * rows + i].encoding
+                assert fp_original[i * cols + j].BASE == fp_transposed[j * rows + i].BASE
+                assert fp_original[i * cols + j].exponent == fp_transposed[j * rows + i].exponent
+                assert pi_original_cipher[i * cols + j] == pi_transposed_cipher[j * rows + i]
+                assert pi_original_base[i * cols + j] == pi_transposed_base[j * rows + i]
+                assert pi_original_exp[i * cols + j] == pi_transposed_exp[j * rows + i]
+        print("test passed")
+
+    def test_pi_sum(self):
+        print("\n\n", "*" * 100, "\n\nTest Sum Begins")
+        # generate raw data
+        raw = generate_rand(TEST_SIZE)
+        # generate test PaillierEncryptedStorage and its shape
+        te_store = te_p2c(raw, None)
+        encoded = fp_encode(te_store, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded, None, None)
+        rows, cols = 2, 3
+        shape = TensorShapeStorage(rows, cols)
+
+        print("raw matrix:\n", numpy.asarray(raw).reshape(rows, cols))
+
+        print("TEST AXIS = 0")
+        res_sum_axis0, res_shape_axis0 = pi_sum(self._fpga_pub_key, encrypted, shape, 0, None, None, None)
+        res_axis0_fpga = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, res_sum_axis0, None, None))
+        res_axis0_cpu = numpy.asarray(raw).reshape(rows, cols).sum(axis=0)
+        print("result shape:", res_shape_axis0.to_tuple())
+        for i in range(cols):
+            print("column:", i, ", CPU result:", res_axis0_cpu[i], ", FPGA result:", res_axis0_fpga[i])
+            assert_diff(res_axis0_cpu[i], res_axis0_fpga[i])
+
+        print("TEST AXIS = 1")
+        res_sum_axis1, res_shape_axis1 = pi_sum(self._fpga_pub_key, encrypted, shape, 1, None, None, None)
+        res_axis1_fpga = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, res_sum_axis1, None, None))
+        res_axis1_cpu = numpy.asarray(raw).reshape(rows, cols).sum(axis=1)
+        print("result shape:", res_shape_axis1.to_tuple())
+        for i in range(rows):
+            print("column:", i, ", CPU result:", res_axis1_cpu[i], ", FPGA result:", res_axis1_fpga[i])
+            assert_diff(res_axis1_cpu[i], res_axis1_fpga[i])
+
+        print("TEST AXIS = None")
+        res_sum_axis, res_shape_axis = pi_sum(self._fpga_pub_key, encrypted, shape, None, None, None, None)
+        res_axis_fpga = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, res_sum_axis, None, None))
+        res_axis_cpu = [numpy.asarray(raw).reshape(rows, cols).sum()]
+        print("result shape:", res_shape_axis.to_tuple())
+        for i in range(pow(CIPHER_BYTE, 0, PLAIN_BYTE)):
+            print("result:", i, ", CPU result:", res_axis_cpu[i], ", FPGA result:", res_axis_fpga[i])
+            assert_diff(res_axis_cpu[i], res_axis_fpga[i])
+        print("test passed")
+
+    def test_pi_matmul(self):
+        print("\n\n", "*" * 100, "\n\nTest PEN Matrix_Multiplies FPN Begins")
+        raw_1, raw_2 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        # generate the 2 operands
+        te_store_1, te_store_2 = te_p2c(raw_1, None), te_p2c(raw_2, None)
+        encoded_1, encoded_2 = fp_encode(te_store_1, self.n, self.max_int), fp_encode(te_store_2, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded_1, None, None)
+        P, Q, R, S = 2, 3, 3, 2
+        shape_1, shape_2 = TensorShapeStorage(P, Q), TensorShapeStorage(R, S)
+        # then perform the matmul
+        res_store, res_shape = pi_matmul(self._fpga_pub_key, encrypted, encoded_2, shape_1, shape_2, None, None, None)
+        decrypted = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, res_store, None, None))
+        res = numpy.asarray(decrypted).reshape(res_shape.to_tuple())
+        ref = numpy.asarray(raw_1).reshape(P, Q) @ numpy.asarray(raw_2).reshape(R, S)
+        print("FPGA result shape:", res_shape.to_tuple(), ", CPU result shape:", ref.shape)
+        assert res_shape.to_tuple() == ref.shape
+        print("CPU result:\n", ref, "\nFPGA result:\n", res)
+        assert_ndarray_diff(res, ref)
+        print("test passed")
+
+    def test_combination(self):
+        print("\n\n", "*" * 100, "\n\nTest Combination Begins")
+
+        # generate operands
+        raw_1, raw_3 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        if RAND_TYPE == INT64_TYPE:
+            raw_2, raw_4 = [i % 16384 for i in generate_rand(TEST_SIZE)], [i % 16384 for i in generate_rand(TEST_SIZE)]
+        elif RAND_TYPE == FLOAT_TYPE:
+            raw_2, raw_4 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        else:
+            raise PermissionError("Invalid Data Type")
+        print('Raw data:\nraw_1:', raw_1, '\nraw_2:', raw_2, '\nraw_3:', raw_3, '\nraw_4:', raw_4)
+
+        # generate shapes and NumPy arrays
+        rows, cols = 2, 3
+        array_1, array_2 = numpy.asarray(raw_1).reshape(rows, cols), numpy.asarray(raw_2).reshape(cols, rows)
+        array_3, array_4 = numpy.asarray(raw_3).reshape(rows, cols), numpy.asarray(raw_4).reshape(rows, cols)
+        shape_1, shape_2 = TensorShapeStorage(rows, cols), TensorShapeStorage(cols, rows)
+        shape_3, shape_4 = TensorShapeStorage(rows, cols), TensorShapeStorage(rows, cols)
+
+        # transfer and encode
+        te_store_1, te_store_2 = te_p2c(raw_1, None), te_p2c(raw_2, None)
+        te_store_3, te_store_4 = te_p2c(raw_3, None), te_p2c(raw_4, None)
+        encoded_1, encoded_2 = fp_encode(te_store_1, self.n, self.max_int), fp_encode(te_store_2, self.n, self.max_int)
+        encoded_3, encoded_4 = fp_encode(te_store_3, self.n, self.max_int), fp_encode(te_store_4, self.n, self.max_int)
+
+        # perform encrypt and obfs
+        encrypted_old_1 = pi_encrypt(self._fpga_pub_key, encoded_1, None, None)
+        encrypted_old_3 = pi_encrypt(self._fpga_pub_key, encoded_3, None, None)
+        rand_store_1 = pi_gen_obf_seed(None, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, time.time(), None)
+        rand_store_3 = pi_gen_obf_seed(None, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, time.time(), None)
+        encrypted_1 = pi_obfuscate(self._fpga_pub_key, encrypted_old_1, rand_store_1, None, None)
+        encrypted_3 = pi_obfuscate(self._fpga_pub_key, encrypted_old_3, rand_store_3, None, None)
+
+        print("Perform Add")
+        add_res_store, add_res_shape = pi_add(self._fpga_pub_key, encrypted_1, encrypted_3, shape_1, shape_3, None,
+                                              None, None)
+        add_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, add_res_store, None, None)).reshape(
+            add_res_shape.to_tuple())
+        add_ref = array_1 + array_3
+        print("FPGA intermediate result:", add_res)
+        print("NumPy intermediate result:", add_ref)
+        assert_ndarray_diff(add_res, add_ref)
+
+        print("Perform Mul")
+        mul_res_store, mul_res_shape = pi_mul(self._fpga_pub_key, add_res_store, encoded_4, add_res_shape, shape_4,
+                                              None, None, None)
+        mul_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, mul_res_store, None, None)).reshape(
+            mul_res_shape.to_tuple())
+        mul_ref = (array_1 + array_3) * array_4
+        print("FPGA intermediate result:", mul_res)
+        print("NumPy intermediate result:", mul_ref)
+        assert_ndarray_diff(mul_res, mul_ref)
+
+        print("Perform Matmul")
+        matmul_res_store, matmul_res_shape = pi_matmul(self._fpga_pub_key, mul_res_store, encoded_2, mul_res_shape,
+                                                       shape_2, None, None, None)
+        matmul_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, matmul_res_store, None, None)).reshape(
+            matmul_res_shape.to_tuple())
+        matmul_ref = ((array_1 + array_3) * array_4) @ array_2
+        print("FPGA result shape:", matmul_res_shape.to_tuple(), ", CPU result shape:", matmul_ref.shape)
+        print("CPU result:\n", matmul_ref)
+        print("FPGA result:\n", matmul_res)
+        assert_ndarray_diff(matmul_res, matmul_ref)
+
+        print("test passed")
+
+    def test_c2bytes_and_bytes2c(self):
+        print("\n\n", "*" * 100, "\n\nTest bytes and c transformation begins")
+
+        raw = generate_rand(TEST_SIZE)
+        print("Raw Data:", raw)
+
+        print("\nPart 1: test te_c2bytes and te_bytes2c")
+        te_store = te_p2c(raw, None)
+        te_bytes = te_c2bytes(te_store, None)
+        te_store_recv = te_bytes2c(te_bytes, te_store)
+        te_ref = list(te_c2p(te_store_recv))
+        print("Bytes Representation:", te_bytes)
+        print("Received data:", te_ref)
+        assert te_store.data_type == RAND_TYPE
+        assert te_store_recv.data_type == RAND_TYPE
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], te_ref[i])
+
+        print("\nPart 2: test fp_c2bytes and fp_bytes2c")
+        fp_store = fp_encode(te_store, self.n, self.max_int)
+        fp_bytes = fp_c2bytes(fp_store, None)
+        fp_store_recv = fp_bytes2c(fp_bytes, fp_store)
+        fp_ref = list(te_c2p(fp_decode(fp_store_recv, None, None)))
+        print("Bytes Representation (excerpt):", fp_bytes[1888:1999])
+        print("Received data:", fp_ref)
+        assert fp_store.data_type == RAND_TYPE
+        assert fp_store_recv.data_type == RAND_TYPE
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], fp_ref[i])
+
+        print("\nPart 3: test pi_c2bytes and pi_bytes2c")
+        pi_store = pi_encrypt(self._fpga_pub_key, fp_store, None, None)
+        pi_bytes = pi_c2bytes(pi_store, None)
+        pi_store_recv = pi_bytes2c(pi_bytes, pi_store)
+        pi_ref = list(te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, pi_store_recv, None, None)))
+        print("Bytes Representation (excerpt):", pi_bytes[1888:1999])
+        print("Received data:", pi_ref)
+        assert pi_store.data_type == RAND_TYPE
+        assert pi_store_recv.data_type == RAND_TYPE
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], pi_ref[i])
+
+        print("test passed")
+
+    def test_fp_slice(self):
+        print("\n\n", "*" * 100, "\n\nTest fp_slice begins")
+        rows, cols = 3, 4
+        shape = [rows, cols]
+        begin_h, end_h = 2, 3
+        begin_v, end_v = 1, 3
+        raw = numpy.asarray(generate_rand(functools.reduce(operator.mul, [*shape], 1))).reshape(shape)
+        store = te_p2c(raw, None)
+        encoded = fp_encode(store, self.n, self.max_int)
+        slice_h_store, slice_h_shape = fp_slice(encoded, TensorShapeStorage(*shape), begin_h, end_h, 0, None, None,
+                                                None)
+        slice_v_store, slice_v_shape = fp_slice(encoded, TensorShapeStorage(*shape), begin_v, end_v, 1, None, None,
+                                                None)
+        recv_h = numpy.asarray(te_c2p(fp_decode(slice_h_store, None, None))).reshape(slice_h_shape)
+        recv_v = numpy.asarray(te_c2p(fp_decode(slice_v_store, None, None))).reshape(slice_v_shape)
+        print("raw array:\n", raw)
+        print("horizontal slice:\n", recv_h)
+        print("vertical slice:\n", recv_v)
+        for i in range(end_h - begin_h):
+            for j in range(cols):
+                assert_diff(raw[begin_h + i][j], recv_h[i][j])
+        for i in range(rows):
+            for j in range(end_v - begin_v):
+                assert_diff(raw[i][begin_v + j], recv_v[i][j])
+        assert slice_h_store.data_type == RAND_TYPE
+        assert slice_v_store.data_type == RAND_TYPE
+        print("test passed")
+
+    def test_pi_slice(self):
+        print("\n\n", "*" * 100, "\n\nTest pi_slice begins")
+        rows, cols = 3, 4
+        shape = [rows, cols]
+        begin_h, end_h = 2, 3
+        begin_v, end_v = 1, 3
+        raw = numpy.asarray(generate_rand(functools.reduce(operator.mul, [*shape], 1))).reshape(shape)
+        store = te_p2c(raw, None)
+        encoded = fp_encode(store, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded, None, None)
+        slice_h_store, slice_h_shape = pi_slice(encrypted, TensorShapeStorage(*shape), begin_h, end_h, 0, None, None,
+                                                None)
+        slice_v_store, slice_v_shape = pi_slice(encrypted, TensorShapeStorage(*shape), begin_v, end_v, 1, None, None,
+                                                None)
+        recv_h = numpy.asarray(
+            te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, slice_h_store, None, None))).reshape(
+            slice_h_shape)
+        recv_v = numpy.asarray(
+            te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, slice_v_store, None, None))).reshape(
+            slice_v_shape)
+        print("raw array:\n", raw)
+        print("horizontal slice:\n", recv_h)
+        print("vertical slice:\n", recv_v)
+        for i in range(end_h - begin_h):
+            for j in range(cols):
+                assert_diff(raw[begin_h + i][j], recv_h[i][j])
+        for i in range(rows):
+            for j in range(end_v - begin_v):
+                assert_diff(raw[i][begin_v + j], recv_v[i][j])
+        assert slice_h_store.data_type == RAND_TYPE
+        assert slice_v_store.data_type == RAND_TYPE
+        print("test passed")
+
+    def test_pi_reshape(self):
+        print("\n\n", "*" * 100, "\n\nTest pi_reshape begins")
+        raw = generate_rand(TEST_SIZE)
+        store = te_p2c(raw, None)
+        encoded = fp_encode(store, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded, None, None)
+        old_shape, new_shape = TensorShapeStorage(2, 3), TensorShapeStorage(3, 2)
+        new_store_res, new_shape_res = pi_reshape(encrypted, old_shape, new_shape, encrypted, None,
+                                                  None)  # PREVENT DOUBLE FREE: option 1
+        print("PyObject ids before and after reshape:", id(new_store_res), id(encrypted))
+        assert id(new_store_res) == id(encrypted)
+        # encrypted.exp_storage, encrypted.pen_storage, encrypted.base_storage =
+        # None, None, None  # PREVENT DOUBLE FREE: option 2
+
+        print("original shape:", old_shape.to_tuple(), ", returned shape:", new_shape_res.to_tuple(),
+              ", expected new shape:", new_shape.to_tuple())
+        assert new_shape.to_tuple() == new_shape_res.to_tuple()
+        recv = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, new_store_res, None, None))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], recv[i])
+        assert encoded.data_type == RAND_TYPE
+        assert encrypted.data_type == RAND_TYPE
+        assert new_store_res.data_type == RAND_TYPE
+        print("raw tensor:\n", numpy.asarray(raw).reshape(old_shape.to_tuple()))
+        print("reshaped tensor:\n", numpy.asarray(recv).reshape(new_shape_res.to_tuple()))
+
+        print("test passed")
+
+    def test_fp_mul(self):
+        print("\n\n", "*" * 100, "\n\nTest fp_mul begins")
+        raw_1, raw_2 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        store_1, store_2 = te_p2c(raw_1, None), te_p2c(raw_2, None)
+        encoded_1, encoded_2 = fp_encode(store_1, self.n, self.max_int), fp_encode(store_2, self.n, self.max_int)
+        res_store, res_shape = fp_mul(encoded_1, encoded_2, TensorShapeStorage(2, 3), TensorShapeStorage(2, 3), None,
+                                      None, None)
+        decoded = fp_decode(res_store, None, None)
+        recv = te_c2p(decoded)
+        assert res_shape.to_tuple() == (2, 3)
+        assert encoded_1.data_type == RAND_TYPE
+        assert encoded_2.data_type == RAND_TYPE
+        assert res_store.data_type == RAND_TYPE
+        assert decoded.data_type == RAND_TYPE
+
+        cpu_encoded_1 = [FixedPointNumber.encode(v, self.n, self.max_int) for v in raw_1]
+        cpu_encoded_2 = [FixedPointNumber.encode(v, self.n, self.max_int) for v in raw_2]
+        cpu_res = [FixedPointNumber((cpu_encoded_1[i].encoding * cpu_encoded_2[i].encoding) % self.n,
+                                    cpu_encoded_1[i].exponent + cpu_encoded_2[i].exponent, self.n, self.max_int) for i
+                   in range(TEST_SIZE)]
+        cpu_ref = [v.decode() for v in cpu_res]
+
+        print("FPGA result:", list(recv))
+        print("CPU result:", list(cpu_ref))
+
+        res_fp = fp_c2p(res_store)
+        for i in range(TEST_SIZE):
+            assert_diff(recv[i], cpu_ref[i])
+            assert_diff(res_fp[i].encoding, cpu_res[i].encoding)
+            assert res_fp[i].BASE == cpu_res[i].BASE
+            assert res_fp[i].exponent == cpu_res[i].exponent
+
+        print("test passed")
+
+    def test_te_c2p_first(self):
+        print("\n\n", "*" * 100, "\n\nTest te_c2p_first begins")
+
+        raw = generate_rand(TEST_SIZE)
+        store = te_p2c(raw, None)
+        print(raw[0], te_c2p_first(store))
+        assert raw[0] == te_c2p_first(store)
+
+        print("test passed")
+
+    def test_malloc(self):
+        print("\n\n", "*" * 100, "\n\nTest malloc begins")
+
+        bi_store = bi_alloc(None, TEST_SIZE, PLAIN_BYTE, MEM_HOST)
+        te_store = te_alloc(None, TEST_SIZE, MEM_HOST)
+        fp_store = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        pi_store = pi_alloc(None, TEST_SIZE, MEM_HOST)
+
+        raw = generate_rand(TEST_SIZE)
+        store = te_p2c(raw, te_store)
+        encoded = fp_encode(store, self.n, self.max_int, None, None, fp_store, None)
+        print("PyObject ids before and after encode:", id(encoded), id(fp_store))
+        assert id(encoded) == id(fp_store)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded, pi_store, None)
+        obf_seeds = pi_gen_obf_seed(bi_store, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, time.time(), None)
+        encrypted = pi_obfuscate(self._fpga_pub_key, pi_store, obf_seeds, pi_store, None)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, encrypted, te_store, None)
+        received = te_c2p(decrypted)
+        print("raw data:", raw, "\nreceived data:", list(received))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], received[i])
+
+        bi_free(bi_store)
+        te_free(te_store)
+        fp_free(fp_store)
+        pi_free(pi_store)
+        # fp_store.base_storage, fp_store.bigint_storage, fp_store.exp_storage = None, None, None
+
+        print("test passed")
+
+    def test_p2c(self):
+        print("\n\n", "*" * 100, "\n\nTest fp_p2c & pi_p2c Begins")
+
+        print("Part 1.1: test te_p2c for list")
+        raw = generate_rand(TEST_SIZE)
+        te_store = te_p2c(raw, None)
+        received = te_c2p(te_store)
+        print("raw data:", raw, "\nreceived data:", list(received))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], received[i])
+
+        print("Part 1.2: test te_p2c for ndarray")
+        np_raw = numpy.asarray(raw).reshape(2, 3)
+        te_store = te_p2c(np_raw, None)
+        received = te_c2p(te_store)
+        print("raw data:", raw, "\nreceived data:", list(received))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], received[i])
+
+        print("Part 2.1: test fp_p2c for list")
+        cpu_encoded = [FixedPointNumber.encode(v, self.n, self.max_int) for v in raw]
+        fp_store = fp_p2c(None, cpu_encoded, RAND_TYPE)
+        decoded = te_c2p(fp_decode(fp_store, None, None))
+        print("raw data:", raw, "\ndecoded data:", list(decoded))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], decoded[i])
+
+        print("Part 2.2: test fp_p2c for ndarray")
+        np_cpu_encoded = numpy.asarray(cpu_encoded).reshape(2, 3)
+        fp_store = fp_p2c(None, np_cpu_encoded, RAND_TYPE)
+        decoded = te_c2p(fp_decode(fp_store, None, None))
+        print("raw data:", raw, "\ndecoded data:", list(decoded))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], decoded[i])
+
+        print("Part 3.1: test pi_p2c for list")
+        cpu_encrypted = [self._pub_key.encrypt(raw[i], None, 0) for i in range(TEST_SIZE)]
+        pi_store = pi_p2c(None, cpu_encrypted, RAND_TYPE)
+        decrypted = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, pi_store, None, None))
+        print("raw data:", raw, "\ndecrypted data:", list(decrypted))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], decrypted[i])
+
+        print("Part 3.2: test pi_p2c for ndarray")
+        np_cpu_encrypted = numpy.asarray(cpu_encrypted).reshape(2, 3)
+        pi_store = pi_p2c(None, np_cpu_encrypted, RAND_TYPE)
+        decrypted = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, pi_store, None, None))
+        print("raw data:", raw, "\ndecrypted data:", list(decrypted))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], decrypted[i])
+
+        print("test passed")
+
+    def test_tensor(self):
+        print("\n\n", "*" * 100, "\n\nTest Tensor begins")
+        if RAND_TYPE == INT64_TYPE:
+            raw_1, raw_2 = [i % 128 + 1 for i in generate_rand(TEST_SIZE)], [i % 128 + 1 for i in
+                                                                             generate_rand(TEST_SIZE)]
+        elif RAND_TYPE == FLOAT_TYPE:
+            raw_1, raw_2 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        else:
+            raise PermissionError("Invalid Data Type")
+        rows, cols = 2, 3
+        shape = TensorShapeStorage(rows, cols)
+        transposed_shape = TensorShapeStorage(cols, rows)
+        array_1 = numpy.asarray(raw_1).reshape(shape.to_tuple())
+        array_2 = numpy.asarray(raw_2).reshape(shape.to_tuple())
+        array_3 = array_2.transpose()
+        store_1 = TensorStorage(array_1, TEST_SIZE, MEM_HOST, RAND_TYPE)
+        store_2 = TensorStorage(array_2, TEST_SIZE, MEM_HOST, RAND_TYPE)
+        store_3 = TensorStorage(array_3, TEST_SIZE, MEM_HOST, RAND_TYPE)
+        print("raw data:\n", array_1, "\n", array_2)
+
+        print("Part 1: test shape")
+
+        def __run_test_shape(dims):
+            shape = tuple(dims)
+            c_shape = te_p2c_shape(shape, None)
+            py_shape = te_c2p_shape(c_shape)
+            print("compare shapes:", shape, c_shape.to_tuple(), py_shape)
+            assert shape == c_shape.to_tuple()
+            assert shape == py_shape
+
+        __run_test_shape([])
+        __run_test_shape([1])
+        __run_test_shape([1, 2])
+
+        print("Part 2: test te_slice")
+        res_store, res_shape = te_slice(store_1, shape, 1, 2, 0, None, None, None)
+        assert (res_store.data == array_1[1:2]).all()
+        assert res_shape.to_tuple() == (1, cols)
+        res_store, res_shape = te_slice(store_1, shape, 0, 2, 1, None, None, None)
+        assert (res_store.data == array_1[:, 0:2]).all()
+        assert res_shape.to_tuple() == (rows, 2)
+
+        print("Part 3: test te_cat")
+        res_store, res_shape = te_cat([store_1, store_2], 0, None, None)
+        assert (res_store.data == numpy.vstack((array_1, array_2))).all()
+        assert res_shape.to_tuple() == (rows * 2, cols)
+        res_store, res_shape = te_cat([store_1, store_2], 1, None, None)
+        assert (res_store.data == numpy.hstack((array_1, array_2))).all()
+        assert res_shape.to_tuple() == (rows, cols * 2)
+
+        print("Part 4: test te_pow")
+        res_store, res_shape = te_pow(store_1, 9, shape, None, None, None)
+        assert (res_store.data == array_1 ** 9).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 5: test te_add")
+        res_store, res_shape = te_add(store_1, store_2, shape, shape, None, None, None)
+        assert (res_store.data == array_1 + array_2).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 6: test te_mul")
+        res_store, res_shape = te_mul(store_1, store_2, shape, shape, None, None, None)
+        assert (res_store.data == array_1 * array_2).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 7: test te_truediv")
+        res_store, res_shape = te_truediv(store_1, store_2, shape, shape, None, None, None)
+        assert (res_store.data == array_1 / array_2).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 8: test te_floordiv")
+        res_store, res_shape = te_floordiv(store_1, store_2, shape, shape, None, None, None)
+        assert (res_store.data == array_1 // array_2).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 9: test te_sub")
+        res_store, res_shape = te_sub(store_1, store_2, shape, shape, None, None, None)
+        assert (res_store.data == array_1 - array_2).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 10: test te_matmul")
+        res_store, res_shape = te_matmul(store_1, store_3, shape, transposed_shape, None, None, None)
+        print(res_store.data)
+        assert_ndarray_diff(res_store.data, array_1 @ array_2.transpose())
+        assert res_shape.to_tuple() == (rows, rows)
+
+        print("Part 11: test te_abs")
+        res_store, res_shape = te_abs(store_1, shape, None, None, None)
+        assert (res_store.data == abs(array_1)).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 12: test te_neg")
+        res_store, res_shape = te_neg(store_1, shape, None, None, None)
+        assert (res_store.data == -array_1).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 13: test te_transpose")
+        res_store, res_shape = te_transpose(store_1, shape, None, None, None)
+        assert (res_store.data == array_1.transpose()).all()
+        assert res_shape.to_tuple() == transposed_shape.to_tuple()
+
+        print("Part 14: test te_sum")
+        res_store, res_shape = te_sum(store_1, shape, None, None, None, None)
+        assert (res_store.data == array_1.sum()).all()
+        assert res_shape.to_tuple() == ()
+        res_store, res_shape = te_sum(store_1, shape, 0, None, None, None)
+        assert (res_store.data == array_1.sum(axis=0)).all()
+        assert res_shape.to_tuple() == (cols,)
+        res_store, res_shape = te_sum(store_1, shape, 1, None, None, None)
+        assert (res_store.data == array_1.sum(axis=1)).all()
+        assert res_shape.to_tuple() == (rows,)
+
+        print("Part 15: test te_reshape")
+        res_store, res_shape = te_reshape(store_1, shape, transposed_shape, None, None, None)
+        assert (res_store.data == array_1.reshape(transposed_shape.to_tuple())).all()
+        assert res_shape.to_tuple() == transposed_shape.to_tuple()
+
+        print("Part 16: test te_exp")
+        res_store, res_shape = te_exp(store_1, shape, None, None, None)
+        assert (res_store.data == numpy.exp(array_1)).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 17: test te_hstack")
+        res_store, res_shape = te_hstack(store_1, store_2, shape, shape, None, None, None)
+        assert (res_store.data == numpy.hstack((array_1, array_2))).all()
+        assert res_shape.to_tuple() == (rows, cols * 2)
+
+        print("Test passed")
+
+    def test_matmul_fix(self):
+        print("\n\n", "*" * 100, "\n\nTest matmul_fix Begins")
+        print("This test is to test whether the previous overflow bug in matmul has been fixed")
+
+        # use specific operands
+        raw_1 = [-6.328172916615867, -2.8424299647675904, 5.161324580891171, -0.23598534366587853, 0.8092957262188305,
+                 19.50497470592641]
+        raw_2 = [-0.048743928478232584, 6.191889562038381, 2.7177577835259017, 17.09697900858307, 11.31935499510339,
+                 -4.881758293445916]
+        raw_3 = [14.051643909583548, 5.246105161671397, 6.764067053406746, 4.727717881071932, -6.361020843266641,
+                 -12.94175161066905]
+        raw_4 = [-0.003912522017777569, 14.519125724575714, -5.401608455748054, 13.918193685722846, 5.97460357170185,
+                 -3.960383753671568]
+
+        print('Raw data:\n', raw_1, '\n', raw_2, '\n', raw_3, '\n', raw_4)
+
+        # generate shapes and NumPy arrays
+        rows, cols = 2, 3
+        array_1, array_2 = numpy.asarray(raw_1).reshape(rows, cols), numpy.asarray(raw_2).reshape(cols, rows)
+        array_3, array_4 = numpy.asarray(raw_3).reshape(rows, cols), numpy.asarray(raw_4).reshape(rows, cols)
+        shape_1, shape_2 = TensorShapeStorage(rows, cols), TensorShapeStorage(cols, rows)
+        shape_3, shape_4 = TensorShapeStorage(rows, cols), TensorShapeStorage(rows, cols)
+
+        # transfer and encode
+        te_store_1, te_store_2 = te_p2c(raw_1, None), te_p2c(raw_2, None)
+        te_store_3, te_store_4 = te_p2c(raw_3, None), te_p2c(raw_4, None)
+        encoded_1, encoded_2 = fp_encode(te_store_1, self.n, self.max_int), fp_encode(te_store_2, self.n, self.max_int)
+        encoded_3, encoded_4 = fp_encode(te_store_3, self.n, self.max_int), fp_encode(te_store_4, self.n, self.max_int)
+
+        # perform encrypt and obfs
+        encrypted_old_1 = pi_encrypt(self._fpga_pub_key, encoded_1, None, None)
+        encrypted_old_3 = pi_encrypt(self._fpga_pub_key, encoded_3, None, None)
+        rand_store_1 = pi_gen_obf_seed(None, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, time.time(), None)
+        rand_store_3 = pi_gen_obf_seed(None, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, time.time(), None)
+        encrypted_1 = pi_obfuscate(self._fpga_pub_key, encrypted_old_1, rand_store_1, None, None)
+        encrypted_3 = pi_obfuscate(self._fpga_pub_key, encrypted_old_3, rand_store_3, None, None)
+
+        print("Perform Add")
+        add_res_store, add_res_shape = pi_add(self._fpga_pub_key, encrypted_1, encrypted_3, shape_1, shape_3, None,
+                                              None, None)
+        add_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, add_res_store, None, None)).reshape(
+            add_res_shape.to_tuple())
+        add_ref = array_1 + array_3
+        print("FPGA intermediate result:", add_res)
+        print("NumPy intermediate result:", add_ref)
+        assert_ndarray_diff(add_res, add_ref)
+
+        print("Perform Mul")
+        mul_res_store, mul_res_shape = pi_mul(self._fpga_pub_key, add_res_store, encoded_4, add_res_shape, shape_4,
+                                              None, None, None)
+        mul_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, mul_res_store, None, None)).reshape(
+            mul_res_shape.to_tuple())
+        mul_ref = (array_1 + array_3) * array_4
+        print("FPGA intermediate result:", mul_res)
+        print("NumPy intermediate result:", mul_ref)
+        assert_ndarray_diff(mul_res, mul_ref)
+
+        # The following code is to dump PEN and FPN storages into stdout
+        # print("n (big endian bytes):", self._pub_key.n.to_bytes(CIPHER_BYTE, 'big').hex())
+        # print("nsquare (big endian bytes):", self._pub_key.nsquare.to_bytes(CIPHER_BYTE, 'big').hex())
+        # fp_list = fp_c2p(encoded_2)
+        # pi_cipher, pi_base, pi_exp = pi_c2p(mul_res_store)
+        # print("\n\n>>>>>>>>>>>>>> dumping pen storage\n")
+        # for i in range(TEST_SIZE):
+        #     print("=====================id:", i)
+        #     print("PEN cipher (big endian bytes):", pi_cipher[i].to_bytes(CIPHER_BYTE, 'big').hex())
+        #     print("PEN base (decimal):", pi_base[i])
+        #     print("PEN exponent (decimal):", pi_exp[i])
+        # print("\n\n>>>>>>>>>>>>>> dumping fpn storage\n")
+        # for i in range(TEST_SIZE):
+        #     print("=====================id:", i)
+        #     print("FPN encoding (big endian bytes):", fp_list[i].encoding.to_bytes(CIPHER_BYTE, 'big').hex())
+        #     print("FPN base (decimal):", fp_list[i].BASE)
+        #     print("FPN exponent (decimal):", fp_list[i].exponent)
+
+        # The following code is essentially to decrypt and encrypt again.
+        # However, the numbers might be truncated so that the overflow could be mitigated
+        # tmp_te_store = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, mul_res_store, None, None)
+        # mul_res_store = pi_encrypt(self._fpga_pub_key, fp_encode(tmp_te_store, self.n, self.max_int), None, None)
+        # mul_res_store = pi_obfuscate(self._fpga_pub_key, mul_res_store, rand_store_1, None, None)
+
+        print("Perform Matmul: PEN shape (2, 3), FPN shape (3, 2)")
+        matmul_res_store, matmul_res_shape = pi_matmul(self._fpga_pub_key, mul_res_store, encoded_2, mul_res_shape,
+                                                       shape_2, None, None, None)
+        matmul_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, matmul_res_store, None, None)).reshape(
+            matmul_res_shape.to_tuple())
+        matmul_ref = ((array_1 + array_3) * array_4) @ array_2
+        print("FPGA result shape:", matmul_res_shape.to_tuple(), ", CPU result shape:", matmul_ref.shape)
+        print("CPU result:\n", matmul_ref)
+        print("FPGA result:\n", matmul_res)
+        assert_ndarray_diff(matmul_res, matmul_ref)
+
+        print("test passed")
+
+    def test_te_vertical_slice(self):
+        print("\n\n", "*" * 100, "\n\nTest Tensor Vertical Slice Begins")
+        shape = (2, 3)
+        np_raw = numpy.asarray(generate_rand(TEST_SIZE)).reshape(shape)
+        print("raw data:\n", np_raw)
+        np_raw_store = TensorStorage(np_raw, TEST_SIZE, MEM_HOST, RAND_TYPE)
+        np_slice_store, np_slice_shape = te_slice(np_raw_store, TensorShapeStorage(*shape), 2, 3, 1, None, None, None)
+        print("numpy slice data:\n", np_slice_store.data)
+        c_slice_store = te_p2c(np_slice_store.data, None)
+        slice_recv = te_c2p(c_slice_store).reshape(np_slice_shape)
+        print("received slice data:\n", slice_recv)
+        assert_ndarray_diff(np_slice_store.data, slice_recv)
+        print("Test Passed")
+
+    def test_encode_precision_1(self):
+        print("\n\n", "*" * 100, "\n\nTesting encode with precision 1")
+        raw = [19.12634]
+        store = te_p2c(raw, None)
+        fp_store = fp_encode(store, self.n, self.max_int, 1)
+        recv = fp_decode(fp_store, None, None)
+        recv_scalar = te_c2p(recv)
+        print("result:", recv_scalar)
+        assert recv_scalar[0] == 19
+        print("Test passed")
+
+    def test_matmul_limits(self):
+        print("\n\n", "*" * 100, "\n\nTest after how many matmul would cause our internal data structure overflow")
+        shape_tuple = (TEST_SIZE // 2, TEST_SIZE // 2)
+        shape_store = TensorShapeStorage(*shape_tuple)
+        shape_size = functools.reduce(operator.mul, [*shape_tuple], 1)
+        raw_1, raw_2 = [random.gauss(0, 1) for _ in range(shape_size)], [random.gauss(0, 1) for _ in range(shape_size)]
+        left_array, right_array = numpy.asarray(raw_1).reshape(shape_tuple), numpy.asarray(raw_2).reshape(shape_tuple)
+
+        # FPGA encode & encrypt
+        left_store = pi_encrypt(self._fpga_pub_key, fp_encode(te_p2c(raw_1, None), self.n, self.max_int), None, None)
+        obf_seeds = pi_gen_obf_seed(None, self._fpga_pub_key, shape_size, CIPHER_BITS // 6, time.time(), None)
+        left_store = pi_obfuscate(self._fpga_pub_key, left_store, obf_seeds, left_store, None)
+        right_store = fp_encode(te_p2c(raw_2, None), self.n, self.max_int)
+
+        for i in range(1, 100):
+            # Dumping useful data
+            print("\n>>>>>>>>>>>>>>> iteration:", i)
+            _, base, exp = pi_c2p(left_store)
+            fp_py_store = fp_c2p(right_store)
+            all_exponents = [*exp, *[v.exponent for v in fp_py_store]]
+            max_exponent = max(*all_exponents)
+            if i == 1:
+                initial_max_exp = max_exponent
+            print("all exponents:", all_exponents)
+            print("max base:", max(*base, *[v.BASE for v in fp_py_store]), ", max exponent:", max_exponent)
+
+            # Running Numpy and FPGA matmul, storing the result to the left operand
+            left_array = left_array @ right_array
+            left_store, tmp_shape = pi_matmul(self._fpga_pub_key, left_store, right_store, shape_store, shape_store,
+                                              left_store, None, None)
+
+            # Get matmul result of the current iteration and compare
+            tmp_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, left_store, None, None)).reshape(
+                tmp_shape.to_tuple())
+            print("FPGA result:\n", tmp_res, "\nCPU result:\n", left_array)
+            try:
+                assert_ndarray_diff(tmp_res, left_array)
+            except AssertionError:
+                final_exponents = [*pi_c2p(left_store)[2], *[v.exponent for v in fp_c2p(right_store)]]
+                final_max_exp = max(*final_exponents)
+                print("final exponents:", final_exponents)
+                print("initial max exponent:", initial_max_exp, ", final max exponent", final_max_exp)
+                print(">>>>> FPGA and CPU results didn't match at iteration #{}.".format(i))
+                # The following assertions are deprecated as we treat max_exp for FPN and PEN separately
+                # assert 256 <= final_max_exp < 512
+                # assert initial_max_exp * int(round(2 ** i)) == final_max_exp
+                # assert i == math.ceil(8 - math.log2(initial_max_exp))
+                break
+
+        print("Test passed")
+
+    def test_fp_cat(self):
+        print("\n\n", "*" * 100, "\n\nTest fp_cat begins")
+        shape_tuple = (2, 3)
+        shape = TensorShapeStorage(*shape_tuple)
+        shape_size = int(round(numpy.prod(shape_tuple)))
+        tmp_1, tmp_2 = generate_rand(shape_size), generate_rand(shape_size)
+        array_1, array_2 = numpy.asarray(tmp_1).reshape(shape_tuple), numpy.asarray(tmp_2).reshape(shape_tuple)
+        print("array_1:\n", array_1, "\narray_2:\n", array_2)
+
+        fp_store_1 = fp_encode(te_p2c(array_1, None), self.n, self.max_int)
+        fp_store_2 = fp_encode(te_p2c(array_2, None), self.n, self.max_int)
+
+        # test vertical cat
+        print("Part 1: test vertical cat")
+        cat_store, cat_shape = fp_cat([fp_store_1, fp_store_2], [shape, shape], 0, None, None)
+        print("result shape:", cat_shape.to_tuple())
+        assert cat_shape.to_tuple() == (shape_tuple[0] * 2, shape_tuple[1])
+        decoded = fp_decode(cat_store, None, None)
+        res = te_c2p(decoded).reshape(cat_shape.to_tuple())
+        ref = numpy.concatenate((array_1, array_2), 0)
+        print("result tensor:\n", res)
+        print("reference tensor:\n", ref)
+        assert_ndarray_diff(res, ref)
+
+        # test horizontal cat
+        print("Part 2: test horizontal cat")
+        cat_store, cat_shape = fp_cat([fp_store_1, fp_store_2], [shape, shape], 1, None, None)
+        print("result shape:", cat_shape.to_tuple())
+        assert cat_shape.to_tuple() == (shape_tuple[0], shape_tuple[1] * 2)
+        decoded = fp_decode(cat_store, None, None)
+        res = te_c2p(decoded).reshape(cat_shape.to_tuple())
+        ref = numpy.concatenate((array_1, array_2), 1)
+        print("result tensor:\n", res)
+        print("reference tensor:\n", ref)
+        assert_ndarray_diff(res, ref)
+
+        print("test passed")
+
+    def test_pi_cat(self):
+        print("\n\n", "*" * 100, "\n\nTest pi_cat begins")
+        shape_tuple = (2, 3)
+        shape = TensorShapeStorage(*shape_tuple)
+        shape_size = int(round(numpy.prod(shape_tuple)))
+        array_1 = numpy.asarray(generate_rand(shape_size)).reshape(shape_tuple)
+        array_2 = numpy.asarray(generate_rand(shape_size)).reshape(shape_tuple)
+        array_3 = numpy.asarray(generate_rand(shape_size)).reshape(shape_tuple)
+        print("array_1:\n", array_1, "\narray_2:\n", array_2, "\narray_3:\n", array_3)
+
+        fp_store_1 = fp_encode(te_p2c(array_1, None), self.n, self.max_int)
+        fp_store_2 = fp_encode(te_p2c(array_2, None), self.n, self.max_int)
+        fp_store_3 = fp_encode(te_p2c(array_3, None), self.n, self.max_int)
+        pi_store_1 = pi_encrypt(self._fpga_pub_key, fp_store_1, None, None)
+        pi_store_2 = pi_encrypt(self._fpga_pub_key, fp_store_2, None, None)
+        pi_store_3 = pi_encrypt(self._fpga_pub_key, fp_store_3, None, None)
+
+        # test horizontal cat
+        print("Part 1: test horizontal cat")
+        cat_store, cat_shape = pi_cat([pi_store_1, pi_store_2, pi_store_3], [shape, shape, shape], 1, None, None)
+        print("result shape:", cat_shape.to_tuple())
+        assert cat_shape.to_tuple() == (shape_tuple[0], shape_tuple[1] * 3)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, cat_store, None, None)
+        res = te_c2p(decrypted).reshape(cat_shape.to_tuple())
+        ref = numpy.concatenate((array_1, array_2, array_3), 1)
+        print("result tensor:\n", res)
+        print("reference tensor:\n", ref)
+        assert_ndarray_diff(res, ref)
+
+        # test vertical cat
+        print("Part 2: test vertical cat")
+        cat_store, cat_shape = pi_cat([pi_store_1, pi_store_2, pi_store_3], [shape, shape, shape], 0, None, None)
+        print("result shape:", cat_shape.to_tuple())
+        assert cat_shape.to_tuple() == (shape_tuple[0] * 3, shape_tuple[1])
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, cat_store, None, None)
+        res = te_c2p(decrypted).reshape(cat_shape.to_tuple())
+        ref = numpy.concatenate((array_1, array_2, array_3), 0)
+        print("result tensor:\n", res)
+        print("reference tensor:\n", ref)
+        assert_ndarray_diff(res, ref)
+
+        # test concat combination
+        print("Part 3: test combined cat")
+        vector_size = cat_shape.to_tuple()[0]
+        vector_shape = TensorShapeStorage(vector_size, 1)
+        vector = numpy.asarray(generate_rand(vector_size)).reshape(vector_size, 1)
+        print("vector:\n", vector)
+        vector_te_store = te_p2c(vector, None)
+        vector_fp_store = fp_encode(vector_te_store, self.n, self.max_int)
+        vector_pi_store = pi_encrypt(self._fpga_pub_key, vector_fp_store, None, None)
+        cat_store, cat_shape = pi_cat([cat_store, vector_pi_store], [cat_shape, vector_shape], 1, None, None)
+        print("result shape:", cat_shape.to_tuple())
+        assert cat_shape.to_tuple() == (shape_tuple[0] * 3, shape_tuple[1] + 1)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, cat_store, None, None)
+        res = te_c2p(decrypted).reshape(cat_shape.to_tuple())
+        ref = numpy.concatenate([numpy.concatenate((array_1, array_2, array_3), 0), vector], 1)
+        print("result tensor:\n", res)
+        print("reference tensor:\n", ref)
+        assert_ndarray_diff(res, ref)
+
+        print("test passed")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_performance.py b/gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_performance.py
new file mode 100755
index 0000000000..88013399e0
--- /dev/null
+++ b/gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_performance.py
@@ -0,0 +1,324 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import numpy
+import unittest
+import random
+import functools
+import time
+
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
+    PaillierKeypair,
+    PaillierEncryptedNumber,
+    FixedPointNumber,
+    gmpy_math,
+)
+
+from ..fpga_engine import (
+    FLOAT_TYPE,
+    INT64_TYPE,
+    pi_p2c_pub_key,
+    pi_p2c_priv_key,
+    pi_h2d_pub_key,
+    pi_h2d_priv_key,
+    TensorShapeStorage,
+    bi_alloc,
+    PLAIN_BYTE,
+    MEM_HOST,
+    te_alloc,
+    fp_alloc,
+    pi_alloc,
+    te_p2c,
+    fp_encode,
+    fp_decode,
+    te_c2p,
+    pi_encrypt,
+    pi_gen_obf_seed,
+    CIPHER_BITS,
+    pi_obfuscate,
+    pi_c2p,
+    pi_decrypt,
+    fp_mul,
+    fp_c2p,
+    pi_add,
+    pi_mul,
+    pi_sum,
+    bi_free,
+    te_free,
+    fp_free,
+    pi_free, initialize_device, pi_matmul,
+)
+
+RAND_TYPE = FLOAT_TYPE  # SWITCH DATA TYPE HERE: EITHER INT64_TYPE OR FLOAT_TYPE
+NUM_ROWS = 666
+NUM_COLS = 666
+TEST_SIZE = NUM_ROWS * NUM_COLS
+ERROR_TOLERANCE = 1e-10
+
+
+def generate_rand(test_size):
+    if RAND_TYPE == FLOAT_TYPE:
+        return numpy.random.normal(0, 5, test_size)
+    elif RAND_TYPE == INT64_TYPE:
+        return numpy.random.randint(-2 ** 10, 2 ** 10, test_size)
+    else:
+        raise TypeError("Invalid data type")
+
+
+def assert_diff(res, ref):
+    if res == 0 or ref == 0:
+        assert res == 0
+        assert ref == 0
+    else:
+        diff = res - ref
+        assert abs(diff / res) < ERROR_TOLERANCE
+        assert abs(diff / ref) < ERROR_TOLERANCE
+
+
+def assert_ndarray_diff(res, ref):
+    assert res.shape == ref.shape
+    res, ref = res.flatten(), ref.flatten()
+    assert res.shape == ref.shape
+    for i in range(res.size):
+        try:
+            assert_diff(res[i], ref[i])
+        except AssertionError:
+            print("Assertion Error at location", i, ", FPGA result:",
+                  res[i], ", reference result:", ref[i])
+
+
+def profile(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        res = func(*args, **kwargs)
+        end_time = time.time()
+        return res, end_time - start_time
+
+    return wrapper
+
+
+def compare_time(fpga_time, cpu_time, num_instances=TEST_SIZE):
+    print("FPGA time:", fpga_time, "second(s)")
+    print("CPU time:", cpu_time, "second(s)")
+    print("FPGA throughput:", num_instances / fpga_time, "instance(s) per second")
+    print("CPU throughput:", num_instances / cpu_time, "instance(s) per second")
+    print("Speedup:", cpu_time / fpga_time)
+
+
+def cpu_pi_gen_obf_seed(res_store, public_key, count, elem_size, rand_seed, stream):
+    random.seed(rand_seed)
+    rand_vals = [random.randrange(1, 8 ** elem_size) for _ in range(count)]
+    return [gmpy_math.powmod(v, public_key.n, public_key.nsquare) for v in rand_vals]
+
+
+def cpu_pi_obfuscate(public_key, encrypted_numbers, obf_seeds, exponents, res_store, stream):
+    return [PaillierEncryptedNumber(public_key, (encrypted_numbers[i] * obf_seeds[i]) % public_key.nsquare,
+                                    exponents[i]) for i in range(len(encrypted_numbers))]
+
+
+def cpu_fp_mul(left, right):
+    return [FixedPointNumber((left[i].encoding * right[i].encoding) % left[i].n,
+                             left[i].exponent + right[i].exponent, left[i].n, left[i].max_int) for i in
+            range(len(left))]
+
+
+class TestOperators(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        initialize_device()
+        cls._pub_key, cls._priv_key = PaillierKeypair.generate_keypair()
+        cls.n, cls.max_int = cls._pub_key.n, cls._pub_key.max_int
+        cls._cpu_pub_key = pi_p2c_pub_key(None, cls._pub_key)
+        cls._cpu_priv_key = pi_p2c_priv_key(None, cls._priv_key)
+        cls._fpga_pub_key = pi_h2d_pub_key(None, cls._cpu_pub_key)
+        cls._fpga_priv_key = pi_h2d_priv_key(None, cls._cpu_priv_key)
+        print("\n\n", "*" * 100, "\n\nInitialization complete\nTest Size:", TEST_SIZE)
+
+    # test performance
+    def test_performance(self):
+        print("\n\n", "*" * 100, "\n\nTest performance begins")
+
+        print("\n>>>>> generate data and allocate memory spaces")
+        raw, raw2 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        shape_tuple, shape_tuple_T = (NUM_ROWS, NUM_COLS), (NUM_COLS, NUM_ROWS)
+        shape_store, shape_store_T = TensorShapeStorage(*shape_tuple), TensorShapeStorage(*shape_tuple_T)
+        fpga_bi_store, fpga_bi_store2 = bi_alloc(
+            None, TEST_SIZE, PLAIN_BYTE, MEM_HOST), bi_alloc(
+            None, TEST_SIZE, PLAIN_BYTE, MEM_HOST)
+        fpga_te_store, fpga_te_store2 = te_alloc(None, TEST_SIZE, MEM_HOST), te_alloc(None, TEST_SIZE, MEM_HOST)
+        fpga_fp_store, fpga_fp_store2 = fp_alloc(None, TEST_SIZE, MEM_HOST), fp_alloc(None, TEST_SIZE, MEM_HOST)
+        fpga_pi_store, fpga_pi_store2 = pi_alloc(None, TEST_SIZE, MEM_HOST), pi_alloc(None, TEST_SIZE, MEM_HOST)
+        fpga_te_store, fpga_te_store2 = te_p2c(raw, fpga_te_store), te_p2c(raw2, fpga_te_store2)
+
+        print("\n>>>>> fp_encode profiling begins")
+        fpga_encoded, fpga_encode_time = profile(fp_encode)(fpga_te_store, self.n, self.max_int, res=fpga_fp_store)
+        cpu_encoded, cpu_encode_time = profile(
+            lambda l: [
+                FixedPointNumber.encode(
+                    v, self.n, self.max_int) for v in l])(raw)
+        compare_time(fpga_encode_time, cpu_encode_time)
+
+        print("\n>>>>> fp_decode profiling begins")
+        fpga_decoded, fpga_decode_time = profile(fp_decode)(fpga_encoded, fpga_te_store, None)
+        cpu_decoded, cpu_decode_time = profile(lambda l: [v.decode() for v in l])(cpu_encoded)
+        compare_time(fpga_decode_time, cpu_decode_time)
+
+        # check decoded results
+        assert_ndarray_diff(te_c2p(fpga_decoded), numpy.asarray(cpu_decoded))
+
+        print("\n>>>>> pi_encrypt profiling begins")
+        print("This function calculates (encoding * n + 1) % nsquare")
+        fpga_encrypted, fpga_encrypt_time = profile(pi_encrypt)(self._fpga_pub_key, fpga_encoded, fpga_pi_store, None)
+        cpu_encrypted, cpu_encrypt_time = profile(
+            lambda l: [
+                self._pub_key.raw_encrypt(
+                    v.encoding, 1) for v in l])(cpu_encoded)
+        compare_time(fpga_encrypt_time, cpu_encrypt_time)
+
+        print("\n>>>>> pi_gen_obf_seed profiling begins")
+        print("This function calculates (rand() ^ n) % nsquare")
+        fpga_obf_seeds, fpga_gen_obf_seeds_time = profile(pi_gen_obf_seed)(
+            fpga_bi_store, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None)
+        cpu_obf_seeds, cpu_gen_obf_seefs_time = profile(cpu_pi_gen_obf_seed)(
+            None, self._pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None)
+        compare_time(fpga_gen_obf_seeds_time, cpu_gen_obf_seefs_time)
+
+        print("\n>>>>> pi_obfuscate profiling begins")
+        print("This function calculates (raw_cipher * obf_seed) % nsquare,")
+        print("\twhere raw_cipher and obf_seed are calculated in pi_encrypt and pi_gen_obf_seeds, respectively")
+        fpga_obfuscated, fpga_obfuscate_time = profile(pi_obfuscate)(
+            self._fpga_pub_key, fpga_encrypted, fpga_obf_seeds, fpga_pi_store, None)
+        cpu_obfuscated, cpu_obfuscate_time = profile(cpu_pi_obfuscate)(
+            self._pub_key, cpu_encrypted, cpu_obf_seeds, [
+                v.exponent for v in cpu_encoded], None, None)
+        compare_time(fpga_obfuscate_time, cpu_obfuscate_time)
+
+        # check intermediate result
+        assert_ndarray_diff(numpy.asarray(pi_c2p(fpga_obfuscated)[0]), numpy.asarray(
+            [v.ciphertext(False) for v in cpu_obfuscated]))
+
+        print("\n>>>>> pi_decrypt profiling begins")
+        print("This function calculates L(cipher ^ lambda % nsquare) * L(g ^ lambda % nsquare) ^ -1 % n")
+        print("fp_decode is by default included in pi_decrypt")
+        fps_buffer = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        fpga_decrypted, fpga_decrypt_time = profile(pi_decrypt)(
+            self._fpga_pub_key, self._fpga_priv_key, fpga_obfuscated, fpga_te_store, fps_buffer)
+        cpu_decrypted, cpu_decrypt_time = profile(lambda l: [self._priv_key.decrypt(v) for v in l])(cpu_obfuscated)
+        compare_time(fpga_decrypt_time, cpu_decrypt_time)
+
+        # check decrypted results
+        assert_ndarray_diff(te_c2p(fpga_decrypted), numpy.asarray(cpu_decrypted))
+
+        print("\n>>>>> generating the other array")
+        # encode the other array
+        fpga_encoded2 = fp_encode(fpga_te_store2, self.n, self.max_int, res=fpga_fp_store2)
+        cpu_encoded2 = [FixedPointNumber.encode(v, self.n, self.max_int) for v in raw2]
+        # encrypt the other array
+        fpga_encrypted2 = pi_encrypt(self._fpga_pub_key, fpga_encoded2, fpga_pi_store2, None)
+        cpu_encrypted2 = [self._pub_key.raw_encrypt(v.encoding, 1) for v in cpu_encoded2]
+        # generate obfuscation seeds (obfuscators) for the other array using a different random seed
+        fpga_obf_seeds2 = pi_gen_obf_seed(fpga_bi_store2, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, 1, None)
+        cpu_obf_seeds2 = cpu_pi_gen_obf_seed(None, self._pub_key, TEST_SIZE, CIPHER_BITS // 6, 1, None)
+        # obfuscate the other array
+        fpga_obfuscated2 = pi_obfuscate(self._fpga_pub_key, fpga_encrypted2, fpga_obf_seeds2, fpga_pi_store2, None)
+        cpu_obfuscated2 = cpu_pi_obfuscate(
+            self._pub_key, cpu_encrypted2, cpu_obf_seeds2, [
+                v.exponent for v in cpu_encoded2], None, None)
+        # check intermediate result
+        assert_ndarray_diff(numpy.asarray(pi_c2p(fpga_obfuscated2)[0]), numpy.asarray(
+            [v.ciphertext(False) for v in cpu_obfuscated2]))
+
+        print("\n>>>>> fp_mul profiling begins")
+        fpga_fp_mul_store = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        (fpga_fp_mul_res, _), fpga_fp_mul_time = profile(fp_mul)(fpga_encoded,
+                                                                 fpga_encoded2, shape_store, shape_store,
+                                                                 fpga_fp_mul_store, shape_store, None)
+        cpu_fp_mul_res, cpu_fp_mul_time = profile(cpu_fp_mul)(cpu_encoded, cpu_encoded2)
+        compare_time(fpga_fp_mul_time, cpu_fp_mul_time)
+
+        # Compare results
+        received_fp_mul_res = fp_c2p(fpga_fp_mul_res)
+        for i in range(TEST_SIZE):
+            assert_diff(received_fp_mul_res[i].encoding, cpu_fp_mul_res[i].encoding)
+            assert received_fp_mul_res[i].BASE == cpu_fp_mul_res[i].BASE
+            assert received_fp_mul_res[i].exponent == cpu_fp_mul_res[i].exponent
+
+        print("\n>>>>> pi_add profiling begins")
+        (fpga_add_res, _), fpga_add_time = profile(pi_add)(self._fpga_pub_key, fpga_obfuscated,
+                                                           fpga_obfuscated2, shape_store, shape_store, fpga_pi_store,
+                                                           shape_store, None)
+        cpu_add_res, cpu_add_time = profile(lambda a, b: [a[i] + b[i]
+                                                          for i in range(TEST_SIZE)])(cpu_obfuscated, cpu_obfuscated2)
+        compare_time(fpga_add_time, cpu_add_time)
+
+        print("\n>>>>> pi_mul profiling begins")
+        (fpga_mul_res, _), fpga_mul_time = profile(pi_mul)(self._fpga_pub_key, fpga_add_res,
+                                                           fpga_encoded2, shape_store, shape_store, fpga_pi_store,
+                                                           shape_store, None)
+        cpu_mul_res, cpu_mul_time = profile(lambda a, b: [a[i] * b[i]
+                                                          for i in range(TEST_SIZE)])(cpu_add_res, cpu_encoded2)
+        compare_time(fpga_mul_time, cpu_mul_time)
+
+        print("\n>>>>> pi_matmul profiling begins")
+        print("sizes are", shape_tuple, "and", shape_tuple_T)
+        fpga_pi_matmul_store = pi_alloc(None, NUM_ROWS * NUM_ROWS, MEM_HOST)
+        (fpga_matmul_res, fpga_matmul_shape), fpga_matmul_time = profile(pi_matmul)(self._fpga_pub_key,
+                                                                                    fpga_mul_res, fpga_encoded2,
+                                                                                    shape_store, shape_store_T,
+                                                                                    fpga_pi_matmul_store, None, None)
+        cpu_matmul_res, cpu_matmul_time = profile(
+            lambda a, b: a @ b)(numpy.asarray(cpu_mul_res).reshape(shape_tuple),
+                                numpy.asarray(cpu_encoded2).reshape(shape_tuple_T))
+        compare_time(fpga_matmul_time, cpu_matmul_time, NUM_ROWS * TEST_SIZE)
+
+        print("\n>>>>> pi_sum profiling begins")
+        print("shape is", fpga_matmul_shape.to_tuple())
+        fpga_pi_sum_store = pi_alloc(None, max(NUM_ROWS, NUM_COLS), MEM_HOST)
+        for axis in [0, 1, None]:
+            print(">>> axis:", axis)
+            (fpga_sum_res, _), fpga_sum_time = profile(pi_sum)(self._fpga_pub_key,
+                                                               fpga_matmul_res, fpga_matmul_shape, axis,
+                                                               fpga_pi_sum_store, None, None)
+            cpu_sum_res, cpu_sum_time = profile(lambda a: numpy.sum(a, axis))(cpu_matmul_res)
+            compare_time(fpga_sum_time, cpu_sum_time)
+
+            # check result
+            fpga_decrypted = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, fpga_sum_res, None, None))
+            cpu_decrypted = numpy.asarray([self._priv_key.decrypt(v) for v in cpu_sum_res.flat]
+                                          if axis is not None else [self._priv_key.decrypt(cpu_sum_res)])
+            assert_ndarray_diff(fpga_decrypted, cpu_decrypted)
+
+        print("\n>>>>> free all allocated spaces")
+        bi_free(fpga_bi_store)
+        bi_free(fpga_bi_store2)
+        te_free(fpga_te_store)
+        te_free(fpga_te_store2)
+        fp_free(fpga_fp_store)
+        fp_free(fpga_fp_store2)
+        fp_free(fps_buffer)
+        fp_free(fpga_fp_mul_store)
+        pi_free(fpga_pi_store)
+        pi_free(fpga_pi_store2)
+        pi_free(fpga_pi_matmul_store)
+        pi_free(fpga_pi_sum_store)
+
+        print("test passed")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/gpu/fate-tensor-fpga/pyproject.toml b/gpu/tensor/paillier_fpga/pyproject.toml
similarity index 96%
rename from gpu/fate-tensor-fpga/pyproject.toml
rename to gpu/tensor/paillier_fpga/pyproject.toml
index 4e3db6d158..6db3fdecf9 100644
--- a/gpu/fate-tensor-fpga/pyproject.toml
+++ b/gpu/tensor/paillier_fpga/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.poetry]
-name = "fate-tensor-fpga"
+name = "paillier-fpga"
 version = "0.1.0"
 description = "This project is an industrial-level heterogeneous acceleration system to support and speed up federated learning. We've designed and implemented a heterogeneous acceleration solutions using FPGA, respectively, that can significantly accelerate the Paillier cryptosystem while maintaining functionality, accuracy and scalability."
 authors = ["Xiaolong.Gao <1506957902@qq.com>"]

From a5ad38ffe303415a92a62e65903a760539045cae Mon Sep 17 00:00:00 2001
From: "Xiaolong.Gao" <1506957902@qq.com>
Date: Fri, 22 Jul 2022 15:58:18 +0800
Subject: [PATCH 8/8] feat: add README

Signed-off-by: Xiaolong.Gao <1506957902@qq.com>
---
 gpu/tensor/README.md | 75 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 gpu/tensor/README.md

diff --git a/gpu/tensor/README.md b/gpu/tensor/README.md
new file mode 100644
index 0000000000..15f1dd7e2c
--- /dev/null
+++ b/gpu/tensor/README.md
@@ -0,0 +1,75 @@
+# GPU & FPGA - A Great Heterogeneous Acceleration Engine for Federated Learning
+
+This project is an industrial-level heterogeneous acceleration system to support and speed up federated learning. We've designed and implemented two different heterogeneous acceleration solutions using GPU and FPGA, respectively, that can significantly accelerate the Paillier cryptosystem while maintaining functionality, accuracy and scalability.  
+
+### How to test GPU engine
+- Requirements / Recommendations:
+    - At least one capable NVIDIA GPU device is required.
+        - We would recommend such device with a GPU microarchitecture of or later than Volta, such as Tesla V100 or Tesla V100S, to fully utilize the functional supports in our CUDA code.
+    - CentOS with version >= 7.8.2003
+        - We haven't tested if our engine works well in other Linux releases, such as Ubuntu and Debian. However, it should work with at most some slight modifications.
+    - Python with version >= 3.6.8
+        - The latest version of NumPy (1.19.4 as of now) is recommended.
+        - You may need to install other essential Python packages.
+    - If you would like to compile the CUDA code:
+        - gcc version 4.8.5 would suffice. Don't use gcc later than 7 since nvcc doesn't support it.
+        - nvcc version 10.0.130 would suffice.
+- To test GPU engine functionality
+    ```python
+    python3 -m paillier_gpu.tests.test_gpu_engine
+    ```
+- To test GPU engine performance (profiling)
+    ```python
+    python3 -m paillier_gpu.tests.test_gpu_performance
+    ```
+- You may switch the RAND_TYPE variable between INT64_TYPE and FLOAT_TYPE in the test file, which is recommended to make sure that both float64 (double) and int64 (long long) types can pass all assertions.
+
+### How to test FPGA engine
+- Requirements / Recommendations:
+    - At least one capable Xilinx FPGA device, such as Alveo U250, is required.
+    - CentOS with version >= 7.8.2003
+        - We haven't tested if our engine works well in other Linux releases, such as Ubuntu and Debian. However, it should work with at most some slight modifications.
+    - Python with version >= 3.6.8
+        - The latest version of NumPy (1.19.4 as of now) is recommended.
+        - You may need to install other essential Python packages.
+    - GCC with version >= 4.8.5 if you would like to compile the C source code.
+    - Superuser privileges are required as we need access sensitive directories.
+        - Note that the Python path with and without sudo may differ.
+- To test FPGA engine functionality
+    ```python
+    sudo python3 -m paillier_fpga.tests.test_fpga_engine
+    ```
+- To test FPGA engine performance (profiling)
+    ```python
+    sudo python3 -m paillier_fpga.tests.test_fpga_performance
+    ```
+- You may switch the RAND_TYPE variable between INT64_TYPE and FLOAT_TYPE in the test file, which is recommended to make sure that both float64 (double) and int64 (long long) types can pass all assertions.
+
+### Profiling Information
+The profiling result was obtained from a server with the following configuration.  
+|Hardware Type|Model|Quantity|Remark|
+|-|-|-|-|
+|CPU|Intel Xeon Silver 4114 CPU @ 2.20GHz|2|only 1 core is used in profiling|
+|GPU|NVIDIA Tesla V100 PCIe 32GB|4|only 1 GPU card is used in profiling|
+|FPGA|Xilinx Alveo U250|1||
+|Memory|Samsung 16GiB DIMM DDR4 Synchronous 2666 MHz (0.4 ns)|12|192 GB in total|
+|Hard Disk|2TB WDC WD20SPZX-60U|1||  
+
+The chart is an overview of the profiling information of our GPU and FPGA engines compared to a CPU implementation under a unified shape of 666*666, where the throughput means the number of operations (instances, either fixed-point numbers or Paillier-encrypted numbers) a device is capable to compute within a second. For matrix multiplication, we consider the number of operations as the number of modular exponentiations we have to compute under a naive O(n^3) algorithm.  
+We don't count the memory allocation time as it could take a significant amount of time for I/O-bound operators like those involving modular multiplication instead of modular exponentiation. As a result, we would recommend users to reuse the already-allocated CPU memory space as much as possible in a way similar to register renaming.
+
+|Operator|CPU Throughput|GPU Throughput|GPU Speedup|FPGA Throughput|FPGA Speedup|
+|-|-|-|-|-|-|
+|fp_encode|62303.97|33611720.05|539.48|7215836.85|115.82|
+|fp_decode|567913.21|25958708.28|45.71|583509.90|1.03|
+|pi_encrypt|205864.74|24814051.60|120.54|687947.44|3.34|
+|pi_gen_obf_seed|444.05|86766.80|195.40|33653.43|75.79|
+|pi_obfuscate|60236.27|11101085.43|184.29|2035691.96|33.80|
+|pi_decrypt|1590.48|299298.46|188.18|69354.57|43.61|
+|fp_mul|228424.79|11480248.47|50.26|1695313.95|7.42|
+|pi_add|29759.90|1203071.88|40.43|423378.92|14.23|
+|pi_mul|6175.70|1068244.51|172.98|359942.47|58.28|
+|pi_matmul|4178.43|620310.10|148.46|150362.36|35.99|
+|pi_sum(axis=0)|12865.10|1675271.14|130.22|844531.30|65.65|
+|pi_sum(axis=1)|15919.62|4651463.65|292.18|947461.90|59.52|
+|pi_sum(axis=None)|10277.66|4677684.56|455.13|877720.61|85.40|
\ No newline at end of file