Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Llama65B静态batch复现步骤 #844

Open
bmers opened this issue Nov 30, 2023 · 0 comments
Open

Llama65B静态batch复现步骤 #844

bmers opened this issue Nov 30, 2023 · 0 comments

Comments

@bmers
Copy link

bmers commented Nov 30, 2023

一、相关代码仓:

Paddle:https://github.com/MyAngelAyase/Paddle
PaddleCustomDevice:https://github.com/MyAngelAyase/PaddleCustomDevice
PaddleNLP:https://github.com/MyAngelAyase/PaddleNLP

二、安装编译

# 安装最新加速库Run包
source /usr/local/Ascend/atb/set_env.sh

# Paddle (5832764483 2023-10-13)
git clone https://github.com/MyAngelAyase/Paddle.git -b develop
cmake .. -DPY_VERSION=3.9 -DPYTHON_EXECUTABLE=`which python3` -DWITH_ARM=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -DON_INFER=ON -DWITH_XBYAK=OFF -DWITH_CUSTOM_DEVICE=ON -DWITH_DISTRIBUTE=ON -DWITH_PSCORE=ON
make -j$(nproc)

# PaddleCustomDevice (a800ff7)
git clone. https://github.com/MyAngelAyase/PaddleCustomDevice.git -b develop
export WITH_ASCEND_TRANSFORMER_ACC=ON
bash tools/compile.sh

# PaddleNLP (913d569)
git clone https://github.com/MyAngelAyase/PaddleNLP.git -b develop
cd PaddleNLP && pip install -e .

# 后处理自定义算子 
wget xxxx/paddle_post_processALL.zip

修改paddle_post_processALL/src/ops/ascendc/op_kernel/set_mask_value.cpp:30
(因为当前在PaddleNLP将(attention mask - 1)*1e4操作提出到predictor处理,见#pr74#pr17,故在setmaskvalue需要适配将赋1修改为赋0,业务模型无类似操作可不需要。)

image

cd build && bash build_ops.sh
bash  aie_ops.run --extract=/workspace # 目录下会生成vendors目录
# 设置算子库路径
export ASCEND_CUSTOM_OPP_PATH=/workspace/vendors/aie_ascendc

三、模型导出

# 切权重
词表横切会使用多个小算子组成embedding,为避免造成显存碎片,暂时先暴力的不切词表权重(后续考虑竖切),接入Llama65B加速库embedding。

对应PaddleNLP修改见#pr21
脚本内容:

"""
Author(Zhengzekang):

If we use PaddleNLP to export distributed model by using dy2static directly, 
each device will read full model which easily cause OOM. 

This script only use single device to read full model and split model by assigned NRANKS

"""

import paddle 
import os 
import json 

# Define Weight Name List. 
LLAMA_COLUMN_SPLIT_WEIGHT_LIST = [
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.up_proj.weight",
]

LLAMA_ROW_SPLIT_WEIGHT_LIST = [
    "self_attn.o_proj.weight",
    "mlp.down_proj.weight"
]

LLAMA_NO_SPLIT_WEIGHT_LIST = [
    "input_layernorm.weight",
    "post_attention_layernorm.weight",
]

LM_HEAD_COLUMN_SPLIT_WEIGHT_LIST = [
    "lm_head.weight"
]

EMBEDDING_ROW_SPLIT_WEIGHT_LIST = [
    "llama.embed_tokens.weight"
]

FINAL_NORM_WEIGHT_LSIT = [
    "llama.norm.weight"
]

def parse_arguments():
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--model_dir", required=True, help="The directory of model.")
    parser.add_argument("--output_model_dir", required=True, help="The directory of model.")
    parser.add_argument("--nranks", type=int, default="1", help="The number of distributed model num. ")
    return parser.parse_args()

def col_split(weight, nranks): 
    return paddle.split(paddle.to_tensor(weight, place=paddle.CPUPlace()), axis=1, num_or_sections=nranks)

def row_split(weight, nranks): 
    return paddle.split(paddle.to_tensor(weight, place=paddle.CPUPlace()), axis=0, num_or_sections=nranks)

def split_model_weight(model_dir, nranks, output_model_path): 
    model_state_path = os.path.join(model_dir, "model_state.pdparams")
    origin_model = paddle.load(model_state_path, return_numpy=True)
    config = None 
    with open(os.path.join(model_dir, "config.json"), "r") as f: 
        config = json.load(f) 

    for rank_id in range(nranks): 
        print("Now process rank: ", rank_id)
        split_state_dict = dict()
        col_split_lm_head_weight = col_split(origin_model[LM_HEAD_COLUMN_SPLIT_WEIGHT_LIST[0]], nranks)[rank_id]
        #row_split_embed_token_weight = row_split(origin_model[EMBEDDING_ROW_SPLIT_WEIGHT_LIST[0]], nranks)[rank_id]
        split_state_dict[LM_HEAD_COLUMN_SPLIT_WEIGHT_LIST[0]] = col_split_lm_head_weight
        #split_state_dict[EMBEDDING_ROW_SPLIT_WEIGHT_LIST[0]] = row_split_embed_token_weight
        split_state_dict[EMBEDDING_ROW_SPLIT_WEIGHT_LIST[0]] = paddle.to_tensor(origin_model[EMBEDDING_ROW_SPLIT_WEIGHT_LIST[0]], place=paddle.CPUPlace())

        print(split_state_dict)
        for layer_id in range(config["num_hidden_layers"]): 
            print("Now process LayerIdx: ", layer_id)
            for column_split_weight_name in LLAMA_COLUMN_SPLIT_WEIGHT_LIST: 
                full_column_split_weight_name = "llama.layers.{}.".format(layer_id) + column_split_weight_name
                column_split_weight = col_split(origin_model[full_column_split_weight_name], nranks)[rank_id]
                split_state_dict[full_column_split_weight_name] = column_split_weight

            for row_split_weight_name in LLAMA_ROW_SPLIT_WEIGHT_LIST: 
                full_row_split_weight_name = "llama.layers.{}.".format(layer_id) + row_split_weight_name
                row_split_weight = row_split(origin_model[full_row_split_weight_name], nranks)[rank_id]
                split_state_dict[full_row_split_weight_name] = row_split_weight

            for no_split_weight_name in LLAMA_NO_SPLIT_WEIGHT_LIST: 
                full_no_split_weight_name = "llama.layers.{}.".format(layer_id) + no_split_weight_name
                split_state_dict[full_no_split_weight_name] = paddle.to_tensor(origin_model[full_no_split_weight_name], place=paddle.CPUPlace())

        last_norm_weight_name = FINAL_NORM_WEIGHT_LSIT[0]
        split_state_dict[last_norm_weight_name] = paddle.to_tensor(origin_model[last_norm_weight_name], place=paddle.CPUPlace())
        paddle.save(split_state_dict, os.path.join(output_model_path, "model_state.tp0{}.pdparams".format(rank_id)))

if __name__ == "__main__": 
    args = parse_arguments()
    split_model_weight(args.model_dir, 
                       args.nranks, 
                       args.output_model_dir)
# Npu下导出8卡推理模型,PaddleNLP
cd PaddleNLP/llm
python -m paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" export_model.py \
    --model_name_or_path facebook/llama-65b \
    --output_path ./export_llama65b_fp16_mp8 \
    --dtype float16 \
    --inference_model

确保less_than 前的 reduce_sum 算子的 (后续在NPU上推理有问题,必须要求reduce_all=True)
运行以下程序输出readable code显示模型中reduce_sum算子的attr。

import paddle

paddle.enable_static()
exe = paddle.static.Executor(paddle.CPUPlace())

# load inference model  
[infer_program, feed_target_names, fetch_targets] = paddle.static.load_inference_model("./export_llama65b_fp16_mp8/rank_0/model", exe)

# dump readable_code of global block
with open ('readable_code_global_block.log', 'w') as file:
  global_block = infer_program.global_block()
  file.write(global_block._to_readable_code())

在输入出日志中grep reduce_sum 算子,获得结果如下

grep "\ less_than" readable_code_global_block.log
grep "\ reduce_sum" readable_code_global_block.log
# 期望得到的输出结果如下
{Out=['sum_1.tmp_0']} = reduce_sum(inputs={X=['cast_2.tmp_0']}, dim = [], in_dtype = -1, keep_dim = False, op_device = , op_namescope = /, op_role = 0, op_role_var = [], out_dtype = -1, reduce_all = True)
{Out=['sum_2.tmp_0']} = reduce_sum(inputs={X=['cast_3.tmp_0']}, dim = [], in_dtype = -1, keep_dim = False, op_device = , op_namescope = /, op_role = 0, op_role_var = [], out_dtype = -1, reduce_all = True)

四、模型预测

修改 infer_llama_npu.sh 中的 model_dir,
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant