We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Paddle:https://github.com/MyAngelAyase/Paddle PaddleCustomDevice:https://github.com/MyAngelAyase/PaddleCustomDevice PaddleNLP:https://github.com/MyAngelAyase/PaddleNLP
# 安装最新加速库Run包 source /usr/local/Ascend/atb/set_env.sh # Paddle (5832764483 2023-10-13) git clone https://github.com/MyAngelAyase/Paddle.git -b develop cmake .. -DPY_VERSION=3.9 -DPYTHON_EXECUTABLE=`which python3` -DWITH_ARM=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -DON_INFER=ON -DWITH_XBYAK=OFF -DWITH_CUSTOM_DEVICE=ON -DWITH_DISTRIBUTE=ON -DWITH_PSCORE=ON make -j$(nproc) # PaddleCustomDevice (a800ff7) git clone. https://github.com/MyAngelAyase/PaddleCustomDevice.git -b develop export WITH_ASCEND_TRANSFORMER_ACC=ON bash tools/compile.sh # PaddleNLP (913d569) git clone https://github.com/MyAngelAyase/PaddleNLP.git -b develop cd PaddleNLP && pip install -e . # 后处理自定义算子 wget xxxx/paddle_post_processALL.zip
修改paddle_post_processALL/src/ops/ascendc/op_kernel/set_mask_value.cpp:30 (因为当前在PaddleNLP将(attention mask - 1)*1e4操作提出到predictor处理,见#pr74和#pr17,故在setmaskvalue需要适配将赋1修改为赋0,业务模型无类似操作可不需要。)
cd build && bash build_ops.sh bash aie_ops.run --extract=/workspace # 目录下会生成vendors目录 # 设置算子库路径 export ASCEND_CUSTOM_OPP_PATH=/workspace/vendors/aie_ascendc
# 切权重 词表横切会使用多个小算子组成embedding,为避免造成显存碎片,暂时先暴力的不切词表权重(后续考虑竖切),接入Llama65B加速库embedding。
对应PaddleNLP修改见#pr21 脚本内容:
""" Author(Zhengzekang): If we use PaddleNLP to export distributed model by using dy2static directly, each device will read full model which easily cause OOM. This script only use single device to read full model and split model by assigned NRANKS """ import paddle import os import json # Define Weight Name List. LLAMA_COLUMN_SPLIT_WEIGHT_LIST = [ "self_attn.q_proj.weight", "self_attn.k_proj.weight", "self_attn.v_proj.weight", "mlp.gate_proj.weight", "mlp.up_proj.weight", ] LLAMA_ROW_SPLIT_WEIGHT_LIST = [ "self_attn.o_proj.weight", "mlp.down_proj.weight" ] LLAMA_NO_SPLIT_WEIGHT_LIST = [ "input_layernorm.weight", "post_attention_layernorm.weight", ] LM_HEAD_COLUMN_SPLIT_WEIGHT_LIST = [ "lm_head.weight" ] EMBEDDING_ROW_SPLIT_WEIGHT_LIST = [ "llama.embed_tokens.weight" ] FINAL_NORM_WEIGHT_LSIT = [ "llama.norm.weight" ] def parse_arguments(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--model_dir", required=True, help="The directory of model.") parser.add_argument("--output_model_dir", required=True, help="The directory of model.") parser.add_argument("--nranks", type=int, default="1", help="The number of distributed model num. ") return parser.parse_args() def col_split(weight, nranks): return paddle.split(paddle.to_tensor(weight, place=paddle.CPUPlace()), axis=1, num_or_sections=nranks) def row_split(weight, nranks): return paddle.split(paddle.to_tensor(weight, place=paddle.CPUPlace()), axis=0, num_or_sections=nranks) def split_model_weight(model_dir, nranks, output_model_path): model_state_path = os.path.join(model_dir, "model_state.pdparams") origin_model = paddle.load(model_state_path, return_numpy=True) config = None with open(os.path.join(model_dir, "config.json"), "r") as f: config = json.load(f) for rank_id in range(nranks): print("Now process rank: ", rank_id) split_state_dict = dict() col_split_lm_head_weight = col_split(origin_model[LM_HEAD_COLUMN_SPLIT_WEIGHT_LIST[0]], nranks)[rank_id] #row_split_embed_token_weight = row_split(origin_model[EMBEDDING_ROW_SPLIT_WEIGHT_LIST[0]], nranks)[rank_id] split_state_dict[LM_HEAD_COLUMN_SPLIT_WEIGHT_LIST[0]] = col_split_lm_head_weight #split_state_dict[EMBEDDING_ROW_SPLIT_WEIGHT_LIST[0]] = row_split_embed_token_weight split_state_dict[EMBEDDING_ROW_SPLIT_WEIGHT_LIST[0]] = paddle.to_tensor(origin_model[EMBEDDING_ROW_SPLIT_WEIGHT_LIST[0]], place=paddle.CPUPlace()) print(split_state_dict) for layer_id in range(config["num_hidden_layers"]): print("Now process LayerIdx: ", layer_id) for column_split_weight_name in LLAMA_COLUMN_SPLIT_WEIGHT_LIST: full_column_split_weight_name = "llama.layers.{}.".format(layer_id) + column_split_weight_name column_split_weight = col_split(origin_model[full_column_split_weight_name], nranks)[rank_id] split_state_dict[full_column_split_weight_name] = column_split_weight for row_split_weight_name in LLAMA_ROW_SPLIT_WEIGHT_LIST: full_row_split_weight_name = "llama.layers.{}.".format(layer_id) + row_split_weight_name row_split_weight = row_split(origin_model[full_row_split_weight_name], nranks)[rank_id] split_state_dict[full_row_split_weight_name] = row_split_weight for no_split_weight_name in LLAMA_NO_SPLIT_WEIGHT_LIST: full_no_split_weight_name = "llama.layers.{}.".format(layer_id) + no_split_weight_name split_state_dict[full_no_split_weight_name] = paddle.to_tensor(origin_model[full_no_split_weight_name], place=paddle.CPUPlace()) last_norm_weight_name = FINAL_NORM_WEIGHT_LSIT[0] split_state_dict[last_norm_weight_name] = paddle.to_tensor(origin_model[last_norm_weight_name], place=paddle.CPUPlace()) paddle.save(split_state_dict, os.path.join(output_model_path, "model_state.tp0{}.pdparams".format(rank_id))) if __name__ == "__main__": args = parse_arguments() split_model_weight(args.model_dir, args.nranks, args.output_model_dir)
# Npu下导出8卡推理模型,PaddleNLP cd PaddleNLP/llm python -m paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" export_model.py \ --model_name_or_path facebook/llama-65b \ --output_path ./export_llama65b_fp16_mp8 \ --dtype float16 \ --inference_model
确保less_than 前的 reduce_sum 算子的 (后续在NPU上推理有问题,必须要求reduce_all=True) 运行以下程序输出readable code显示模型中reduce_sum算子的attr。
import paddle paddle.enable_static() exe = paddle.static.Executor(paddle.CPUPlace()) # load inference model [infer_program, feed_target_names, fetch_targets] = paddle.static.load_inference_model("./export_llama65b_fp16_mp8/rank_0/model", exe) # dump readable_code of global block with open ('readable_code_global_block.log', 'w') as file: global_block = infer_program.global_block() file.write(global_block._to_readable_code())
在输入出日志中grep reduce_sum 算子,获得结果如下
grep "\ less_than" readable_code_global_block.log grep "\ reduce_sum" readable_code_global_block.log # 期望得到的输出结果如下 {Out=['sum_1.tmp_0']} = reduce_sum(inputs={X=['cast_2.tmp_0']}, dim = [], in_dtype = -1, keep_dim = False, op_device = , op_namescope = /, op_role = 0, op_role_var = [], out_dtype = -1, reduce_all = True) {Out=['sum_2.tmp_0']} = reduce_sum(inputs={X=['cast_3.tmp_0']}, dim = [], in_dtype = -1, keep_dim = False, op_device = , op_namescope = /, op_role = 0, op_role_var = [], out_dtype = -1, reduce_all = True)
修改 infer_llama_npu.sh 中的 model_dir,
The text was updated successfully, but these errors were encountered:
No branches or pull requests
一、相关代码仓:
Paddle:https://github.com/MyAngelAyase/Paddle
PaddleCustomDevice:https://github.com/MyAngelAyase/PaddleCustomDevice
PaddleNLP:https://github.com/MyAngelAyase/PaddleNLP
二、安装编译
修改paddle_post_processALL/src/ops/ascendc/op_kernel/set_mask_value.cpp:30
(因为当前在PaddleNLP将(attention mask - 1)*1e4操作提出到predictor处理,见#pr74和#pr17,故在setmaskvalue需要适配将赋1修改为赋0,业务模型无类似操作可不需要。)
三、模型导出
# 切权重 词表横切会使用多个小算子组成embedding,为避免造成显存碎片,暂时先暴力的不切词表权重(后续考虑竖切),接入Llama65B加速库embedding。
对应PaddleNLP修改见#pr21
脚本内容:
确保less_than 前的 reduce_sum 算子的 (后续在NPU上推理有问题,必须要求reduce_all=True)
运行以下程序输出readable code显示模型中reduce_sum算子的attr。
在输入出日志中grep reduce_sum 算子,获得结果如下
四、模型预测
The text was updated successfully, but these errors were encountered: