PaddlePaddle · wawltor · Dec 26, 2024 · Dec 25, 2024 · Dec 25, 2024 · Dec 25, 2024
diff --git a/...bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh → ...bf16_DP1_MP2_PP4_VPP5_Sharding4_Stage1.sh b/...bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh → ...bf16_DP1_MP2_PP4_VPP5_Sharding4_Stage1.sh
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-param="model_item=gpt3-13b_pretrain "
+param="model_item=gpt-3-13b_pretrain "
 param+="run_mode=DP1_MP2_PP4_VPP5_Sharding4_Stage1 "
 param+="device_num=N4C32 "
 param+="global_batch_size=128 "

diff --git a/...bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh → ...bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh b/...bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh → ...bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-param="model_item=baichuan-inc-baichaun-2-13b_pretrain "
-param+="run_mode=DP1_MP2_PP4_1F1B_Sharding8_Stage2 "
+param="model_item=baichuan-inc-baichuan-2-13b_pretrain "
+param+="run_mode=DP1_MP4_PP2_1F1B_Sharding4_Stage1 "
 param+="device_num=N4C32 "
 param+="global_batch_size=128 "
 param+="nnodes=4 "

diff --git a/.../static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json b/.../static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json
@@ -44,12 +44,12 @@
     "fused_linear": 1,
     "fused_linear_param_grad_add": 1,
     "use_fused_rope": true,
-    "use_fused_rms_norm": false,
+    "use_fused_rms_norm": true,
     "max_seq_length": 4096,
     "sequence_parallel": false,
     "sharding": "stage1",
     "sharding_parallel_config": "enable_stage1_tensor_fusion enable_stage1_overlap",
-    "tensor_parallel_config": "enable_mp_async_allreduce",
+    "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy",
     "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
     "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward"
 }
diff --git a/...bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh → ...bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh b/...bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh → ...bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-param="model_item=gpt3-13b_pretrain_dy2st "
+param="model_item=gpt-3-13b_pretrain_dy2st "
 param+="run_mode=DP1_MP2_PP4_1F1B_Sharding4_Stage1 "
 param+="device_num=N4C32 "
 param+="global_batch_size=128 "

diff --git a/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json b/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json
@@ -39,7 +39,7 @@
     "fuse_attention_qkv": 1,
     "fused_linear_param_grad_add": 1,
     "use_fused_rope": true,
-    "use_fused_rms_norm": false,
+    "use_fused_rms_norm": true,
     "recompute": 0,
     "recompute_use_reentrant": true,
     "recompute_granularity": "full",
@@ -52,7 +52,7 @@
     "attention_probs_dropout_prob": 0.1,
     "hidden_dropout_prob": 0.1,
     "sharding_parallel_config": "enable_stage1_tensor_fusion enable_stage1_overlap",
-    "tensor_parallel_config": "enable_mp_async_allreduce",
+    "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy",
     "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
     "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward"
 }
diff --git a/...test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_70b/pretrain-llama2_70b.json b/...test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_70b/pretrain-llama2_70b.json
@@ -53,7 +53,7 @@
     "pipeline_schedule_mode": "VPP", 
     "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
     "sharding_parallel_config": "enable_stage1_overlap",
-    "tensor_parallel_config": "enable_mp_async_allreduce",
+    "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy",
     "max_seq_length": 4096,
     "to_static": true,
     "eliminate_transpose": 1,

diff --git a/...rallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh b/...rallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-param="model_item=qwen-2-14b_pretrain_dy2st "
+param="model_item=qwen-14b_pretrain_dy2st "
 param+="run_mode=DP1_MP2_PP4_1F1B_Sharding4_Stage1 "
 param+="device_num=N4C32 "
 param+="global_batch_size=128 "

diff --git a/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json b/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json
@@ -50,6 +50,6 @@
     "auto_parallel_resume_form_hybrid_parallel": true,
     "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
     "sharding_parallel_config": "enable_stage1_overlap",
-    "tensor_parallel_config": "enable_mp_async_allreduce",
+    "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy",
     "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward"
 }