From c18d0ade9b0e0c8148b05d274003169747e806ce Mon Sep 17 00:00:00 2001 From: nsarka Date: Fri, 11 Oct 2024 11:04:02 -0400 Subject: [PATCH] TL/UCP: Grace tuning (#1027) * TL/UCP: Grace tuning * UTIL: Allow generic conf matching in ucc.conf * TL/UCP: Grace generic conf --------- Co-authored-by: nsarkauskas Co-authored-by: nsarkauskas --- contrib/ucc.conf | 58 ++++++++++++++++++++++++++++++++++++++++++ src/utils/ini.h | 2 +- src/utils/ucc_parser.c | 23 ++++++++++++++--- src/utils/ucc_parser.h | 5 +++- 4 files changed, 83 insertions(+), 5 deletions(-) diff --git a/contrib/ucc.conf b/contrib/ucc.conf index b594f97a9b..b7c591ebb3 100644 --- a/contrib/ucc.conf +++ b/contrib/ucc.conf @@ -89,3 +89,61 @@ UCC_TL_UCP_TUNE=allreduce:0-16k:@0#allreduce:16k-inf:@1 UCC_TL_UCP_ALLREDUCE_KN_RADIX=0-8k:host:8,8k-inf:host:2 UCC_TL_UCP_ALLREDUCE_SRA_KN_RADIX=8 UCC_TL_UCP_TUNE=allreduce:0-8k:@0#allreduce:8k-inf:@1 + +#NVIDIA Grace, Generic 1 node +[vendor=nvidia model=grace nnodes=1] +UCC_TL_UCP_ALLREDUCE_KN_RADIX=0-8:host:2,8-64:host:3,64-4k:host:2 +UCC_TL_UCP_ALLREDUCE_SRA_KN_RADIX=4096-8192:host:96,8192-16384:host:3,16384-32768:host:8,32768-65536:host:16,65536-131072:host:32,131072-262144:host:2,262144-524288:host:3,524288-1048576:host:2 +UCC_TL_UCP_TUNE=allreduce:0-4k:@0#allreduce:4k-inf:@1 + +#NVIDIA Grace, 2 socket (C2): +[vendor=nvidia model=grace team_size=144 sock=72 nnodes=1] +UCC_TL_UCP_ALLREDUCE_KN_RADIX=0-4k:host:2 +UCC_TL_UCP_ALLREDUCE_SRA_KN_RADIX=4096-8192:host:96,8192-16384:host:4,16384-32768:host:6,32768-65536:host:18,65536-131072:host:32,131072-262144:host:72,262144-524288:host:3,524288-1048576:host:2 +UCC_TL_UCP_TUNE=allreduce:0-4k:@0#allreduce:4k-inf:@1 + +[vendor=nvidia model=grace team_size=128 sock=64 nnodes=1] +UCC_TL_UCP_ALLREDUCE_KN_RADIX=0-4k:host:2 +UCC_TL_UCP_ALLREDUCE_SRA_KN_RADIX=4096-8192:host:72,8192-16384:host:4,16384-32768:host:8,32768-65536:host:16,65536-131072:host:32,131072-262144:host:64,262144-524288:host:3,524288-1048576:host:3 +UCC_TL_UCP_TUNE=allreduce:0-4k:@0#allreduce:4k-inf:@1 + +[vendor=nvidia model=grace team_size=64 sock=32 nnodes=1] +UCC_TL_UCP_ALLREDUCE_KN_RADIX=0-4k:host:2 +UCC_TL_UCP_ALLREDUCE_SRA_KN_RADIX=4096-8192:host:2,8192-16384:host:4,16384-32768:host:8,32768-65536:host:16,65536-131072:host:32,131072-262144:host:3,262144-524288:host:3,524288-1048576:host:2 +UCC_TL_UCP_TUNE=allreduce:0-4k:@0#allreduce:4k-inf:@1 + +[vendor=nvidia model=grace team_size=32 sock=16 nnodes=1] +UCC_TL_UCP_ALLREDUCE_KN_RADIX=0-4k:host:2 +UCC_TL_UCP_ALLREDUCE_SRA_KN_RADIX=4096-8192:host:2,8192-16384:host:4,16384-32768:host:8,32768-65536:host:16,65536-131072:host:3,131072-262144:host:2,262144-524288:host:2,524288-1048576:host:2 +UCC_TL_UCP_TUNE=allreduce:0-4k:@0#allreduce:4k-inf:@1 + +[vendor=nvidia model=grace team_size=16 sock=8 nnodes=1] +UCC_TL_UCP_ALLREDUCE_KN_RADIX=0-4k:host:2 +UCC_TL_UCP_ALLREDUCE_SRA_KN_RADIX=4096-8192:host:2,8192-16384:host:8,16384-32768:host:8,32768-65536:host:2,65536-131072:host:2,131072-262144:host:2,262144-524288:host:2,524288-1048576:host:2 +UCC_TL_UCP_TUNE=allreduce:0-4k:@0#allreduce:4k-inf:@1 + +#NVIDIA Grace, 1 socket (CG): +[vendor=nvidia model=grace team_size=72 sock=72 nnodes=1] +UCC_TL_UCP_ALLREDUCE_KN_RADIX=0-64:host:3,64-4k:host:2 +UCC_TL_UCP_ALLREDUCE_SRA_KN_RADIX=4096-8192:host:6,8192-16384:host:6,16384-32768:host:6,32768-65536:host:16,65536-131072:host:32,131072-262144:host:48,262144-524288:host:2,524288-1048576:host:2 +UCC_TL_UCP_TUNE=allreduce:0-4k:@0#allreduce:4k-inf:@1 + +[vendor=nvidia model=grace team_size=64 sock=64 nnodes=1] +UCC_TL_UCP_ALLREDUCE_KN_RADIX=0-4k:host:2 +UCC_TL_UCP_ALLREDUCE_SRA_KN_RADIX=4096-8192:host:2,8192-16384:host:4,16384-32768:host:8,32768-65536:host:18,65536-131072:host:32,131072-262144:host:48,262144-524288:host:2,524288-1048576:host:2 +UCC_TL_UCP_TUNE=allreduce:0-4k:@0#allreduce:4k-inf:@1 + +[vendor=nvidia model=grace team_size=32 sock=32 nnodes=1] +UCC_TL_UCP_ALLREDUCE_KN_RADIX=0-4k:host:2 +UCC_TL_UCP_ALLREDUCE_SRA_KN_RADIX=4096-8192:host:2,8192-16384:host:4,16384-32768:host:8,32768-65536:host:16,65536-131072:host:144,131072-262144:host:2,262144-524288:host:2,524288-1048576:host:4 +UCC_TL_UCP_TUNE=allreduce:0-4k:@0#allreduce:4k-inf:@1 + +[vendor=nvidia model=grace team_size=16 sock=16 nnodes=1] +UCC_TL_UCP_ALLREDUCE_KN_RADIX=0-4k:host:2 +UCC_TL_UCP_ALLREDUCE_SRA_KN_RADIX=4096-8192:host:2,8192-16384:host:4,16384-32768:host:8,32768-65536:host:16,65536-131072:host:3,131072-262144:host:2,262144-524288:host:4,524288-1048576:host:4 +UCC_TL_UCP_TUNE=allreduce:0-4k:@0#allreduce:4k-inf:@1 + +[vendor=nvidia model=grace team_size=8 sock=8 nnodes=1] +UCC_TL_UCP_ALLREDUCE_KN_RADIX=0-8k:host:2 +UCC_TL_UCP_ALLREDUCE_SRA_KN_RADIX=8192-16384:host:2,16384-32768:host:4,32768-65536:host:16,65536-131072:host:18,131072-262144:host:6,262144-524288:host:96,524288-1048576:host:4 +UCC_TL_UCP_TUNE=allreduce:0-8k:@0#allreduce:8k-inf:@1 diff --git a/src/utils/ini.h b/src/utils/ini.h index a1437c6a4c..2cf5200480 100644 --- a/src/utils/ini.h +++ b/src/utils/ini.h @@ -99,7 +99,7 @@ int ucc_ini_parse_string(const char* string, ini_handler handler, void* user); /* Maximum line length for any line in INI file (stack or heap). Note that this must be 3 more than the longest line (due to '\r', '\n', and '\0'). */ #ifndef UCC_INI_MAX_LINE -#define UCC_INI_MAX_LINE 200 +#define UCC_INI_MAX_LINE 500 #endif /* Nonzero to allow heap line buffer to grow via realloc(), zero for a diff --git a/src/utils/ucc_parser.c b/src/utils/ucc_parser.c index ea9a6d0fa3..9ba638be83 100644 --- a/src/utils/ucc_parser.c +++ b/src/utils/ucc_parser.c @@ -49,6 +49,8 @@ static int ucc_check_section(ucc_section_desc_t sec_desc, ucc_rank_t team_size, ucc_rank_t ppn_min, ucc_rank_t ppn_max, + ucc_rank_t sock_min, + ucc_rank_t sock_max, ucc_rank_t nnodes) { if (sec_desc.mask & UCC_TUNING_DESC_FIELD_VENDOR) { @@ -72,6 +74,11 @@ static int ucc_check_section(ucc_section_desc_t sec_desc, return 0; } } + if (sec_desc.mask & UCC_TUNING_DESC_FIELD_SOCK) { + if (sock_min < sec_desc.min_sock || sock_max > sec_desc.max_sock) { + return 0; + } + } if (sec_desc.mask & UCC_TUNING_DESC_FIELD_NNODES) { if (nnodes < sec_desc.min_nnodes || nnodes > sec_desc.max_nnodes) { return 0; @@ -160,6 +167,13 @@ ucc_parse_section_name_to_desc(const char *sec_name, ucc_section_desc_t *desc) } desc->mask |= UCC_TUNING_DESC_FIELD_PPN; } + else if (strcmp(cur_str[0], "sock") == 0) { + if (!ucc_check_range(cur_str[1], &desc->min_sock, + &desc->max_sock)) { + goto err_key; + } + desc->mask |= UCC_TUNING_DESC_FIELD_SOCK; + } else if (strcmp(cur_str[0], "nnodes") == 0) { if (!ucc_check_range(cur_str[1], &desc->min_nnodes, &desc->max_nnodes)) { @@ -576,8 +590,11 @@ ucc_status_t ucc_add_team_sections(void *team_cfg, ucc_cpu_model_t model = ucc_arch_get_cpu_model(); ucc_rank_t ppn_min = ucc_topo_min_ppn(team_topo); ucc_rank_t ppn_max = ucc_topo_max_ppn(team_topo); + ucc_rank_t sock_min = ucc_topo_min_socket_size(team_topo); + ucc_rank_t sock_max = ucc_topo_max_socket_size(team_topo); ucc_rank_t nnodes = ucc_topo_nnodes(team_topo); ucc_rank_t team_size = team_topo->set.map.ep_num; + int found = 0; khash_t(ucc_sec) *sec_h; khiter_t i, j; const char *sec_name; @@ -589,7 +606,7 @@ ucc_status_t ucc_add_team_sections(void *team_cfg, sec_name = kh_key(sections, i); sec = kh_val(sections, i); if (ucc_check_section(sec->desc, vendor, model, team_size, - ppn_min, ppn_max, nnodes)) { + ppn_min, ppn_max, sock_min, sock_max, nnodes)) { sec_h = &sec->vals_h; j = kh_get(ucc_sec, sec_h, tune_key); if (j != kh_end(sec_h)) { @@ -597,10 +614,10 @@ ucc_status_t ucc_add_team_sections(void *team_cfg, } status = ucc_apply_file_cfg(team_cfg, tl_fields, env_prefix, component_prefix, sec_name); - return status; + found = 1; } } - return UCC_ERR_NOT_FOUND; + return found ? status : UCC_ERR_NOT_FOUND; } ucc_status_t ucc_config_parser_fill_opts(void *opts, ucs_config_global_list_entry_t *entry, diff --git a/src/utils/ucc_parser.h b/src/utils/ucc_parser.h index 90a1c085ef..01748186ee 100644 --- a/src/utils/ucc_parser.h +++ b/src/utils/ucc_parser.h @@ -97,7 +97,8 @@ enum tuning_mask { UCC_TUNING_DESC_FIELD_MODEL = UCC_BIT(1), UCC_TUNING_DESC_FIELD_TEAM_SIZE = UCC_BIT(2), UCC_TUNING_DESC_FIELD_PPN = UCC_BIT(3), - UCC_TUNING_DESC_FIELD_NNODES = UCC_BIT(4) + UCC_TUNING_DESC_FIELD_NNODES = UCC_BIT(4), + UCC_TUNING_DESC_FIELD_SOCK = UCC_BIT(5) }; typedef struct ucc_section_desc { @@ -108,6 +109,8 @@ typedef struct ucc_section_desc { ucc_rank_t max_team_size; ucc_rank_t min_ppn; ucc_rank_t max_ppn; + ucc_rank_t min_sock; + ucc_rank_t max_sock; ucc_rank_t min_nnodes; ucc_rank_t max_nnodes; } ucc_section_desc_t;