diff --git a/config/ucx.conf b/config/ucx.conf index d40117752a3..98813d65bab 100644 --- a/config/ucx.conf +++ b/config/ucx.conf @@ -6,6 +6,7 @@ UCX_IB_MLX5_DEVX_OBJECTS= UCX_GDR_COPY_BW=0MBs,get_dedicated:30GBs,put_dedicated:30GBs UCX_GDR_COPY_LAT=30e-9 UCX_DISTANCE_BW=auto,sys:16500MBs +UCX_CUDA_COPY_BW=h2d:400GBs,d2h:300GBs,d2d:400GBs,other:10000MBs [Fujitsu ARM] CPU vendor=Fujitsu ARM diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.c b/src/uct/cuda/cuda_copy/cuda_copy_iface.c index 49b45df9bf1..da5f4fb303c 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_iface.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_iface.c @@ -37,14 +37,24 @@ static ucs_config_field_t uct_cuda_copy_iface_config_table[] = { "Max number of cuda events. -1 is infinite", ucs_offsetof(uct_cuda_copy_iface_config_t, max_cuda_events), UCS_CONFIG_TYPE_UINT}, - {"BW", "10000MBs", - "Effective memory bandwidth", - ucs_offsetof(uct_cuda_copy_iface_config_t, bandwidth), UCS_CONFIG_TYPE_BW}, + /* TODO: 1. Add separate keys for shared and dedicated bandwidth + 2. Remove the "other" key (use pref_loc for managed memory) */ + {"BW", "10000MBs,h2d:8300MBs,d2h:11660MBs,d2d:320GBs", + "Effective memory bandwidth", 0, + UCS_CONFIG_TYPE_KEY_VALUE(UCS_CONFIG_TYPE_BW, + {"h2d", "host to device bandwidth", + ucs_offsetof(uct_cuda_copy_iface_config_t, bw.h2d)}, + {"d2h", "device to host bandwidth", + ucs_offsetof(uct_cuda_copy_iface_config_t, bw.d2h)}, + {"d2d", "device to device bandwidth", + ucs_offsetof(uct_cuda_copy_iface_config_t, bw.d2d)}, + {"other", "any other memory types combinations bandwidth", + ucs_offsetof(uct_cuda_copy_iface_config_t, bw.other)}, + {NULL})}, {NULL} }; - /* Forward declaration for the delete function */ static void UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_copy_iface_t)(uct_iface_t*); @@ -134,7 +144,7 @@ static ucs_status_t uct_cuda_copy_iface_query(uct_iface_h tl_iface, iface_attr->latency = UCT_CUDA_COPY_IFACE_LATENCY; iface_attr->bandwidth.dedicated = 0; - iface_attr->bandwidth.shared = iface->config.bandwidth; + iface_attr->bandwidth.shared = iface->config.bw.other; iface_attr->overhead = UCT_CUDA_COPY_IFACE_OVERHEAD; iface_attr->priority = 0; @@ -407,16 +417,17 @@ uct_cuda_copy_estimate_perf(uct_iface_h tl_iface, uct_perf_attr_t *perf_attr) perf_attr->bandwidth.dedicated = 0; if ((src_mem_type == UCS_MEMORY_TYPE_HOST) && (dst_mem_type == UCS_MEMORY_TYPE_CUDA)) { - perf_attr->bandwidth.shared = (zcopy ? 8300.0 : 7900.0) * UCS_MBYTE; + perf_attr->bandwidth.shared = zcopy ? iface->config.bw.h2d : + iface->config.bw.h2d * 0.95; } else if ((src_mem_type == UCS_MEMORY_TYPE_CUDA) && (dst_mem_type == UCS_MEMORY_TYPE_HOST)) { - perf_attr->bandwidth.shared = (zcopy ? 11660.0 : 9320.0) * - UCS_MBYTE; + perf_attr->bandwidth.shared = zcopy ? iface->config.bw.d2h : + iface->config.bw.d2h * 0.95; } else if ((src_mem_type == UCS_MEMORY_TYPE_CUDA) && (dst_mem_type == UCS_MEMORY_TYPE_CUDA)) { - perf_attr->bandwidth.shared = 320.0 * UCS_GBYTE; + perf_attr->bandwidth.shared = iface->config.bw.d2d; } else { - perf_attr->bandwidth.shared = iface->config.bandwidth; + perf_attr->bandwidth.shared = iface->config.bw.other; } } @@ -491,7 +502,10 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h work self->id = ucs_generate_uuid((uintptr_t)self); self->config.max_poll = config->max_poll; self->config.max_cuda_events = config->max_cuda_events; - self->config.bandwidth = config->bandwidth; + self->config.bw.h2d = config->bw.h2d; + self->config.bw.d2h = config->bw.d2h; + self->config.bw.d2d = config->bw.d2d; + self->config.bw.other = config->bw.other; UCS_STATIC_BITMAP_RESET_ALL(&self->streams_to_sync); ucs_mpool_params_reset(&mp_params); diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.h b/src/uct/cuda/cuda_copy/cuda_copy_iface.h index 700345d22a2..1b54298dfaa 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_iface.h +++ b/src/uct/cuda/cuda_copy/cuda_copy_iface.h @@ -69,7 +69,12 @@ typedef struct uct_cuda_copy_iface { struct { unsigned max_poll; unsigned max_cuda_events; - double bandwidth; + struct { + double h2d; + double d2h; + double d2d; + double other; + } bw; } config; /* handler to support arm/wakeup feature */ struct { @@ -87,7 +92,12 @@ typedef struct uct_cuda_copy_iface_config { uct_iface_config_t super; unsigned max_poll; unsigned max_cuda_events; - double bandwidth; + struct { + double h2d; + double d2h; + double d2d; + double other; + } bw; } uct_cuda_copy_iface_config_t;