diff --git a/config/m4/cuda.m4 b/config/m4/cuda.m4 index 8fc12fa38e6..ebf13449bc6 100644 --- a/config/m4/cuda.m4 +++ b/config/m4/cuda.m4 @@ -1 +1,84 @@ -AM_CONDITIONAL([HAVE_CUDA], [true]) +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + +# +# Check for CUDA support +# +cuda_happy="no" + +AC_ARG_WITH([cuda], + [AS_HELP_STRING([--with-cuda=(DIR)], [Enable the use of CUDA (default is no).])], + [], [with_cuda=no]) + +AS_IF([test "x$with_cuda" != "xno"], + + [AS_IF([test "x$with_cuda" == "x" || test "x$with_cuda" == "xguess" || test "x$with_cuda" == "xyes"], + [ + AC_MSG_NOTICE([CUDA path was not specified. Guessing ...]) + with_cuda=/usr/local/cuda + ], + [:]) + AC_CHECK_HEADERS([$with_cuda/include/cuda.h $with_cuda/include/cuda_runtime.h], + [AC_CHECK_DECLS([cuPointerGetAttribute], + [cuda_happy="yes"], + [AC_MSG_WARN([CUDA runtime not detected. Disable.]) + cuda_happy="no"], + [#include <$with_cuda/include/cuda.h>]) + AS_IF([test "x$cuda_happy" == "xyes"], + [AC_DEFINE([HAVE_CUDA], 1, [Enable CUDA support]) + AC_SUBST(CUDA_CPPFLAGS, "-I$with_cuda/include/ ") + AC_SUBST(CUDA_CFLAGS, "-I$with_cuda/include/ ") + AC_SUBST(CUDA_LDFLAGS, "-lcudart -lcuda -L$with_cuda/lib64") + CFLAGS="$CFLAGS $CUDA_CFLAGS" + CPPFLAGS="$CPPFLAGS $CUDA_CPPFLAGS" + LDFLAGS="$LDFLAGS $CUDA_LDFLAGS"], + [])], + [AC_MSG_WARN([CUDA not found]) + AC_DEFINE([HAVE_CUDA], [0], [Disable the use of CUDA])])], + [AC_MSG_WARN([CUDA was explicitly disabled]) + AC_DEFINE([HAVE_CUDA], [0], [Disable the use of CUDA])] +) + + +AM_CONDITIONAL([HAVE_CUDA], [test "x$cuda_happy" != xno]) + +AC_ARG_WITH([gdrcopy], + [AS_HELP_STRING([--with-gdrcopy=(DIR)], [Enable the use of GDR_COPY (default is no).])], + [], [with_gdrcopy=no]) + +AS_IF([test "x$with_gdrcopy" != "xno"], + + [AS_IF([test "x$with_gdrcopy" == "x" || test "x$with_gdrcopy" == "xguess" || test "x$with_gdrcopy" == "xyes"], + [ + AC_MSG_NOTICE([GDR_COPY path was not specified. Guessing ...]) + with_gdrcopy=/usr/local/gdrcopy + ], + [:]) + AC_CHECK_HEADERS([$with_gdrcopy/include/gdrapi.h], + [AC_CHECK_DECLS([gdr_pin_buffer], + [gdrcopy_happy="yes"], + [AC_MSG_WARN([GDR_COPY runtime not detected. Disable.]) + gdrcopy_happy="no"], + [#include <$with_gdrcopy/include/gdrapi.h>]) + AS_IF([test "x$gdrcopy_happy" == "xyes"], + [AC_DEFINE([HAVE_GDR_COPY], 1, [Enable GDR_COPY support]) + AC_SUBST(GDR_COPY_CPPFLAGS, "-I$with_gdrcopy/include/ ") + AC_SUBST(GDR_COPY_CFLAGS, "-I$with_gdrcopy/include/ ") + AC_SUBST(GDR_COPY_LDFLAGS, "-lgdrapi -L$with_gdrcopy/lib64") + CFLAGS="$CFLAGS $GDR_COPY_CFLAGS" + CPPFLAGS="$CPPFLAGS $GDR_COPY_CPPFLAGS" + LDFLAGS="$LDFLAGS $GDR_COPY_LDFLAGS"], + [])], + [AC_MSG_WARN([GDR_COPY not found]) + AC_DEFINE([HAVE_GDR_COPY], [0], [Disable the use of GDR_COPY])])], + [AC_MSG_WARN([GDR_COPY was explicitly disabled]) + AC_DEFINE([HAVE_GDR_COPY], [0], [Disable the use of GDR_COPY])] +) + + +AM_CONDITIONAL([HAVE_GDR_COPY], [test "x$gdrcopy_happy" != xno]) + +AC_DEFINE([HAVE_CUDA_GDR], [1], [Eanble GPU Direct RDMA])] +AM_CONDITIONAL([HAVE_CUDA_GDR], [1]) diff --git a/src/ucm/Makefile.am b/src/ucm/Makefile.am index 2215ddd2111..7db3ea7ccef 100644 --- a/src/ucm/Makefile.am +++ b/src/ucm/Makefile.am @@ -47,6 +47,13 @@ libucm_la_SOURCES = \ util/reloc.c \ util/sys.c +if HAVE_CUDA +libucm_la_SOURCES += \ + cuda/install.c \ + cuda/replace.c + +endif + if HAVE_UCM_PTMALLOC283 libucm_la_CPPFLAGS += \ -I$(srcdir)/ptmalloc283/sysdeps/pthread \ diff --git a/src/ucm/api/ucm.h b/src/ucm/api/ucm.h index 0b024e3d29b..2eb55603ae6 100644 --- a/src/ucm/api/ucm.h +++ b/src/ucm/api/ucm.h @@ -9,7 +9,10 @@ #define UCM_H_ #include - +#if HAVE_CUDA +#include +#include +#endif BEGIN_C_DECLS #include @@ -32,6 +35,8 @@ typedef enum ucm_event_type { UCM_EVENT_SHMAT = UCS_BIT(3), UCM_EVENT_SHMDT = UCS_BIT(4), UCM_EVENT_SBRK = UCS_BIT(5), + /* cuda events */ + UCM_EVENT_CUDAFREE = UCS_BIT(6), /* Aggregate events */ UCM_EVENT_VM_MAPPED = UCS_BIT(16), @@ -113,6 +118,17 @@ typedef union ucm_event { intptr_t increment; } sbrk; +#if HAVE_CUDA + /* + * UCM_EVENT_CUDAFREE + * cudaFree() is called. + */ + struct { + int result; + void *address; + } cudaFree; +#endif + /* * UCM_EVENT_VM_MAPPED, UCM_EVENT_VM_UNMAPPED * @@ -351,6 +367,23 @@ int ucm_shmdt(const void *shmaddr); */ void *ucm_sbrk(intptr_t increment); +#if HAVE_CUDA + +/** + * @brief Call the original implementation of @ref cudaFree without triggering events. + */ +cudaError_t ucm_orig_cudaFree(void *address); + + +/** + * @brief Call the original implementation of @ref cudaFree and all handlers + * associated with it. + */ +cudaError_t ucm_cudaFree(void *address); + +#endif + + END_C_DECLS diff --git a/src/ucm/cuda/cudamem.h b/src/ucm/cuda/cudamem.h new file mode 100644 index 00000000000..e6665bc7704 --- /dev/null +++ b/src/ucm/cuda/cudamem.h @@ -0,0 +1,22 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCM_CUDAMEM_H_ +#define UCM_CUDAMEM_H_ + +#include +#include +#include + +ucs_status_t ucm_cudamem_install(int events); + +void ucm_cudamem_event_test_callback(ucm_event_type_t event_type, + ucm_event_t *event, void *arg); + + +cudaError_t ucm_override_cudaFree(void *addr); + +#endif diff --git a/src/ucm/cuda/install.c b/src/ucm/cuda/install.c new file mode 100644 index 00000000000..596ea0a9738 --- /dev/null +++ b/src/ucm/cuda/install.c @@ -0,0 +1,154 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * Copyright (C) ARM Ltd. 2016. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "cudamem.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +typedef struct ucm_cudamem_func { + ucm_reloc_patch_t patch; + ucm_event_type_t event_type; +} ucm_cudamem_func_t; + +static ucm_cudamem_func_t ucm_cudamem_funcs[] = { + { {"cudaFree", ucm_override_cudaFree}, UCM_EVENT_CUDAFREE}, + { {NULL, NULL}, 0} +}; + +void ucm_cudamem_event_test_callback(ucm_event_type_t event_type, + ucm_event_t *event, void *arg) +{ + int *out_events = arg; + *out_events |= event_type; +} + +/* Called with lock held */ +static ucs_status_t ucm_cudamem_test(int events) +{ + static int installed_events = 0; + ucm_event_handler_t handler; + int out_events = 0; + void *p; + + if (ucs_test_all_flags(installed_events, events)) { + /* All requested events are already installed */ + return UCS_OK; + } + + /* Install a temporary event handler which will add the supported event + * type to out_events bitmap. + */ + handler.events = events; + handler.priority = -1; + handler.cb = ucm_cudamem_event_test_callback; + handler.arg = &out_events; + out_events = 0; + + ucm_event_handler_add(&handler); + + if (events & (UCM_EVENT_CUDAFREE)) { + if (cudaSuccess != cudaMalloc(&p, 64)) { + ucm_error("cudaMalloc failed"); + return UCS_ERR_UNSUPPORTED; + } + cudaFree(p); + } + + + ucm_event_handler_remove(&handler); + + /* TODO check address / stop all threads */ + installed_events |= out_events; + ucm_debug("cudamem test: got 0x%x out of 0x%x, total: 0x%x", out_events, events, + installed_events); + + /* Return success iff we caught all wanted events */ + if (!ucs_test_all_flags(out_events, events)) { + return UCS_ERR_UNSUPPORTED; + } + + return UCS_OK; +} + +/* Called with lock held */ +static ucs_status_t ucs_cudamem_install_reloc(int events) +{ + static int installed_events = 0; + ucm_cudamem_func_t *entry; + ucs_status_t status; + + if (!ucm_global_config.enable_cuda_hooks) { + ucm_debug("installing cudamem relocations is disabled by configuration"); + return UCS_ERR_UNSUPPORTED; + } + + for (entry = ucm_cudamem_funcs; entry->patch.symbol != NULL; ++entry) { + if (!(entry->event_type & events)) { + /* Not required */ + continue; + } + + if (entry->event_type & installed_events) { + /* Already installed */ + continue; + } + + ucm_debug("cudamem: installing relocation table entry for %s = %p for event 0x%x", + entry->patch.symbol, entry->patch.value, entry->event_type); + + status = ucm_reloc_modify(&entry->patch); + if (status != UCS_OK) { + ucm_warn("failed to install relocation table entry for '%s'", + entry->patch.symbol); + return status; + } + + installed_events |= entry->event_type; + } + + return UCS_OK; +} + +ucs_status_t ucm_cudamem_install(int events) +{ + static pthread_mutex_t install_mutex = PTHREAD_MUTEX_INITIALIZER; + ucs_status_t status; + + pthread_mutex_lock(&install_mutex); + + status = ucm_cudamem_test(events); + if (status == UCS_OK) { + goto out_unlock; + } + + status = ucs_cudamem_install_reloc(events); + if (status != UCS_OK) { + ucm_debug("failed to install relocations for cudamem"); + goto out_unlock; + } + + status = ucm_cudamem_test(events); + +out_unlock: + pthread_mutex_unlock(&install_mutex); + return status; +} diff --git a/src/ucm/cuda/replace.c b/src/ucm/cuda/replace.c new file mode 100644 index 00000000000..4a88adf98d6 --- /dev/null +++ b/src/ucm/cuda/replace.c @@ -0,0 +1,101 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "cudamem.h" + +#include +#include +#include +#include +#include +#include +#include + + +#define MAP_FAILED ((void*)-1) + +static pthread_mutex_t ucm_cudamem_get_orig_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; +static pthread_t volatile ucm_cudamem_get_orig_thread = -1; + + +/** + * Define a replacement function to a memory-mapping function call, which calls + * the event handler, and if event handler returns error code - calls the original + * function. + */ +#define UCM_DEFINE_CUDA_FUNC(_name, _rettype, _fail_val, ...) \ + \ + _rettype ucm_override_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)); \ + \ + /* Call the original function using dlsym(RTLD_NEXT) */ \ + _rettype ucm_orig_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \ + { \ + typedef _rettype (*func_ptr_t) (__VA_ARGS__); \ + static func_ptr_t orig_func_ptr = NULL; \ + \ + ucm_trace("%s()", __FUNCTION__); \ + \ + if (ucs_unlikely(orig_func_ptr == NULL)) { \ + pthread_mutex_lock(&ucm_cudamem_get_orig_lock); \ + ucm_cudamem_get_orig_thread = pthread_self(); \ + orig_func_ptr = ucm_reloc_get_orig(UCS_PP_QUOTE(_name), \ + ucm_override_##_name); \ + ucm_cudamem_get_orig_thread = -1; \ + pthread_mutex_unlock(&ucm_cudamem_get_orig_lock); \ + } \ + return orig_func_ptr(UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \ + } \ + \ + /* Define a symbol which goes to the replacement - in case we are loaded first */ \ + _rettype ucm_override_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \ + { \ + ucm_trace("%s()", __FUNCTION__); \ + \ + if (ucs_unlikely(ucm_cudamem_get_orig_thread == pthread_self())) { \ + return _fail_val; \ + } \ + return ucm_##_name(UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \ + } + +#define UCM_OVERRIDE_CUDA_FUNC(_name) \ + cudaError_t _name() __attribute__ ((alias ("ucm_override_" UCS_PP_QUOTE(_name)))); \ + + +/* + * Define argument list with given types. + */ +#define UCM_FUNC_DEFINE_ARGS(...) \ + UCS_PP_FOREACH_SEP(_UCM_FUNC_ARG_DEFINE, _, \ + UCS_PP_ZIP((UCS_PP_SEQ(UCS_PP_NUM_ARGS(__VA_ARGS__))), \ + (__VA_ARGS__))) + +/* + * Pass auto-generated arguments to a function call. + */ +#define UCM_FUNC_PASS_ARGS(...) \ + UCS_PP_FOREACH_SEP(_UCM_FUNC_ARG_PASS, _, UCS_PP_SEQ(UCS_PP_NUM_ARGS(__VA_ARGS__))) + + +/* + * Helpers + */ +#define _UCM_FUNC_ARG_DEFINE(_, _bundle) \ + __UCM_FUNC_ARG_DEFINE(_, UCS_PP_TUPLE_0 _bundle, UCS_PP_TUPLE_1 _bundle) +#define __UCM_FUNC_ARG_DEFINE(_, _index, _type) \ + _type UCS_PP_TOKENPASTE(arg, _index) +#define _UCM_FUNC_ARG_PASS(_, _index) \ + UCS_PP_TOKENPASTE(arg, _index) + + +UCM_DEFINE_CUDA_FUNC(cudaFree, cudaError_t, -1, void*) + +#if ENABLE_SYMBOL_OVERRIDE +UCM_OVERRIDE_CUDA_FUNC(cudaFree) +#endif diff --git a/src/ucm/event/event.c b/src/ucm/event/event.c index f572dae6bbe..0c0fee9bf0b 100644 --- a/src/ucm/event/event.c +++ b/src/ucm/event/event.c @@ -13,6 +13,9 @@ #include #include #include +#if HAVE_CUDA +#include +#endif #include #include #include @@ -89,6 +92,13 @@ static void ucm_event_call_orig(ucm_event_type_t event_type, ucm_event_t *event, event->sbrk.result = ucm_orig_sbrk(event->sbrk.increment); } break; +#if HAVE_CUDA + case UCM_EVENT_CUDAFREE: + if (event->cudaFree.result == -1) { + event->cudaFree.result = ucm_orig_cudaFree(event->cudaFree.address); + } + break; +#endif default: ucm_warn("Got unknown event %d", event_type); break; @@ -102,7 +112,7 @@ static void ucm_event_call_orig(ucm_event_type_t event_type, ucm_event_t *event, static ucm_event_handler_t ucm_event_orig_handler = { .list = UCS_LIST_INITIALIZER(&ucm_event_handlers, &ucm_event_handlers), .events = UCM_EVENT_MMAP | UCM_EVENT_MUNMAP | UCM_EVENT_MREMAP | - UCM_EVENT_SHMAT | UCM_EVENT_SHMDT | UCM_EVENT_SBRK, /* All events */ + UCM_EVENT_SHMAT | UCM_EVENT_SHMDT | UCM_EVENT_SBRK | UCM_EVENT_CUDAFREE, /* All events */ .priority = 0, /* Between negative and positive handlers */ .cb = ucm_event_call_orig }; @@ -334,6 +344,27 @@ void *ucm_sbrk(intptr_t increment) return event.sbrk.result; } +#if HAVE_CUDA +cudaError_t ucm_cudaFree(void *addr) +{ + ucm_event_t event; + + ucm_event_enter(); + + ucm_trace("ucm_cudaFree(addr=%p )", addr); + + ucm_dispatch_vm_munmap(addr, 0); + + event.cudaFree.result = -1; + event.cudaFree.address = addr; + ucm_event_dispatch(UCM_EVENT_CUDAFREE, &event); + + ucm_event_leave(); + + return event.cudaFree.result; +} +#endif + void ucm_event_handler_add(ucm_event_handler_t *handler) { ucm_event_handler_t *elem; @@ -390,6 +421,19 @@ static ucs_status_t ucm_event_install(int events) } ucm_debug("malloc hooks are ready"); + +#if HAVE_CUDA + if (events & UCM_EVENT_VM_UNMAPPED) { + native_events = UCM_EVENT_CUDAFREE; + } + status = ucm_cudamem_install(native_events); + if (status != UCS_OK) { + ucm_debug("failed to install cudamem events"); + goto out_unlock; + } + ucm_debug("cudaFree hooks are ready"); +#endif + status = UCS_OK; out_unlock: diff --git a/src/ucm/util/ucm_config.c b/src/ucm/util/ucm_config.c index 2d9d2b43301..fd2081ff37d 100644 --- a/src/ucm/util/ucm_config.c +++ b/src/ucm/util/ucm_config.c @@ -20,6 +20,7 @@ #define UCM_EN_MMAP_RELOC_VAR "MMAP_RELOC" #define UCM_EN_MALLOC_HOOKS_VAR "MALLOC_HOOKS" #define UCM_EN_MALLOC_RELOC_VAR "MALLOC_RELOC" +#define UCM_EN_CUDA_HOOKS_VAR "CUDA_HOOKS" ucm_config_t ucm_global_config = { @@ -28,7 +29,10 @@ ucm_config_t ucm_global_config = { .enable_events = 1, .enable_mmap_reloc = 1, .enable_malloc_hooks = 1, - .enable_malloc_reloc = 0 + .enable_malloc_reloc = 0, +#if HAVE_CUDA + .enable_cuda_hooks = 1 +#endif }; static const char *ucm_config_bool_to_string(int value) @@ -107,6 +111,10 @@ void ucm_config_print(FILE *stream, ucs_config_print_flags_t print_flags) print_flags); fprintf(stream, "%s%s=%s\n", UCM_ENV_PREFIX, UCM_EN_MALLOC_RELOC_VAR, ucm_config_bool_to_string(ucm_global_config.enable_malloc_reloc)); +#if HAVE_CUDA + fprintf(stream, "%s%s=%s\n", UCM_ENV_PREFIX, UCM_EN_CUDA_HOOKS_VAR, + ucm_config_bool_to_string(ucm_global_config.enable_cuda_hooks)); +#endif } static void ucm_config_set_value_table(const char *str_value, const char **table, @@ -157,6 +165,10 @@ ucs_status_t ucm_config_modify(const char *name, const char *value) ucm_config_set_value_bool(value, &ucm_global_config.enable_malloc_hooks); } else if (!strcmp(name, UCM_EN_MALLOC_RELOC_VAR)) { ucm_config_set_value_bool(value, &ucm_global_config.enable_malloc_reloc); +#if HAVE_CUDA + } else if (!strcmp(name, UCM_EN_CUDA_HOOKS_VAR)) { + ucm_config_set_value_bool(value, &ucm_global_config.enable_cuda_hooks); +#endif } else { return UCS_ERR_INVALID_PARAM; } diff --git a/src/ucm/util/ucm_config.h b/src/ucm/util/ucm_config.h index 317608a57c6..bc42a600b12 100644 --- a/src/ucm/util/ucm_config.h +++ b/src/ucm/util/ucm_config.h @@ -18,6 +18,9 @@ typedef struct ucm_config { int enable_mmap_reloc; int enable_malloc_hooks; int enable_malloc_reloc; +#if HAVE_CUDA + int enable_cuda_hooks; +#endif size_t alloc_alignment; } ucm_config_t; diff --git a/src/ucp/api/ucp_def.h b/src/ucp/api/ucp_def.h index d094541e5e4..84cd2e83382 100644 --- a/src/ucp/api/ucp_def.h +++ b/src/ucp/api/ucp_def.h @@ -141,6 +141,13 @@ typedef struct ucp_rkey *ucp_rkey_h; */ typedef struct ucp_mem *ucp_mem_h; +/* + * @ingroup UCP_ADDR_DN + * @brief UCP Address Domain + * + * Address Domain handle is an opaque object representing a memory adreess domain +*/ +typedef struct ucp_addr_dn *ucp_addr_dn_h; /** * @ingroup UCP_WORKER diff --git a/src/ucp/core/ucp_context.c b/src/ucp/core/ucp_context.c index a10dd627e38..b575e007cb4 100644 --- a/src/ucp/core/ucp_context.c +++ b/src/ucp/core/ucp_context.c @@ -150,6 +150,10 @@ static ucs_config_field_t ucp_config_table[] = { "Also the value has to be bigger than UCX_TM_THRESH to take an effect." , ucs_offsetof(ucp_config_t, ctx.tm_max_bcopy), UCS_CONFIG_TYPE_MEMUNITS}, + {"RNDV_FRAG_SIZE", "65536", + "RNDV fragment size \n", + ucs_offsetof(ucp_config_t, ctx.rndv_frag_size), UCS_CONFIG_TYPE_MEMUNITS}, + {NULL} }; diff --git a/src/ucp/core/ucp_context.h b/src/ucp/core/ucp_context.h index 827f486ef87..4e848c2c618 100644 --- a/src/ucp/core/ucp_context.h +++ b/src/ucp/core/ucp_context.h @@ -51,6 +51,8 @@ typedef struct ucp_context_config { ucp_atomic_mode_t atomic_mode; /** If use mutex for MT support or not */ int use_mt_mutex; + /** RNDV pipeline fragment size */ + size_t rndv_frag_size; /** On-demand progress */ int adaptive_progress; } ucp_context_config_t; diff --git a/src/ucp/core/ucp_ep.c b/src/ucp/core/ucp_ep.c index b8c350d1fec..d6c8f9a6d24 100644 --- a/src/ucp/core/ucp_ep.c +++ b/src/ucp/core/ucp_ep.c @@ -885,6 +885,7 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) { ucp_context_h context = worker->context; ucp_ep_rma_config_t *rma_config; + ucp_ep_addr_domain_config_t *domain_config; uct_iface_attr_t *iface_attr; uct_md_attr_t *md_attr; ucp_rsc_index_t rsc_index; @@ -903,6 +904,7 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) config->tag.eager.zcopy_auto_thresh = 0; config->am.zcopy_auto_thresh = 0; config->p2p_lanes = 0; + config->domain_lanes = 0; config->bcopy_thresh = context->config.ext.bcopy_thresh; config->tag.lane = UCP_NULL_LANE; config->tag.proto = &ucp_tag_eager_proto; @@ -990,6 +992,23 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) } } + /* Configuration for memory domains */ + for (lane = 0; lane < config->key.num_lanes; ++lane) { + if (config->key.domain_lanes[lane] == UCP_NULL_LANE) { + continue; + } + config->domain_lanes |= UCS_BIT(lane); + + domain_config = &config->domain[lane]; + rsc_index = config->key.lanes[lane].rsc_index; + iface_attr = &worker->ifaces[rsc_index].attr; + + domain_config->tag.eager.max_short = iface_attr->cap.am.max_short; + //TODO: zcopy thrshold should be based on the ep AM lane capability with domain addr(i.e can UCT do zcopy from domain) + memset(domain_config->tag.eager.zcopy_thresh, 0, UCP_MAX_IOV * sizeof(size_t)); + + } + /* Configuration for remote memory access */ for (lane = 0; lane < config->key.num_lanes; ++lane) { if (ucp_ep_config_get_rma_prio(config->key.rma_lanes, lane) == -1) { diff --git a/src/ucp/core/ucp_ep.h b/src/ucp/core/ucp_ep.h index f6fd896ee1b..9033846af63 100644 --- a/src/ucp/core/ucp_ep.h +++ b/src/ucp/core/ucp_ep.h @@ -81,6 +81,9 @@ typedef struct ucp_ep_config_key { /* Lanes for atomic operations, sorted by priority, highest first */ ucp_lane_index_t amo_lanes[UCP_MAX_LANES]; + /* Lanes for domain operations, sorted by priority, highest first */ + ucp_lane_index_t domain_lanes[UCP_MAX_LANES]; + /* Bitmap of remote mds which are reachable from this endpoint (with any set * of transports which could be selected in the future). */ @@ -106,6 +109,17 @@ typedef struct ucp_ep_rma_config { } ucp_ep_rma_config_t; +#define UCP_IS_DEFAULT_ADDR_DOMAIN(_addr_dn_h) (_addr_dn_h == &ucp_addr_dn_dummy_handle) + +typedef struct ucp_ep_addr_domain_config { + struct { + struct { + ssize_t max_short; + size_t zcopy_thresh[UCP_MAX_IOV]; + } eager; + } tag; +} ucp_ep_addr_domain_config_t; + /* * Configuration for AM and tag offload protocols */ @@ -136,6 +150,10 @@ typedef struct ucp_ep_config { */ ucp_lane_map_t p2p_lanes; + /* Bitmap of which lanes are domain lanes + */ + ucp_lane_map_t domain_lanes; + /* Configuration for each lane that provides RMA */ ucp_ep_rma_config_t rma[UCP_MAX_LANES]; /* Threshold for switching from put_short to put_bcopy */ @@ -179,8 +197,11 @@ typedef struct ucp_ep_config { * (currently it's only AM based). */ const ucp_proto_t *proto; } stream; -} ucp_ep_config_t; + /* Configuration of all domains */ + ucp_ep_addr_domain_config_t domain[UCP_MAX_LANES]; + +} ucp_ep_config_t; /** * Remote protocol layer endpoint @@ -245,4 +266,8 @@ size_t ucp_ep_config_get_zcopy_auto_thresh(size_t iovcnt, const ucp_context_h context, double bandwidth); +ucp_lane_index_t ucp_config_find_domain_lane(const ucp_ep_config_t *config, + const ucp_lane_index_t *lanes, + ucp_md_map_t dn_md_map); +ucs_status_t ucp_ep_set_domain_lanes(ucp_ep_h ep, ucp_addr_dn_h addr_dn_h); #endif diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index 9c89adec845..58b26db7cee 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -24,6 +24,12 @@ static ucp_mem_t ucp_mem_dummy_handle = { .md_map = 0 }; +ucp_addr_dn_t ucp_addr_dn_dummy_handle = { + .md_map = 0, + .id = UCT_MD_ADDR_DOMAIN_LAST +}; + + /** * Unregister memory from all memory domains. * Save in *alloc_md_memh_p the memory handle of the allocating MD, if such exists. @@ -106,6 +112,40 @@ static ucs_status_t ucp_memh_reg_mds(ucp_context_h context, ucp_mem_h memh, return UCS_OK; } +ucs_status_t ucp_addr_domain_detect_mds(ucp_context_h context, void *addr, ucp_addr_dn_h *addr_dn_h) +{ + ucs_status_t status; + unsigned md_index; + uct_addr_domain_t domain_id = UCT_MD_ADDR_DOMAIN_DEFAULT; + + *addr_dn_h = &ucp_addr_dn_dummy_handle; + + /*TODO: return if no MDs with address domain detect */ + + for (md_index = 0; md_index < context->num_mds; ++md_index) { + if (context->tl_mds[md_index].attr.cap.flags & UCT_MD_FLAG_ADDR_DN) { + if (domain_id == UCT_MD_ADDR_DOMAIN_DEFAULT) { + status = uct_md_mem_detect(context->tl_mds[md_index].md, addr); + if (status == UCS_OK) { + domain_id = context->tl_mds[md_index].attr.cap.addr_dn; + + *addr_dn_h = ucs_malloc(sizeof(ucp_addr_dn_t), "ucp_addr_dn_h"); + if (*addr_dn_h == NULL) { + return UCS_ERR_NO_MEMORY; + } + + (*addr_dn_h)->id = domain_id; + (*addr_dn_h)->md_map = UCS_BIT(md_index); + } + } else { + if (domain_id == context->tl_mds[md_index].attr.cap.addr_dn) { + (*addr_dn_h)->md_map |= UCS_BIT(md_index); + } + } + } + } + return UCS_OK; +} /** * @return Whether MD number 'md_index' is selected by the configuration as part * of allocation method number 'config_method_index'. diff --git a/src/ucp/core/ucp_mm.h b/src/ucp/core/ucp_mm.h index 3ddfbbfc842..7877c9da03f 100644 --- a/src/ucp/core/ucp_mm.h +++ b/src/ucp/core/ucp_mm.h @@ -64,6 +64,16 @@ typedef struct ucp_mem_desc { } ucp_mem_desc_t; +/** + * Memory Address Domain descriptor. + * Contains domain information of the memory address it belongs to. + */ +typedef struct ucp_addr_dn { + ucp_md_map_t md_map; /* Which MDs have own ths addr Domain */ + uct_addr_domain_t id; /* Address domain index */ + ucp_lane_index_t eager_lane; +} ucp_addr_dn_t; + void ucp_rkey_resolve_inner(ucp_rkey_h rkey, ucp_ep_h ep); ucs_status_t ucp_mpool_malloc(ucs_mpool_t *mp, size_t *size_p, void **chunk_p); @@ -72,6 +82,16 @@ void ucp_mpool_free(ucs_mpool_t *mp, void *chunk); void ucp_mpool_obj_init(ucs_mpool_t *mp, void *obj, void *chunk); +/** + * Detects the address domain on all MDs. skips on detect on sub-sequence MDs + * if it sucessfully detected by MD. +**/ +ucs_status_t ucp_addr_domain_detect_mds(ucp_context_h context, void *addr, + ucp_addr_dn_h *addr_dn_h); + + +extern ucp_addr_dn_t ucp_addr_dn_dummy_handle; + static UCS_F_ALWAYS_INLINE uct_mem_h ucp_memh2uct(ucp_mem_h memh, ucp_md_index_t md_idx) { diff --git a/src/ucp/core/ucp_request.h b/src/ucp/core/ucp_request.h index 52c92dbddc2..3ca2b5c492e 100644 --- a/src/ucp/core/ucp_request.h +++ b/src/ucp/core/ucp_request.h @@ -78,6 +78,7 @@ typedef void (*ucp_request_callback_t)(ucp_request_t *req); struct ucp_request { ucs_status_t status; /* Operation status */ uint16_t flags; /* Request flags */ + ucp_addr_dn_h addr_dn_h; /* Memory domain handle */ union { struct { diff --git a/src/ucp/core/ucp_worker.c b/src/ucp/core/ucp_worker.c index 4f3270ae5d6..f5e1cf5f730 100644 --- a/src/ucp/core/ucp_worker.c +++ b/src/ucp/core/ucp_worker.c @@ -36,6 +36,88 @@ static ucs_stats_class_t ucp_worker_stats_class = { #endif +static ucs_status_t ucp_mpool_dereg_mds(ucp_context_h context, ucp_mem_h memh) { + unsigned md_index, uct_index; + ucs_status_t status; + + uct_index = 0; + + for (md_index = 0; md_index < context->num_mds; ++md_index) { + if (!(memh->md_map & UCS_BIT(md_index))) { + continue; + } + + status = uct_md_mem_dereg(context->tl_mds[md_index].md, + memh->uct[uct_index]); + if (status != UCS_OK) { + ucs_error("Failed to dereg address %p with md %s", memh->address, + context->tl_mds[md_index].rsc.md_name); + return status; + } + + ++uct_index; + } + + return UCS_OK; +} + +static ucs_status_t ucp_mpool_reg_mds(ucp_context_h context, ucp_mem_h memh) { + unsigned md_index, uct_memh_count; + ucs_status_t status; + + uct_memh_count = 0; + memh->md_map = 0; + + for (md_index = 0; md_index < context->num_mds; ++md_index) { + if (context->tl_mds[md_index].attr.cap.flags & UCT_MD_FLAG_REG) { + status = uct_md_mem_reg(context->tl_mds[md_index].md, memh->address, + memh->length, 0, memh->uct[uct_memh_count]); + if (status != UCS_OK) { + ucs_error("Failed to register memory pool chunk %p with md %s", + memh->address, context->tl_mds[md_index].rsc.md_name); + return status; + } + + memh->md_map |= UCS_BIT(md_index); + uct_memh_count++; + } + } + + return UCS_OK; +} + + +static ucs_status_t ucp_mpool_rndv_malloc(ucs_mpool_t *mp, size_t *size_p, void **chunk_p) { + ucp_worker_h worker = ucs_container_of(mp, ucp_worker_t, reg_mp); + ucp_mem_desc_t *chunk_hdr; + ucs_status_t status; + + status = ucp_mpool_malloc(mp, size_p, chunk_p); + if (status != UCS_OK) { + ucs_error("Failed to allocate memory pool chunk: %s", ucs_status_string(status)); + return UCS_ERR_NO_MEMORY; + } + + chunk_hdr = (ucp_mem_desc_t *)(*chunk_p) - 1; + + status = ucp_mpool_reg_mds(worker->context, chunk_hdr->memh); + if (status != UCS_OK) { + ucp_mpool_dereg_mds(worker->context, chunk_hdr->memh); + return status; + } + + return UCS_OK; +} + + +static void ucp_mpool_rndv_free(ucs_mpool_t *mp, void *chunk) { + ucp_worker_h worker = ucs_container_of(mp, ucp_worker_t, reg_mp); + ucp_mem_desc_t *chunk_hdr = (ucp_mem_desc_t *)chunk - 1; + ucp_mpool_dereg_mds(worker->context, chunk_hdr->memh); + ucp_mpool_free(mp, chunk); +} + + ucs_mpool_ops_t ucp_am_mpool_ops = { .chunk_alloc = ucs_mpool_hugetlb_malloc, .chunk_release = ucs_mpool_hugetlb_free, @@ -52,6 +134,14 @@ ucs_mpool_ops_t ucp_reg_mpool_ops = { }; +ucs_mpool_ops_t ucp_rndv_frag_mpool_ops = { + .chunk_alloc = ucp_mpool_rndv_malloc, + .chunk_release = ucp_mpool_rndv_free, + .obj_init = ucs_empty_function, + .obj_cleanup = ucs_empty_function +}; + + void ucp_worker_iface_check_events(ucp_worker_iface_t *wiface, int force); @@ -909,8 +999,19 @@ static ucs_status_t ucp_worker_init_mpools(ucp_worker_h worker, goto err_release_am_mpool; } + + status = ucs_mpool_init(&worker->rndv_frag_mp, 0, + context->config.ext.rndv_frag_size, + 0, 128, 128, UINT_MAX, + &ucp_rndv_frag_mpool_ops, "ucp_rndv_frags"); + if (status != UCS_OK) { + goto err_release_reg_mpool; + } + return UCS_OK; +err_release_reg_mpool: + ucs_mpool_cleanup(&worker->reg_mp, 0); err_release_am_mpool: ucs_mpool_cleanup(&worker->am_mp, 0); out: @@ -1120,6 +1221,7 @@ void ucp_worker_destroy(ucp_worker_h worker) ucp_worker_destroy_eps(worker); ucs_mpool_cleanup(&worker->am_mp, 1); ucs_mpool_cleanup(&worker->reg_mp, 1); + ucs_mpool_cleanup(&worker->rndv_frag_mp, 1); ucp_worker_close_ifaces(worker); ucp_worker_wakeup_cleanup(worker); ucs_mpool_cleanup(&worker->req_mp, 1); diff --git a/src/ucp/core/ucp_worker.h b/src/ucp/core/ucp_worker.h index 916755910a5..202e5ffcd99 100644 --- a/src/ucp/core/ucp_worker.h +++ b/src/ucp/core/ucp_worker.h @@ -147,6 +147,7 @@ typedef struct ucp_worker { ucs_mpool_t am_mp; /* Memory pool for AM receives */ ucs_mpool_t reg_mp; /* Registered memory pool */ ucp_mt_lock_t mt_lock; /* Configuration of multi-threading support */ + ucs_mpool_t rndv_frag_mp; /* Memory pool for RNDV fragments */ UCS_STATS_NODE_DECLARE(stats); diff --git a/src/ucp/dt/dt.c b/src/ucp/dt/dt.c index 418b9775446..bfa4fd4a020 100644 --- a/src/ucp/dt/dt.c +++ b/src/ucp/dt/dt.c @@ -5,6 +5,7 @@ */ #include "dt.h" +#include size_t ucp_dt_pack(ucp_datatype_t datatype, void *dest, const void *src, @@ -44,3 +45,123 @@ size_t ucp_dt_pack(ucp_datatype_t datatype, void *dest, const void *src, state->offset += result_len; return result_len; } + +static UCS_F_ALWAYS_INLINE ucs_status_t ucp_dn_dt_unpack(ucp_request_t *req, void *buffer, size_t buffer_size, + const void *recv_data, size_t recv_length) +{ + ucs_status_t status; + ucp_worker_h worker = req->recv.worker; + ucp_context_h context = worker->context; + ucp_ep_h ep = ucp_worker_ep_find(worker, worker->uuid); + ucp_ep_config_t *config = ucp_ep_config(ep); + ucp_md_map_t dn_md_map = req->addr_dn_h->md_map; + ucp_lane_index_t dn_lane; + ucp_rsc_index_t rsc_index; + uct_iface_attr_t *iface_attr; + unsigned md_index; + uct_mem_h memh; + uct_iov_t iov; + + if (recv_length == 0) { + return UCS_OK; + } + + while(1) { + dn_lane = ucp_config_find_domain_lane(config, config->key.domain_lanes, dn_md_map); + if (dn_lane == UCP_NULL_LANE) { + ucs_error("Not find address domain lane."); + return UCS_ERR_IO_ERROR; + } + rsc_index = ucp_ep_get_rsc_index(ep, dn_lane); + iface_attr = &worker->ifaces[rsc_index].attr; + md_index = config->key.lanes[dn_lane].dst_md_index; + if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_ZCOPY)) { + dn_md_map |= ~UCS_BIT(md_index); + continue; + } + break; + } + + + status = uct_md_mem_reg(context->tl_mds[md_index].md, buffer, buffer_size, + UCT_MD_MEM_ACCESS_REMOTE_PUT, &memh); + if (status != UCS_OK) { + ucs_error("Failed to reg address %p with md %s", buffer, + context->tl_mds[md_index].rsc.md_name); + return status; + } + + ucs_assert(buffer_size >= recv_length); + iov.buffer = (void *)recv_data; + iov.length = recv_length; + iov.count = 1; + iov.memh = UCT_MEM_HANDLE_NULL; + + + status = uct_ep_put_zcopy(ep->uct_eps[dn_lane], &iov, 1, (uint64_t)buffer, + (uct_rkey_t )memh, NULL); + if (status != UCS_OK) { + uct_md_mem_dereg(context->tl_mds[md_index].md, memh); + ucs_error("Failed to perform uct_ep_put_zcopy to address %p", recv_data); + return status; + } + + status = uct_md_mem_dereg(context->tl_mds[md_index].md, memh); + if (status != UCS_OK) { + ucs_error("Failed to dereg address %p with md %s", buffer, + context->tl_mds[md_index].rsc.md_name); + return status; + } + + return UCS_OK; +} + + +ucs_status_t ucp_dt_unpack(ucp_request_t *req, ucp_datatype_t datatype, void *buffer, size_t buffer_size, + ucp_dt_state_t *state, const void *recv_data, size_t recv_length, int last) +{ + ucp_dt_generic_t *dt_gen; + size_t offset = state->offset; + ucs_status_t status; + + if (ucs_unlikely((recv_length + offset) > buffer_size)) { + ucs_trace_req("message truncated: recv_length %zu offset %zu buffer_size %zu", + recv_length, offset, buffer_size); + if (UCP_DT_IS_GENERIC(datatype) && last) { + ucp_dt_generic(datatype)->ops.finish(state->dt.generic.state); + } + return UCS_ERR_MESSAGE_TRUNCATED; + } + + switch (datatype & UCP_DATATYPE_CLASS_MASK) { + case UCP_DATATYPE_CONTIG: + if (ucs_likely(UCP_IS_DEFAULT_ADDR_DOMAIN(req->addr_dn_h))) { + UCS_PROFILE_NAMED_CALL("memcpy_recv", memcpy, buffer + offset, + recv_data, recv_length); + return UCS_OK; + } else { + return ucp_dn_dt_unpack(req, buffer, buffer_size, recv_data, recv_length); + } + + case UCP_DATATYPE_IOV: + UCS_PROFILE_CALL(ucp_dt_iov_scatter, buffer, state->dt.iov.iovcnt, + recv_data, recv_length, &state->dt.iov.iov_offset, + &state->dt.iov.iovcnt_offset); + return UCS_OK; + + case UCP_DATATYPE_GENERIC: + dt_gen = ucp_dt_generic(datatype); + status = UCS_PROFILE_NAMED_CALL("dt_unpack", dt_gen->ops.unpack, + state->dt.generic.state, offset, + recv_data, recv_length); + if (last) { + UCS_PROFILE_NAMED_CALL_VOID("dt_finish", dt_gen->ops.finish, + state->dt.generic.state); + } + return status; + + default: + ucs_error("unexpected datatype=%lx", datatype); + return UCS_ERR_INVALID_PARAM; + } +} diff --git a/src/ucp/dt/dt.h b/src/ucp/dt/dt.h index f35b03f75b3..82c83a84ce1 100644 --- a/src/ucp/dt/dt.h +++ b/src/ucp/dt/dt.h @@ -15,6 +15,7 @@ #include #include #include +#include /** @@ -72,51 +73,8 @@ size_t ucp_dt_length(ucp_datatype_t datatype, size_t count, size_t ucp_dt_pack(ucp_datatype_t datatype, void *dest, const void *src, ucp_dt_state_t *state, size_t length); -static UCS_F_ALWAYS_INLINE ucs_status_t -ucp_dt_unpack(ucp_datatype_t datatype, void *buffer, size_t buffer_size, - ucp_dt_state_t *state, const void *recv_data, - size_t recv_length, int last) -{ - ucp_dt_generic_t *dt_gen; - size_t offset = state->offset; - ucs_status_t status; - - if (ucs_unlikely((recv_length + offset) > buffer_size)) { - ucs_trace_req("message truncated: recv_length %zu offset %zu buffer_size %zu", - recv_length, offset, buffer_size); - if (UCP_DT_IS_GENERIC(datatype) && last) { - ucp_dt_generic(datatype)->ops.finish(state->dt.generic.state); - } - return UCS_ERR_MESSAGE_TRUNCATED; - } - - switch (datatype & UCP_DATATYPE_CLASS_MASK) { - case UCP_DATATYPE_CONTIG: - UCS_PROFILE_NAMED_CALL("memcpy_recv", memcpy, buffer + offset, - recv_data, recv_length); - return UCS_OK; - - case UCP_DATATYPE_IOV: - UCS_PROFILE_CALL(ucp_dt_iov_scatter, buffer, state->dt.iov.iovcnt, - recv_data, recv_length, &state->dt.iov.iov_offset, - &state->dt.iov.iovcnt_offset); - return UCS_OK; - - case UCP_DATATYPE_GENERIC: - dt_gen = ucp_dt_generic(datatype); - status = UCS_PROFILE_NAMED_CALL("dt_unpack", dt_gen->ops.unpack, - state->dt.generic.state, offset, - recv_data, recv_length); - if (last) { - UCS_PROFILE_NAMED_CALL_VOID("dt_finish", dt_gen->ops.finish, - state->dt.generic.state); - } - return status; - - default: - ucs_error("unexpected datatype=%lx", datatype); - return UCS_ERR_INVALID_PARAM; - } -} +ucs_status_t ucp_dt_unpack(ucp_request_t *req, ucp_datatype_t datatype, + void *buffer, size_t buffer_size, ucp_dt_state_t *state, + const void *recv_data, size_t recv_length, int last); #endif diff --git a/src/ucp/tag/eager.h b/src/ucp/tag/eager.h index 94202c2f477..2e1580cc326 100644 --- a/src/ucp/tag/eager.h +++ b/src/ucp/tag/eager.h @@ -101,7 +101,7 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucp_eager_unexp_match(ucp_worker_h worker, ucp_recv_desc_t *rdesc, ucp_tag_t tag, unsigned flags, void *buffer, size_t count, ucp_datatype_t datatype, ucp_dt_state_t *state, - ucp_tag_recv_info_t *info) + ucp_request_t *req, ucp_tag_recv_info_t *info) { size_t recv_len, hdr_len; ucs_status_t status; @@ -110,7 +110,7 @@ ucp_eager_unexp_match(ucp_worker_h worker, ucp_recv_desc_t *rdesc, ucp_tag_t tag UCP_WORKER_STAT_EAGER_CHUNK(worker, UNEXP); hdr_len = rdesc->hdr_len; recv_len = rdesc->length - hdr_len; - status = ucp_dt_unpack(datatype, buffer, count, state, data + hdr_len, + status = ucp_dt_unpack(req, datatype, buffer, count, state, data + hdr_len, recv_len, flags & UCP_RECV_DESC_FLAG_LAST); state->offset += recv_len; diff --git a/src/ucp/tag/eager_rcv.c b/src/ucp/tag/eager_rcv.c index 742c43e18a0..1a2138aee4c 100644 --- a/src/ucp/tag/eager_rcv.c +++ b/src/ucp/tag/eager_rcv.c @@ -71,7 +71,7 @@ ucp_eager_handler(void *arg, void *data, size_t length, unsigned am_flags, if (req != NULL) { UCS_PROFILE_REQUEST_EVENT(req, "eager_recv", recv_len); - status = ucp_dt_unpack(req->recv.datatype, req->recv.buffer, + status = ucp_dt_unpack(req, req->recv.datatype, req->recv.buffer, req->recv.length, &req->recv.state, data + hdr_len, recv_len, flags & UCP_RECV_DESC_FLAG_LAST); diff --git a/src/ucp/tag/offload.c b/src/ucp/tag/offload.c index 91142df2d6d..550c4f30e1e 100644 --- a/src/ucp/tag/offload.c +++ b/src/ucp/tag/offload.c @@ -64,7 +64,7 @@ void ucp_tag_offload_completed(uct_tag_context_t *self, uct_tag_t stag, } if (req->recv.rdesc != NULL) { - status = ucp_dt_unpack(req->recv.datatype, req->recv.buffer, req->recv.length, + status = ucp_dt_unpack(req, req->recv.datatype, req->recv.buffer, req->recv.length, &req->recv.state, req->recv.rdesc + 1, length, 1); ucs_mpool_put_inline(req->recv.rdesc); } else { diff --git a/src/ucp/tag/rndv.c b/src/ucp/tag/rndv.c index 548d3c4c10a..05cad2ce386 100644 --- a/src/ucp/tag/rndv.c +++ b/src/ucp/tag/rndv.c @@ -731,7 +731,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_data_handler, } UCS_PROFILE_REQUEST_EVENT(rreq, "rndv_data_recv", recv_len); - status = ucp_dt_unpack(rreq->recv.datatype, rreq->recv.buffer, + status = ucp_dt_unpack(rreq, rreq->recv.datatype, rreq->recv.buffer, rreq->recv.length, &rreq->recv.state, data + hdr_len, recv_len, 0); if ((status == UCS_OK) || (status == UCS_INPROGRESS)) { @@ -764,9 +764,9 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_data_last_handler, /* Check that total received length matches RTS->length */ ucs_assert(rreq->recv.info.length == rreq->recv.state.offset + recv_len); UCS_PROFILE_REQUEST_EVENT(rreq, "rndv_data_last_recv", recv_len); - status = ucp_dt_unpack(rreq->recv.datatype, rreq->recv.buffer, - rreq->recv.length, &rreq->recv.state, - data + hdr_len, recv_len, 1); + status = ucp_dt_unpack(rreq, rreq->recv.datatype, rreq->recv.buffer, + rreq->recv.length, &rreq->recv.state, + data + hdr_len, recv_len, 1); } else { ucs_trace_data("drop last segment for rreq %p, length %zu, status %s", rreq, recv_len, ucs_status_string(rreq->status)); diff --git a/src/ucp/tag/tag_recv.c b/src/ucp/tag/tag_recv.c index e9413eefd11..900dbc1cc3a 100644 --- a/src/ucp/tag/tag_recv.c +++ b/src/ucp/tag/tag_recv.c @@ -85,7 +85,7 @@ ucp_tag_search_unexp(ucp_worker_h worker, void *buffer, size_t buffer_size, UCS_PROFILE_REQUEST_EVENT(req, "eager_match", 0); status = ucp_eager_unexp_match(worker, rdesc, recv_tag, flags, buffer, buffer_size, datatype, - &req->recv.state, info); + &req->recv.state, req, info); ucs_trace_req("release receive descriptor %p", rdesc); if (status != UCS_INPROGRESS) { goto out_release_desc; @@ -128,6 +128,8 @@ ucp_tag_recv_request_init(ucp_request_t *req, ucp_worker_h worker, void* buffer, req->recv.state.offset = 0; req->recv.worker = worker; + ucp_addr_domain_detect_mds(worker->context, buffer, &(req->addr_dn_h)); + switch (datatype & UCP_DATATYPE_CLASS_MASK) { case UCP_DATATYPE_IOV: req->recv.state.dt.iov.iov_offset = 0; diff --git a/src/ucp/tag/tag_send.c b/src/ucp/tag/tag_send.c index 9b7326e3d98..979b11e2ffe 100644 --- a/src/ucp/tag/tag_send.c +++ b/src/ucp/tag/tag_send.c @@ -202,7 +202,8 @@ ucp_tag_send_req(ucp_request_t *req, size_t count, ssize_t max_short, static void ucp_tag_send_req_init(ucp_request_t* req, ucp_ep_h ep, const void* buffer, uintptr_t datatype, - ucp_tag_t tag, uint16_t flags) + ucp_tag_t tag, uint16_t flags, + ucp_addr_dn_h addr_dn_h) { req->flags = flags; req->send.ep = ep; @@ -211,6 +212,7 @@ static void ucp_tag_send_req_init(ucp_request_t* req, ucp_ep_h ep, req->send.tag = tag; req->send.reg_rsc = UCP_NULL_RESOURCE; req->send.state.offset = 0; + req->addr_dn_h = addr_dn_h; VALGRIND_MAKE_MEM_UNDEFINED(&req->send.uct_comp.count, sizeof(req->send.uct_comp.count)); @@ -228,13 +230,20 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nb, ucp_request_t *req; size_t length; ucs_status_ptr_t ret; + ucp_addr_dn_h addr_dn_h; UCP_THREAD_CS_ENTER_CONDITIONAL(&ep->worker->mt_lock); ucs_trace_req("send_nb buffer %p count %zu tag %"PRIx64" to %s cb %p", buffer, count, tag, ucp_ep_peer_name(ep), cb); - if (ucs_likely(UCP_DT_IS_CONTIG(datatype))) { + ucp_addr_domain_detect_mds(ep->worker->context, (void *)buffer, &addr_dn_h); + if (ucs_likely(!UCP_IS_DEFAULT_ADDR_DOMAIN(addr_dn_h))) { + ucp_ep_set_domain_lanes(ep, addr_dn_h); + } + + if (ucs_likely(UCP_IS_DEFAULT_ADDR_DOMAIN(addr_dn_h)) && + ucs_likely(UCP_DT_IS_CONTIG(datatype))) { length = ucp_contig_dt_length(datatype, count); if (ucs_likely((ssize_t)length <= ucp_ep_config(ep)->tag.eager.max_short)) { status = UCS_PROFILE_CALL(ucp_tag_send_eager_short, ep, tag, buffer, @@ -253,11 +262,15 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nb, goto out; } - ucp_tag_send_req_init(req, ep, buffer, datatype, tag, 0); + ucp_tag_send_req_init(req, ep, buffer, datatype, tag, 0, addr_dn_h); ret = ucp_tag_send_req(req, count, - ucp_ep_config(ep)->tag.eager.max_short, - ucp_ep_config(ep)->tag.eager.zcopy_thresh, + ucs_likely(UCP_IS_DEFAULT_ADDR_DOMAIN(addr_dn_h)) ? + ucp_ep_config(ep)->tag.eager.max_short : + ucp_ep_config(ep)->domain[addr_dn_h->eager_lane].tag.eager.max_short, + ucs_likely(UCP_IS_DEFAULT_ADDR_DOMAIN(addr_dn_h)) ? + ucp_ep_config(ep)->tag.eager.zcopy_thresh : + ucp_ep_config(ep)->domain[addr_dn_h->eager_lane].tag.eager.zcopy_thresh, ucp_ep_config(ep)->tag.rndv.rma_thresh, ucp_ep_config(ep)->tag.rndv.am_thresh, cb, ucp_ep_config(ep)->tag.proto); @@ -293,7 +306,9 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_sync_nb, /* Remote side needs to send reply, so have it connect to us */ ucp_ep_connect_remote(ep); - ucp_tag_send_req_init(req, ep, buffer, datatype, tag, UCP_REQUEST_FLAG_SYNC); + + ucp_tag_send_req_init(req, ep, buffer, datatype, tag, UCP_REQUEST_FLAG_SYNC, + &ucp_addr_dn_dummy_handle); ret = ucp_tag_send_req(req, count, -1, /* disable short method */ diff --git a/src/ucp/wireup/address.c b/src/ucp/wireup/address.c index 43cdaca9de3..fd0cb51e141 100644 --- a/src/ucp/wireup/address.c +++ b/src/ucp/wireup/address.c @@ -61,9 +61,11 @@ typedef struct { #define UCP_ADDRESS_FLAG_EMPTY 0x80 /* Device without TL addresses */ #define UCP_ADDRESS_FLAG_MD_ALLOC 0x40 /* MD can register */ #define UCP_ADDRESS_FLAG_MD_REG 0x20 /* MD can allocate */ +#define UCP_ADDRESS_FLAG_MD_DOMAIN 0x10 /* address domain MD */ #define UCP_ADDRESS_FLAG_MD_MASK ~(UCP_ADDRESS_FLAG_EMPTY | \ UCP_ADDRESS_FLAG_MD_ALLOC | \ - UCP_ADDRESS_FLAG_MD_REG) + UCP_ADDRESS_FLAG_MD_REG | \ + UCP_ADDRESS_FLAG_MD_DOMAIN) static size_t ucp_address_string_packed_size(const char *s) { @@ -325,7 +327,8 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep, *(uint8_t*)ptr = md_index | ((dev->tl_bitmap == 0) ? UCP_ADDRESS_FLAG_EMPTY : 0) | ((md_flags & UCT_MD_FLAG_ALLOC) ? UCP_ADDRESS_FLAG_MD_ALLOC : 0) | - ((md_flags & UCT_MD_FLAG_REG) ? UCP_ADDRESS_FLAG_MD_REG : 0); + ((md_flags & UCT_MD_FLAG_REG) ? UCP_ADDRESS_FLAG_MD_REG : 0) | + ((md_flags & UCT_MD_FLAG_ADDR_DN) ? UCP_ADDRESS_FLAG_MD_DOMAIN : 0); ++ptr; /* Device address length */ @@ -564,6 +567,7 @@ ucs_status_t ucp_address_unpack(const void *buffer, uint64_t *remote_uuid_p, md_index = md_byte & UCP_ADDRESS_FLAG_MD_MASK; md_flags = (md_byte & UCP_ADDRESS_FLAG_MD_ALLOC) ? UCT_MD_FLAG_ALLOC : 0; md_flags |= (md_byte & UCP_ADDRESS_FLAG_MD_REG) ? UCT_MD_FLAG_REG : 0; + md_flags |= (md_byte & UCP_ADDRESS_FLAG_MD_DOMAIN) ? UCT_MD_FLAG_ADDR_DN : 0; empty_dev = md_byte & UCP_ADDRESS_FLAG_EMPTY; ++ptr; diff --git a/src/ucp/wireup/select.c b/src/ucp/wireup/select.c index 1f6ae4457b1..7da6c632104 100644 --- a/src/ucp/wireup/select.c +++ b/src/ucp/wireup/select.c @@ -10,17 +10,19 @@ #include #include #include +#include #include #include #define UCP_WIREUP_RNDV_TEST_MSG_SIZE 262144 enum { - UCP_WIREUP_LANE_USAGE_AM = UCS_BIT(0), - UCP_WIREUP_LANE_USAGE_RMA = UCS_BIT(1), - UCP_WIREUP_LANE_USAGE_AMO = UCS_BIT(2), - UCP_WIREUP_LANE_USAGE_RNDV = UCS_BIT(3), - UCP_WIREUP_LANE_USAGE_TAG = UCS_BIT(4) + UCP_WIREUP_LANE_USAGE_AM = UCS_BIT(0), + UCP_WIREUP_LANE_USAGE_RMA = UCS_BIT(1), + UCP_WIREUP_LANE_USAGE_AMO = UCS_BIT(2), + UCP_WIREUP_LANE_USAGE_RNDV = UCS_BIT(3), + UCP_WIREUP_LANE_USAGE_TAG = UCS_BIT(4), + UCP_WIREUP_LANE_USAGE_DOMAIN = UCS_BIT(5) }; @@ -32,10 +34,12 @@ typedef struct { uint32_t usage; double rma_score; double amo_score; + double domain_score; } ucp_wireup_lane_desc_t; static const char *ucp_wireup_md_flags[] = { + [ucs_ilog2(UCT_MD_FLAG_ADDR_DN)] = "memory address domain", [ucs_ilog2(UCT_MD_FLAG_ALLOC)] = "memory allocation", [ucs_ilog2(UCT_MD_FLAG_REG)] = "memory registration", }; @@ -361,6 +365,7 @@ ucp_wireup_add_lane_desc(ucp_wireup_lane_desc_t *lane_descs, lane_desc->usage = usage; lane_desc->rma_score = 0.0; lane_desc->amo_score = 0.0; + lane_desc->domain_score = 0.0; out_update_score: if (usage & UCP_WIREUP_LANE_USAGE_RMA) { @@ -369,6 +374,9 @@ ucp_wireup_add_lane_desc(ucp_wireup_lane_desc_t *lane_descs, if (usage & UCP_WIREUP_LANE_USAGE_AMO) { lane_desc->amo_score = score; } + if (usage & UCP_WIREUP_LANE_USAGE_DOMAIN) { + lane_desc->domain_score = score; + } } #define UCP_WIREUP_COMPARE_SCORE(_elem1, _elem2, _arg, _token) \ @@ -396,6 +404,12 @@ static int ucp_wireup_compare_lane_amo_score(const void *elem1, const void *elem return UCP_WIREUP_COMPARE_SCORE(elem1, elem2, arg, amo); } +static int ucp_wireup_compare_lane_domain_score(const void *elem1, const void *elem2, + void *arg) +{ + return UCP_WIREUP_COMPARE_SCORE(elem1, elem2, arg, amo); +} + static UCS_F_NOINLINE ucs_status_t ucp_wireup_add_memaccess_lanes(ucp_ep_h ep, unsigned address_count, const ucp_address_entry_t *address_list, @@ -548,6 +562,163 @@ static ucs_status_t ucp_wireup_add_rma_lanes(ucp_ep_h ep, const ucp_ep_params_t -1, UCP_WIREUP_LANE_USAGE_RMA); } +ucp_lane_index_t ucp_config_find_domain_lane(const ucp_ep_config_t *config, + const ucp_lane_index_t *lanes, + ucp_md_map_t dn_md_map) +{ + ucp_md_index_t dst_md_index; + ucp_lane_index_t lane; + ucp_md_map_t dst_md_mask; + int prio; + + for (prio = 0; prio < UCP_MAX_LANES; ++prio) { + lane = lanes[prio]; + if (lane == UCP_NULL_LANE) { + return UCP_NULL_LANE; /* No more lanes */ + } + + dst_md_index = config->key.lanes[lane].dst_md_index; + dst_md_mask = UCS_BIT(dst_md_index); + if (dn_md_map & dst_md_mask) { + return lane; + } + } + return UCP_NULL_LANE; +} + +ucs_status_t ucp_ep_set_domain_lanes(ucp_ep_h ep, ucp_addr_dn_h addr_dn_h) +{ + ucp_rsc_index_t rsc_index; + uct_iface_attr_t *iface_attr; + ucp_md_map_t dn_md_map; + ucp_lane_index_t dn_lane; + ucp_md_index_t md_index; + + dn_md_map = addr_dn_h->md_map; + + while(1) { + dn_lane = ucp_config_find_domain_lane(ucp_ep_config(ep), + ucp_ep_config(ep)->key.domain_lanes, dn_md_map); + if (dn_lane == UCP_NULL_LANE) { + ucs_error("Not find address domain lane."); + return UCS_ERR_IO_ERROR; + } + rsc_index = ucp_ep_get_rsc_index(ep, dn_lane); + iface_attr = &ep->worker->ifaces[rsc_index].attr; + md_index = ucp_ep_config(ep)->key.lanes[dn_lane].dst_md_index; + if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_ZCOPY) { + addr_dn_h->eager_lane = dn_lane; + } + /*TODO: revisit cap flags for rndv lane*/ + /*if (iface_attr->cap.flags & UCT_IFACE_FLAG_GET_ZCOPY) { + *addr_dn_h->rndv_lane = dn_lane + }*/ + dn_md_map |= ~UCS_BIT(md_index); + if (addr_dn_h->eager_lane != UCP_NULL_LANE || dn_md_map == 0) { + break; + } + } + + return UCS_OK; +} + + +double ucp_wireup_addr_domain_score_func(ucp_context_h context, + const uct_md_attr_t *md_attr, + const uct_iface_attr_t *iface_attr, + const ucp_address_iface_attr_t *remote_iface_attr) +{ + /* best end-to-end latency and larger bcopy size */ + return (1e-3 / (ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) + + iface_attr->overhead + remote_iface_attr->overhead)); +} + +static UCS_F_NOINLINE ucs_status_t +ucp_wireup_add_addr_domain_lanes(ucp_ep_h ep, unsigned address_count, + const ucp_address_entry_t *address_list, + ucp_wireup_lane_desc_t *lane_descs, + ucp_lane_index_t *num_lanes_p, + const ucp_wireup_criteria_t *criteria, + uint64_t tl_bitmap, uint32_t usage) +{ + ucp_address_entry_t *address_list_copy; + ucp_rsc_index_t rsc_index, dst_md_index; + size_t address_list_size; + double score; + uint64_t remote_md_map; + unsigned addr_index; + ucs_status_t status; + + remote_md_map = -1; + + /* Create a copy of the address list */ + address_list_size = sizeof(*address_list_copy) * address_count; + address_list_copy = ucs_malloc(address_list_size, "rma address list"); + if (address_list_copy == NULL) { + status = UCS_ERR_NO_MEMORY; + goto out; + } + + memcpy(address_list_copy, address_list, address_list_size); + + status = ucp_wireup_select_transport(ep, address_list_copy, address_count, + criteria, tl_bitmap, remote_md_map, + 0, &rsc_index, &addr_index, &score); + if (status != UCS_OK) { + goto out_free_address_list; + } + + dst_md_index = address_list_copy[addr_index].md_index; + + /* Add to the list of lanes and remove all occurrences of the remote md + * from the address list, to avoid selecting the same remote md again.*/ + ucp_wireup_add_lane_desc(lane_descs, num_lanes_p, rsc_index, addr_index, + dst_md_index, score, usage, 0); + remote_md_map &= ~UCS_BIT(dst_md_index); + + while (address_count > 0) { + status = ucp_wireup_select_transport(ep, address_list_copy, address_count, + criteria, tl_bitmap, remote_md_map, + 0, &rsc_index, &addr_index, &score); + if (status != UCS_OK) { + break; + } + + /* Add lane description and remove all occurrences of the remote md */ + dst_md_index = address_list_copy[addr_index].md_index; + ucp_wireup_add_lane_desc(lane_descs, num_lanes_p, rsc_index, addr_index, + dst_md_index, score, usage, 0); + remote_md_map &= ~UCS_BIT(dst_md_index); + } + + status = UCS_OK; + +out_free_address_list: + ucs_free(address_list_copy); +out: + return UCS_OK; +} +static ucs_status_t ucp_wireup_add_domain_lane(ucp_ep_h ep, const ucp_ep_params_t *params, + unsigned address_count, + const ucp_address_entry_t *address_list, + ucp_wireup_lane_desc_t *lane_descs, + ucp_lane_index_t *num_lanes_p) +{ + ucp_wireup_criteria_t criteria; + + criteria.title = "adress domain"; + criteria.local_md_flags = UCT_MD_FLAG_ADDR_DN; + criteria.remote_md_flags = UCT_MD_FLAG_ADDR_DN; + criteria.remote_iface_flags = UCT_IFACE_FLAG_CONNECT_TO_IFACE; + criteria.local_iface_flags = criteria.remote_iface_flags; + criteria.calc_score = ucp_wireup_addr_domain_score_func; + ucp_wireup_fill_ep_params_criteria(&criteria, params); + + return ucp_wireup_add_addr_domain_lanes(ep, address_count, address_list, + lane_descs, num_lanes_p, &criteria, + -1, UCP_WIREUP_LANE_USAGE_DOMAIN); +} + double ucp_wireup_amo_score_func(ucp_context_h context, const uct_md_attr_t *md_attr, const uct_iface_attr_t *iface_attr, @@ -899,6 +1070,12 @@ ucs_status_t ucp_wireup_select_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, return status; } + status = ucp_wireup_add_domain_lane(ep, params, address_count, address_list, + lane_descs, &key->num_lanes); + if (status != UCS_OK) { + return status; + } + /* User should not create endpoints unless requested communication features */ if (key->num_lanes == 0) { ucs_error("No transports selected to %s (features: 0x%lx)", @@ -936,6 +1113,9 @@ ucs_status_t ucp_wireup_select_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, ucs_assert(key->tag_lane == UCP_NULL_LANE); key->tag_lane = lane; } + if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_DOMAIN) { + key->domain_lanes[lane] = lane; + } } /* Sort RMA and AMO lanes according to score */ @@ -943,6 +1123,9 @@ ucs_status_t ucp_wireup_select_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, ucp_wireup_compare_lane_rma_score, lane_descs); ucs_qsort_r(key->amo_lanes, UCP_MAX_LANES, sizeof(ucp_lane_index_t), ucp_wireup_compare_lane_amo_score, lane_descs); + ucs_qsort_r(key->domain_lanes, UCP_MAX_LANES, sizeof(ucp_lane_index_t), + ucp_wireup_compare_lane_domain_score, lane_descs); + /* Get all reachable MDs from full remote address list */ key->reachable_md_map = ucp_wireup_get_reachable_mds(worker, address_count, diff --git a/src/uct/Makefile.am b/src/uct/Makefile.am index 780b494a843..97318ecaab5 100644 --- a/src/uct/Makefile.am +++ b/src/uct/Makefile.am @@ -195,14 +195,27 @@ endif if HAVE_CUDA noinst_HEADERS += \ - cuda/cuda_md.h \ - cuda/cuda_iface.h \ - cuda/cuda_ep.h + cuda/cuda_copy/cuda_copy_md.h \ + cuda/cuda_copy/cuda_copy_iface.h \ + cuda/cuda_copy/cuda_copy_ep.h libuct_la_SOURCES += \ - cuda/cuda_md.c \ - cuda/cuda_iface.c \ - cuda/cuda_ep.c + cuda/cuda_copy/cuda_copy_md.c \ + cuda/cuda_copy/cuda_copy_iface.c \ + cuda/cuda_copy/cuda_copy_ep.c + +if HAVE_GDR_COPY +noinst_HEADERS += \ + cuda/gdr_copy/gdr_copy_md.h \ + cuda/gdr_copy/gdr_copy_iface.h \ + cuda/gdr_copy/gdr_copy_ep.h + +libuct_la_SOURCES += \ + cuda/gdr_copy/gdr_copy_md.c \ + cuda/gdr_copy/gdr_copy_iface.c \ + cuda/gdr_copy/gdr_copy_ep.c +endif + endif if HAVE_ROCM diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h index 131fcd13715..583d74851aa 100644 --- a/src/uct/api/uct.h +++ b/src/uct/api/uct.h @@ -377,11 +377,24 @@ enum { UCT_MD_FLAG_RKEY_PTR = UCS_BIT(6), /**< MD supports direct access to remote memory via a pointer that is returned by @ref uct_rkey_ptr */ - UCT_MD_FLAG_SOCKADDR = UCS_BIT(7) /**< MD support for client-server + UCT_MD_FLAG_SOCKADDR = UCS_BIT(7), /**< MD support for client-server connection establishment via sockaddr */ + UCT_MD_FLAG_ADDR_DN = UCS_BIT(8) /**< MD supports memory addr domain + detection */ }; +/* + * @ingroup UCT_MD + * @brief Memory addr domains + */ +typedef enum { + UCT_MD_ADDR_DOMAIN_CUDA = 0, /**< NVIDIA CUDA domain */ + UCT_MD_ADDR_DOMAIN_DEFAULT, /**< Default system domain */ + UCT_MD_ADDR_DOMAIN_LAST = UCT_MD_ADDR_DOMAIN_DEFAULT + +} uct_addr_domain_t; + /** * @ingroup UCT_MD @@ -630,6 +643,7 @@ struct uct_md_attr { size_t max_alloc; /**< Maximal allocation size */ size_t max_reg; /**< Maximal registration size */ uint64_t flags; /**< UCT_MD_FLAG_xx */ + uct_addr_domain_t addr_dn; /**< Supported addr domain */ } cap; uct_linear_growth_t reg_cost; /**< Memory registration cost estimation @@ -1413,6 +1427,18 @@ ucs_status_t uct_md_mem_reg(uct_md_h md, void *address, size_t length, ucs_status_t uct_md_mem_dereg(uct_md_h md, uct_mem_h memh); +/** + * @ingroup UCT_MD + * @brief Detect memory on the memory domain. + * + * Detect memory on the memory domain. + * Return UCS_OK if address belongs to MDs address domain + * + * @param [in] md Memory domain to register memory on. + * @param [in] address Memory address to detect. + */ +ucs_status_t uct_md_mem_detect(uct_md_h md, void *addr); + /** * @ingroup UCT_MD * @brief Allocate memory for zero-copy communications and remote access. diff --git a/src/uct/base/uct_md.c b/src/uct/base/uct_md.c index 7876eb602da..f2a1f740f80 100644 --- a/src/uct/base/uct_md.c +++ b/src/uct/base/uct_md.c @@ -543,3 +543,8 @@ int uct_md_is_sockaddr_accessible(uct_md_h md, const ucs_sock_addr_t *sockaddr, { return md->ops->is_sockaddr_accessible(md, sockaddr, mode); } + +ucs_status_t uct_md_mem_detect(uct_md_h md, void *addr) +{ + return md->ops->mem_detect(md, addr); +} diff --git a/src/uct/base/uct_md.h b/src/uct/base/uct_md.h index 1cf33670017..c1160841591 100644 --- a/src/uct/base/uct_md.h +++ b/src/uct/base/uct_md.h @@ -136,6 +136,7 @@ struct uct_md_ops { int (*is_sockaddr_accessible)(uct_md_h md, const ucs_sock_addr_t *sockaddr, uct_sockaddr_accessibility_t mode); + ucs_status_t (*mem_detect)(uct_md_h md, void *addr); }; diff --git a/src/uct/cuda/cuda_copy/cuda_copy_ep.c b/src/uct/cuda/cuda_copy/cuda_copy_ep.c new file mode 100644 index 00000000000..035a4d2a6ff --- /dev/null +++ b/src/uct/cuda/cuda_copy/cuda_copy_ep.c @@ -0,0 +1,62 @@ +/** + * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "cuda_copy_ep.h" +#include "cuda_copy_iface.h" + +#include +#include +#include + + +static UCS_CLASS_INIT_FUNC(uct_cuda_copy_ep_t, uct_iface_t *tl_iface, + const uct_device_addr_t *dev_addr, + const uct_iface_addr_t *iface_addr) +{ + uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_copy_iface_t); + UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super) + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_cuda_copy_ep_t) +{ +} + +UCS_CLASS_DEFINE(uct_cuda_copy_ep_t, uct_base_ep_t) +UCS_CLASS_DEFINE_NEW_FUNC(uct_cuda_copy_ep_t, uct_ep_t, uct_iface_t*, + const uct_device_addr_t *, const uct_iface_addr_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_cuda_copy_ep_t, uct_ep_t); + + +ucs_status_t uct_cuda_copy_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, + uint64_t remote_addr, uct_rkey_t rkey, + uct_completion_t *comp) +{ + ucs_status_t status; + uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_copy_iface_t); + + uct_cuda_copy_event_desc_t *cuda_event = ucs_mpool_get(&iface->cuda_event_desc); + + status = CUDA_FUNC(cudaMemcpyAsync((void *)remote_addr, iov[0].buffer, iov[0].length, cudaMemcpyDeviceToHost, iface->stream_d2h)); + if (UCS_OK != status) { + ucs_error("cudaMemcpyAsync Failed "); + return UCS_ERR_IO_ERROR; + } + status = CUDA_FUNC(cudaEventRecord(cuda_event->event, iface->stream_d2h)); + if (UCS_OK != status) { + ucs_error("cudaEventRecord Failed "); + return UCS_ERR_IO_ERROR; + } + cuda_event->comp = comp; + + ucs_queue_push(&iface->pending_event_q, &cuda_event->queue); + + ucs_info("cuda async issued :%p buffer:%p len:%ld", cuda_event, iov[0].buffer, iov[0].length); + + return UCS_INPROGRESS; + +} + diff --git a/src/uct/cuda/cuda_copy/cuda_copy_ep.h b/src/uct/cuda/cuda_copy/cuda_copy_ep.h new file mode 100644 index 00000000000..f46e92690ad --- /dev/null +++ b/src/uct/cuda/cuda_copy/cuda_copy_ep.h @@ -0,0 +1,33 @@ +/** +* Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. +* See file LICENSE for terms. +*/ + +#ifndef UCT_SYSV_EP_H +#define UCT_SYSV_EP_H + +#include +#include +#include + + +typedef struct uct_cuda_copy_ep_addr { + int ep_id; +} uct_cuda_copy_ep_addr_t; + +typedef struct uct_cuda_copy_ep { + uct_base_ep_t super; + struct uct_cuda_copy_ep *next; +} uct_cuda_copy_ep_t; + +UCS_CLASS_DECLARE_NEW_FUNC(uct_cuda_copy_ep_t, uct_ep_t, uct_iface_t*, + const uct_device_addr_t *, const uct_iface_addr_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_cuda_copy_ep_t, uct_ep_t); + +ucs_status_t uct_cuda_copy_ep_get_zcopy(uct_ep_h tl_ep, + const uct_iov_t *iov, size_t iovcnt, + uint64_t remote_addr, uct_rkey_t rkey, + uct_completion_t *comp); + +#endif diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.c b/src/uct/cuda/cuda_copy/cuda_copy_iface.c new file mode 100644 index 00000000000..266f20d3e41 --- /dev/null +++ b/src/uct/cuda/cuda_copy/cuda_copy_iface.c @@ -0,0 +1,243 @@ +/** + * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "cuda_copy_iface.h" +#include "cuda_copy_md.h" +#include "cuda_copy_ep.h" + +#include +#include + + +static ucs_config_field_t uct_cuda_copy_iface_config_table[] = { + + {"", "", NULL, + ucs_offsetof(uct_cuda_copy_iface_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, + + {NULL} +}; + + +/* Forward declaration for the delete function */ +static void UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_copy_iface_t)(uct_iface_t*); + + +static ucs_status_t uct_cuda_copy_iface_get_address(uct_iface_h tl_iface, + uct_iface_addr_t *iface_addr) +{ + int *cuda_copy_addr = (int*)iface_addr; + *cuda_copy_addr = 0; + return UCS_OK; +} + +static int uct_cuda_copy_iface_is_reachable(const uct_iface_h iface, const uct_device_addr_t *dev_addr, + const uct_iface_addr_t *iface_addr) +{ + return 1; +} + +static ucs_status_t uct_cuda_copy_iface_query(uct_iface_h iface, + uct_iface_attr_t *iface_attr) +{ + memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + + /* FIXME all of these values */ + iface_attr->iface_addr_len = sizeof(int); + iface_attr->device_addr_len = 0; + iface_attr->ep_addr_len = 0; + iface_attr->cap.flags = UCT_IFACE_FLAG_CONNECT_TO_IFACE | + UCT_IFACE_FLAG_GET_ZCOPY | + UCT_IFACE_FLAG_PENDING; + + iface_attr->cap.put.max_short = 0; + iface_attr->cap.put.max_bcopy = 0; + iface_attr->cap.put.min_zcopy = 0; + iface_attr->cap.put.max_zcopy = 0; + iface_attr->cap.put.opt_zcopy_align = 1; + iface_attr->cap.put.align_mtu = iface_attr->cap.put.opt_zcopy_align; + iface_attr->cap.put.max_iov = 1; + + iface_attr->cap.get.max_bcopy = 0; + iface_attr->cap.get.min_zcopy = 0; + iface_attr->cap.get.max_zcopy = 0; + iface_attr->cap.get.opt_zcopy_align = 1; + iface_attr->cap.get.align_mtu = iface_attr->cap.get.opt_zcopy_align; + iface_attr->cap.get.max_iov = 1; + + iface_attr->cap.am.max_short = -1; + iface_attr->cap.am.max_bcopy = 0; + iface_attr->cap.am.min_zcopy = 0; + iface_attr->cap.am.max_zcopy = 0; + iface_attr->cap.am.opt_zcopy_align = 1; + iface_attr->cap.am.align_mtu = iface_attr->cap.am.opt_zcopy_align; + iface_attr->cap.am.max_hdr = 0; + iface_attr->cap.am.max_iov = 1; + + iface_attr->latency.overhead = 10e-6; + iface_attr->latency.growth = 0; + iface_attr->bandwidth = 6911 * 1024.0 * 1024.0; + iface_attr->overhead = 1000; + iface_attr->priority = 0; + + return UCS_OK; +} +static unsigned uct_cuda_copy_iface_progress(uct_iface_h tl_iface) +{ + uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_copy_iface_t); + unsigned count = 0; + + ucs_queue_iter_t iter; + uct_cuda_copy_event_desc_t *cuda_event; + cudaError_t result = cudaSuccess; + + ucs_queue_for_each_safe(cuda_event, iter, &iface->pending_event_q, queue) + { + result = cudaEventQuery(cuda_event->event); + if (cudaSuccess == result) { + ucs_queue_del_iter(&iface->pending_event_q, iter); + cuda_event->comp->func(cuda_event->comp, UCS_OK); + ucs_info("Event Done :%p", cuda_event); + ucs_mpool_put(cuda_event); + count++; + } + } + return count; +} + +static uct_iface_ops_t uct_cuda_copy_iface_ops = { + .ep_get_zcopy = uct_cuda_copy_ep_get_zcopy, + .ep_pending_add = ucs_empty_function_return_busy, + .ep_pending_purge = ucs_empty_function, + .ep_flush = uct_base_ep_flush, + .ep_fence = uct_base_ep_fence, + .ep_create_connected = UCS_CLASS_NEW_FUNC_NAME(uct_cuda_copy_ep_t), + .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_copy_ep_t), + .iface_flush = uct_base_iface_flush, + .iface_fence = uct_base_iface_fence, + .iface_progress_enable = ucs_empty_function, + .iface_progress_disable = ucs_empty_function, + .iface_progress = uct_cuda_copy_iface_progress, + .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_copy_iface_t), + .iface_query = uct_cuda_copy_iface_query, + .iface_get_device_address = (void*)ucs_empty_function_return_success, + .iface_get_address = uct_cuda_copy_iface_get_address, + .iface_is_reachable = uct_cuda_copy_iface_is_reachable, +}; + +void uct_cuda_copy_event_desc_init(ucs_mpool_t *mp, void *obj, void *chunk) +{ + uct_cuda_copy_event_desc_t *base = (uct_cuda_copy_event_desc_t *) obj; + ucs_status_t status; + memset(base, 0 , sizeof(*base)); + status = CUDA_FUNC(cudaEventCreateWithFlags(&(base->event), cudaEventDisableTiming)); + if (UCS_OK != status) { + ucs_error("cudaEventCreateWithFlags Failed"); + } +} +void uct_cuda_copy_event_desc_cleanup(ucs_mpool_t *mp, void *obj) +{ + ucs_status_t status; + uct_cuda_copy_event_desc_t *base = (uct_cuda_copy_event_desc_t *) obj; + status = CUDA_FUNC(cudaEventDestroy(base->event)); + if (UCS_OK != status) { + ucs_error("cudaEventDestroy Failed"); + } +} + +static ucs_mpool_ops_t uct_cuda_copy_event_desc_mpool_ops = { + .chunk_alloc = ucs_mpool_hugetlb_malloc, + .chunk_release = ucs_mpool_hugetlb_free, + .obj_init = uct_cuda_copy_event_desc_init, + .obj_cleanup = uct_cuda_copy_event_desc_cleanup, +}; + +static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h worker, + const uct_iface_params_t *params, + const uct_iface_config_t *tl_config) +{ + ucs_status_t status; + UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_cuda_copy_iface_ops, md, worker, + params, tl_config UCS_STATS_ARG(params->stats_root) + UCS_STATS_ARG(UCT_CUDA_COPY_TL_NAME)); + + if (strcmp(params->mode.device.dev_name, UCT_CUDA_DEV_NAME) != 0) { + ucs_error("No device was found: %s", params->mode.device.dev_name); + return UCS_ERR_NO_DEVICE; + } + + + status = ucs_mpool_init(&self->cuda_event_desc, + 0, + sizeof(uct_cuda_copy_event_desc_t), + 0, /* alignment offset */ + UCS_SYS_CACHE_LINE_SIZE, /* alignment */ + 128, /* grow */ + 1024, /* max desc */ + &uct_cuda_copy_event_desc_mpool_ops, + "CUDA EVENT objects"); + + if (UCS_OK != status) { + ucs_error("Mpool creation failed"); + return UCS_ERR_IO_ERROR; + } + + status = CUDA_FUNC(cudaStreamCreateWithFlags(&self->stream_d2h, cudaStreamNonBlocking)); + if (UCS_OK != status) { + ucs_error("cudaStreamCreateWithFlags creation failed"); + return UCS_ERR_IO_ERROR; + } + status = CUDA_FUNC(cudaStreamCreateWithFlags(&self->stream_h2d, cudaStreamNonBlocking)); + if (UCS_OK != status) { + ucs_error("cudaStreamCreateWithFlags creation failed"); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_cuda_copy_iface_t) +{ + ucs_mpool_cleanup(&self->cuda_event_desc, 1); +} + +UCS_CLASS_DEFINE(uct_cuda_copy_iface_t, uct_base_iface_t); +UCS_CLASS_DEFINE_NEW_FUNC(uct_cuda_copy_iface_t, uct_iface_t, uct_md_h, uct_worker_h, + const uct_iface_params_t*, const uct_iface_config_t*); +static UCS_CLASS_DEFINE_DELETE_FUNC(uct_cuda_copy_iface_t, uct_iface_t); + + +static ucs_status_t uct_cuda_copy_query_tl_resources(uct_md_h md, + uct_tl_resource_desc_t **resource_p, + unsigned *num_resources_p) +{ + uct_tl_resource_desc_t *resource; + + resource = ucs_calloc(1, sizeof(uct_tl_resource_desc_t), "resource desc"); + if (NULL == resource) { + ucs_error("Failed to allocate memory"); + return UCS_ERR_NO_MEMORY; + } + + ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", + UCT_CUDA_COPY_TL_NAME); + ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", + UCT_CUDA_DEV_NAME); + resource->dev_type = UCT_DEVICE_TYPE_ACC; + + *num_resources_p = 1; + *resource_p = resource; + return UCS_OK; +} + +UCT_TL_COMPONENT_DEFINE(uct_cuda_copy_tl, + uct_cuda_copy_query_tl_resources, + uct_cuda_copy_iface_t, + UCT_CUDA_COPY_TL_NAME, + "CUDA_COPY_", + uct_cuda_copy_iface_config_table, + uct_cuda_copy_iface_config_t); +UCT_MD_REGISTER_TL(&uct_cuda_copy_md_component, &uct_cuda_copy_tl); diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.h b/src/uct/cuda/cuda_copy/cuda_copy_iface.h new file mode 100644 index 00000000000..1e0374852b7 --- /dev/null +++ b/src/uct/cuda/cuda_copy/cuda_copy_iface.h @@ -0,0 +1,51 @@ +/** + * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_CUDA_IFACE_H +#define UCT_CUDA_IFACE_H + +#include +#include +#include +#include + + +#define UCT_CUDA_COPY_TL_NAME "cuda_copy" +#define UCT_CUDA_DEV_NAME "cudacopy0" + +#define CUDA_FUNC(func) ({ \ +ucs_status_t _status = UCS_OK; \ +do { \ + CUresult _result = (func); \ + if (CUDA_SUCCESS != _result) { \ + ucs_error("[%s:%d] cuda failed with %d \n", \ + __FILE__, __LINE__,_result); \ + _status = UCS_ERR_IO_ERROR; \ + } \ +} while (0); \ +_status; \ +}) + +typedef struct uct_cuda_copy_iface { + uct_base_iface_t super; + ucs_mpool_t cuda_event_desc; + ucs_queue_head_t pending_event_q; + cudaStream_t stream_d2h; + cudaStream_t stream_h2d; +} uct_cuda_copy_iface_t; + + +typedef struct uct_cuda_copy_iface_config { + uct_iface_config_t super; +} uct_cuda_copy_iface_config_t; + +typedef struct uct_cuda_copy_event_desc { + cudaEvent_t event; + uct_completion_t *comp; + ucs_queue_elem_t queue; +} uct_cuda_copy_event_desc_t; + +#endif diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c new file mode 100644 index 00000000000..33b3b3982a7 --- /dev/null +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -0,0 +1,165 @@ +/** + * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "cuda_copy_md.h" + +#include +#include +#include +#include +#include +#include +#include +#include + + +static ucs_config_field_t uct_cuda_copy_md_config_table[] = { + {"", "", NULL, + ucs_offsetof(uct_cuda_copy_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)}, + + {NULL} +}; + +static ucs_status_t uct_cuda_copy_md_query(uct_md_h md, uct_md_attr_t *md_attr) +{ + md_attr->cap.flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_ADDR_DN; + md_attr->cap.addr_dn = UCT_MD_ADDR_DOMAIN_CUDA; + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = ULONG_MAX; + md_attr->rkey_packed_size = 0; + md_attr->reg_cost.overhead = 0; + md_attr->reg_cost.growth = 0; + memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); + return UCS_OK; +} + +static ucs_status_t uct_cuda_copy_mkey_pack(uct_md_h md, uct_mem_h memh, + void *rkey_buffer) +{ + return UCS_OK; +} + +static ucs_status_t uct_cuda_copy_rkey_unpack(uct_md_component_t *mdc, + const void *rkey_buffer, uct_rkey_t *rkey_p, + void **handle_p) +{ + *rkey_p = 0xdeadbeef; + *handle_p = NULL; + return UCS_OK; +} + +static ucs_status_t uct_cuda_copy_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, + void *handle) +{ + return UCS_OK; +} + +static ucs_status_t uct_cuda_copy_mem_reg(uct_md_h md, void *address, size_t length, + unsigned flags, uct_mem_h *memh_p) +{ + cudaError_t cuerr = cudaSuccess; + + if(address == NULL) { + *memh_p = address; + return UCS_OK; + } + + cuerr = cudaHostRegister(address, length, cudaHostRegisterPortable); + if (cuerr != cudaSuccess) { + return UCS_ERR_IO_ERROR; + } + + *memh_p = address; + return UCS_OK; +} + +static ucs_status_t uct_cuda_copy_mem_dereg(uct_md_h md, uct_mem_h memh) +{ + void *address = (void *)memh; + cudaError_t cuerr = cudaSuccess; + if (address == NULL) { + return UCS_OK; + } + cuerr = cudaHostUnregister(address); + if (cuerr != cudaSuccess) { + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +static ucs_status_t uct_cuda_copy_mem_detect(uct_md_h md, void *addr) +{ + int memory_type; + cudaError_t cuda_err = cudaSuccess; + struct cudaPointerAttributes attributes; + CUresult cu_err = CUDA_SUCCESS; + + if (addr == NULL) { + return UCS_ERR_INVALID_ADDR; + } + + cu_err = cuPointerGetAttribute(&memory_type, + CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + (CUdeviceptr)addr); + if (cu_err != CUDA_SUCCESS) { + cuda_err = cudaPointerGetAttributes (&attributes, addr); + if (cuda_err == cudaSuccess) { + if (attributes.memoryType == cudaMemoryTypeDevice) { + return UCS_OK; + } + } + } else if (memory_type == CU_MEMORYTYPE_DEVICE) { + return UCS_OK; + } + return UCS_ERR_INVALID_ADDR; +} + +static ucs_status_t uct_cuda_copy_query_md_resources(uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) +{ + return uct_single_md_resource(&uct_cuda_copy_md_component, resources_p, num_resources_p); +} + +static void uct_cuda_copy_md_close(uct_md_h uct_md) { + uct_cuda_copy_md_t *md = ucs_derived_of(uct_md, uct_cuda_copy_md_t); + ucs_free(md); + +} +static uct_md_ops_t md_ops = { + .close = uct_cuda_copy_md_close, + .query = uct_cuda_copy_md_query, + .mkey_pack = uct_cuda_copy_mkey_pack, + .mem_reg = uct_cuda_copy_mem_reg, + .mem_dereg = uct_cuda_copy_mem_dereg, + .mem_detect = uct_cuda_copy_mem_detect +}; + +static ucs_status_t uct_cuda_copy_md_open(const char *md_name, const uct_md_config_t *uct_md_config, + uct_md_h *md_p) +{ + uct_cuda_copy_md_t *md; + // ucs_status_t status; + // const uct_cuda_copy_md_config_t *md_config = ucs_derived_of(uct_md_config, uct_cuda_copy_md_config_t); + + md = ucs_malloc(sizeof(uct_cuda_copy_md_t), "uct_cuda_copy_md_t"); + if (NULL == md) { + ucs_error("Failed to allocate memory for uct_cuda_copy_md_t"); + return UCS_ERR_NO_MEMORY; + } + + md->super.ops = &md_ops; + md->super.component = &uct_cuda_copy_md_component; + + *md_p = (uct_md_h) md; + return UCS_OK; +} + +UCT_MD_COMPONENT_DEFINE(uct_cuda_copy_md_component, UCT_CUDA_COPY_MD_NAME, + uct_cuda_copy_query_md_resources, uct_cuda_copy_md_open, NULL, + uct_cuda_copy_rkey_unpack, uct_cuda_copy_rkey_release, "CUDA_COPY_", + uct_cuda_copy_md_config_table, uct_cuda_copy_md_config_t); + diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.h b/src/uct/cuda/cuda_copy/cuda_copy_md.h new file mode 100644 index 00000000000..cd94ac74492 --- /dev/null +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.h @@ -0,0 +1,31 @@ +/** + * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_CUDA_COPY_H +#define UCT_CUDA_COPY_H + +#include + +#define UCT_CUDA_COPY_MD_NAME "cuda_cpy" + +extern uct_md_component_t uct_cuda_copy_md_component; + +/** + * @brief cuda_copy MD descriptor + */ +typedef struct uct_cuda_copy_md { + struct uct_md super; /**< Domain info */ +} uct_cuda_copy_md_t; + +/** + * gdr copy domain configuration. + */ +typedef struct uct_cuda_copy_md_config { + uct_md_config_t super; + +} uct_cuda_copy_md_config_t; + +#endif diff --git a/src/uct/cuda/cuda_ep.c b/src/uct/cuda/cuda_ep.c deleted file mode 100644 index 7150a26ca71..00000000000 --- a/src/uct/cuda/cuda_ep.c +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. - * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -#include "cuda_ep.h" -#include "cuda_iface.h" - -#include -#include -#include - - -static UCS_CLASS_INIT_FUNC(uct_cuda_ep_t, uct_iface_t *tl_iface, - const uct_device_addr_t *dev_addr, - const uct_iface_addr_t *iface_addr) -{ - uct_cuda_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_iface_t); - UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super) - return UCS_OK; -} - -static UCS_CLASS_CLEANUP_FUNC(uct_cuda_ep_t) -{ -} - -UCS_CLASS_DEFINE(uct_cuda_ep_t, uct_base_ep_t) -UCS_CLASS_DEFINE_NEW_FUNC(uct_cuda_ep_t, uct_ep_t, uct_iface_t*, - const uct_device_addr_t *, const uct_iface_addr_t *); -UCS_CLASS_DEFINE_DELETE_FUNC(uct_cuda_ep_t, uct_ep_t); - - -ucs_status_t uct_cuda_ep_put_short(uct_ep_h tl_ep, const void *buffer, - unsigned length, uint64_t remote_addr, - uct_rkey_t rkey) -{ - /* Code for PUT here */ - return UCS_ERR_UNSUPPORTED; -} - -ucs_status_t uct_cuda_ep_am_short(uct_ep_h ep, uint8_t id, uint64_t header, - const void *payload, unsigned length) -{ - return UCS_ERR_UNSUPPORTED; -} - diff --git a/src/uct/cuda/cuda_ep.h b/src/uct/cuda/cuda_ep.h deleted file mode 100644 index 686d7a65455..00000000000 --- a/src/uct/cuda/cuda_ep.h +++ /dev/null @@ -1,32 +0,0 @@ -/** -* Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. -* Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. -* See file LICENSE for terms. -*/ - -#ifndef UCT_SYSV_EP_H -#define UCT_SYSV_EP_H - -#include -#include -#include - - -typedef struct uct_cuda_ep_addr { - int ep_id; -} uct_cuda_ep_addr_t; - -typedef struct uct_cuda_ep { - uct_base_ep_t super; - struct uct_cuda_ep *next; -} uct_cuda_ep_t; - -UCS_CLASS_DECLARE_NEW_FUNC(uct_cuda_ep_t, uct_ep_t, uct_iface_t*, - const uct_device_addr_t *, const uct_iface_addr_t *); -UCS_CLASS_DECLARE_DELETE_FUNC(uct_cuda_ep_t, uct_ep_t); - -ucs_status_t uct_cuda_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length, - uint64_t remote_addr, uct_rkey_t rkey); -ucs_status_t uct_cuda_ep_am_short(uct_ep_h ep, uint8_t id, uint64_t header, - const void *payload, unsigned length); -#endif diff --git a/src/uct/cuda/cuda_md.c b/src/uct/cuda/cuda_md.c deleted file mode 100644 index 19596500f93..00000000000 --- a/src/uct/cuda/cuda_md.c +++ /dev/null @@ -1,102 +0,0 @@ -/** - * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. - * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -#include "cuda_md.h" - -#include -#include -#include -#include -#include -#include - - -static ucs_status_t uct_cuda_md_query(uct_md_h md, uct_md_attr_t *md_attr) -{ - md_attr->cap.flags = UCT_MD_FLAG_REG; - md_attr->cap.max_alloc = 0; - md_attr->cap.max_reg = ULONG_MAX; - md_attr->rkey_packed_size = 0; - md_attr->reg_cost.overhead = 0; - md_attr->reg_cost.growth = 0; - memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); - return UCS_OK; -} - -static ucs_status_t uct_cuda_mkey_pack(uct_md_h md, uct_mem_h memh, - void *rkey_buffer) -{ - return UCS_OK; -} - -static ucs_status_t uct_cuda_rkey_unpack(uct_md_component_t *mdc, - const void *rkey_buffer, uct_rkey_t *rkey_p, - void **handle_p) -{ - *rkey_p = 0xdeadbeef; - *handle_p = NULL; - return UCS_OK; -} - -static ucs_status_t uct_cuda_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, - void *handle) -{ - return UCS_OK; -} - -static ucs_status_t uct_cuda_mem_reg(uct_md_h md, void *address, size_t length, - unsigned flags, uct_mem_h *memh_p) -{ - ucs_status_t rc; - uct_mem_h * mem_hndl = NULL; - mem_hndl = ucs_malloc(sizeof(void *), "cuda handle for test passing"); - if (NULL == mem_hndl) { - ucs_error("Failed to allocate memory for gni_mem_handle_t"); - rc = UCS_ERR_NO_MEMORY; - goto mem_err; - } - *memh_p = mem_hndl; - return UCS_OK; - mem_err: - return rc; -} - -static ucs_status_t uct_cuda_mem_dereg(uct_md_h md, uct_mem_h memh) -{ - ucs_free(memh); - return UCS_OK; -} - -static ucs_status_t uct_cuda_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) -{ - return uct_single_md_resource(&uct_cuda_md, resources_p, num_resources_p); -} - -static ucs_status_t uct_cuda_md_open(const char *md_name, const uct_md_config_t *md_config, - uct_md_h *md_p) -{ - static uct_md_ops_t md_ops = { - .close = (void*)ucs_empty_function, - .query = uct_cuda_md_query, - .mkey_pack = uct_cuda_mkey_pack, - .mem_reg = uct_cuda_mem_reg, - .mem_dereg = uct_cuda_mem_dereg - }; - static uct_md_t md = { - .ops = &md_ops, - .component = &uct_cuda_md - }; - - *md_p = &md; - return UCS_OK; -} - -UCT_MD_COMPONENT_DEFINE(uct_cuda_md, UCT_CUDA_MD_NAME, - uct_cuda_query_md_resources, uct_cuda_md_open, NULL, - uct_cuda_rkey_unpack, uct_cuda_rkey_release, "CUDA_", - uct_md_config_table, uct_md_config_t); - diff --git a/src/uct/cuda/cuda_md.h b/src/uct/cuda/cuda_md.h deleted file mode 100644 index 4c947e7d5af..00000000000 --- a/src/uct/cuda/cuda_md.h +++ /dev/null @@ -1,16 +0,0 @@ -/** - * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. - * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -#ifndef UCT_CUDA_CONTEXT_H -#define UCT_CUDA_CONTEXT_H - -#include - -#define UCT_CUDA_MD_NAME "gpu" - -extern uct_md_component_t uct_cuda_md; - -#endif diff --git a/src/uct/cuda/gdr_copy/gdr_copy_ep.c b/src/uct/cuda/gdr_copy/gdr_copy_ep.c new file mode 100644 index 00000000000..58f85050eca --- /dev/null +++ b/src/uct/cuda/gdr_copy/gdr_copy_ep.c @@ -0,0 +1,56 @@ +/** + * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "gdr_copy_ep.h" +#include "gdr_copy_md.h" +#include "gdr_copy_iface.h" + +#include +#include +#include + + +static UCS_CLASS_INIT_FUNC(uct_gdr_copy_ep_t, uct_iface_t *tl_iface, + const uct_device_addr_t *dev_addr, + const uct_iface_addr_t *iface_addr) +{ + uct_gdr_copy_iface_t *iface = ucs_derived_of(tl_iface, uct_gdr_copy_iface_t); + UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super) + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_gdr_copy_ep_t) +{ +} + +UCS_CLASS_DEFINE(uct_gdr_copy_ep_t, uct_base_ep_t) +UCS_CLASS_DEFINE_NEW_FUNC(uct_gdr_copy_ep_t, uct_ep_t, uct_iface_t*, + const uct_device_addr_t *, const uct_iface_addr_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_gdr_copy_ep_t, uct_ep_t); + + +ucs_status_t uct_gdr_copy_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, + uint64_t remote_addr, uct_rkey_t rkey, + uct_completion_t *comp) +{ + uct_gdr_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_gdr_copy_iface_t); + uct_gdr_copy_md_t *md = (uct_gdr_copy_md_t *)iface->super.md; + uct_gdr_copy_mem_t *mem_hndl = (uct_gdr_copy_mem_t *) rkey; + gdr_info_t gdr_info; + size_t bar_off; + + assert(iovcnt == 1); + + if (gdr_get_info(md->gdrcpy_ctx, mem_hndl->mh, &gdr_info) != 0) { + ucs_error("gdr_get_info failed. "); + return UCS_ERR_IO_ERROR; + } + bar_off = remote_addr - gdr_info.va; + + gdr_copy_to_bar ((mem_hndl->bar_ptr + bar_off), iov[0].buffer, iov[0].length); + + return UCS_OK; +} diff --git a/src/uct/cuda/gdr_copy/gdr_copy_ep.h b/src/uct/cuda/gdr_copy/gdr_copy_ep.h new file mode 100644 index 00000000000..3ffedf8ea94 --- /dev/null +++ b/src/uct/cuda/gdr_copy/gdr_copy_ep.h @@ -0,0 +1,33 @@ +/** +* Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. +* Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. +* See file LICENSE for terms. +*/ + +#ifndef UCT_SYSV_EP_H +#define UCT_SYSV_EP_H + +#include +#include +#include + + +typedef struct uct_gdr_copy_ep_addr { + int ep_id; +} uct_gdr_copy_ep_addr_t; + +typedef struct uct_gdr_copy_ep { + uct_base_ep_t super; + struct uct_gdr_copy_ep *next; +} uct_gdr_copy_ep_t; + +UCS_CLASS_DECLARE_NEW_FUNC(uct_gdr_copy_ep_t, uct_ep_t, uct_iface_t*, + const uct_device_addr_t *, const uct_iface_addr_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_gdr_copy_ep_t, uct_ep_t); + +ucs_status_t uct_gdr_copy_ep_put_zcopy(uct_ep_h tl_ep, + const uct_iov_t *iov, size_t iovcnt, + uint64_t remote_addr, uct_rkey_t rkey, + uct_completion_t *comp); + +#endif diff --git a/src/uct/cuda/cuda_iface.c b/src/uct/cuda/gdr_copy/gdr_copy_iface.c similarity index 59% rename from src/uct/cuda/cuda_iface.c rename to src/uct/cuda/gdr_copy/gdr_copy_iface.c index ee6441709e8..975a753c740 100644 --- a/src/uct/cuda/cuda_iface.c +++ b/src/uct/cuda/gdr_copy/gdr_copy_iface.c @@ -4,18 +4,18 @@ * See file LICENSE for terms. */ -#include "cuda_iface.h" -#include "cuda_md.h" -#include "cuda_ep.h" +#include "gdr_copy_iface.h" +#include "gdr_copy_md.h" +#include "gdr_copy_ep.h" #include #include -static ucs_config_field_t uct_cuda_iface_config_table[] = { +static ucs_config_field_t uct_gdr_copy_iface_config_table[] = { {"", "", NULL, - ucs_offsetof(uct_cuda_iface_config_t, super), + ucs_offsetof(uct_gdr_copy_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, {NULL} @@ -23,24 +23,24 @@ static ucs_config_field_t uct_cuda_iface_config_table[] = { /* Forward declaration for the delete function */ -static void UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_iface_t)(uct_iface_t*); +static void UCS_CLASS_DELETE_FUNC_NAME(uct_gdr_copy_iface_t)(uct_iface_t*); -static ucs_status_t uct_cuda_iface_get_address(uct_iface_h tl_iface, +static ucs_status_t uct_gdr_copy_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *iface_addr) { - int *cuda_addr = (int*)iface_addr; - *cuda_addr = 0; + int *gdr_copy_addr = (int*)iface_addr; + *gdr_copy_addr = 0; return UCS_OK; } -static int uct_cuda_iface_is_reachable(const uct_iface_h iface, const uct_device_addr_t *dev_addr, +static int uct_gdr_copy_iface_is_reachable(const uct_iface_h iface, const uct_device_addr_t *dev_addr, const uct_iface_addr_t *iface_addr) { - return 0; + return 1; } -static ucs_status_t uct_cuda_iface_query(uct_iface_h iface, +static ucs_status_t uct_gdr_copy_iface_query(uct_iface_h iface, uct_iface_attr_t *iface_attr) { memset(iface_attr, 0, sizeof(uct_iface_attr_t)); @@ -49,8 +49,9 @@ static ucs_status_t uct_cuda_iface_query(uct_iface_h iface, iface_attr->iface_addr_len = sizeof(int); iface_attr->device_addr_len = 0; iface_attr->ep_addr_len = 0; - iface_attr->max_conn_priv = 0; - iface_attr->cap.flags = 0; + iface_attr->cap.flags = UCT_IFACE_FLAG_CONNECT_TO_IFACE | + UCT_IFACE_FLAG_PUT_ZCOPY | + UCT_IFACE_FLAG_PENDING; iface_attr->cap.put.max_short = 0; iface_attr->cap.put.max_bcopy = 0; @@ -67,7 +68,7 @@ static ucs_status_t uct_cuda_iface_query(uct_iface_h iface, iface_attr->cap.get.align_mtu = iface_attr->cap.get.opt_zcopy_align; iface_attr->cap.get.max_iov = 1; - iface_attr->cap.am.max_short = 0; + iface_attr->cap.am.max_short = -1; iface_attr->cap.am.max_bcopy = 0; iface_attr->cap.am.min_zcopy = 0; iface_attr->cap.am.max_zcopy = 0; @@ -76,41 +77,42 @@ static ucs_status_t uct_cuda_iface_query(uct_iface_h iface, iface_attr->cap.am.max_hdr = 0; iface_attr->cap.am.max_iov = 1; - iface_attr->latency.overhead = 1e-9; + iface_attr->latency.overhead = 2e-6; iface_attr->latency.growth = 0; iface_attr->bandwidth = 6911 * 1024.0 * 1024.0; - iface_attr->overhead = 0; + iface_attr->overhead = 100; iface_attr->priority = 0; return UCS_OK; } -static uct_iface_ops_t uct_cuda_iface_ops = { - .ep_put_short = uct_cuda_ep_put_short, - .ep_am_short = uct_cuda_ep_am_short, +static uct_iface_ops_t uct_gdr_copy_iface_ops = { + .ep_put_zcopy = uct_gdr_copy_ep_put_zcopy, + .ep_pending_add = ucs_empty_function_return_busy, + .ep_pending_purge = ucs_empty_function, .ep_flush = uct_base_ep_flush, .ep_fence = uct_base_ep_fence, - .ep_create_connected = UCS_CLASS_NEW_FUNC_NAME(uct_cuda_ep_t), - .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_ep_t), + .ep_create_connected = UCS_CLASS_NEW_FUNC_NAME(uct_gdr_copy_ep_t), + .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_gdr_copy_ep_t), .iface_flush = uct_base_iface_flush, .iface_fence = uct_base_iface_fence, .iface_progress_enable = ucs_empty_function, .iface_progress_disable = ucs_empty_function, .iface_progress = ucs_empty_function_return_zero, - .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_iface_t), - .iface_query = uct_cuda_iface_query, + .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_gdr_copy_iface_t), + .iface_query = uct_gdr_copy_iface_query, .iface_get_device_address = (void*)ucs_empty_function_return_success, - .iface_get_address = uct_cuda_iface_get_address, - .iface_is_reachable = uct_cuda_iface_is_reachable, + .iface_get_address = uct_gdr_copy_iface_get_address, + .iface_is_reachable = uct_gdr_copy_iface_is_reachable, }; -static UCS_CLASS_INIT_FUNC(uct_cuda_iface_t, uct_md_h md, uct_worker_h worker, +static UCS_CLASS_INIT_FUNC(uct_gdr_copy_iface_t, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *tl_config) { - UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_cuda_iface_ops, md, worker, + UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_gdr_copy_iface_ops, md, worker, params, tl_config UCS_STATS_ARG(params->stats_root) - UCS_STATS_ARG(UCT_CUDA_TL_NAME)); + UCS_STATS_ARG(UCT_GDR_COPY_TL_NAME)); if (strcmp(params->mode.device.dev_name, UCT_CUDA_DEV_NAME) != 0) { ucs_error("No device was found: %s", params->mode.device.dev_name); @@ -120,18 +122,18 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_iface_t, uct_md_h md, uct_worker_h worker, return UCS_OK; } -static UCS_CLASS_CLEANUP_FUNC(uct_cuda_iface_t) +static UCS_CLASS_CLEANUP_FUNC(uct_gdr_copy_iface_t) { /* tasks to tear down the domain */ } -UCS_CLASS_DEFINE(uct_cuda_iface_t, uct_base_iface_t); -UCS_CLASS_DEFINE_NEW_FUNC(uct_cuda_iface_t, uct_iface_t, uct_md_h, uct_worker_h, +UCS_CLASS_DEFINE(uct_gdr_copy_iface_t, uct_base_iface_t); +UCS_CLASS_DEFINE_NEW_FUNC(uct_gdr_copy_iface_t, uct_iface_t, uct_md_h, uct_worker_h, const uct_iface_params_t*, const uct_iface_config_t*); -static UCS_CLASS_DEFINE_DELETE_FUNC(uct_cuda_iface_t, uct_iface_t); +static UCS_CLASS_DEFINE_DELETE_FUNC(uct_gdr_copy_iface_t, uct_iface_t); -static ucs_status_t uct_cuda_query_tl_resources(uct_md_h md, +static ucs_status_t uct_gdr_copy_query_tl_resources(uct_md_h md, uct_tl_resource_desc_t **resource_p, unsigned *num_resources_p) { @@ -144,7 +146,7 @@ static ucs_status_t uct_cuda_query_tl_resources(uct_md_h md, } ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", - UCT_CUDA_TL_NAME); + UCT_GDR_COPY_TL_NAME); ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", UCT_CUDA_DEV_NAME); resource->dev_type = UCT_DEVICE_TYPE_ACC; @@ -154,11 +156,11 @@ static ucs_status_t uct_cuda_query_tl_resources(uct_md_h md, return UCS_OK; } -UCT_TL_COMPONENT_DEFINE(uct_cuda_tl, - uct_cuda_query_tl_resources, - uct_cuda_iface_t, - UCT_CUDA_TL_NAME, - "CUDA_", - uct_cuda_iface_config_table, - uct_cuda_iface_config_t); -UCT_MD_REGISTER_TL(&uct_cuda_md, &uct_cuda_tl); +UCT_TL_COMPONENT_DEFINE(uct_gdr_copy_tl, + uct_gdr_copy_query_tl_resources, + uct_gdr_copy_iface_t, + UCT_GDR_COPY_TL_NAME, + "GDR_COPY_", + uct_gdr_copy_iface_config_table, + uct_gdr_copy_iface_config_t); +UCT_MD_REGISTER_TL(&uct_gdr_copy_md_component, &uct_gdr_copy_tl); diff --git a/src/uct/cuda/cuda_iface.h b/src/uct/cuda/gdr_copy/gdr_copy_iface.h similarity index 61% rename from src/uct/cuda/cuda_iface.h rename to src/uct/cuda/gdr_copy/gdr_copy_iface.h index 4c6bc352352..7e0e4d32f4c 100644 --- a/src/uct/cuda/cuda_iface.h +++ b/src/uct/cuda/gdr_copy/gdr_copy_iface.h @@ -10,18 +10,17 @@ #include -#define UCT_CUDA_TL_NAME "cuda" -#define UCT_CUDA_DEV_NAME "gpu0" +#define UCT_GDR_COPY_TL_NAME "gdr_copy" +#define UCT_CUDA_DEV_NAME "gdrcopy0" -typedef struct uct_cuda_iface { +typedef struct uct_gdr_copy_iface { uct_base_iface_t super; -} uct_cuda_iface_t; +} uct_gdr_copy_iface_t; -typedef struct uct_cuda_iface_config { +typedef struct uct_gdr_copy_iface_config { uct_iface_config_t super; -} uct_cuda_iface_config_t; - +} uct_gdr_copy_iface_config_t; #endif diff --git a/src/uct/cuda/gdr_copy/gdr_copy_md.c b/src/uct/cuda/gdr_copy/gdr_copy_md.c new file mode 100644 index 00000000000..8e629002d63 --- /dev/null +++ b/src/uct/cuda/gdr_copy/gdr_copy_md.c @@ -0,0 +1,368 @@ +/** + * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "gdr_copy_md.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN 4096 + +static ucs_config_field_t uct_gdr_copy_md_config_table[] = { + {"", "", NULL, + ucs_offsetof(uct_gdr_copy_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)}, + + {"RCACHE", "try", "Enable using memory registration cache", + ucs_offsetof(uct_gdr_copy_md_config_t, rcache.enable), UCS_CONFIG_TYPE_TERNARY}, + + {"RCACHE_ADDR_ALIGN", UCS_PP_MAKE_STRING(UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN), + "Registration cache address alignment, must be power of 2\n" + "between "UCS_PP_MAKE_STRING(UCS_PGT_ADDR_ALIGN)"and system page size", + ucs_offsetof(uct_gdr_copy_md_config_t, rcache.alignment), UCS_CONFIG_TYPE_UINT}, + + {"RCACHE_MEM_PRIO", "1000", "Registration cache memory event priority", + ucs_offsetof(uct_gdr_copy_md_config_t, rcache.event_prio), UCS_CONFIG_TYPE_UINT}, + + {"RCACHE_OVERHEAD", "90ns", "Registration cache lookup overhead", + ucs_offsetof(uct_gdr_copy_md_config_t, rcache.overhead), UCS_CONFIG_TYPE_TIME}, + + {"MEM_REG_OVERHEAD", "16us", "Memory registration overhead", /* TODO take default from device */ + ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.overhead), UCS_CONFIG_TYPE_TIME}, + + {"MEM_REG_GROWTH", "0.06ns", "Memory registration growth rate", /* TODO take default from device */ + ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.growth), UCS_CONFIG_TYPE_TIME}, + + {NULL} +}; + +static ucs_status_t uct_gdr_copy_md_query(uct_md_h md, uct_md_attr_t *md_attr) +{ + md_attr->cap.flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_ADDR_DN; + md_attr->cap.addr_dn = UCT_MD_ADDR_DOMAIN_CUDA; + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = ULONG_MAX; + md_attr->rkey_packed_size = 0; + md_attr->reg_cost.overhead = 0; + md_attr->reg_cost.growth = 0; + memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_mkey_pack(uct_md_h md, uct_mem_h memh, + void *rkey_buffer) +{ + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_rkey_unpack(uct_md_component_t *mdc, + const void *rkey_buffer, uct_rkey_t *rkey_p, + void **handle_p) +{ + *rkey_p = 0xdeadbeef; + *handle_p = NULL; + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, + void *handle) +{ + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_mem_reg_internal(uct_md_h uct_md, void *address, size_t length, + unsigned flags, uct_gdr_copy_mem_t *mem_hndl) +{ + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + CUdeviceptr d_ptr = ((CUdeviceptr )(char *) address); + gdr_mh_t mh; + void *bar_ptr; + + if (gdr_pin_buffer(md->gdrcpy_ctx, d_ptr, length, 0, 0, &mh) != 0) { + ucs_error("gdr_pin_buffer Failed. length :%lu ", length); + return UCS_ERR_IO_ERROR; + + } + if (mh == 0) { + ucs_error("gdr_pin_buffer Failed. length :%lu ", length); + return UCS_ERR_IO_ERROR; + } + + if (gdr_map(md->gdrcpy_ctx, mh, &bar_ptr, length) !=0) { + ucs_error("gdr_map failed. length :%lu ", length); + return UCS_ERR_IO_ERROR; + } + + mem_hndl->mh = mh; + mem_hndl->bar_ptr = bar_ptr; + mem_hndl->reg_size = length; + + return UCS_OK; + +} + +static ucs_status_t uct_gdr_copy_mem_dereg_internal(uct_md_h uct_md, uct_gdr_copy_mem_t *mem_hndl) +{ + + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + + if (gdr_unmap(md->gdrcpy_ctx, mem_hndl->mh, mem_hndl->bar_ptr, mem_hndl->reg_size) !=0) { + ucs_error("gdr_unmap Failed. unpin_size:%lu ", mem_hndl->reg_size); + return UCS_ERR_IO_ERROR; + } + if (gdr_unpin_buffer(md->gdrcpy_ctx, mem_hndl->mh) !=0) { + ucs_error("gdr_unpin_buffer failed "); + return UCS_ERR_IO_ERROR; + } + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_mem_reg(uct_md_h uct_md, void *address, size_t length, + unsigned flags, uct_mem_h *memh_p) +{ + uct_gdr_copy_mem_t * mem_hndl = NULL; + size_t reg_size; + void *ptr; + ucs_status_t status; + + + mem_hndl = ucs_malloc(sizeof(uct_gdr_copy_mem_t), "gdr_copy handle"); + if (NULL == mem_hndl) { + ucs_error("Failed to allocate memory for uct_gdr_copy_mem_t"); + return UCS_ERR_NO_MEMORY; + } + + reg_size = (length + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK; + ptr = (void *) ((uintptr_t)address & GPU_PAGE_MASK); + + status = uct_gdr_copy_mem_reg_internal(uct_md, ptr, reg_size, 0, mem_hndl); + if (status != UCS_OK) { + free(mem_hndl); + return status; + } + + *memh_p = mem_hndl; + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_mem_dereg(uct_md_h uct_md, uct_mem_h memh) +{ + uct_gdr_copy_mem_t *mem_hndl = memh; + ucs_status_t status; + + status = uct_gdr_copy_mem_dereg_internal(uct_md, mem_hndl); + free(mem_hndl); + return status; +} + +static ucs_status_t uct_gdr_copy_mem_detect(uct_md_h md, void *addr) +{ + int memory_type; + cudaError_t cuda_err = cudaSuccess; + struct cudaPointerAttributes attributes; + CUresult cu_err = CUDA_SUCCESS; + + if (addr == NULL) { + return UCS_ERR_INVALID_ADDR; + } + + cu_err = cuPointerGetAttribute(&memory_type, + CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + (CUdeviceptr)addr); + if (cu_err != CUDA_SUCCESS) { + cuda_err = cudaPointerGetAttributes (&attributes, addr); + if (cuda_err == cudaSuccess) { + if (attributes.memoryType == cudaMemoryTypeDevice) { + return UCS_OK; + } + } + } else if (memory_type == CU_MEMORYTYPE_DEVICE) { + return UCS_OK; + } + + return UCS_ERR_INVALID_ADDR; +} + +static ucs_status_t uct_gdr_copy_query_md_resources(uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) +{ + + return uct_single_md_resource(&uct_gdr_copy_md_component, resources_p, num_resources_p); +} + +static void uct_gdr_copy_md_close(uct_md_h uct_md) +{ + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + + if (md->rcache != NULL) { + ucs_rcache_destroy(md->rcache); + } + + if (gdr_close(md->gdrcpy_ctx) != 0) { + ucs_error("Failed to close gdrcopy"); + } + + ucs_free(md); +} + +static uct_md_ops_t md_ops = { + .close = uct_gdr_copy_md_close, + .query = uct_gdr_copy_md_query, + .mkey_pack = uct_gdr_copy_mkey_pack, + .mem_reg = uct_gdr_copy_mem_reg, + .mem_dereg = uct_gdr_copy_mem_dereg, + .mem_detect = uct_gdr_copy_mem_detect +}; + +static inline uct_gdr_copy_rcache_region_t* uct_gdr_copy_rache_region_from_memh(uct_mem_h memh) +{ + return ucs_container_of(memh, uct_gdr_copy_rcache_region_t, memh); +} + +static ucs_status_t uct_gdr_copy_mem_rcache_reg(uct_md_h uct_md, void *address, + size_t length, unsigned flags, + uct_mem_h *memh_p) +{ + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + ucs_rcache_region_t *rregion; + ucs_status_t status; + uct_gdr_copy_mem_t *memh; + + status = ucs_rcache_get(md->rcache, address, length, PROT_READ|PROT_WRITE, + &flags, &rregion); + if (status != UCS_OK) { + return status; + } + + ucs_assert(rregion->refcount > 0); + memh = &ucs_derived_of(rregion, uct_gdr_copy_rcache_region_t)->memh; + *memh_p = memh; + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_mem_rcache_dereg(uct_md_h uct_md, uct_mem_h memh) +{ + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + uct_gdr_copy_rcache_region_t *region = uct_gdr_copy_rache_region_from_memh(memh); + + ucs_rcache_region_put(md->rcache, ®ion->super); + return UCS_OK; +} + +static uct_md_ops_t md_rcache_ops = { + .close = uct_gdr_copy_md_close, + .query = uct_gdr_copy_md_query, + .mkey_pack = uct_gdr_copy_mkey_pack, + .mem_reg = uct_gdr_copy_mem_rcache_reg, + .mem_dereg = uct_gdr_copy_mem_rcache_dereg, + .mem_detect = uct_gdr_copy_mem_detect +}; +static ucs_status_t uct_gdr_copy_rcache_mem_reg_cb(void *context, ucs_rcache_t *rcache, + void *arg, ucs_rcache_region_t *rregion) +{ + uct_gdr_copy_rcache_region_t *region = ucs_derived_of(rregion, uct_gdr_copy_rcache_region_t); + uct_gdr_copy_md_t *md = context; + int *flags = arg; + ucs_status_t status; + + status = uct_gdr_copy_mem_reg_internal(&md->super, (void*)region->super.super.start, + region->super.super.end - region->super.super.start, + *flags, ®ion->memh); + if (status != UCS_OK) { + return status; + } + + return UCS_OK; +} + +static void uct_gdr_copy_rcache_mem_dereg_cb(void *context, ucs_rcache_t *rcache, + ucs_rcache_region_t *rregion) +{ + uct_gdr_copy_rcache_region_t *region = ucs_derived_of(rregion, uct_gdr_copy_rcache_region_t); + uct_gdr_copy_md_t *md = context; + + (void)uct_gdr_copy_mem_dereg_internal(&md->super, ®ion->memh); +} + +static void uct_gdr_copy_rcache_dump_region_cb(void *context, ucs_rcache_t *rcache, + ucs_rcache_region_t *rregion, char *buf, + size_t max) +{ + +} + +static ucs_rcache_ops_t uct_gdr_copy_rcache_ops = { + .mem_reg = uct_gdr_copy_rcache_mem_reg_cb, + .mem_dereg = uct_gdr_copy_rcache_mem_dereg_cb, + .dump_region = uct_gdr_copy_rcache_dump_region_cb +}; + +static ucs_status_t uct_gdr_copy_md_open(const char *md_name, const uct_md_config_t *uct_md_config, + uct_md_h *md_p) +{ + ucs_status_t status; + uct_gdr_copy_md_t *md; + const uct_gdr_copy_md_config_t *md_config = ucs_derived_of(uct_md_config, uct_gdr_copy_md_config_t); + ucs_rcache_params_t rcache_params; + + md = ucs_malloc(sizeof(uct_gdr_copy_md_t), "uct_gdr_copy_md_t"); + if (NULL == md) { + ucs_error("Failed to allocate memory for uct_gdr_copy_md_t"); + return UCS_ERR_NO_MEMORY; + } + + md->super.ops = &md_ops; + md->super.component = &uct_gdr_copy_md_component; + md->rcache = NULL; + md->reg_cost = md_config->uc_reg_cost; + + + + md->gdrcpy_ctx = gdr_open(); + if (md->gdrcpy_ctx == (void *)0) { + ucs_error("Failed to open gdrcopy "); + return UCS_ERR_IO_ERROR; + } + + if (md_config->rcache.enable != UCS_NO) { + // UCS_STATIC_ASSERT(UCS_PGT_ADDR_ALIGN >= UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN); + rcache_params.region_struct_size = sizeof(uct_gdr_copy_rcache_region_t); + rcache_params.alignment = md_config->rcache.alignment; + rcache_params.ucm_event_priority = md_config->rcache.event_prio; + rcache_params.context = md; + rcache_params.ops = &uct_gdr_copy_rcache_ops; + status = ucs_rcache_create(&rcache_params, "gdr_copy" UCS_STATS_ARG(NULL), &md->rcache); + if (status == UCS_OK) { + md->super.ops = &md_rcache_ops; + md->reg_cost.overhead = 0; + md->reg_cost.growth = 0; /* It's close enough to 0 */ + } else { + ucs_assert(md->rcache == NULL); + if (md_config->rcache.enable == UCS_YES) { + ucs_error("Failed to create registration cache: %s", + ucs_status_string(status)); + return UCS_ERR_IO_ERROR; + } else { + ucs_debug("Could not create registration cache for: %s", + ucs_status_string(status)); + } + } + } + + *md_p = (uct_md_h) md; + return UCS_OK; +} + +UCT_MD_COMPONENT_DEFINE(uct_gdr_copy_md_component, UCT_GDR_COPY_MD_NAME, + uct_gdr_copy_query_md_resources, uct_gdr_copy_md_open, NULL, + uct_gdr_copy_rkey_unpack, uct_gdr_copy_rkey_release, "GDR_COPY_", + uct_gdr_copy_md_config_table, uct_gdr_copy_md_config_t); + diff --git a/src/uct/cuda/gdr_copy/gdr_copy_md.h b/src/uct/cuda/gdr_copy/gdr_copy_md.h new file mode 100644 index 00000000000..be0ab5964d3 --- /dev/null +++ b/src/uct/cuda/gdr_copy/gdr_copy_md.h @@ -0,0 +1,65 @@ +/** + * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. + * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_GDR_COPY_MD_H +#define UCT_GDR_COPY_MD_H + +#include +#include +#include "gdrapi.h" + +#define UCT_GDR_COPY_MD_NAME "gdr_copy" + +extern uct_md_component_t uct_gdr_copy_md_component; + +/** + * @brief gdr_copy MD descriptor + */ +typedef struct uct_gdr_copy_md { + struct uct_md super; /**< Domain info */ + gdr_t gdrcpy_ctx; /**< gdr copy context */ + ucs_rcache_t *rcache; /**< Registration cache (can be NULL) */ + uct_linear_growth_t reg_cost; /**< Memory registration cost */ +} uct_gdr_copy_md_t; + +/** + * gdr copy domain configuration. + */ +typedef struct uct_gdr_copy_md_config { + uct_md_config_t super; + struct { + ucs_ternary_value_t enable; /**< Enable registration cache */ + size_t alignment; /**< Force address alignment */ + unsigned event_prio; /**< Memory events priority */ + double overhead; /**< Lookup overhead estimation */ + } rcache; + + uct_linear_growth_t uc_reg_cost; /**< Memory registration cost estimation + without using the cache */ + + +} uct_gdr_copy_md_config_t; + + +/** + * @brief gdr copy mem handle + */ +typedef struct uct_gdr_copy_mem { + gdr_mh_t mh; + void *bar_ptr; + size_t reg_size; +} uct_gdr_copy_mem_t; + +/** + * cuda memory region in the registration cache. + */ +typedef struct uct_gdr_copy_rcache_region { + ucs_rcache_region_t super; + uct_gdr_copy_mem_t memh; /**< mr exposed to the user as the memh */ +} uct_gdr_copy_rcache_region_t; + + +#endif diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index 7811bcece3d..161ed70ca46 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -158,6 +158,7 @@ static ucs_status_t uct_ib_md_query(uct_md_h uct_md, uct_md_attr_t *md_attr) UCT_MD_FLAG_NEED_MEMH | UCT_MD_FLAG_NEED_RKEY | UCT_MD_FLAG_ADVISE; + md_attr->cap.addr_dn = UCT_MD_ADDR_DOMAIN_DEFAULT; md_attr->rkey_packed_size = sizeof(uint64_t); if (md->config.enable_contig_pages && @@ -866,6 +867,7 @@ static uct_md_ops_t uct_ib_md_ops = { .mem_alloc = uct_ib_mem_alloc, .mem_free = uct_ib_mem_free, .mem_reg = uct_ib_mem_reg, + .mem_detect = ucs_empty_function_return_success, .mem_dereg = uct_ib_mem_dereg, .mem_advise = uct_ib_mem_advise, .mkey_pack = uct_ib_mkey_pack, diff --git a/src/uct/rocm/rocm_cma_md.c b/src/uct/rocm/rocm_cma_md.c index 3ba3d6ad10c..7d70152b41a 100644 --- a/src/uct/rocm/rocm_cma_md.c +++ b/src/uct/rocm/rocm_cma_md.c @@ -30,6 +30,7 @@ static ucs_status_t uct_rocm_cma_md_query(uct_md_h md, uct_md_attr_t *md_attr) md_attr->rkey_packed_size = sizeof(uct_rocm_cma_key_t); md_attr->cap.flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.addr_dn = UCT_MD_ADDR_DOMAIN_DEFAULT; md_attr->cap.max_alloc = 0; md_attr->cap.max_reg = ULONG_MAX; @@ -211,6 +212,7 @@ static ucs_status_t uct_rocm_cma_md_open(const char *md_name, .query = uct_rocm_cma_md_query, .mkey_pack = uct_rocm_cma_rkey_pack, .mem_reg = uct_rocm_cma_mem_reg, + .mem_detect = ucs_empty_function_return_success, .mem_dereg = uct_rocm_cma_mem_dereg }; diff --git a/src/uct/sm/cma/cma_md.c b/src/uct/sm/cma/cma_md.c index 71676b4d6bd..b5b4503f0f5 100644 --- a/src/uct/sm/cma/cma_md.c +++ b/src/uct/sm/cma/cma_md.c @@ -59,7 +59,8 @@ static ucs_status_t uct_cma_md_open(const char *md_name, const uct_md_config_t * .mem_free = (void*)ucs_empty_function_return_success, .mkey_pack = (void*)ucs_empty_function_return_success, .mem_reg = uct_cma_mem_reg, - .mem_dereg = (void*)ucs_empty_function_return_success + .mem_dereg = (void*)ucs_empty_function_return_success, + .mem_detect = ucs_empty_function_return_success, }; static uct_md_t md = { .ops = &md_ops, @@ -80,6 +81,7 @@ ucs_status_t uct_cma_md_query(uct_md_h md, uct_md_attr_t *md_attr) { md_attr->rkey_packed_size = 0; md_attr->cap.flags = UCT_MD_FLAG_REG; + md_attr->cap.addr_dn = UCT_MD_ADDR_DOMAIN_DEFAULT; md_attr->cap.max_alloc = 0; md_attr->cap.max_reg = ULONG_MAX; md_attr->reg_cost.overhead = 9e-9; diff --git a/src/uct/sm/knem/knem_md.c b/src/uct/sm/knem/knem_md.c index 4a1cb24ffe8..8592a9fa55a 100644 --- a/src/uct/sm/knem/knem_md.c +++ b/src/uct/sm/knem/knem_md.c @@ -13,6 +13,7 @@ ucs_status_t uct_knem_md_query(uct_md_h md, uct_md_attr_t *md_attr) md_attr->rkey_packed_size = sizeof(uct_knem_key_t); md_attr->cap.flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.addr_dn = UCT_MD_ADDR_DOMAIN_DEFAULT; md_attr->cap.max_alloc = 0; md_attr->cap.max_reg = ULONG_MAX; md_attr->reg_cost.overhead = 1200.0e-9; @@ -186,7 +187,8 @@ static ucs_status_t uct_knem_md_open(const char *md_name, const uct_md_config_t .mem_free = (void*)ucs_empty_function_return_success, .mkey_pack = uct_knem_rkey_pack, .mem_reg = uct_knem_mem_reg, - .mem_dereg = uct_knem_mem_dereg + .mem_dereg = uct_knem_mem_dereg, + .mem_detect = ucs_empty_function_return_success }; knem_md = ucs_malloc(sizeof(uct_knem_md_t), "uct_knem_md_t"); diff --git a/src/uct/sm/mm/mm_md.c b/src/uct/sm/mm/mm_md.c index c849dfd6412..1cb173d3d59 100644 --- a/src/uct/sm/mm/mm_md.c +++ b/src/uct/sm/mm/mm_md.c @@ -124,6 +124,7 @@ ucs_status_t uct_mm_md_query(uct_md_h md, uct_md_attr_t *md_attr) md_attr->reg_cost.growth = 0.007e-9; } md_attr->cap.flags |= UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.addr_dn = UCT_MD_ADDR_DOMAIN_DEFAULT; /* all mm md(s) support fixed memory alloc */ md_attr->cap.flags |= UCT_MD_FLAG_FIXED; md_attr->cap.max_alloc = ULONG_MAX; @@ -227,6 +228,7 @@ uct_md_ops_t uct_mm_md_ops = { .mem_free = uct_mm_mem_free, .mem_reg = uct_mm_mem_reg, .mem_dereg = uct_mm_mem_dereg, + .mem_detect = ucs_empty_function_return_success, .mkey_pack = uct_mm_mkey_pack, }; diff --git a/src/uct/sm/self/self_md.c b/src/uct/sm/self/self_md.c index ede57407be5..9e5a0386b14 100644 --- a/src/uct/sm/self/self_md.c +++ b/src/uct/sm/self/self_md.c @@ -10,6 +10,7 @@ static ucs_status_t uct_self_md_query(uct_md_h md, uct_md_attr_t *attr) { /* Dummy memory registration provided. No real memory handling exists */ attr->cap.flags = UCT_MD_FLAG_REG; + attr->cap.addr_dn = UCT_MD_ADDR_DOMAIN_DEFAULT; attr->cap.max_alloc = 0; attr->cap.max_reg = ULONG_MAX; attr->rkey_packed_size = 0; /* uct_md_query adds UCT_MD_COMPONENT_NAME_MAX to this */ @@ -41,7 +42,8 @@ static ucs_status_t uct_self_md_open(const char *md_name, const uct_md_config_t .query = uct_self_md_query, .mkey_pack = ucs_empty_function_return_success, .mem_reg = uct_self_mem_reg, - .mem_dereg = ucs_empty_function_return_success + .mem_dereg = ucs_empty_function_return_success, + .mem_detect = ucs_empty_function_return_success }; static uct_md_t md = { .ops = &md_ops, diff --git a/src/uct/tcp/tcp_md.c b/src/uct/tcp/tcp_md.c index 008ea5f1d60..63fd3e639e5 100644 --- a/src/uct/tcp/tcp_md.c +++ b/src/uct/tcp/tcp_md.c @@ -32,7 +32,8 @@ static ucs_status_t uct_tcp_md_open(const char *md_name, const uct_md_config_t * .query = uct_tcp_md_query, .mkey_pack = ucs_empty_function_return_unsupported, .mem_reg = ucs_empty_function_return_unsupported, - .mem_dereg = ucs_empty_function_return_unsupported + .mem_dereg = ucs_empty_function_return_unsupported, + .mem_detect = ucs_empty_function_return_success }; static uct_md_t md = { .ops = &md_ops, diff --git a/src/uct/ugni/base/ugni_md.c b/src/uct/ugni/base/ugni_md.c index a0b671544f7..065c339680c 100644 --- a/src/uct/ugni/base/ugni_md.c +++ b/src/uct/ugni/base/ugni_md.c @@ -34,6 +34,7 @@ static ucs_status_t uct_ugni_md_query(uct_md_h md, uct_md_attr_t *md_attr) md_attr->cap.flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_MEMH | UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.addr_dn = UCT_MD_ADDR_DOMAIN_DEFAULT; md_attr->cap.max_alloc = 0; md_attr->cap.max_reg = ULONG_MAX; md_attr->reg_cost.overhead = 1000.0e-9; @@ -180,6 +181,7 @@ static ucs_status_t uct_ugni_md_open(const char *md_name, const uct_md_config_t .mem_free = (void*)ucs_empty_function, .mem_reg = uct_ugni_mem_reg, .mem_dereg = uct_ugni_mem_dereg, + .mem_detect = ucs_empty_function_return_success, .mkey_pack = uct_ugni_rkey_pack };