bureddy · bureddy · Sep 26, 2017 · Sep 26, 2017 · Sep 26, 2017 · Sep 26, 2017
diff --git a/config/m4/cuda.m4 b/config/m4/cuda.m4
@@ -1 +1,84 @@
-AM_CONDITIONAL([HAVE_CUDA], [true])
+#
+# Copyright (C) Mellanox Technologies Ltd. 2001-2017.  ALL RIGHTS RESERVED.
+# See file LICENSE for terms.
+#
+
+#
+# Check for CUDA support
+#
+cuda_happy="no"
+
+AC_ARG_WITH([cuda],
+           [AS_HELP_STRING([--with-cuda=(DIR)], [Enable the use of CUDA (default is no).])],
+           [], [with_cuda=no])
+
+AS_IF([test "x$with_cuda" != "xno"],
+
+      [AS_IF([test "x$with_cuda" == "x" || test "x$with_cuda" == "xguess" || test "x$with_cuda" == "xyes"],
+             [
+              AC_MSG_NOTICE([CUDA path was not specified. Guessing ...])
+              with_cuda=/usr/local/cuda
+              ],
+              [:])
+        AC_CHECK_HEADERS([$with_cuda/include/cuda.h $with_cuda/include/cuda_runtime.h],
+                       [AC_CHECK_DECLS([cuPointerGetAttribute],
+                           [cuda_happy="yes"],
+                           [AC_MSG_WARN([CUDA runtime not detected. Disable.])
+                            cuda_happy="no"],
+                            [#include <$with_cuda/include/cuda.h>])
+                           AS_IF([test "x$cuda_happy" == "xyes"],
+                            [AC_DEFINE([HAVE_CUDA], 1, [Enable CUDA support])
+                            AC_SUBST(CUDA_CPPFLAGS, "-I$with_cuda/include/ ")
+                            AC_SUBST(CUDA_CFLAGS, "-I$with_cuda/include/ ")
+                            AC_SUBST(CUDA_LDFLAGS, "-lcudart -lcuda -L$with_cuda/lib64")
+                            CFLAGS="$CFLAGS $CUDA_CFLAGS"
+                            CPPFLAGS="$CPPFLAGS $CUDA_CPPFLAGS"
+                            LDFLAGS="$LDFLAGS $CUDA_LDFLAGS"],
+                        [])],
+                       [AC_MSG_WARN([CUDA not found])
+                        AC_DEFINE([HAVE_CUDA], [0], [Disable the use of CUDA])])],
+      [AC_MSG_WARN([CUDA was explicitly disabled])
+      AC_DEFINE([HAVE_CUDA], [0], [Disable the use of CUDA])]
+)
+
+
+AM_CONDITIONAL([HAVE_CUDA], [test "x$cuda_happy" != xno])
+
+AC_ARG_WITH([gdrcopy],
+           [AS_HELP_STRING([--with-gdrcopy=(DIR)], [Enable the use of GDR_COPY (default is no).])],
+           [], [with_gdrcopy=no])
+
+AS_IF([test "x$with_gdrcopy" != "xno"],
+
+      [AS_IF([test "x$with_gdrcopy" == "x" || test "x$with_gdrcopy" == "xguess" || test "x$with_gdrcopy" == "xyes"],
+             [
+              AC_MSG_NOTICE([GDR_COPY path was not specified. Guessing ...])
+              with_gdrcopy=/usr/local/gdrcopy
+              ],
+              [:])
+        AC_CHECK_HEADERS([$with_gdrcopy/include/gdrapi.h],
+                       [AC_CHECK_DECLS([gdr_pin_buffer],
+                           [gdrcopy_happy="yes"],
+                           [AC_MSG_WARN([GDR_COPY runtime not detected. Disable.])
+                            gdrcopy_happy="no"],
+                            [#include <$with_gdrcopy/include/gdrapi.h>])
+                           AS_IF([test "x$gdrcopy_happy" == "xyes"],
+                            [AC_DEFINE([HAVE_GDR_COPY], 1, [Enable GDR_COPY support])
+                            AC_SUBST(GDR_COPY_CPPFLAGS, "-I$with_gdrcopy/include/ ")
+                            AC_SUBST(GDR_COPY_CFLAGS, "-I$with_gdrcopy/include/ ")
+                            AC_SUBST(GDR_COPY_LDFLAGS, "-lgdrapi -L$with_gdrcopy/lib64")
+                            CFLAGS="$CFLAGS $GDR_COPY_CFLAGS"
+                            CPPFLAGS="$CPPFLAGS $GDR_COPY_CPPFLAGS"
+                            LDFLAGS="$LDFLAGS $GDR_COPY_LDFLAGS"],
+                        [])],
+                       [AC_MSG_WARN([GDR_COPY not found])
+                        AC_DEFINE([HAVE_GDR_COPY], [0], [Disable the use of GDR_COPY])])],
+      [AC_MSG_WARN([GDR_COPY was explicitly disabled])
+      AC_DEFINE([HAVE_GDR_COPY], [0], [Disable the use of GDR_COPY])]
+)
+
+
+AM_CONDITIONAL([HAVE_GDR_COPY], [test "x$gdrcopy_happy" != xno])
+
+AC_DEFINE([HAVE_CUDA_GDR], [1], [Eanble GPU Direct RDMA])]
+AM_CONDITIONAL([HAVE_CUDA_GDR], [1])
diff --git a/src/ucm/Makefile.am b/src/ucm/Makefile.am
@@ -47,6 +47,13 @@ libucm_la_SOURCES = \
 	util/reloc.c \
 	util/sys.c
 
+if HAVE_CUDA
+libucm_la_SOURCES += \
+    cuda/install.c \
+    cuda/replace.c
+
+endif
+
 if HAVE_UCM_PTMALLOC283
 libucm_la_CPPFLAGS += \
     -I$(srcdir)/ptmalloc283/sysdeps/pthread \

diff --git a/src/ucm/api/ucm.h b/src/ucm/api/ucm.h
@@ -9,7 +9,10 @@
 #define UCM_H_
 
 #include <ucs/sys/compiler_def.h>
-
+#if HAVE_CUDA
+#include <cuda_runtime.h>
+#include <cuda.h>
+#endif
 BEGIN_C_DECLS
 
 #include <ucs/config/types.h>
@@ -32,6 +35,8 @@ typedef enum ucm_event_type {
     UCM_EVENT_SHMAT           = UCS_BIT(3),
     UCM_EVENT_SHMDT           = UCS_BIT(4),
     UCM_EVENT_SBRK            = UCS_BIT(5),
+    /* cuda events */
+    UCM_EVENT_CUDAFREE        = UCS_BIT(6),
 
     /* Aggregate events */
     UCM_EVENT_VM_MAPPED       = UCS_BIT(16),
@@ -113,6 +118,17 @@ typedef union ucm_event {
         intptr_t       increment;
     } sbrk;
 
+#if HAVE_CUDA
+    /*
+     * UCM_EVENT_CUDAFREE
+     * cudaFree() is called.
+     */
+    struct {
+        int      result;
+        void     *address;
+    } cudaFree;
+#endif
+
     /*
      * UCM_EVENT_VM_MAPPED, UCM_EVENT_VM_UNMAPPED
      *
@@ -351,6 +367,23 @@ int ucm_shmdt(const void *shmaddr);
  */
 void *ucm_sbrk(intptr_t increment);
 
+#if HAVE_CUDA
+
+/**
+ * @brief Call the original implementation of @ref cudaFree without triggering events.
+ */
+cudaError_t ucm_orig_cudaFree(void *address);
+
+
+/**
+ * @brief Call the original implementation of @ref cudaFree and all handlers
+ * associated with it.
+ */
+cudaError_t ucm_cudaFree(void *address);
+
+#endif
+
+
 
 END_C_DECLS
 

diff --git a/src/ucm/cuda/cudamem.h b/src/ucm/cuda/cudamem.h
@@ -0,0 +1,22 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef UCM_CUDAMEM_H_
+#define UCM_CUDAMEM_H_
+
+#include <ucm/api/ucm.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+ucs_status_t ucm_cudamem_install(int events);
+
+void ucm_cudamem_event_test_callback(ucm_event_type_t event_type,
+                                  ucm_event_t *event, void *arg);
+
+
+cudaError_t ucm_override_cudaFree(void *addr);
+
+#endif
diff --git a/src/ucm/cuda/install.c b/src/ucm/cuda/install.c
@@ -0,0 +1,154 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
+ * Copyright (C) ARM Ltd. 2016.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "cudamem.h"
+
+#include <ucm/api/ucm.h>
+#include <ucm/event/event.h>
+#include <ucm/util/log.h>
+#include <ucm/util/reloc.h>
+#include <ucm/util/ucm_config.h>
+#include <ucs/sys/math.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <unistd.h>
+#include <pthread.h>
+
+
+typedef struct ucm_cudamem_func {
+    ucm_reloc_patch_t   patch;
+    ucm_event_type_t    event_type;
+} ucm_cudamem_func_t;
+
+static ucm_cudamem_func_t ucm_cudamem_funcs[] = {
+    { {"cudaFree",   ucm_override_cudaFree},   UCM_EVENT_CUDAFREE},
+    { {NULL, NULL}, 0}
+};
+
+void ucm_cudamem_event_test_callback(ucm_event_type_t event_type,
+                                  ucm_event_t *event, void *arg)
+{
+    int *out_events = arg;
+    *out_events |= event_type;
+}
+
+/* Called with lock held */
+static ucs_status_t ucm_cudamem_test(int events)
+{
+    static int installed_events = 0;
+    ucm_event_handler_t handler;
+    int out_events = 0;
+    void *p;
+
+    if (ucs_test_all_flags(installed_events, events)) {
+        /* All requested events are already installed */
+        return UCS_OK;
+    }
+
+    /* Install a temporary event handler which will add the supported event
+     * type to out_events bitmap.
+     */
+    handler.events   = events;
+    handler.priority = -1;
+    handler.cb       = ucm_cudamem_event_test_callback;
+    handler.arg      = &out_events;
+    out_events       = 0;
+
+    ucm_event_handler_add(&handler);
+
+    if (events & (UCM_EVENT_CUDAFREE)) {
+        if (cudaSuccess != cudaMalloc(&p, 64)) {
+            ucm_error("cudaMalloc failed");
+            return UCS_ERR_UNSUPPORTED;
+        }
+        cudaFree(p);
+    }
+
+
+    ucm_event_handler_remove(&handler);
+
+    /* TODO check address / stop all threads */
+    installed_events |= out_events;
+    ucm_debug("cudamem test: got 0x%x out of 0x%x, total: 0x%x", out_events, events,
+              installed_events);
+
+    /* Return success iff we caught all wanted events */
+    if (!ucs_test_all_flags(out_events, events)) {
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    return UCS_OK;
+}
+
+/* Called with lock held */
+static ucs_status_t ucs_cudamem_install_reloc(int events)
+{
+    static int installed_events = 0;
+    ucm_cudamem_func_t *entry;
+    ucs_status_t status;
+
+    if (!ucm_global_config.enable_cuda_hooks) {
+        ucm_debug("installing cudamem relocations is disabled by configuration");
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    for (entry = ucm_cudamem_funcs; entry->patch.symbol != NULL; ++entry) {
+        if (!(entry->event_type & events)) {
+            /* Not required */
+            continue;
+        }
+
+        if (entry->event_type & installed_events) {
+            /* Already installed */
+            continue;
+        }
+
+        ucm_debug("cudamem: installing relocation table entry for %s = %p for event 0x%x",
+                  entry->patch.symbol, entry->patch.value, entry->event_type);
+
+        status = ucm_reloc_modify(&entry->patch);
+        if (status != UCS_OK) {
+            ucm_warn("failed to install relocation table entry for '%s'",
+                     entry->patch.symbol);
+            return status;
+        }
+
+        installed_events |= entry->event_type;
+    }
+
+    return UCS_OK;
+}
+
+ucs_status_t ucm_cudamem_install(int events)
+{
+    static pthread_mutex_t install_mutex = PTHREAD_MUTEX_INITIALIZER;
+    ucs_status_t status;
+
+    pthread_mutex_lock(&install_mutex);
+
+    status = ucm_cudamem_test(events);
+    if (status == UCS_OK) {
+        goto out_unlock;
+    }
+
+    status = ucs_cudamem_install_reloc(events);
+    if (status != UCS_OK) {
+        ucm_debug("failed to install relocations for cudamem");
+        goto out_unlock;
+    }
+
+    status = ucm_cudamem_test(events);
+
+out_unlock:
+    pthread_mutex_unlock(&install_mutex);
+    return status;
+}