Provide a user level locking mechanism for FFTW (#548)

* Provide a user level locking mechanism for FFTW This commit provides a basic interface for the user to provide a lock around `fftw` calls. This is because any plan manipulation functions in `fftw` (make, destroy) are not thread-safe. See example code here: ```c++ #include <vector> #include <mutex> #include <fftw3.h> #include <finufft.h> #include <omp.h> using namespace std; #define N 65384 void locker(void *lck) { reinterpret_cast<recursive_mutex *>(lck)->lock(); } void unlocker(void *lck) { reinterpret_cast<recursive_mutex *>(lck)->unlock(); } int main() { int64_t Ns[3]; // guru describes mode array by vector [N1,N2..] Ns[0] = N; recursive_mutex lck; finufft_opts opts; finufft_default_opts(&opts); opts.nthreads = 1; opts.debug = 0; opts.fftw_lock_fun = locker; opts.fftw_unlock_fun = unlocker; opts.fftw_lock_data = reinterpret_cast<void *>(&lck); // random nonuniform points (x) and complex strengths (c)... vector<complex<double>> c(N); // init FFTW threads fftw_init_threads(); // FFTW and FINUFFT execution using OpenMP parallelization #pragma omp parallel for for (int j = 0; j < 100; ++j) { // allocate output array for FFTW... vector<complex<double>> F1(N); // FFTW plan lck.lock(); fftw_plan_with_nthreads(1); fftw_plan plan = fftw_plan_dft_1d(N, reinterpret_cast<fftw_complex*>(c.data()), reinterpret_cast<fftw_complex*>(F1.data()), FFTW_FORWARD, FFTW_ESTIMATE); fftw_destroy_plan(plan); lck.unlock(); // FINUFFT plan finufft_plan nufftplan; finufft_makeplan(1, 1, Ns, 1, 1, 1e-6, &nufftplan, &opts); finufft_destroy(nufftplan); } return 0; } ``` * fftw_lock: fix issue when null opts passed * fftw_lock: add new opts to python bindings * threads: fix lock guard issue in osx arm64 clang * fftw_lock: add info to changelog and user documentation * fftw_lock: add test for new fftw_lock * fftw_lock: share lock and init between float/double * Revert "fftw_lock: share lock and init between float/double" This reverts commit 8594b65. * fftw_lock_test: add comments to source describing why --------- Co-authored-by: Robert Blackwell <[email protected]>
flatironinstitute · Sep 18, 2024 · b98fd1d · b98fd1d
1 parent 1842601
commit b98fd1d
Show file tree

Hide file tree

Showing 8 changed files with 201 additions and 5 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -14,6 +14,9 @@ Master (9/10/24)
   - Minor fixes on the GPU code:
     a) removed memory leaks in case of errors
     b) renamed maxbatchsize to batchsize
+* Add options for user-provided FFTW locker (PR548, Blackwell). These options can be be
+used to prevent crashes when a user is creating/destroying FFTW plans and
+FINUFFT plans in threads simultaneously.
 
 V 2.3.0 (9/5/24)
 

diff --git a/docs/opts.rst b/docs/opts.rst
@@ -189,3 +189,77 @@ Here ``0`` makes an automatic choice. If you are unhappy with this, then for sma
 **spread_nthr_atomic**: if non-negative: for numbers of threads up to this value, an OMP critical block for ``add_wrapped_subgrid`` is used in spreading (type 1 transforms). Above this value, instead OMP atomic writes are used, which scale better for large thread numbers. If negative, the heuristic default in the spreader is used, set in ``src/spreadinterp.cpp:setup_spreader()``.
 
 **spread_max_sp_size**: if positive, overrides the maximum subproblem (chunking) size for multithreaded spreading (type 1 transforms). Otherwise the default in the spreader is used, set in ``src/spreadinterp.cpp:setup_spreader()``, which we believe is a decent heuristic for Intel i7 and xeon machines.
+
+
+Thread safety options (advanced)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, with FFTW as the FFT library, FINUFFT is thread safe so long as no other threads are calling FFTW plan creation/destruction routines independently of FINUFFT. If these FFTW routines are called outside of FINUFFT, then the program is liable to crash. In most cases, the calling program can simply call the FFTW routine ``fftw_make_planner_thread_safe()`` before threading out and thread safety will be maintained. However, in instances where this is less desirable, we provide a means to provide your own FFTW locking mechanism. The following example code should exercise FFTW thread safety, and can be built with ``c++ thread_test.cpp -o thread_test -lfinufft -lfftw3_threads -lfftw3 -fopenmp -std=c++11``, assuming the finufft include and library paths are set.
+
+.. code-block:: C++
+
+
+  // thread_test.cpp
+  #include <vector>
+  #include <mutex>
+  #include <complex>
+
+  #include <fftw3.h>
+  #include <finufft.h>
+  #include <omp.h>
+
+  using namespace std;
+
+  constexpr int N = 65384;
+
+  void locker(void *lck) { reinterpret_cast<recursive_mutex *>(lck)->lock(); }
+  void unlocker(void *lck) { reinterpret_cast<recursive_mutex *>(lck)->unlock(); }
+
+  int main() {
+    int64_t Ns[3]; // guru describes mode array by vector [N1,N2..]
+    Ns[0] = N;
+    recursive_mutex lck;
+
+    finufft_opts opts;
+    finufft_default_opts(&opts);
+    opts.nthreads = 1;
+    opts.debug = 0;
+    opts.fftw_lock_fun = locker;
+    opts.fftw_unlock_fun = unlocker;
+    opts.fftw_lock_data = reinterpret_cast<void *>(&lck);
+
+    // random nonuniform points (x) and complex strengths (c)
+    vector<complex<double>> c(N);
+
+    // init FFTW threads
+    fftw_init_threads();
+
+    // FFTW and FINUFFT execution using OpenMP parallelization
+    #pragma omp parallel for
+    for (int j = 0; j < 100; ++j) {
+      // allocate output array for FFTW...
+      vector<complex<double>> F1(N);
+
+      // FFTW plan
+      lck.lock();
+      fftw_plan_with_nthreads(1);
+      fftw_plan plan = fftw_plan_dft_1d(N, reinterpret_cast<fftw_complex*>(c.data()),
+                                        reinterpret_cast<fftw_complex*>(F1.data()),
+                                        FFTW_FORWARD, FFTW_ESTIMATE);
+      fftw_destroy_plan(plan);
+      lck.unlock();
+
+      // FINUFFT plan
+      finufft_plan nufftplan;
+      finufft_makeplan(1, 1, Ns, 1, 1, 1e-6, &nufftplan, &opts);
+      finufft_destroy(nufftplan);
+    }
+
+    return 0;
+  }
+
+**fftw_lock_fun**:  ``void (fun*)(void *)`` C-style callback function to lock calls to FFTW plan manipulation routines. A ``nullptr`` or ``0`` value will be ignored. If non-null, ``fftw_unlock_fun`` must also be set.
+
+**fftw_unlock_fun**: ``void (fun*)(void *)`` C-style callback function to unlock calls to FFTW plan manipulation routines. A ``nullptr`` or ``0`` value will be ignored. If non-null, ``fftw_lock_fun`` must also be set.
+
+**fftw_lock_data**:  ``void *data`` pointer, typically to the lock object itself. Pointer will be passed to ``fftw_lock_fun`` and ``fftw_unlock_fun`` if they are set.
diff --git a/include/finufft_errors.h b/include/finufft_errors.h
@@ -24,6 +24,7 @@ enum {
   FINUFFT_ERR_BINSIZE_NOTVALID       = 18,
   FINUFFT_ERR_INSUFFICIENT_SHMEM     = 19,
   FINUFFT_ERR_NUM_NU_PTS_INVALID     = 20,
-  FINUFFT_ERR_INVALID_ARGUMENT       = 21
+  FINUFFT_ERR_INVALID_ARGUMENT       = 21,
+  FINUFFT_ERR_LOCK_FUNS_INVALID      = 22
 };
 #endif
diff --git a/include/finufft_opts.h b/include/finufft_opts.h
@@ -32,6 +32,12 @@ typedef struct finufft_opts { // defaults see finufft.cpp:finufft_default_opts()
                           // atomic
   int spread_max_sp_size; // if >0, overrides spreader (dir=1) max subproblem size
                           // sphinx tag (don't remove): @opts_end
+
+  // User can provide their own FFTW planner lock functions for thread safety
+  // Null values ignored and use a default lock function (both or neither must be set)
+  void (*fftw_lock_fun)(void *);   // Function ptr that locks the FFTW planner
+  void (*fftw_unlock_fun)(void *); // Function ptr that unlocks the FFTW planner
+  void *fftw_lock_data;            // Data to pass to the lock functions (e.g. a mutex)
 } finufft_opts;
 
 // Those of the above of the form spread_* indicate pass through to finufft_spread_opts

diff --git a/python/finufft/finufft/_finufft.py b/python/finufft/finufft/_finufft.py
@@ -85,7 +85,10 @@ class FinufftOpts(ctypes.Structure):
                       ('spread_thread', c_int),
                       ('maxbatchsize', c_int),
                       ('spread_nthr_atomic', c_int),
-                      ('spread_max_sp_size', c_int)]
+                      ('spread_max_sp_size', c_int),
+                      ('fftw_lock_fun', c_void_p),
+                      ('fftw_unlock_fun', c_void_p),
+                      ('fftw_lock_data', c_void_p)]
 
 
 FinufftPlan = c_void_p

diff --git a/src/finufft.cpp b/src/finufft.cpp
@@ -89,6 +89,29 @@ namespace common {
 // Technically global state...
 // Needs to be static to avoid name collision with SINGLE/DOUBLE
 static std::mutex fftw_lock;
+
+class FFTWLockGuard {
+public:
+  FFTWLockGuard(void (*lock_fun)(void *), void (*unlock_fun)(void *), void *lock_data)
+      : unlock_fun_(unlock_fun), lock_data_(lock_data), fftw_lock_(fftw_lock) {
+    if (lock_fun)
+      lock_fun(lock_data_);
+    else
+      fftw_lock_.lock();
+  }
+  ~FFTWLockGuard() {
+    if (unlock_fun_)
+      unlock_fun_(lock_data_);
+    else
+      fftw_lock_.unlock();
+  }
+
+private:
+  void (*unlock_fun_)(void *);
+  void *lock_data_;
+  std::mutex &fftw_lock_;
+};
+
 #endif
 
 static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts,
@@ -527,6 +550,9 @@ void FINUFFT_DEFAULT_OPTS(finufft_opts *o)
   o->maxbatchsize       = 0;
   o->spread_nthr_atomic = -1;
   o->spread_max_sp_size = 0;
+  o->fftw_lock_fun      = nullptr;
+  o->fftw_unlock_fun    = nullptr;
+  o->fftw_lock_data     = nullptr;
   // sphinx tag (don't remove): @defopts_end
 }
 
@@ -564,6 +590,12 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans,
     fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans);
     return FINUFFT_ERR_NTRANS_NOTVALID;
   }
+  if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) {
+    fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n",
+            __func__);
+    return FINUFFT_ERR_LOCK_FUNS_INVALID;
+    ;
+  }
 
   // get stuff from args...
   p->type    = type;
@@ -657,7 +689,8 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans,
     // thread-safe (can be called inside OMP)
     {
       static bool did_fftw_init = false; // the only global state of FINUFFT
-      std::lock_guard<std::mutex> lock(fftw_lock);
+      FFTWLockGuard lock(p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun,
+                         p->opts.fftw_lock_data);
       if (!did_fftw_init) {
         FFTW_INIT();          // setup FFTW global state; should only do once
         did_fftw_init = true; // ensure other FINUFFT threads don't clash
@@ -750,7 +783,8 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans,
     // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride,
     // idist, ot, onembed, ostride, odist, sign, flags
     {
-      std::lock_guard<std::mutex> lock(fftw_lock);
+      FFTWLockGuard lock(p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun,
+                         p->opts.fftw_lock_data);
       // FFTW_PLAN_TH sets all future fftw_plan calls to use nthr_fft threads.
       // FIXME: Since this might override what the user wants for fftw, we'd like to
       // set it just for our one plan and then revert to the user value.
@@ -1220,7 +1254,8 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p)
   if (p->type == 1 || p->type == 2) {
 #ifndef FINUFFT_USE_DUCC0
     {
-      std::lock_guard<std::mutex> lock(fftw_lock);
+      FFTWLockGuard lock(p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun,
+                         p->opts.fftw_lock_data);
       FFTW_DE(p->fftwPlan);
     }
 #endif

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -37,6 +37,17 @@ add_test(
   COMMAND testutils
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
+if(NOT FINUFFT_USE_DUCC0)
+  add_executable(fftw_lock_test fftw_lock_test.cpp)
+  target_compile_features(fftw_lock_test PRIVATE cxx_std_17)
+  finufft_link_test(fftw_lock_test)
+
+  add_test(
+    NAME run_fftw_lock_test
+    COMMAND fftw_lock_test
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+endif()
+
 # Add ctest definitions that run at both precisions...
 function(add_tests_with_prec PREC REQ_TOL CHECK_TOL SUFFIX)
   # All of the following should be run at OMP_NUM_THREADS=4 or something small,

diff --git a/test/fftw_lock_test.cpp b/test/fftw_lock_test.cpp
@@ -0,0 +1,63 @@
+#include <complex>
+#include <mutex>
+#include <vector>
+
+#include <fftw3.h>
+#include <finufft.h>
+#include <omp.h>
+
+// This file tests the user locking mechanism for multi-threaded FFTW. This
+// demonstrates a user lock to prevent FFTW plan calls from interfering with
+// finufft plan calls (make/destroy).
+// Robert Blackwell. Based on bug identified by Jonas Krimmer (9/17/24)
+// See discussion at https://github.com/ludvigak/FINUFFT.jl/issues/62
+
+constexpr int N = 65384;
+
+// Example user lock functions
+void locker(void *lck) { reinterpret_cast<std::mutex *>(lck)->lock(); }
+void unlocker(void *lck) { reinterpret_cast<std::mutex *>(lck)->unlock(); }
+
+int main() {
+  int64_t Ns[3]; // guru describes mode array by vector [N1,N2..]
+  Ns[0] = N;
+  std::mutex lck;
+
+  finufft_opts opts;
+  finufft_default_opts(&opts);
+  opts.nthreads        = 1;
+  opts.debug           = 0;
+  opts.fftw_lock_fun   = locker;
+  opts.fftw_unlock_fun = unlocker;
+  opts.fftw_lock_data  = reinterpret_cast<void *>(&lck);
+
+  // random nonuniform points (x) and complex strengths (c)...
+  std::vector<std::complex<double>> c(N);
+
+  omp_set_num_threads(8);
+
+  // init FFTW threads
+  fftw_init_threads();
+
+// FFTW and FINUFFT execution using OpenMP parallelization
+#pragma omp parallel for
+  for (int j = 0; j < 100; ++j) {
+    // allocate output array for FFTW...
+    std::vector<std::complex<double>> F1(N);
+
+    // FFTW plan
+    lck.lock();
+    fftw_plan_with_nthreads(1);
+    fftw_plan plan = fftw_plan_dft_1d(N, reinterpret_cast<fftw_complex *>(c.data()),
+                                      reinterpret_cast<fftw_complex *>(F1.data()),
+                                      FFTW_FORWARD, FFTW_ESTIMATE);
+    fftw_destroy_plan(plan);
+    lck.unlock();
+
+    // FINUFFT plan
+    finufft_plan nufftplan;
+    finufft_makeplan(1, 1, Ns, 1, 1, 1e-6, &nufftplan, &opts);
+    finufft_destroy(nufftplan);
+  }
+  return 0;
+}