diff --git a/CHANGELOG b/CHANGELOG index bcbb61e1..37f4c9c8 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -14,6 +14,9 @@ Master (9/10/24) - Minor fixes on the GPU code: a) removed memory leaks in case of errors b) renamed maxbatchsize to batchsize +* Add options for user-provided FFTW locker (PR548, Blackwell). These options can be be +used to prevent crashes when a user is creating/destroying FFTW plans and +FINUFFT plans in threads simultaneously. V 2.3.0 (9/5/24) diff --git a/docs/opts.rst b/docs/opts.rst index 89b7be8c..d659b8b2 100644 --- a/docs/opts.rst +++ b/docs/opts.rst @@ -189,3 +189,77 @@ Here ``0`` makes an automatic choice. If you are unhappy with this, then for sma **spread_nthr_atomic**: if non-negative: for numbers of threads up to this value, an OMP critical block for ``add_wrapped_subgrid`` is used in spreading (type 1 transforms). Above this value, instead OMP atomic writes are used, which scale better for large thread numbers. If negative, the heuristic default in the spreader is used, set in ``src/spreadinterp.cpp:setup_spreader()``. **spread_max_sp_size**: if positive, overrides the maximum subproblem (chunking) size for multithreaded spreading (type 1 transforms). Otherwise the default in the spreader is used, set in ``src/spreadinterp.cpp:setup_spreader()``, which we believe is a decent heuristic for Intel i7 and xeon machines. + + +Thread safety options (advanced) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, with FFTW as the FFT library, FINUFFT is thread safe so long as no other threads are calling FFTW plan creation/destruction routines independently of FINUFFT. If these FFTW routines are called outside of FINUFFT, then the program is liable to crash. In most cases, the calling program can simply call the FFTW routine ``fftw_make_planner_thread_safe()`` before threading out and thread safety will be maintained. However, in instances where this is less desirable, we provide a means to provide your own FFTW locking mechanism. The following example code should exercise FFTW thread safety, and can be built with ``c++ thread_test.cpp -o thread_test -lfinufft -lfftw3_threads -lfftw3 -fopenmp -std=c++11``, assuming the finufft include and library paths are set. + +.. code-block:: C++ + + + // thread_test.cpp + #include + #include + #include + + #include + #include + #include + + using namespace std; + + constexpr int N = 65384; + + void locker(void *lck) { reinterpret_cast(lck)->lock(); } + void unlocker(void *lck) { reinterpret_cast(lck)->unlock(); } + + int main() { + int64_t Ns[3]; // guru describes mode array by vector [N1,N2..] + Ns[0] = N; + recursive_mutex lck; + + finufft_opts opts; + finufft_default_opts(&opts); + opts.nthreads = 1; + opts.debug = 0; + opts.fftw_lock_fun = locker; + opts.fftw_unlock_fun = unlocker; + opts.fftw_lock_data = reinterpret_cast(&lck); + + // random nonuniform points (x) and complex strengths (c) + vector> c(N); + + // init FFTW threads + fftw_init_threads(); + + // FFTW and FINUFFT execution using OpenMP parallelization + #pragma omp parallel for + for (int j = 0; j < 100; ++j) { + // allocate output array for FFTW... + vector> F1(N); + + // FFTW plan + lck.lock(); + fftw_plan_with_nthreads(1); + fftw_plan plan = fftw_plan_dft_1d(N, reinterpret_cast(c.data()), + reinterpret_cast(F1.data()), + FFTW_FORWARD, FFTW_ESTIMATE); + fftw_destroy_plan(plan); + lck.unlock(); + + // FINUFFT plan + finufft_plan nufftplan; + finufft_makeplan(1, 1, Ns, 1, 1, 1e-6, &nufftplan, &opts); + finufft_destroy(nufftplan); + } + + return 0; + } + +**fftw_lock_fun**: ``void (fun*)(void *)`` C-style callback function to lock calls to FFTW plan manipulation routines. A ``nullptr`` or ``0`` value will be ignored. If non-null, ``fftw_unlock_fun`` must also be set. + +**fftw_unlock_fun**: ``void (fun*)(void *)`` C-style callback function to unlock calls to FFTW plan manipulation routines. A ``nullptr`` or ``0`` value will be ignored. If non-null, ``fftw_lock_fun`` must also be set. + +**fftw_lock_data**: ``void *data`` pointer, typically to the lock object itself. Pointer will be passed to ``fftw_lock_fun`` and ``fftw_unlock_fun`` if they are set. diff --git a/include/finufft_errors.h b/include/finufft_errors.h index fb827557..10465aa1 100644 --- a/include/finufft_errors.h +++ b/include/finufft_errors.h @@ -24,6 +24,7 @@ enum { FINUFFT_ERR_BINSIZE_NOTVALID = 18, FINUFFT_ERR_INSUFFICIENT_SHMEM = 19, FINUFFT_ERR_NUM_NU_PTS_INVALID = 20, - FINUFFT_ERR_INVALID_ARGUMENT = 21 + FINUFFT_ERR_INVALID_ARGUMENT = 21, + FINUFFT_ERR_LOCK_FUNS_INVALID = 22 }; #endif diff --git a/include/finufft_opts.h b/include/finufft_opts.h index 0435b8c4..0d482f73 100644 --- a/include/finufft_opts.h +++ b/include/finufft_opts.h @@ -32,6 +32,12 @@ typedef struct finufft_opts { // defaults see finufft.cpp:finufft_default_opts() // atomic int spread_max_sp_size; // if >0, overrides spreader (dir=1) max subproblem size // sphinx tag (don't remove): @opts_end + + // User can provide their own FFTW planner lock functions for thread safety + // Null values ignored and use a default lock function (both or neither must be set) + void (*fftw_lock_fun)(void *); // Function ptr that locks the FFTW planner + void (*fftw_unlock_fun)(void *); // Function ptr that unlocks the FFTW planner + void *fftw_lock_data; // Data to pass to the lock functions (e.g. a mutex) } finufft_opts; // Those of the above of the form spread_* indicate pass through to finufft_spread_opts diff --git a/python/finufft/finufft/_finufft.py b/python/finufft/finufft/_finufft.py index 99969a38..10af6829 100644 --- a/python/finufft/finufft/_finufft.py +++ b/python/finufft/finufft/_finufft.py @@ -85,7 +85,10 @@ class FinufftOpts(ctypes.Structure): ('spread_thread', c_int), ('maxbatchsize', c_int), ('spread_nthr_atomic', c_int), - ('spread_max_sp_size', c_int)] + ('spread_max_sp_size', c_int), + ('fftw_lock_fun', c_void_p), + ('fftw_unlock_fun', c_void_p), + ('fftw_lock_data', c_void_p)] FinufftPlan = c_void_p diff --git a/src/finufft.cpp b/src/finufft.cpp index 6c592396..9ed3f7d2 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -89,6 +89,29 @@ namespace common { // Technically global state... // Needs to be static to avoid name collision with SINGLE/DOUBLE static std::mutex fftw_lock; + +class FFTWLockGuard { +public: + FFTWLockGuard(void (*lock_fun)(void *), void (*unlock_fun)(void *), void *lock_data) + : unlock_fun_(unlock_fun), lock_data_(lock_data), fftw_lock_(fftw_lock) { + if (lock_fun) + lock_fun(lock_data_); + else + fftw_lock_.lock(); + } + ~FFTWLockGuard() { + if (unlock_fun_) + unlock_fun_(lock_data_); + else + fftw_lock_.unlock(); + } + +private: + void (*unlock_fun_)(void *); + void *lock_data_; + std::mutex &fftw_lock_; +}; + #endif static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts, @@ -527,6 +550,9 @@ void FINUFFT_DEFAULT_OPTS(finufft_opts *o) o->maxbatchsize = 0; o->spread_nthr_atomic = -1; o->spread_max_sp_size = 0; + o->fftw_lock_fun = nullptr; + o->fftw_unlock_fun = nullptr; + o->fftw_lock_data = nullptr; // sphinx tag (don't remove): @defopts_end } @@ -564,6 +590,12 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans); return FINUFFT_ERR_NTRANS_NOTVALID; } + if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) { + fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n", + __func__); + return FINUFFT_ERR_LOCK_FUNS_INVALID; + ; + } // get stuff from args... p->type = type; @@ -657,7 +689,8 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, // thread-safe (can be called inside OMP) { static bool did_fftw_init = false; // the only global state of FINUFFT - std::lock_guard lock(fftw_lock); + FFTWLockGuard lock(p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, + p->opts.fftw_lock_data); if (!did_fftw_init) { FFTW_INIT(); // setup FFTW global state; should only do once did_fftw_init = true; // ensure other FINUFFT threads don't clash @@ -750,7 +783,8 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride, // idist, ot, onembed, ostride, odist, sign, flags { - std::lock_guard lock(fftw_lock); + FFTWLockGuard lock(p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, + p->opts.fftw_lock_data); // FFTW_PLAN_TH sets all future fftw_plan calls to use nthr_fft threads. // FIXME: Since this might override what the user wants for fftw, we'd like to // set it just for our one plan and then revert to the user value. @@ -1220,7 +1254,8 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p) if (p->type == 1 || p->type == 2) { #ifndef FINUFFT_USE_DUCC0 { - std::lock_guard lock(fftw_lock); + FFTWLockGuard lock(p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, + p->opts.fftw_lock_data); FFTW_DE(p->fftwPlan); } #endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0e1a1730..3736c2ca 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -37,6 +37,17 @@ add_test( COMMAND testutils WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +if(NOT FINUFFT_USE_DUCC0) + add_executable(fftw_lock_test fftw_lock_test.cpp) + target_compile_features(fftw_lock_test PRIVATE cxx_std_17) + finufft_link_test(fftw_lock_test) + + add_test( + NAME run_fftw_lock_test + COMMAND fftw_lock_test + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +endif() + # Add ctest definitions that run at both precisions... function(add_tests_with_prec PREC REQ_TOL CHECK_TOL SUFFIX) # All of the following should be run at OMP_NUM_THREADS=4 or something small, diff --git a/test/fftw_lock_test.cpp b/test/fftw_lock_test.cpp new file mode 100644 index 00000000..45e14886 --- /dev/null +++ b/test/fftw_lock_test.cpp @@ -0,0 +1,63 @@ +#include +#include +#include + +#include +#include +#include + +// This file tests the user locking mechanism for multi-threaded FFTW. This +// demonstrates a user lock to prevent FFTW plan calls from interfering with +// finufft plan calls (make/destroy). +// Robert Blackwell. Based on bug identified by Jonas Krimmer (9/17/24) +// See discussion at https://github.com/ludvigak/FINUFFT.jl/issues/62 + +constexpr int N = 65384; + +// Example user lock functions +void locker(void *lck) { reinterpret_cast(lck)->lock(); } +void unlocker(void *lck) { reinterpret_cast(lck)->unlock(); } + +int main() { + int64_t Ns[3]; // guru describes mode array by vector [N1,N2..] + Ns[0] = N; + std::mutex lck; + + finufft_opts opts; + finufft_default_opts(&opts); + opts.nthreads = 1; + opts.debug = 0; + opts.fftw_lock_fun = locker; + opts.fftw_unlock_fun = unlocker; + opts.fftw_lock_data = reinterpret_cast(&lck); + + // random nonuniform points (x) and complex strengths (c)... + std::vector> c(N); + + omp_set_num_threads(8); + + // init FFTW threads + fftw_init_threads(); + +// FFTW and FINUFFT execution using OpenMP parallelization +#pragma omp parallel for + for (int j = 0; j < 100; ++j) { + // allocate output array for FFTW... + std::vector> F1(N); + + // FFTW plan + lck.lock(); + fftw_plan_with_nthreads(1); + fftw_plan plan = fftw_plan_dft_1d(N, reinterpret_cast(c.data()), + reinterpret_cast(F1.data()), + FFTW_FORWARD, FFTW_ESTIMATE); + fftw_destroy_plan(plan); + lck.unlock(); + + // FINUFFT plan + finufft_plan nufftplan; + finufft_makeplan(1, 1, Ns, 1, 1, 1e-6, &nufftplan, &opts); + finufft_destroy(nufftplan); + } + return 0; +}