diff --git a/.github/workflows/python_build_wheels.yml b/.github/workflows/python_build_wheels.yml index 455764213..ad16b0373 100644 --- a/.github/workflows/python_build_wheels.yml +++ b/.github/workflows/python_build_wheels.yml @@ -21,10 +21,10 @@ jobs: CIBW_BEFORE_ALL_MACOS: | # In order to reinstall a version of GCC compatible with older versions of macOS, we need to first uninstall the existing version. brew uninstall gcc - pkg=$(brew fetch --force --bottle-tag=monterey gcc | grep 'Downloaded to' | cut -d' ' -f3) + pkg=$(brew fetch --force --bottle-tag=monterey gcc | grep 'Downloaded to.*monterey.*' | cut -d' ' -f3) brew install $pkg - pkg=$(brew fetch --force --bottle-tag=monterey fftw | grep 'Downloaded to' | cut -d' ' -f3) + pkg=$(brew fetch --force --bottle-tag=monterey fftw | grep 'Downloaded to.*monterey.*' | cut -d' ' -f3) brew install $pkg CIBW_ARCHS_MACOS: "x86_64" # Need following versions of GCC for compatibility with fftw @@ -56,10 +56,10 @@ jobs: CIBW_BEFORE_ALL_MACOS: | # In order to reinstall a version of GCC compatible with older versions of macOS, we need to first uninstall the existing version. brew uninstall gcc - pkg=$(brew fetch --force --bottle-tag=arm64_monterey gcc | grep 'Downloaded to' | cut -d' ' -f3) + pkg=$(brew fetch --force --bottle-tag=arm64_monterey gcc | grep 'Downloaded to.*monterey.*' | cut -d' ' -f3) brew install $pkg - pkg=$(brew fetch --force --bottle-tag=arm64_monterey fftw | grep 'Downloaded to' | cut -d' ' -f3) + pkg=$(brew fetch --force --bottle-tag=arm64_monterey fftw | grep 'Downloaded to.*monterey.*' | cut -d' ' -f3) brew install $pkg CIBW_ENVIRONMENT_MACOS: > CC=gcc-14 diff --git a/CHANGELOG b/CHANGELOG index fe852d418..37f4c9c80 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,15 +2,21 @@ List of features / changes made / release notes, in reverse chronological order. If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately). Master (9/10/24) + * reduced roundoff error in a[n] phase calc in CPU onedim_fseries_kernel(). #534 (Barnett). -* Support for type 3 in 1D, 2D, and 3D in the GPU library cufinufft (PR #517). - - Removed the CPU fseries computation (only used for benchmark no longer needed). - - Added complex arithmetic support for cuda_complex type - - Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic - - Minor fixes on the GPU code: - a) removed memory leaks in case of errors - b) renamed maxbatchsize to batchsize +* GPU code type 1,2 also reduced round-off error in phases, to match CPU code; + rationalized onedim_{fseries,nuft}_* GPU codes to match CPU (Barbone, Barnett) +* Added type 3 in 1D, 2D, and 3D, in the GPU library cufinufft. PR #517, Barbone + - Removed the CPU fseries computation (used for benchmark, no longer needed) + - Added complex arithmetic support for cuda_complex type + - Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic + - Minor fixes on the GPU code: + a) removed memory leaks in case of errors + b) renamed maxbatchsize to batchsize +* Add options for user-provided FFTW locker (PR548, Blackwell). These options can be be +used to prevent crashes when a user is creating/destroying FFTW plans and +FINUFFT plans in threads simultaneously. V 2.3.0 (9/5/24) diff --git a/docs/opts.rst b/docs/opts.rst index 89b7be8cd..d659b8b20 100644 --- a/docs/opts.rst +++ b/docs/opts.rst @@ -189,3 +189,77 @@ Here ``0`` makes an automatic choice. If you are unhappy with this, then for sma **spread_nthr_atomic**: if non-negative: for numbers of threads up to this value, an OMP critical block for ``add_wrapped_subgrid`` is used in spreading (type 1 transforms). Above this value, instead OMP atomic writes are used, which scale better for large thread numbers. If negative, the heuristic default in the spreader is used, set in ``src/spreadinterp.cpp:setup_spreader()``. **spread_max_sp_size**: if positive, overrides the maximum subproblem (chunking) size for multithreaded spreading (type 1 transforms). Otherwise the default in the spreader is used, set in ``src/spreadinterp.cpp:setup_spreader()``, which we believe is a decent heuristic for Intel i7 and xeon machines. + + +Thread safety options (advanced) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, with FFTW as the FFT library, FINUFFT is thread safe so long as no other threads are calling FFTW plan creation/destruction routines independently of FINUFFT. If these FFTW routines are called outside of FINUFFT, then the program is liable to crash. In most cases, the calling program can simply call the FFTW routine ``fftw_make_planner_thread_safe()`` before threading out and thread safety will be maintained. However, in instances where this is less desirable, we provide a means to provide your own FFTW locking mechanism. The following example code should exercise FFTW thread safety, and can be built with ``c++ thread_test.cpp -o thread_test -lfinufft -lfftw3_threads -lfftw3 -fopenmp -std=c++11``, assuming the finufft include and library paths are set. + +.. code-block:: C++ + + + // thread_test.cpp + #include + #include + #include + + #include + #include + #include + + using namespace std; + + constexpr int N = 65384; + + void locker(void *lck) { reinterpret_cast(lck)->lock(); } + void unlocker(void *lck) { reinterpret_cast(lck)->unlock(); } + + int main() { + int64_t Ns[3]; // guru describes mode array by vector [N1,N2..] + Ns[0] = N; + recursive_mutex lck; + + finufft_opts opts; + finufft_default_opts(&opts); + opts.nthreads = 1; + opts.debug = 0; + opts.fftw_lock_fun = locker; + opts.fftw_unlock_fun = unlocker; + opts.fftw_lock_data = reinterpret_cast(&lck); + + // random nonuniform points (x) and complex strengths (c) + vector> c(N); + + // init FFTW threads + fftw_init_threads(); + + // FFTW and FINUFFT execution using OpenMP parallelization + #pragma omp parallel for + for (int j = 0; j < 100; ++j) { + // allocate output array for FFTW... + vector> F1(N); + + // FFTW plan + lck.lock(); + fftw_plan_with_nthreads(1); + fftw_plan plan = fftw_plan_dft_1d(N, reinterpret_cast(c.data()), + reinterpret_cast(F1.data()), + FFTW_FORWARD, FFTW_ESTIMATE); + fftw_destroy_plan(plan); + lck.unlock(); + + // FINUFFT plan + finufft_plan nufftplan; + finufft_makeplan(1, 1, Ns, 1, 1, 1e-6, &nufftplan, &opts); + finufft_destroy(nufftplan); + } + + return 0; + } + +**fftw_lock_fun**: ``void (fun*)(void *)`` C-style callback function to lock calls to FFTW plan manipulation routines. A ``nullptr`` or ``0`` value will be ignored. If non-null, ``fftw_unlock_fun`` must also be set. + +**fftw_unlock_fun**: ``void (fun*)(void *)`` C-style callback function to unlock calls to FFTW plan manipulation routines. A ``nullptr`` or ``0`` value will be ignored. If non-null, ``fftw_lock_fun`` must also be set. + +**fftw_lock_data**: ``void *data`` pointer, typically to the lock object itself. Pointer will be passed to ``fftw_lock_fun`` and ``fftw_unlock_fun`` if they are set. diff --git a/docs/users.rst b/docs/users.rst index 3cfe8c3b4..8c439bff9 100644 --- a/docs/users.rst +++ b/docs/users.rst @@ -35,7 +35,7 @@ and also add them to GitHub's Used By feature): #. `EM-Align `_: Aligning rotation, reflection, and translation between volumes (desntiy maps) in cryo-electron microscopy, from Shkolnisky Lab at Tel Aviv. #. `spinifel `_: Uses the multitiered iterative phasing (M-TIP) algorithm for single particle X-ray diffraction imaging, on CPU/GPU, from the ExaFEL project at LBNL/DOE. - + #. `sinctransform `_: C++ and MATLAB codes to evaluate sums of the sinc and sinc^2 kernels between arbitrary nonuniform points in 1,2, or 3 dimensions, by Hannah Lawrence (2017 summer intern at Flatiron). #. `fsinc `_: Gaute Hope's fast sinc transform and interpolation Python package. @@ -46,18 +46,22 @@ and also add them to GitHub's Used By feature): #. `TRIQS CTINT `_: continous time interaction-expansion solver, by N. Wentzell and O. Parcollet (Flatiron Institute, part of platform for interacting quantum systems). +#. `cunuSHT `_: GPU accelerated spherical harmonic transforms from nonuniform samples (arbitrary pixelizations), by S. Belkner and coauthors. https://arxiv.org/abs/2406.14542 + +#. `FReSCO `_: Fast reciprocal-space correlator, by Aaron Shih, Mathias Kasiulis, and Stefano Martiani. This uses thousands of calls to all three transform types in 2D or 3D, to iteratively adjust nonuniform points until their Fourier transforms match a desired function. Physics Mag. article and movie: https://physics.aps.org/articles/v17/134 + Other wrappers to (cu)FINUFFT ------------------------------ - + #. `FINUFFT.jl `_: a `julia `_ language wrapper by Ludvig af Klinteberg, Libin Lu, and others, now using pure Julia, and fully featured (rather than via Python). This is itself wrapped by `AbstractNFFTs.jl` in `NFFT.jl `_. #. `TensorFlow NUFFT `_: a wrapper to the differentiable machine learning Python tool TensorFlow, for the CPU (via FINUFFT) and GPU (via cuFINUFFT). By Javier Montalt Tordera (UCL). #. `JAX bindings to (cu)FINUFFT `_: a wrapper to the differentiable machine learning Python tool JAX. Directly exposes the FINUFFT library to JAX's XLA backend, as well as implementing differentiation rules for the transforms. By Dan Foreman-Mackey (CCA). - + #. `PyTorch wrapper to (cu)FINUFFT `_: a wrapper to the differentiable machine learning Python tool PyTorch. By Michael Eickenberg and Brian Ward (CCM). - + Research output using (cu)FINUFFT --------------------------------- @@ -92,14 +96,14 @@ For the latest see: Google Scholar `FINUFFT citations +#include // --------------- Private data types for compilation in either prec --------- // Devnote: must match those in relevant prec of public finufft.h interface! @@ -258,7 +259,7 @@ typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++ FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3 // other internal structs; each is C-compatible of course - Finufft_FFT_plan fftPlan; + std::unique_ptr> fftPlan; finufft_opts opts; // this and spopts could be made ptrs finufft_spread_opts spopts; diff --git a/include/finufft/fft.h b/include/finufft/fft.h index 87e34d657..7f47814a0 100644 --- a/include/finufft/fft.h +++ b/include/finufft/fft.h @@ -8,6 +8,8 @@ template class Finufft_FFT_plan { public: + Finufft_FFT_plan(void (*)(void *) = nullptr, void (*)(void *) = nullptr, + void * = nullptr) {} void plan(const std::vector & /*dims*/, size_t /*batchSize*/, std::complex * /*ptr*/, int /*sign*/, int /*options*/, int /*nthreads*/) {} static std::complex *alloc_complex(size_t N) { return new std::complex[N]; } @@ -36,9 +38,19 @@ template<> struct Finufft_FFT_plan { } fftwf_plan plan_; + void (*fftw_lock_fun)(void *); // Function ptr that locks the FFTW planner + void (*fftw_unlock_fun)(void *); // Function ptr that unlocks the FFTW planner + void *lock_data; + void lock() { fftw_lock_fun ? fftw_lock_fun(lock_data) : mut().lock(); } + void unlock() { fftw_lock_fun ? fftw_unlock_fun(lock_data) : mut().unlock(); } + public: - Finufft_FFT_plan() : plan_(nullptr) { - std::lock_guard lock(mut()); + Finufft_FFT_plan(void (*fftw_lock_fun_)(void *) = nullptr, + void (*fftw_unlock_fun_)(void *) = nullptr, + void *lock_data_ = nullptr) + : plan_(nullptr), fftw_lock_fun(fftw_lock_fun_), fftw_unlock_fun(fftw_unlock_fun_), + lock_data(lock_data_) { + lock(); #ifdef _OPENMP static bool initialized = false; if (!initialized) { @@ -46,17 +58,19 @@ template<> struct Finufft_FFT_plan { initialized = true; } #endif + unlock(); } ~Finufft_FFT_plan() { - std::lock_guard lock(mut()); + lock(); fftwf_destroy_plan(plan_); + unlock(); } void plan(const std::vector &dims, size_t batchSize, std::complex *ptr, int sign, int options, int nthreads) { uint64_t nf = 1; for (auto i : dims) nf *= i; - std::lock_guard lock(mut()); + lock(); #ifdef _OPENMP fftwf_plan_with_nthreads(nthreads); #endif @@ -64,6 +78,7 @@ template<> struct Finufft_FFT_plan { reinterpret_cast(ptr), nullptr, 1, nf, reinterpret_cast(ptr), nullptr, 1, nf, sign, options); + unlock(); } static std::complex *alloc_complex(size_t N) { return reinterpret_cast *>(fftwf_alloc_complex(N)); @@ -74,17 +89,20 @@ template<> struct Finufft_FFT_plan { void execute() { fftwf_execute(plan_); } static void forget_wisdom() { - std::lock_guard lock(mut()); + // lock(); fftwf_forget_wisdom(); + // unlock(); } static void cleanup() { - std::lock_guard lock(mut()); + // lock(); fftwf_cleanup(); + // unlock(); } static void cleanup_threads() { #ifdef _OPENMP - std::lock_guard lock(mut()); + // lock(); fftwf_cleanup_threads(); +// unlock(); #endif } }; @@ -97,9 +115,19 @@ template<> struct Finufft_FFT_plan { } fftw_plan plan_; + void (*fftw_lock_fun)(void *); // Function ptr that locks the FFTW planner + void (*fftw_unlock_fun)(void *); // Function ptr that unlocks the FFTW planner + void *lock_data; + void lock() { fftw_lock_fun ? fftw_lock_fun(lock_data) : mut().lock(); } + void unlock() { fftw_lock_fun ? fftw_unlock_fun(lock_data) : mut().unlock(); } + public: - Finufft_FFT_plan() : plan_(nullptr) { - std::lock_guard lock(mut()); + Finufft_FFT_plan(void (*fftw_lock_fun_)(void *) = nullptr, + void (*fftw_unlock_fun_)(void *) = nullptr, + void *lock_data_ = nullptr) + : plan_(nullptr), fftw_lock_fun(fftw_lock_fun_), fftw_unlock_fun(fftw_unlock_fun_), + lock_data(lock_data_) { + lock(); #ifdef _OPENMP static bool initialized = false; if (!initialized) { @@ -107,17 +135,19 @@ template<> struct Finufft_FFT_plan { initialized = true; } #endif + unlock(); } ~Finufft_FFT_plan() { - std::lock_guard lock(mut()); + lock(); fftw_destroy_plan(plan_); + unlock(); } void plan(const std::vector &dims, size_t batchSize, std::complex *ptr, int sign, int options, int nthreads) { uint64_t nf = 1; for (auto i : dims) nf *= i; - std::lock_guard lock(mut()); + lock(); #ifdef _OPENMP fftw_plan_with_nthreads(nthreads); #endif @@ -125,6 +155,7 @@ template<> struct Finufft_FFT_plan { reinterpret_cast(ptr), nullptr, 1, nf, reinterpret_cast(ptr), nullptr, 1, nf, sign, options); + unlock(); } static std::complex *alloc_complex(size_t N) { return reinterpret_cast *>(fftw_alloc_complex(N)); @@ -135,17 +166,20 @@ template<> struct Finufft_FFT_plan { void execute() { fftw_execute(plan_); } static void forget_wisdom() { - std::lock_guard lock(mut()); + // lock(); fftw_forget_wisdom(); + // unlock(); } static void cleanup() { - std::lock_guard lock(mut()); + // lock(); fftw_cleanup(); + // unlock(); } static void cleanup_threads() { #ifdef _OPENMP - std::lock_guard lock(mut()); + // lock(); fftw_cleanup_threads(); +// unlock(); #endif } }; diff --git a/include/finufft_errors.h b/include/finufft_errors.h index fb827557c..10465aa13 100644 --- a/include/finufft_errors.h +++ b/include/finufft_errors.h @@ -24,6 +24,7 @@ enum { FINUFFT_ERR_BINSIZE_NOTVALID = 18, FINUFFT_ERR_INSUFFICIENT_SHMEM = 19, FINUFFT_ERR_NUM_NU_PTS_INVALID = 20, - FINUFFT_ERR_INVALID_ARGUMENT = 21 + FINUFFT_ERR_INVALID_ARGUMENT = 21, + FINUFFT_ERR_LOCK_FUNS_INVALID = 22 }; #endif diff --git a/include/finufft_opts.h b/include/finufft_opts.h index 0435b8c41..0d482f732 100644 --- a/include/finufft_opts.h +++ b/include/finufft_opts.h @@ -32,6 +32,12 @@ typedef struct finufft_opts { // defaults see finufft.cpp:finufft_default_opts() // atomic int spread_max_sp_size; // if >0, overrides spreader (dir=1) max subproblem size // sphinx tag (don't remove): @opts_end + + // User can provide their own FFTW planner lock functions for thread safety + // Null values ignored and use a default lock function (both or neither must be set) + void (*fftw_lock_fun)(void *); // Function ptr that locks the FFTW planner + void (*fftw_unlock_fun)(void *); // Function ptr that unlocks the FFTW planner + void *fftw_lock_data; // Data to pass to the lock functions (e.g. a mutex) } finufft_opts; // Those of the above of the form spread_* indicate pass through to finufft_spread_opts diff --git a/python/finufft/finufft/_finufft.py b/python/finufft/finufft/_finufft.py index 99969a38c..10af6829c 100644 --- a/python/finufft/finufft/_finufft.py +++ b/python/finufft/finufft/_finufft.py @@ -85,7 +85,10 @@ class FinufftOpts(ctypes.Structure): ('spread_thread', c_int), ('maxbatchsize', c_int), ('spread_nthr_atomic', c_int), - ('spread_max_sp_size', c_int)] + ('spread_max_sp_size', c_int), + ('fftw_lock_fun', c_void_p), + ('fftw_unlock_fun', c_void_p), + ('fftw_lock_data', c_void_p)] FinufftPlan = c_void_p diff --git a/src/fft.cpp b/src/fft.cpp index c2c8ddf82..2a1e268e0 100644 --- a/src/fft.cpp +++ b/src/fft.cpp @@ -103,6 +103,6 @@ void do_fft(FINUFFT_PLAN p) { } #endif #else - p->fftPlan.execute(); // if thisBatchSizefftPlan->execute(); // if thisBatchSize #include #include +#include #include using namespace std; @@ -520,6 +521,9 @@ void FINUFFT_DEFAULT_OPTS(finufft_opts *o) o->maxbatchsize = 0; o->spread_nthr_atomic = -1; o->spread_max_sp_size = 0; + o->fftw_lock_fun = nullptr; + o->fftw_unlock_fun = nullptr; + o->fftw_lock_data = nullptr; // sphinx tag (don't remove): @defopts_end } @@ -545,6 +549,9 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n", __func__); + p->fftPlan = std::make_unique>( + p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data); + if ((type != 1) && (type != 2) && (type != 3)) { fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type); return FINUFFT_ERR_TYPE_NOTVALID; @@ -557,6 +564,12 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans); return FINUFFT_ERR_NTRANS_NOTVALID; } + if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) { + fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n", + __func__); + return FINUFFT_ERR_LOCK_FUNS_INVALID; + ; + } // get stuff from args... p->type = type; @@ -708,7 +721,7 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, } timer.restart(); - p->fwBatch = p->fftPlan.alloc_complex(p->nf * p->batchSize); // the big workspace + p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace if (p->opts.debug) printf("[%s] fwBatch %.2fGB alloc: \t%.3g s\n", __func__, (double)1E-09 * sizeof(CPX) * p->nf * p->batchSize, timer.elapsedsec()); @@ -723,7 +736,7 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, timer.restart(); // plan the FFTW auto ns = gridsize_for_fft(p); - p->fftPlan.plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft); + p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft); if (p->opts.debug) printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw, nthr_fft, timer.elapsedsec()); @@ -850,8 +863,8 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT __func__); return FINUFFT_ERR_MAXNALLOC; } - p->fftPlan.free(p->fwBatch); - p->fwBatch = p->fftPlan.alloc_complex(p->nf * p->batchSize); // maybe big workspace + p->fftPlan->free(p->fwBatch); + p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // maybe big workspace // (note FFTW_ALLOC is not needed over malloc, but matches its type) if (p->CpBatch) free(p->CpBatch); @@ -1166,7 +1179,7 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p) if (!p) // NULL ptr, so not a ptr to a plan, report error return 1; - p->fftPlan.free(p->fwBatch); // free the big FFTW (or t3 spread) working array + p->fftPlan->free(p->fwBatch); // free the big FFTW (or t3 spread) working array free(p->sortIndices); if (p->type == 1 || p->type == 2) { free(p->phiHat1); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0e1a1730f..3736c2caa 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -37,6 +37,17 @@ add_test( COMMAND testutils WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +if(NOT FINUFFT_USE_DUCC0) + add_executable(fftw_lock_test fftw_lock_test.cpp) + target_compile_features(fftw_lock_test PRIVATE cxx_std_17) + finufft_link_test(fftw_lock_test) + + add_test( + NAME run_fftw_lock_test + COMMAND fftw_lock_test + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +endif() + # Add ctest definitions that run at both precisions... function(add_tests_with_prec PREC REQ_TOL CHECK_TOL SUFFIX) # All of the following should be run at OMP_NUM_THREADS=4 or something small, diff --git a/test/fftw_lock_test.cpp b/test/fftw_lock_test.cpp new file mode 100644 index 000000000..45e148860 --- /dev/null +++ b/test/fftw_lock_test.cpp @@ -0,0 +1,63 @@ +#include +#include +#include + +#include +#include +#include + +// This file tests the user locking mechanism for multi-threaded FFTW. This +// demonstrates a user lock to prevent FFTW plan calls from interfering with +// finufft plan calls (make/destroy). +// Robert Blackwell. Based on bug identified by Jonas Krimmer (9/17/24) +// See discussion at https://github.com/ludvigak/FINUFFT.jl/issues/62 + +constexpr int N = 65384; + +// Example user lock functions +void locker(void *lck) { reinterpret_cast(lck)->lock(); } +void unlocker(void *lck) { reinterpret_cast(lck)->unlock(); } + +int main() { + int64_t Ns[3]; // guru describes mode array by vector [N1,N2..] + Ns[0] = N; + std::mutex lck; + + finufft_opts opts; + finufft_default_opts(&opts); + opts.nthreads = 1; + opts.debug = 0; + opts.fftw_lock_fun = locker; + opts.fftw_unlock_fun = unlocker; + opts.fftw_lock_data = reinterpret_cast(&lck); + + // random nonuniform points (x) and complex strengths (c)... + std::vector> c(N); + + omp_set_num_threads(8); + + // init FFTW threads + fftw_init_threads(); + +// FFTW and FINUFFT execution using OpenMP parallelization +#pragma omp parallel for + for (int j = 0; j < 100; ++j) { + // allocate output array for FFTW... + std::vector> F1(N); + + // FFTW plan + lck.lock(); + fftw_plan_with_nthreads(1); + fftw_plan plan = fftw_plan_dft_1d(N, reinterpret_cast(c.data()), + reinterpret_cast(F1.data()), + FFTW_FORWARD, FFTW_ESTIMATE); + fftw_destroy_plan(plan); + lck.unlock(); + + // FINUFFT plan + finufft_plan nufftplan; + finufft_makeplan(1, 1, Ns, 1, 1, 1e-6, &nufftplan, &opts); + finufft_destroy(nufftplan); + } + return 0; +}