diff --git a/.github/workflows/python_build_wheels.yml b/.github/workflows/python_build_wheels.yml
index 455764213..ad16b0373 100644
--- a/.github/workflows/python_build_wheels.yml
+++ b/.github/workflows/python_build_wheels.yml
@@ -21,10 +21,10 @@ jobs:
           CIBW_BEFORE_ALL_MACOS: |
             # In order to reinstall a version of GCC compatible with older versions of macOS, we need to first uninstall the existing version.
             brew uninstall gcc
-            pkg=$(brew fetch --force --bottle-tag=monterey gcc | grep 'Downloaded to' | cut -d' ' -f3)
+            pkg=$(brew fetch --force --bottle-tag=monterey gcc | grep 'Downloaded to.*monterey.*' | cut -d' ' -f3)
             brew install $pkg
 
-            pkg=$(brew fetch --force --bottle-tag=monterey fftw | grep 'Downloaded to' | cut -d' ' -f3)
+            pkg=$(brew fetch --force --bottle-tag=monterey fftw | grep 'Downloaded to.*monterey.*' | cut -d' ' -f3)
             brew install $pkg
           CIBW_ARCHS_MACOS: "x86_64"
           # Need following versions of GCC for compatibility with fftw
@@ -56,10 +56,10 @@ jobs:
           CIBW_BEFORE_ALL_MACOS: |
             # In order to reinstall a version of GCC compatible with older versions of macOS, we need to first uninstall the existing version.
             brew uninstall gcc
-            pkg=$(brew fetch --force --bottle-tag=arm64_monterey gcc | grep 'Downloaded to' | cut -d' ' -f3)
+            pkg=$(brew fetch --force --bottle-tag=arm64_monterey gcc | grep 'Downloaded to.*monterey.*' | cut -d' ' -f3)
             brew install $pkg
 
-            pkg=$(brew fetch --force --bottle-tag=arm64_monterey fftw | grep 'Downloaded to' | cut -d' ' -f3)
+            pkg=$(brew fetch --force --bottle-tag=arm64_monterey fftw | grep 'Downloaded to.*monterey.*' | cut -d' ' -f3)
             brew install $pkg
           CIBW_ENVIRONMENT_MACOS: >
             CC=gcc-14
diff --git a/CHANGELOG b/CHANGELOG
index fe852d418..37f4c9c80 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,15 +2,21 @@ List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
 Master (9/10/24)
+
 * reduced roundoff error in a[n] phase calc in CPU onedim_fseries_kernel().
    #534 (Barnett).
-* Support for type 3 in 1D, 2D, and 3D in the GPU library cufinufft (PR #517).
-    - Removed the CPU fseries computation (only used for benchmark no longer needed).
-    - Added complex arithmetic support for cuda_complex type
-    - Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic
-    - Minor fixes on the GPU code:
-        a) removed memory leaks in case of errors
-        b) renamed maxbatchsize to batchsize
+* GPU code type 1,2 also reduced round-off error in phases, to match CPU code;
+  rationalized onedim_{fseries,nuft}_* GPU codes to match CPU (Barbone, Barnett)
+* Added type 3 in 1D, 2D, and 3D, in the GPU library cufinufft. PR #517, Barbone
+  - Removed the CPU fseries computation (used for benchmark, no longer needed)
+  - Added complex arithmetic support for cuda_complex type
+  - Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic
+  - Minor fixes on the GPU code:
+    a) removed memory leaks in case of errors
+    b) renamed maxbatchsize to batchsize
+* Add options for user-provided FFTW locker (PR548, Blackwell). These options can be be
+used to prevent crashes when a user is creating/destroying FFTW plans and
+FINUFFT plans in threads simultaneously.
 
 V 2.3.0 (9/5/24)
 
diff --git a/docs/opts.rst b/docs/opts.rst
index 89b7be8cd..d659b8b20 100644
--- a/docs/opts.rst
+++ b/docs/opts.rst
@@ -189,3 +189,77 @@ Here ``0`` makes an automatic choice. If you are unhappy with this, then for sma
 **spread_nthr_atomic**: if non-negative: for numbers of threads up to this value, an OMP critical block for ``add_wrapped_subgrid`` is used in spreading (type 1 transforms). Above this value, instead OMP atomic writes are used, which scale better for large thread numbers. If negative, the heuristic default in the spreader is used, set in ``src/spreadinterp.cpp:setup_spreader()``.
 
 **spread_max_sp_size**: if positive, overrides the maximum subproblem (chunking) size for multithreaded spreading (type 1 transforms). Otherwise the default in the spreader is used, set in ``src/spreadinterp.cpp:setup_spreader()``, which we believe is a decent heuristic for Intel i7 and xeon machines.
+
+
+Thread safety options (advanced)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, with FFTW as the FFT library, FINUFFT is thread safe so long as no other threads are calling FFTW plan creation/destruction routines independently of FINUFFT. If these FFTW routines are called outside of FINUFFT, then the program is liable to crash. In most cases, the calling program can simply call the FFTW routine ``fftw_make_planner_thread_safe()`` before threading out and thread safety will be maintained. However, in instances where this is less desirable, we provide a means to provide your own FFTW locking mechanism. The following example code should exercise FFTW thread safety, and can be built with ``c++ thread_test.cpp -o thread_test -lfinufft -lfftw3_threads -lfftw3 -fopenmp -std=c++11``, assuming the finufft include and library paths are set.
+
+.. code-block:: C++
+
+
+  // thread_test.cpp
+  #include <vector>
+  #include <mutex>
+  #include <complex>
+
+  #include <fftw3.h>
+  #include <finufft.h>
+  #include <omp.h>
+
+  using namespace std;
+
+  constexpr int N = 65384;
+
+  void locker(void *lck) { reinterpret_cast<recursive_mutex *>(lck)->lock(); }
+  void unlocker(void *lck) { reinterpret_cast<recursive_mutex *>(lck)->unlock(); }
+
+  int main() {
+    int64_t Ns[3]; // guru describes mode array by vector [N1,N2..]
+    Ns[0] = N;
+    recursive_mutex lck;
+
+    finufft_opts opts;
+    finufft_default_opts(&opts);
+    opts.nthreads = 1;
+    opts.debug = 0;
+    opts.fftw_lock_fun = locker;
+    opts.fftw_unlock_fun = unlocker;
+    opts.fftw_lock_data = reinterpret_cast<void *>(&lck);
+
+    // random nonuniform points (x) and complex strengths (c)
+    vector<complex<double>> c(N);
+
+    // init FFTW threads
+    fftw_init_threads();
+
+    // FFTW and FINUFFT execution using OpenMP parallelization
+    #pragma omp parallel for
+    for (int j = 0; j < 100; ++j) {
+      // allocate output array for FFTW...
+      vector<complex<double>> F1(N);
+
+      // FFTW plan
+      lck.lock();
+      fftw_plan_with_nthreads(1);
+      fftw_plan plan = fftw_plan_dft_1d(N, reinterpret_cast<fftw_complex*>(c.data()),
+                                        reinterpret_cast<fftw_complex*>(F1.data()),
+                                        FFTW_FORWARD, FFTW_ESTIMATE);
+      fftw_destroy_plan(plan);
+      lck.unlock();
+
+      // FINUFFT plan
+      finufft_plan nufftplan;
+      finufft_makeplan(1, 1, Ns, 1, 1, 1e-6, &nufftplan, &opts);
+      finufft_destroy(nufftplan);
+    }
+
+    return 0;
+  }
+
+**fftw_lock_fun**:  ``void (fun*)(void *)`` C-style callback function to lock calls to FFTW plan manipulation routines. A ``nullptr`` or ``0`` value will be ignored. If non-null, ``fftw_unlock_fun`` must also be set.
+
+**fftw_unlock_fun**: ``void (fun*)(void *)`` C-style callback function to unlock calls to FFTW plan manipulation routines. A ``nullptr`` or ``0`` value will be ignored. If non-null, ``fftw_lock_fun`` must also be set.
+
+**fftw_lock_data**:  ``void *data`` pointer, typically to the lock object itself. Pointer will be passed to ``fftw_lock_fun`` and ``fftw_unlock_fun`` if they are set.
diff --git a/docs/users.rst b/docs/users.rst
index 3cfe8c3b4..8c439bff9 100644
--- a/docs/users.rst
+++ b/docs/users.rst
@@ -35,7 +35,7 @@ and also add them to GitHub's Used By feature):
 #. `EM-Align <https://github.com/ShkolniskyLab/emalign>`_: Aligning rotation, reflection, and translation between volumes (desntiy maps) in cryo-electron microscopy, from Shkolnisky Lab at Tel Aviv.
 
 #. `spinifel <https://gitlab.osti.gov/mtip/spinifel>`_: Uses the multitiered iterative phasing (M-TIP) algorithm for single particle X-ray diffraction imaging, on CPU/GPU, from the ExaFEL project at LBNL/DOE.
-   
+
 #. `sinctransform <https://github.com/hannahlawrence/sinctransform>`_: C++ and MATLAB codes to evaluate sums of the sinc and sinc^2 kernels between arbitrary nonuniform points in 1,2, or 3 dimensions, by Hannah Lawrence (2017 summer intern at Flatiron).
 
 #. `fsinc <https://github.com/gauteh/fsinc>`_:  Gaute Hope's fast sinc transform and interpolation Python package.
@@ -46,18 +46,22 @@ and also add them to GitHub's Used By feature):
 
 #. `TRIQS CTINT <https://github.com/TRIQS/ctint>`_: continous time interaction-expansion solver, by N. Wentzell and O. Parcollet (Flatiron Institute, part of platform for interacting quantum systems).
 
+#. `cunuSHT <https://github.com/Sebastian-Belkner/cunuSHT>`_: GPU accelerated spherical harmonic transforms from nonuniform samples (arbitrary pixelizations), by S. Belkner and coauthors. https://arxiv.org/abs/2406.14542
+
+#. `FReSCO <https://github.com/martiniani-lab/FReSCo>`_: Fast reciprocal-space correlator, by Aaron Shih, Mathias Kasiulis, and Stefano Martiani. This uses thousands of calls to all three transform types in 2D or 3D, to iteratively adjust nonuniform points until their Fourier transforms match a desired function. Physics Mag. article and movie: https://physics.aps.org/articles/v17/134
+
 
 Other wrappers to (cu)FINUFFT
 ------------------------------
-   
+
 #. `FINUFFT.jl <https://github.com/ludvigak/FINUFFT.jl>`_: a `julia <https://julialang.org/>`_ language wrapper by Ludvig af Klinteberg, Libin Lu, and others, now using pure Julia, and fully featured (rather than via Python). This is itself wrapped by `AbstractNFFTs.jl` in `NFFT.jl <https://juliamath.github.io/NFFT.jl/dev/performance/>`_.
 
 #. `TensorFlow NUFFT <https://github.com/mrphys/tensorflow-nufft>`_: a wrapper to the differentiable machine learning Python tool TensorFlow, for the CPU (via FINUFFT) and GPU (via cuFINUFFT). By Javier Montalt Tordera (UCL).
 
 #. `JAX bindings to (cu)FINUFFT <https://github.com/dfm/jax-finufft>`_: a wrapper to the differentiable machine learning Python tool JAX. Directly exposes the FINUFFT library to JAX's XLA backend, as well as implementing differentiation rules for the transforms. By Dan Foreman-Mackey (CCA).
-   
+
 #. `PyTorch wrapper to (cu)FINUFFT <https://flatironinstitute.github.io/pytorch-finufft>`_:  a wrapper to the differentiable machine learning Python tool PyTorch. By Michael Eickenberg and Brian Ward (CCM).
-   
+
 
 Research output using (cu)FINUFFT
 ---------------------------------
@@ -92,14 +96,14 @@ For the latest see: Google Scholar `FINUFFT citations <https://scholar.google.co
 #. A. Harness, S. Shaklan, P. Willems, N. J. Kasdin, K. Balasubramanian, V. White, K. Yee, P. Dumont, R. Muller, S. Vuong, M. Galvin,
    "Optical experiments and model validation of perturbed starshade designs," Proc. SPIE 11823, Techniques and Instrumentation for Detection of Exoplanets X, 1182312 (1 September 2021); https://doi.org/10.1117/12.2595409
 
-#. Chang, P., Pienaar, E., & Gebbie, T. (2020). "Malliavin--Mancino Estimators Implemented with Nonuniform Fast Fourier Transforms." SIAM J. Sci. Comput. 42(6), B1378–B1403. https://doi.org/10.1137/20m1325903 
+#. Chang, P., Pienaar, E., & Gebbie, T. (2020). "Malliavin--Mancino Estimators Implemented with Nonuniform Fast Fourier Transforms." SIAM J. Sci. Comput. 42(6), B1378–B1403. https://doi.org/10.1137/20m1325903
 
 #. Heisenberg voxelization (HVOX) for inteferometry of spherical sky maps in radio-astronomy, by Kashani, Simeoni, et al. (2023) https://arxiv.org/abs/2306.06007 https://github.com/matthieumeo/hvox
 
 #. Sriramkrishnan Muralikrishnan at the Jülich Supercomputing Centre is running cufinufft on 6144 A100 GPUs (the NERSC-9 supercomputer), for a particle-in-Fourier method for plasma simulations. https://pasc23.pasc-conference.org/presentation/?id=msa167&sess=sess154
 
 #. Related to that, FINUFFT is being used for a better-converging Fourier approach to the Immersed Boundary method of Peskin and his group at NYU. Zhe Chen and Charles Peskin, https://arxiv.org/abs/2302.08694
-   
+
 #. Pei R, Askham T, Greengard L, Jiang S (2023). "A fast method for imposing periodic boundary conditions on arbitrarily-shaped lattices in two dimensions." J. Comput. Phys. 474, 111792. https://doi.org/10.1016/j.jcp.2022.111792 Uses FINUFFT for plane wave sums.
 
 #. Dylan Green, JR Jamora, and Anne Gelb (2023). "Leveraging joint sparsity in 3D synthetic aperture radar imaging," Appl. Math. Modern Chall. 1, 61-86. https://doi.org/10.3934/ammc.2023005 Uses 3D transforms between $N=201^3$ modes (voxels) and $M=313300$ data points. As they state, "...the computational cost of each method heavily depends on the NUFFT algorithm used."
@@ -119,8 +123,8 @@ Papers influenced by other aspects of FINUFFT:
 
 1. NFFT.jl: Generic and Fast Julia Implementation of the Nonequidistant Fast Fourier Transform, by Tobias Knopp, Marija Boberg, Mirco Grosser (2022). https://arxiv.org/abs/2208.00049  They use our blocked spreading and piecewise polynomial ideas, and beat our type 1 and 2 performance by a factor of up to 1.7 in multithreaded cases. Code is dimension-independent but very abstract (two levels of meta-programming, I believe).
 
-   
-   
+
+
 Some citations to FINUFFT that do not appear to be actual users
 ---------------------------------------------------------------
 
@@ -133,5 +137,5 @@ Some citations to FINUFFT that do not appear to be actual users
 #. https://arxiv.org/abs/1912.09746
 
 #. https://arxiv.org/abs/2010.05295
-      
+
 Now too many to track by hand... please see Google Scholar search linked above.
diff --git a/include/finufft/defs.h b/include/finufft/defs.h
index 76ac0c545..37eb85d34 100644
--- a/include/finufft/defs.h
+++ b/include/finufft/defs.h
@@ -18,6 +18,7 @@
 // public header gives access to f_opts, f_spread_opts, f_plan...
 // (and clobbers FINUFFT* macros; watch out!)
 #include <finufft.h>
+#include <memory>
 
 // --------------- Private data types for compilation in either prec ---------
 // Devnote: must match those in relevant prec of public finufft.h interface!
@@ -258,7 +259,7 @@ typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++
   FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3
 
   // other internal structs; each is C-compatible of course
-  Finufft_FFT_plan<FLT> fftPlan;
+  std::unique_ptr<Finufft_FFT_plan<FLT>> fftPlan;
   finufft_opts opts; // this and spopts could be made ptrs
   finufft_spread_opts spopts;
 
diff --git a/include/finufft/fft.h b/include/finufft/fft.h
index 87e34d657..7f47814a0 100644
--- a/include/finufft/fft.h
+++ b/include/finufft/fft.h
@@ -8,6 +8,8 @@
 
 template<typename T> class Finufft_FFT_plan {
 public:
+  Finufft_FFT_plan(void (*)(void *) = nullptr, void (*)(void *) = nullptr,
+                   void * = nullptr) {}
   void plan(const std::vector<int> & /*dims*/, size_t /*batchSize*/,
             std::complex<T> * /*ptr*/, int /*sign*/, int /*options*/, int /*nthreads*/) {}
   static std::complex<T> *alloc_complex(size_t N) { return new std::complex<T>[N]; }
@@ -36,9 +38,19 @@ template<> struct Finufft_FFT_plan<float> {
   }
   fftwf_plan plan_;
 
+  void (*fftw_lock_fun)(void *);   // Function ptr that locks the FFTW planner
+  void (*fftw_unlock_fun)(void *); // Function ptr that unlocks the FFTW planner
+  void *lock_data;
+  void lock() { fftw_lock_fun ? fftw_lock_fun(lock_data) : mut().lock(); }
+  void unlock() { fftw_lock_fun ? fftw_unlock_fun(lock_data) : mut().unlock(); }
+
 public:
-  Finufft_FFT_plan() : plan_(nullptr) {
-    std::lock_guard<std::mutex> lock(mut());
+  Finufft_FFT_plan(void (*fftw_lock_fun_)(void *)   = nullptr,
+                   void (*fftw_unlock_fun_)(void *) = nullptr,
+                   void *lock_data_                 = nullptr)
+      : plan_(nullptr), fftw_lock_fun(fftw_lock_fun_), fftw_unlock_fun(fftw_unlock_fun_),
+        lock_data(lock_data_) {
+    lock();
 #ifdef _OPENMP
     static bool initialized = false;
     if (!initialized) {
@@ -46,17 +58,19 @@ template<> struct Finufft_FFT_plan<float> {
       initialized = true;
     }
 #endif
+    unlock();
   }
   ~Finufft_FFT_plan() {
-    std::lock_guard<std::mutex> lock(mut());
+    lock();
     fftwf_destroy_plan(plan_);
+    unlock();
   }
 
   void plan(const std::vector<int> &dims, size_t batchSize, std::complex<float> *ptr,
             int sign, int options, int nthreads) {
     uint64_t nf = 1;
     for (auto i : dims) nf *= i;
-    std::lock_guard<std::mutex> lock(mut());
+    lock();
 #ifdef _OPENMP
     fftwf_plan_with_nthreads(nthreads);
 #endif
@@ -64,6 +78,7 @@ template<> struct Finufft_FFT_plan<float> {
                                 reinterpret_cast<fftwf_complex *>(ptr), nullptr, 1, nf,
                                 reinterpret_cast<fftwf_complex *>(ptr), nullptr, 1, nf,
                                 sign, options);
+    unlock();
   }
   static std::complex<float> *alloc_complex(size_t N) {
     return reinterpret_cast<std::complex<float> *>(fftwf_alloc_complex(N));
@@ -74,17 +89,20 @@ template<> struct Finufft_FFT_plan<float> {
   void execute() { fftwf_execute(plan_); }
 
   static void forget_wisdom() {
-    std::lock_guard<std::mutex> lock(mut());
+    //    lock();
     fftwf_forget_wisdom();
+    //    unlock();
   }
   static void cleanup() {
-    std::lock_guard<std::mutex> lock(mut());
+    //    lock();
     fftwf_cleanup();
+    //    unlock();
   }
   static void cleanup_threads() {
 #ifdef _OPENMP
-    std::lock_guard<std::mutex> lock(mut());
+    //    lock();
     fftwf_cleanup_threads();
+//    unlock();
 #endif
   }
 };
@@ -97,9 +115,19 @@ template<> struct Finufft_FFT_plan<double> {
   }
   fftw_plan plan_;
 
+  void (*fftw_lock_fun)(void *);   // Function ptr that locks the FFTW planner
+  void (*fftw_unlock_fun)(void *); // Function ptr that unlocks the FFTW planner
+  void *lock_data;
+  void lock() { fftw_lock_fun ? fftw_lock_fun(lock_data) : mut().lock(); }
+  void unlock() { fftw_lock_fun ? fftw_unlock_fun(lock_data) : mut().unlock(); }
+
 public:
-  Finufft_FFT_plan() : plan_(nullptr) {
-    std::lock_guard<std::mutex> lock(mut());
+  Finufft_FFT_plan(void (*fftw_lock_fun_)(void *)   = nullptr,
+                   void (*fftw_unlock_fun_)(void *) = nullptr,
+                   void *lock_data_                 = nullptr)
+      : plan_(nullptr), fftw_lock_fun(fftw_lock_fun_), fftw_unlock_fun(fftw_unlock_fun_),
+        lock_data(lock_data_) {
+    lock();
 #ifdef _OPENMP
     static bool initialized = false;
     if (!initialized) {
@@ -107,17 +135,19 @@ template<> struct Finufft_FFT_plan<double> {
       initialized = true;
     }
 #endif
+    unlock();
   }
   ~Finufft_FFT_plan() {
-    std::lock_guard<std::mutex> lock(mut());
+    lock();
     fftw_destroy_plan(plan_);
+    unlock();
   }
 
   void plan(const std::vector<int> &dims, size_t batchSize, std::complex<double> *ptr,
             int sign, int options, int nthreads) {
     uint64_t nf = 1;
     for (auto i : dims) nf *= i;
-    std::lock_guard<std::mutex> lock(mut());
+    lock();
 #ifdef _OPENMP
     fftw_plan_with_nthreads(nthreads);
 #endif
@@ -125,6 +155,7 @@ template<> struct Finufft_FFT_plan<double> {
                                reinterpret_cast<fftw_complex *>(ptr), nullptr, 1, nf,
                                reinterpret_cast<fftw_complex *>(ptr), nullptr, 1, nf,
                                sign, options);
+    unlock();
   }
   static std::complex<double> *alloc_complex(size_t N) {
     return reinterpret_cast<std::complex<double> *>(fftw_alloc_complex(N));
@@ -135,17 +166,20 @@ template<> struct Finufft_FFT_plan<double> {
   void execute() { fftw_execute(plan_); }
 
   static void forget_wisdom() {
-    std::lock_guard<std::mutex> lock(mut());
+    //    lock();
     fftw_forget_wisdom();
+    //    unlock();
   }
   static void cleanup() {
-    std::lock_guard<std::mutex> lock(mut());
+    //    lock();
     fftw_cleanup();
+    //    unlock();
   }
   static void cleanup_threads() {
 #ifdef _OPENMP
-    std::lock_guard<std::mutex> lock(mut());
+    //    lock();
     fftw_cleanup_threads();
+//    unlock();
 #endif
   }
 };
diff --git a/include/finufft_errors.h b/include/finufft_errors.h
index fb827557c..10465aa13 100644
--- a/include/finufft_errors.h
+++ b/include/finufft_errors.h
@@ -24,6 +24,7 @@ enum {
   FINUFFT_ERR_BINSIZE_NOTVALID       = 18,
   FINUFFT_ERR_INSUFFICIENT_SHMEM     = 19,
   FINUFFT_ERR_NUM_NU_PTS_INVALID     = 20,
-  FINUFFT_ERR_INVALID_ARGUMENT       = 21
+  FINUFFT_ERR_INVALID_ARGUMENT       = 21,
+  FINUFFT_ERR_LOCK_FUNS_INVALID      = 22
 };
 #endif
diff --git a/include/finufft_opts.h b/include/finufft_opts.h
index 0435b8c41..0d482f732 100644
--- a/include/finufft_opts.h
+++ b/include/finufft_opts.h
@@ -32,6 +32,12 @@ typedef struct finufft_opts { // defaults see finufft.cpp:finufft_default_opts()
                           // atomic
   int spread_max_sp_size; // if >0, overrides spreader (dir=1) max subproblem size
                           // sphinx tag (don't remove): @opts_end
+
+  // User can provide their own FFTW planner lock functions for thread safety
+  // Null values ignored and use a default lock function (both or neither must be set)
+  void (*fftw_lock_fun)(void *);   // Function ptr that locks the FFTW planner
+  void (*fftw_unlock_fun)(void *); // Function ptr that unlocks the FFTW planner
+  void *fftw_lock_data;            // Data to pass to the lock functions (e.g. a mutex)
 } finufft_opts;
 
 // Those of the above of the form spread_* indicate pass through to finufft_spread_opts
diff --git a/python/finufft/finufft/_finufft.py b/python/finufft/finufft/_finufft.py
index 99969a38c..10af6829c 100644
--- a/python/finufft/finufft/_finufft.py
+++ b/python/finufft/finufft/_finufft.py
@@ -85,7 +85,10 @@ class FinufftOpts(ctypes.Structure):
                       ('spread_thread', c_int),
                       ('maxbatchsize', c_int),
                       ('spread_nthr_atomic', c_int),
-                      ('spread_max_sp_size', c_int)]
+                      ('spread_max_sp_size', c_int),
+                      ('fftw_lock_fun', c_void_p),
+                      ('fftw_unlock_fun', c_void_p),
+                      ('fftw_lock_data', c_void_p)]
 
 
 FinufftPlan = c_void_p
diff --git a/src/fft.cpp b/src/fft.cpp
index c2c8ddf82..2a1e268e0 100644
--- a/src/fft.cpp
+++ b/src/fft.cpp
@@ -103,6 +103,6 @@ void do_fft(FINUFFT_PLAN p) {
   }
 #endif
 #else
-  p->fftPlan.execute(); // if thisBatchSize<batchSize it wastes some flops
+  p->fftPlan->execute(); // if thisBatchSize<batchSize it wastes some flops
 #endif
 }
diff --git a/src/finufft.cpp b/src/finufft.cpp
index 6709dbf87..c4fc56a75 100644
--- a/src/finufft.cpp
+++ b/src/finufft.cpp
@@ -15,6 +15,7 @@
 #include <cstdlib>
 #include <iomanip>
 #include <iostream>
+#include <memory>
 #include <vector>
 
 using namespace std;
@@ -520,6 +521,9 @@ void FINUFFT_DEFAULT_OPTS(finufft_opts *o)
   o->maxbatchsize       = 0;
   o->spread_nthr_atomic = -1;
   o->spread_max_sp_size = 0;
+  o->fftw_lock_fun      = nullptr;
+  o->fftw_unlock_fun    = nullptr;
+  o->fftw_lock_data     = nullptr;
   // sphinx tag (don't remove): @defopts_end
 }
 
@@ -545,6 +549,9 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans,
     printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",
            __func__);
 
+  p->fftPlan = std::make_unique<Finufft_FFT_plan<FLT>>(
+      p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data);
+
   if ((type != 1) && (type != 2) && (type != 3)) {
     fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type);
     return FINUFFT_ERR_TYPE_NOTVALID;
@@ -557,6 +564,12 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans,
     fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans);
     return FINUFFT_ERR_NTRANS_NOTVALID;
   }
+  if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) {
+    fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n",
+            __func__);
+    return FINUFFT_ERR_LOCK_FUNS_INVALID;
+    ;
+  }
 
   // get stuff from args...
   p->type    = type;
@@ -708,7 +721,7 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans,
     }
 
     timer.restart();
-    p->fwBatch = p->fftPlan.alloc_complex(p->nf * p->batchSize); // the big workspace
+    p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace
     if (p->opts.debug)
       printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,
              (double)1E-09 * sizeof(CPX) * p->nf * p->batchSize, timer.elapsedsec());
@@ -723,7 +736,7 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans,
 
     timer.restart(); // plan the FFTW
     auto ns = gridsize_for_fft(p);
-    p->fftPlan.plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft);
+    p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft);
     if (p->opts.debug)
       printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw,
              nthr_fft, timer.elapsedsec());
@@ -850,8 +863,8 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT
               __func__);
       return FINUFFT_ERR_MAXNALLOC;
     }
-    p->fftPlan.free(p->fwBatch);
-    p->fwBatch = p->fftPlan.alloc_complex(p->nf * p->batchSize); // maybe big workspace
+    p->fftPlan->free(p->fwBatch);
+    p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // maybe big workspace
 
     // (note FFTW_ALLOC is not needed over malloc, but matches its type)
     if (p->CpBatch) free(p->CpBatch);
@@ -1166,7 +1179,7 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p)
   if (!p) // NULL ptr, so not a ptr to a plan, report error
     return 1;
 
-  p->fftPlan.free(p->fwBatch); // free the big FFTW (or t3 spread) working array
+  p->fftPlan->free(p->fwBatch); // free the big FFTW (or t3 spread) working array
   free(p->sortIndices);
   if (p->type == 1 || p->type == 2) {
     free(p->phiHat1);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 0e1a1730f..3736c2caa 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -37,6 +37,17 @@ add_test(
   COMMAND testutils
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
+if(NOT FINUFFT_USE_DUCC0)
+  add_executable(fftw_lock_test fftw_lock_test.cpp)
+  target_compile_features(fftw_lock_test PRIVATE cxx_std_17)
+  finufft_link_test(fftw_lock_test)
+
+  add_test(
+    NAME run_fftw_lock_test
+    COMMAND fftw_lock_test
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+endif()
+
 # Add ctest definitions that run at both precisions...
 function(add_tests_with_prec PREC REQ_TOL CHECK_TOL SUFFIX)
   # All of the following should be run at OMP_NUM_THREADS=4 or something small,
diff --git a/test/fftw_lock_test.cpp b/test/fftw_lock_test.cpp
new file mode 100644
index 000000000..45e148860
--- /dev/null
+++ b/test/fftw_lock_test.cpp
@@ -0,0 +1,63 @@
+#include <complex>
+#include <mutex>
+#include <vector>
+
+#include <fftw3.h>
+#include <finufft.h>
+#include <omp.h>
+
+// This file tests the user locking mechanism for multi-threaded FFTW. This
+// demonstrates a user lock to prevent FFTW plan calls from interfering with
+// finufft plan calls (make/destroy).
+// Robert Blackwell. Based on bug identified by Jonas Krimmer (9/17/24)
+// See discussion at https://github.com/ludvigak/FINUFFT.jl/issues/62
+
+constexpr int N = 65384;
+
+// Example user lock functions
+void locker(void *lck) { reinterpret_cast<std::mutex *>(lck)->lock(); }
+void unlocker(void *lck) { reinterpret_cast<std::mutex *>(lck)->unlock(); }
+
+int main() {
+  int64_t Ns[3]; // guru describes mode array by vector [N1,N2..]
+  Ns[0] = N;
+  std::mutex lck;
+
+  finufft_opts opts;
+  finufft_default_opts(&opts);
+  opts.nthreads        = 1;
+  opts.debug           = 0;
+  opts.fftw_lock_fun   = locker;
+  opts.fftw_unlock_fun = unlocker;
+  opts.fftw_lock_data  = reinterpret_cast<void *>(&lck);
+
+  // random nonuniform points (x) and complex strengths (c)...
+  std::vector<std::complex<double>> c(N);
+
+  omp_set_num_threads(8);
+
+  // init FFTW threads
+  fftw_init_threads();
+
+// FFTW and FINUFFT execution using OpenMP parallelization
+#pragma omp parallel for
+  for (int j = 0; j < 100; ++j) {
+    // allocate output array for FFTW...
+    std::vector<std::complex<double>> F1(N);
+
+    // FFTW plan
+    lck.lock();
+    fftw_plan_with_nthreads(1);
+    fftw_plan plan = fftw_plan_dft_1d(N, reinterpret_cast<fftw_complex *>(c.data()),
+                                      reinterpret_cast<fftw_complex *>(F1.data()),
+                                      FFTW_FORWARD, FFTW_ESTIMATE);
+    fftw_destroy_plan(plan);
+    lck.unlock();
+
+    // FINUFFT plan
+    finufft_plan nufftplan;
+    finufft_makeplan(1, 1, Ns, 1, 1, 1e-6, &nufftplan, &opts);
+    finufft_destroy(nufftplan);
+  }
+  return 0;
+}