merge master

flatironinstitute · Sep 20, 2024 · 49c2548 · 49c2548
2 parents 068a5c0 + b98fd1d
commit 49c2548
Show file tree

Hide file tree

Showing 13 changed files with 259 additions and 43 deletions.
diff --git a/.github/workflows/python_build_wheels.yml b/.github/workflows/python_build_wheels.yml
@@ -21,10 +21,10 @@ jobs:
           CIBW_BEFORE_ALL_MACOS: |
             # In order to reinstall a version of GCC compatible with older versions of macOS, we need to first uninstall the existing version.
             brew uninstall gcc
-            pkg=$(brew fetch --force --bottle-tag=monterey gcc | grep 'Downloaded to' | cut -d' ' -f3)
+            pkg=$(brew fetch --force --bottle-tag=monterey gcc | grep 'Downloaded to.*monterey.*' | cut -d' ' -f3)
             brew install $pkg
 
-            pkg=$(brew fetch --force --bottle-tag=monterey fftw | grep 'Downloaded to' | cut -d' ' -f3)
+            pkg=$(brew fetch --force --bottle-tag=monterey fftw | grep 'Downloaded to.*monterey.*' | cut -d' ' -f3)
             brew install $pkg
           CIBW_ARCHS_MACOS: "x86_64"
           # Need following versions of GCC for compatibility with fftw
@@ -56,10 +56,10 @@ jobs:
           CIBW_BEFORE_ALL_MACOS: |
             # In order to reinstall a version of GCC compatible with older versions of macOS, we need to first uninstall the existing version.
             brew uninstall gcc
-            pkg=$(brew fetch --force --bottle-tag=arm64_monterey gcc | grep 'Downloaded to' | cut -d' ' -f3)
+            pkg=$(brew fetch --force --bottle-tag=arm64_monterey gcc | grep 'Downloaded to.*monterey.*' | cut -d' ' -f3)
             brew install $pkg
 
-            pkg=$(brew fetch --force --bottle-tag=arm64_monterey fftw | grep 'Downloaded to' | cut -d' ' -f3)
+            pkg=$(brew fetch --force --bottle-tag=arm64_monterey fftw | grep 'Downloaded to.*monterey.*' | cut -d' ' -f3)
             brew install $pkg
           CIBW_ENVIRONMENT_MACOS: >
             CC=gcc-14

diff --git a/CHANGELOG b/CHANGELOG
@@ -2,15 +2,21 @@ List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
 Master (9/10/24)
+
 * reduced roundoff error in a[n] phase calc in CPU onedim_fseries_kernel().
    #534 (Barnett).
-* Support for type 3 in 1D, 2D, and 3D in the GPU library cufinufft (PR #517).
-    - Removed the CPU fseries computation (only used for benchmark no longer needed).
-    - Added complex arithmetic support for cuda_complex type
-    - Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic
-    - Minor fixes on the GPU code:
-        a) removed memory leaks in case of errors
-        b) renamed maxbatchsize to batchsize
+* GPU code type 1,2 also reduced round-off error in phases, to match CPU code;
+  rationalized onedim_{fseries,nuft}_* GPU codes to match CPU (Barbone, Barnett)
+* Added type 3 in 1D, 2D, and 3D, in the GPU library cufinufft. PR #517, Barbone
+  - Removed the CPU fseries computation (used for benchmark, no longer needed)
+  - Added complex arithmetic support for cuda_complex type
+  - Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic
+  - Minor fixes on the GPU code:
+    a) removed memory leaks in case of errors
+    b) renamed maxbatchsize to batchsize
+* Add options for user-provided FFTW locker (PR548, Blackwell). These options can be be
+used to prevent crashes when a user is creating/destroying FFTW plans and
+FINUFFT plans in threads simultaneously.
 
 V 2.3.0 (9/5/24)
 

diff --git a/docs/opts.rst b/docs/opts.rst
@@ -189,3 +189,77 @@ Here ``0`` makes an automatic choice. If you are unhappy with this, then for sma
 **spread_nthr_atomic**: if non-negative: for numbers of threads up to this value, an OMP critical block for ``add_wrapped_subgrid`` is used in spreading (type 1 transforms). Above this value, instead OMP atomic writes are used, which scale better for large thread numbers. If negative, the heuristic default in the spreader is used, set in ``src/spreadinterp.cpp:setup_spreader()``.
 
 **spread_max_sp_size**: if positive, overrides the maximum subproblem (chunking) size for multithreaded spreading (type 1 transforms). Otherwise the default in the spreader is used, set in ``src/spreadinterp.cpp:setup_spreader()``, which we believe is a decent heuristic for Intel i7 and xeon machines.
+
+
+Thread safety options (advanced)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, with FFTW as the FFT library, FINUFFT is thread safe so long as no other threads are calling FFTW plan creation/destruction routines independently of FINUFFT. If these FFTW routines are called outside of FINUFFT, then the program is liable to crash. In most cases, the calling program can simply call the FFTW routine ``fftw_make_planner_thread_safe()`` before threading out and thread safety will be maintained. However, in instances where this is less desirable, we provide a means to provide your own FFTW locking mechanism. The following example code should exercise FFTW thread safety, and can be built with ``c++ thread_test.cpp -o thread_test -lfinufft -lfftw3_threads -lfftw3 -fopenmp -std=c++11``, assuming the finufft include and library paths are set.
+
+.. code-block:: C++
+
+
+  // thread_test.cpp
+  #include <vector>
+  #include <mutex>
+  #include <complex>
+
+  #include <fftw3.h>
+  #include <finufft.h>
+  #include <omp.h>
+
+  using namespace std;
+
+  constexpr int N = 65384;
+
+  void locker(void *lck) { reinterpret_cast<recursive_mutex *>(lck)->lock(); }
+  void unlocker(void *lck) { reinterpret_cast<recursive_mutex *>(lck)->unlock(); }
+
+  int main() {
+    int64_t Ns[3]; // guru describes mode array by vector [N1,N2..]
+    Ns[0] = N;
+    recursive_mutex lck;
+
+    finufft_opts opts;
+    finufft_default_opts(&opts);
+    opts.nthreads = 1;
+    opts.debug = 0;
+    opts.fftw_lock_fun = locker;
+    opts.fftw_unlock_fun = unlocker;
+    opts.fftw_lock_data = reinterpret_cast<void *>(&lck);
+
+    // random nonuniform points (x) and complex strengths (c)
+    vector<complex<double>> c(N);
+
+    // init FFTW threads
+    fftw_init_threads();
+
+    // FFTW and FINUFFT execution using OpenMP parallelization
+    #pragma omp parallel for
+    for (int j = 0; j < 100; ++j) {
+      // allocate output array for FFTW...
+      vector<complex<double>> F1(N);
+
+      // FFTW plan
+      lck.lock();
+      fftw_plan_with_nthreads(1);
+      fftw_plan plan = fftw_plan_dft_1d(N, reinterpret_cast<fftw_complex*>(c.data()),
+                                        reinterpret_cast<fftw_complex*>(F1.data()),
+                                        FFTW_FORWARD, FFTW_ESTIMATE);
+      fftw_destroy_plan(plan);
+      lck.unlock();
+
+      // FINUFFT plan
+      finufft_plan nufftplan;
+      finufft_makeplan(1, 1, Ns, 1, 1, 1e-6, &nufftplan, &opts);
+      finufft_destroy(nufftplan);
+    }
+
+    return 0;
+  }
+
+**fftw_lock_fun**:  ``void (fun*)(void *)`` C-style callback function to lock calls to FFTW plan manipulation routines. A ``nullptr`` or ``0`` value will be ignored. If non-null, ``fftw_unlock_fun`` must also be set.
+
+**fftw_unlock_fun**: ``void (fun*)(void *)`` C-style callback function to unlock calls to FFTW plan manipulation routines. A ``nullptr`` or ``0`` value will be ignored. If non-null, ``fftw_lock_fun`` must also be set.
+
+**fftw_lock_data**:  ``void *data`` pointer, typically to the lock object itself. Pointer will be passed to ``fftw_lock_fun`` and ``fftw_unlock_fun`` if they are set.
diff --git a/docs/users.rst b/docs/users.rst
@@ -35,7 +35,7 @@ and also add them to GitHub's Used By feature):
 #. `EM-Align <https://github.com/ShkolniskyLab/emalign>`_: Aligning rotation, reflection, and translation between volumes (desntiy maps) in cryo-electron microscopy, from Shkolnisky Lab at Tel Aviv.
 
 #. `spinifel <https://gitlab.osti.gov/mtip/spinifel>`_: Uses the multitiered iterative phasing (M-TIP) algorithm for single particle X-ray diffraction imaging, on CPU/GPU, from the ExaFEL project at LBNL/DOE.
-   
+
 #. `sinctransform <https://github.com/hannahlawrence/sinctransform>`_: C++ and MATLAB codes to evaluate sums of the sinc and sinc^2 kernels between arbitrary nonuniform points in 1,2, or 3 dimensions, by Hannah Lawrence (2017 summer intern at Flatiron).
 
 #. `fsinc <https://github.com/gauteh/fsinc>`_:  Gaute Hope's fast sinc transform and interpolation Python package.
@@ -46,18 +46,22 @@ and also add them to GitHub's Used By feature):
 
 #. `TRIQS CTINT <https://github.com/TRIQS/ctint>`_: continous time interaction-expansion solver, by N. Wentzell and O. Parcollet (Flatiron Institute, part of platform for interacting quantum systems).
 
+#. `cunuSHT <https://github.com/Sebastian-Belkner/cunuSHT>`_: GPU accelerated spherical harmonic transforms from nonuniform samples (arbitrary pixelizations), by S. Belkner and coauthors. https://arxiv.org/abs/2406.14542
+
+#. `FReSCO <https://github.com/martiniani-lab/FReSCo>`_: Fast reciprocal-space correlator, by Aaron Shih, Mathias Kasiulis, and Stefano Martiani. This uses thousands of calls to all three transform types in 2D or 3D, to iteratively adjust nonuniform points until their Fourier transforms match a desired function. Physics Mag. article and movie: https://physics.aps.org/articles/v17/134
+
 
 Other wrappers to (cu)FINUFFT
 ------------------------------
-   
+
 #. `FINUFFT.jl <https://github.com/ludvigak/FINUFFT.jl>`_: a `julia <https://julialang.org/>`_ language wrapper by Ludvig af Klinteberg, Libin Lu, and others, now using pure Julia, and fully featured (rather than via Python). This is itself wrapped by `AbstractNFFTs.jl` in `NFFT.jl <https://juliamath.github.io/NFFT.jl/dev/performance/>`_.
 
 #. `TensorFlow NUFFT <https://github.com/mrphys/tensorflow-nufft>`_: a wrapper to the differentiable machine learning Python tool TensorFlow, for the CPU (via FINUFFT) and GPU (via cuFINUFFT). By Javier Montalt Tordera (UCL).
 
 #. `JAX bindings to (cu)FINUFFT <https://github.com/dfm/jax-finufft>`_: a wrapper to the differentiable machine learning Python tool JAX. Directly exposes the FINUFFT library to JAX's XLA backend, as well as implementing differentiation rules for the transforms. By Dan Foreman-Mackey (CCA).
-   
+
 #. `PyTorch wrapper to (cu)FINUFFT <https://flatironinstitute.github.io/pytorch-finufft>`_:  a wrapper to the differentiable machine learning Python tool PyTorch. By Michael Eickenberg and Brian Ward (CCM).
-   
+
 
 Research output using (cu)FINUFFT
 ---------------------------------
@@ -92,14 +96,14 @@ For the latest see: Google Scholar `FINUFFT citations <https://scholar.google.co
 #. A. Harness, S. Shaklan, P. Willems, N. J. Kasdin, K. Balasubramanian, V. White, K. Yee, P. Dumont, R. Muller, S. Vuong, M. Galvin,
    "Optical experiments and model validation of perturbed starshade designs," Proc. SPIE 11823, Techniques and Instrumentation for Detection of Exoplanets X, 1182312 (1 September 2021); https://doi.org/10.1117/12.2595409
 
-#. Chang, P., Pienaar, E., & Gebbie, T. (2020). "Malliavin--Mancino Estimators Implemented with Nonuniform Fast Fourier Transforms." SIAM J. Sci. Comput. 42(6), B1378–B1403. https://doi.org/10.1137/20m1325903 
+#. Chang, P., Pienaar, E., & Gebbie, T. (2020). "Malliavin--Mancino Estimators Implemented with Nonuniform Fast Fourier Transforms." SIAM J. Sci. Comput. 42(6), B1378–B1403. https://doi.org/10.1137/20m1325903
 
 #. Heisenberg voxelization (HVOX) for inteferometry of spherical sky maps in radio-astronomy, by Kashani, Simeoni, et al. (2023) https://arxiv.org/abs/2306.06007 https://github.com/matthieumeo/hvox
 
 #. Sriramkrishnan Muralikrishnan at the Jülich Supercomputing Centre is running cufinufft on 6144 A100 GPUs (the NERSC-9 supercomputer), for a particle-in-Fourier method for plasma simulations. https://pasc23.pasc-conference.org/presentation/?id=msa167&sess=sess154
 
 #. Related to that, FINUFFT is being used for a better-converging Fourier approach to the Immersed Boundary method of Peskin and his group at NYU. Zhe Chen and Charles Peskin, https://arxiv.org/abs/2302.08694
-   
+
 #. Pei R, Askham T, Greengard L, Jiang S (2023). "A fast method for imposing periodic boundary conditions on arbitrarily-shaped lattices in two dimensions." J. Comput. Phys. 474, 111792. https://doi.org/10.1016/j.jcp.2022.111792 Uses FINUFFT for plane wave sums.
 
 #. Dylan Green, JR Jamora, and Anne Gelb (2023). "Leveraging joint sparsity in 3D synthetic aperture radar imaging," Appl. Math. Modern Chall. 1, 61-86. https://doi.org/10.3934/ammc.2023005 Uses 3D transforms between $N=201^3$ modes (voxels) and $M=313300$ data points. As they state, "...the computational cost of each method heavily depends on the NUFFT algorithm used."
@@ -119,8 +123,8 @@ Papers influenced by other aspects of FINUFFT:
 
 1. NFFT.jl: Generic and Fast Julia Implementation of the Nonequidistant Fast Fourier Transform, by Tobias Knopp, Marija Boberg, Mirco Grosser (2022). https://arxiv.org/abs/2208.00049  They use our blocked spreading and piecewise polynomial ideas, and beat our type 1 and 2 performance by a factor of up to 1.7 in multithreaded cases. Code is dimension-independent but very abstract (two levels of meta-programming, I believe).
 
-   
-   
+
+
 Some citations to FINUFFT that do not appear to be actual users
 ---------------------------------------------------------------
 
@@ -133,5 +137,5 @@ Some citations to FINUFFT that do not appear to be actual users
 #. https://arxiv.org/abs/1912.09746
 
 #. https://arxiv.org/abs/2010.05295
-      
+
 Now too many to track by hand... please see Google Scholar search linked above.
diff --git a/include/finufft/defs.h b/include/finufft/defs.h
@@ -18,6 +18,7 @@
 // public header gives access to f_opts, f_spread_opts, f_plan...
 // (and clobbers FINUFFT* macros; watch out!)
 #include <finufft.h>
+#include <memory>
 
 // --------------- Private data types for compilation in either prec ---------
 // Devnote: must match those in relevant prec of public finufft.h interface!
@@ -258,7 +259,7 @@ typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++
   FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3
 
   // other internal structs; each is C-compatible of course
-  Finufft_FFT_plan<FLT> fftPlan;
+  std::unique_ptr<Finufft_FFT_plan<FLT>> fftPlan;
   finufft_opts opts; // this and spopts could be made ptrs
   finufft_spread_opts spopts;
 

diff --git a/include/finufft/fft.h b/include/finufft/fft.h
@@ -8,6 +8,8 @@
 
 template<typename T> class Finufft_FFT_plan {
 public:
+  Finufft_FFT_plan(void (*)(void *) = nullptr, void (*)(void *) = nullptr,
+                   void * = nullptr) {}
   void plan(const std::vector<int> & /*dims*/, size_t /*batchSize*/,
             std::complex<T> * /*ptr*/, int /*sign*/, int /*options*/, int /*nthreads*/) {}
   static std::complex<T> *alloc_complex(size_t N) { return new std::complex<T>[N]; }
@@ -36,34 +38,47 @@ template<> struct Finufft_FFT_plan<float> {
   }
   fftwf_plan plan_;
 
+  void (*fftw_lock_fun)(void *);   // Function ptr that locks the FFTW planner
+  void (*fftw_unlock_fun)(void *); // Function ptr that unlocks the FFTW planner
+  void *lock_data;
+  void lock() { fftw_lock_fun ? fftw_lock_fun(lock_data) : mut().lock(); }
+  void unlock() { fftw_lock_fun ? fftw_unlock_fun(lock_data) : mut().unlock(); }
+
 public:
-  Finufft_FFT_plan() : plan_(nullptr) {
-    std::lock_guard<std::mutex> lock(mut());
+  Finufft_FFT_plan(void (*fftw_lock_fun_)(void *)   = nullptr,
+                   void (*fftw_unlock_fun_)(void *) = nullptr,
+                   void *lock_data_                 = nullptr)
+      : plan_(nullptr), fftw_lock_fun(fftw_lock_fun_), fftw_unlock_fun(fftw_unlock_fun_),
+        lock_data(lock_data_) {
+    lock();
 #ifdef _OPENMP
     static bool initialized = false;
     if (!initialized) {
       fftwf_init_threads();
       initialized = true;
     }
 #endif
+    unlock();
   }
   ~Finufft_FFT_plan() {
-    std::lock_guard<std::mutex> lock(mut());
+    lock();
     fftwf_destroy_plan(plan_);
+    unlock();
   }
 
   void plan(const std::vector<int> &dims, size_t batchSize, std::complex<float> *ptr,
             int sign, int options, int nthreads) {
     uint64_t nf = 1;
     for (auto i : dims) nf *= i;
-    std::lock_guard<std::mutex> lock(mut());
+    lock();
 #ifdef _OPENMP
     fftwf_plan_with_nthreads(nthreads);
 #endif
     plan_ = fftwf_plan_many_dft(dims.size(), dims.data(), batchSize,
                                 reinterpret_cast<fftwf_complex *>(ptr), nullptr, 1, nf,
                                 reinterpret_cast<fftwf_complex *>(ptr), nullptr, 1, nf,
                                 sign, options);
+    unlock();
   }
   static std::complex<float> *alloc_complex(size_t N) {
     return reinterpret_cast<std::complex<float> *>(fftwf_alloc_complex(N));
@@ -74,17 +89,20 @@ template<> struct Finufft_FFT_plan<float> {
   void execute() { fftwf_execute(plan_); }
 
   static void forget_wisdom() {
-    std::lock_guard<std::mutex> lock(mut());
+    //    lock();
     fftwf_forget_wisdom();
+    //    unlock();
   }
   static void cleanup() {
-    std::lock_guard<std::mutex> lock(mut());
+    //    lock();
     fftwf_cleanup();
+    //    unlock();
   }
   static void cleanup_threads() {
 #ifdef _OPENMP
-    std::lock_guard<std::mutex> lock(mut());
+    //    lock();
     fftwf_cleanup_threads();
+//    unlock();
 #endif
   }
 };
@@ -97,34 +115,47 @@ template<> struct Finufft_FFT_plan<double> {
   }
   fftw_plan plan_;
 
+  void (*fftw_lock_fun)(void *);   // Function ptr that locks the FFTW planner
+  void (*fftw_unlock_fun)(void *); // Function ptr that unlocks the FFTW planner
+  void *lock_data;
+  void lock() { fftw_lock_fun ? fftw_lock_fun(lock_data) : mut().lock(); }
+  void unlock() { fftw_lock_fun ? fftw_unlock_fun(lock_data) : mut().unlock(); }
+
 public:
-  Finufft_FFT_plan() : plan_(nullptr) {
-    std::lock_guard<std::mutex> lock(mut());
+  Finufft_FFT_plan(void (*fftw_lock_fun_)(void *)   = nullptr,
+                   void (*fftw_unlock_fun_)(void *) = nullptr,
+                   void *lock_data_                 = nullptr)
+      : plan_(nullptr), fftw_lock_fun(fftw_lock_fun_), fftw_unlock_fun(fftw_unlock_fun_),
+        lock_data(lock_data_) {
+    lock();
 #ifdef _OPENMP
     static bool initialized = false;
     if (!initialized) {
       fftw_init_threads();
       initialized = true;
     }
 #endif
+    unlock();
   }
   ~Finufft_FFT_plan() {
-    std::lock_guard<std::mutex> lock(mut());
+    lock();
     fftw_destroy_plan(plan_);
+    unlock();
   }
 
   void plan(const std::vector<int> &dims, size_t batchSize, std::complex<double> *ptr,
             int sign, int options, int nthreads) {
     uint64_t nf = 1;
     for (auto i : dims) nf *= i;
-    std::lock_guard<std::mutex> lock(mut());
+    lock();
 #ifdef _OPENMP
     fftw_plan_with_nthreads(nthreads);
 #endif
     plan_ = fftw_plan_many_dft(dims.size(), dims.data(), batchSize,
                                reinterpret_cast<fftw_complex *>(ptr), nullptr, 1, nf,
                                reinterpret_cast<fftw_complex *>(ptr), nullptr, 1, nf,
                                sign, options);
+    unlock();
   }
   static std::complex<double> *alloc_complex(size_t N) {
     return reinterpret_cast<std::complex<double> *>(fftw_alloc_complex(N));
@@ -135,17 +166,20 @@ template<> struct Finufft_FFT_plan<double> {
   void execute() { fftw_execute(plan_); }
 
   static void forget_wisdom() {
-    std::lock_guard<std::mutex> lock(mut());
+    //    lock();
     fftw_forget_wisdom();
+    //    unlock();
   }
   static void cleanup() {
-    std::lock_guard<std::mutex> lock(mut());
+    //    lock();
     fftw_cleanup();
+    //    unlock();
   }
   static void cleanup_threads() {
 #ifdef _OPENMP
-    std::lock_guard<std::mutex> lock(mut());
+    //    lock();
     fftw_cleanup_threads();
+//    unlock();
 #endif
   }
 };

diff --git a/include/finufft_errors.h b/include/finufft_errors.h
@@ -24,6 +24,7 @@ enum {
   FINUFFT_ERR_BINSIZE_NOTVALID       = 18,
   FINUFFT_ERR_INSUFFICIENT_SHMEM     = 19,
   FINUFFT_ERR_NUM_NU_PTS_INVALID     = 20,
-  FINUFFT_ERR_INVALID_ARGUMENT       = 21
+  FINUFFT_ERR_INVALID_ARGUMENT       = 21,
+  FINUFFT_ERR_LOCK_FUNS_INVALID      = 22
 };
 #endif