flatironinstitute · janden · Jul 25, 2023 · Jul 25, 2023 · Jul 25, 2023 · Jul 26, 2023
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -27,7 +27,9 @@ pipeline {
       echo $HOME
     '''
     sh '''#!/bin/bash -ex
-        cuda_arch=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader|head -n 1| sed "s/\\.//")
+        # Oldest card in the Jenkins pool is a K40
+        cuda_arch="35"
+
         cmake -B build . -DFINUFFT_USE_CUDA=ON \
                          -DFINUFFT_USE_CPU=OFF \
                          -DFINUFFT_BUILD_TESTS=ON \
@@ -44,9 +46,14 @@ pipeline {
     sh '''#!/bin/bash -ex
       source $HOME/bin/activate
       python3 -m pip install --upgrade pip
+      python3 -m pip install --upgrade pycuda cupy-cuda110 numba
+      python3 -m pip install torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html
       python3 -m pip install -e python/cufinufft
       python3 -m pip install pytest
-      python3 -m pytest python/cufinufft
+      python3 -m pytest --framework=pycuda python/cufinufft
+      python3 -m pytest --framework=numba python/cufinufft
+      python3 -m pytest --framework=cupy python/cufinufft
+      python3 -m pytest --framework=torch python/cufinufft
     '''
       }
     }

diff --git a/python/cufinufft/cufinufft/_compat.py b/python/cufinufft/cufinufft/_compat.py
@@ -0,0 +1,106 @@
+import inspect
+
+import numpy as np
+
+
+def get_array_ptr(data):
+    try:
+        return data.__cuda_array_interface__['data'][0]
+    except RuntimeError:
+        # Handle torch with gradient enabled
+        # https://github.com/flatironinstitute/finufft/pull/326#issuecomment-1652212770
+        return data.data_ptr()
+    except AttributeError:
+        raise TypeError("Invalid GPU array implementation. Implementation must implement the standard cuda array interface.")
+
+
+def get_array_module(obj):
+    module_name = inspect.getmodule(type(obj)).__name__
+
+    if module_name.startswith("numba.cuda"):
+        return "numba"
+    elif module_name.startswith("torch"):
+        return "torch"
+    elif module_name.startswith("pycuda"):
+        return "pycuda"
+    else:
+        return "generic"
+
+
+def get_array_size(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "torch":
+        return len(obj)
+    else:
+        return obj.size
+
+
+def get_array_dtype(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "torch":
+        dtype_str = str(obj.dtype)
+        dtype_str = dtype_str[len("torch."):]
+        return np.dtype(dtype_str)
+    else:
+        return obj.dtype
+
+
+def is_array_contiguous(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "numba":
+        return obj.is_c_contiguous()
+    elif array_module == "torch":
+        return obj.is_contiguous()
+    else:
+        return obj.flags.c_contiguous
+
+
+def array_can_contiguous(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "pycuda":
+        return False
+    else:
+        return True
+
+
+def array_contiguous(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "numba":
+        import numba
+        ret = numba.cuda.device_array(obj.shape, obj.dtype, stream=obj.stream)
+        ret[:] = obj[:]
+        return ret
+    if array_module == "torch":
+        return obj.contiguous()
+    else:
+        return obj.copy(order="C")
+
+
+def array_empty_like(obj, *args, **kwargs):
+    module_name = get_array_module(obj)
+
+    if module_name == "numba":
+        import numba.cuda
+        return numba.cuda.device_array(*args, **kwargs)
+    elif module_name == "torch":
+        import torch
+        if "shape" in kwargs:
+            kwargs["size"] = kwargs.pop("shape")
+        if "dtype" in kwargs:
+            dtype = kwargs.pop("dtype")
+            if dtype == np.complex64:
+                dtype = torch.complex64
+            elif dtype == np.complex128:
+                dtype = torch.complex128
+            kwargs["dtype"] = dtype
+        if "device" not in kwargs:
+            kwargs["device"] = obj.device
+
+        return torch.empty(*args, **kwargs)
+    else:
+        return type(obj)(*args, **kwargs)
diff --git a/python/cufinufft/cufinufft/_cufinufft.py b/python/cufinufft/cufinufft/_cufinufft.py
@@ -17,8 +17,6 @@
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     import imp
 
-import numpy as np
-
 from ctypes import c_double
 from ctypes import c_int
 from ctypes import c_int64

diff --git a/python/cufinufft/cufinufft/_plan.py b/python/cufinufft/cufinufft/_plan.py
@@ -25,7 +25,7 @@
 from cufinufft._cufinufft import _destroy_plan
 from cufinufft._cufinufft import _destroy_planf
 
-from pycuda.gpuarray import GPUArray
+from cufinufft import _compat
 
 
 # If we are shutting down python, we don't need to run __del__
@@ -206,7 +206,7 @@ def setpts(self, x, y=None, z=None, s=None, t=None, u=None):
 
         _x, _y, _z = _ensure_valid_pts(_x, _y, _z, self.dim)
 
-        M = _x.size
+        M = _compat.get_array_size(_x)
 
         # Because FINUFFT/cufinufft are internally column major,
         #   we will reorder the pts axes. Reordering references
@@ -217,17 +217,17 @@ def setpts(self, x, y=None, z=None, s=None, t=None, u=None):
         #     (x, y, None)    ~>  (y, x, None)
         #     (x, y, z)       ~>  (z, y, x)
         # Via code, we push each dimension onto a stack of axis
-        fpts_axes = [_x.ptr, None, None]
+        fpts_axes = [_compat.get_array_ptr(_x), None, None]
 
         # We will also store references to these arrays.
         #   This keeps python from prematurely cleaning them up.
         self._references.append(_x)
         if self.dim >= 2:
-            fpts_axes.insert(0, _y.ptr)
+            fpts_axes.insert(0, _compat.get_array_ptr(_y))
             self._references.append(_y)
 
         if self.dim >= 3:
-            fpts_axes.insert(0, _z.ptr)
+            fpts_axes.insert(0, _compat.get_array_ptr(_z))
             self._references.append(_z)
 
         # Then take three items off the stack as our reordered axis.
@@ -278,14 +278,16 @@ def execute(self, data, out=None):
         req_out_shape = batch_shape + req_out_shape
 
         if out is None:
-            _out = GPUArray(req_out_shape, dtype=self.dtype)
+            _out = _compat.array_empty_like(_data, req_out_shape, dtype=self.dtype)
         else:
             _out = _ensure_array_shape(_out, "out", req_out_shape)
 
         if self.type == 1:
-            ier = self._exec_plan(self._plan, data.ptr, _out.ptr)
+            ier = self._exec_plan(self._plan, _compat.get_array_ptr(_data),
+                    _compat.get_array_ptr(_out))
         elif self.type == 2:
-            ier = self._exec_plan(self._plan, _out.ptr, data.ptr)
+            ier = self._exec_plan(self._plan, _compat.get_array_ptr(_out),
+                    _compat.get_array_ptr(_data))
 
         if ier != 0:
             raise RuntimeError('Error executing plan.')
@@ -315,27 +317,21 @@ def __del__(self):
 
 def _ensure_array_type(x, name, dtype, output=False):
     if x is None:
-        return GPUArray(0, dtype=dtype, order="C")
+        return None
 
-    if x.dtype != dtype:
+    if _compat.get_array_dtype(x) != dtype:
         raise TypeError(f"Argument `{name}` does not have the correct dtype: "
                         f"{x.dtype} was given, but {dtype} was expected.")
 
-    if not x.flags.c_contiguous:
-        if output:
+    if not _compat.is_array_contiguous(x):
+        if output or not _compat.array_can_contiguous(x):
             raise TypeError(f"Argument `{name}` does not satisfy the "
                             f"following requirement: C")
         else:
-            raise TypeError(f"Argument `{name}` does not satisfy the "
-                            f"following requirement: C")
-
-            # Ideally we'd copy the array into the correct ordering here, but
-            # this does not seem possible as of pycuda 2022.2.2.
-
-            # warnings.warn(f"Argument `{name}` does not satisfy the "
-            #               f"following requirement: C. Copying array (this may
-            #               reduce performance)")
-            # x = gpuarray.GPUArray(x, dtype=dtype, order="C")
+            warnings.warn(f"Argument `{name}` does not satisfy the "
+                          f"following requirement: C. Copying array "
+                          f"(this may reduce performance)")
+            x = _compat.array_contiguous(x)
 
     return x
 
@@ -354,22 +350,21 @@ def _ensure_array_shape(x, name, shape, allow_reshape=False):
     else:
         return x
 
+
 def _ensure_valid_pts(x, y, z, dim):
     if x.ndim != 1:
         raise TypeError(f"Argument `x` must be a vector")
 
-    M = x.size
-
     if dim >= 2:
         y = _ensure_array_shape(y, "y", x.shape)
 
     if dim >= 3:
         z = _ensure_array_shape(z, "z", x.shape)
 
-    if dim < 3 and z.size > 0:
+    if dim < 3 and z is not None and _compat.get_array_size(z) > 0:
         raise TypeError(f"Plan dimension is {dim}, but `z` was specified")
 
-    if dim < 2 and y.size > 0:
+    if dim < 2 and y is not None and _compat.get_array_size(y) > 0:
         raise TypeError(f"Plan dimension is {dim}, but `y` was specified")
 
     return x, y, z
diff --git a/python/cufinufft/cufinufft/_simple.py b/python/cufinufft/cufinufft/_simple.py
@@ -1,4 +1,4 @@
-from cufinufft import Plan
+from cufinufft import Plan, _compat
 
 def nufft1d1(x, data, n_modes=None, out=None, eps=1e-6, isign=1, **kwargs):
     return _invoke_plan(1, 1, x, None, None, data, out, isign, eps, n_modes,
@@ -24,7 +24,7 @@ def nufft3d2(x, y, z, data, out=None, eps=1e-6, isign=-1, **kwargs):
 
 def _invoke_plan(dim, nufft_type, x, y, z, data, out, isign, eps,
         n_modes=None, kwargs=None):
-    dtype = data.dtype
+    dtype = _compat.get_array_dtype(data)
 
     n_trans = _get_ntrans(dim, nufft_type, data)
 

diff --git a/python/cufinufft/examples/example2d1many.py → ...n/cufinufft/examples/example2d1_pycuda.py b/python/cufinufft/examples/example2d1many.py → ...n/cufinufft/examples/example2d1_pycuda.py
diff --git a/python/cufinufft/examples/example2d2many.py → ...n/cufinufft/examples/example2d2_pycuda.py b/python/cufinufft/examples/example2d2many.py → ...n/cufinufft/examples/example2d2_pycuda.py
diff --git a/python/cufinufft/examples/getting_started.py → ...inufft/examples/getting_started_pycuda.py b/python/cufinufft/examples/getting_started.py → ...inufft/examples/getting_started_pycuda.py
diff --git a/python/cufinufft/requirements.txt b/python/cufinufft/requirements.txt
@@ -1,3 +1,2 @@
 numpy
-pycuda
 six
diff --git a/python/cufinufft/tests/conftest.py b/python/cufinufft/tests/conftest.py
@@ -0,0 +1,24 @@
+import pytest
+
+import utils
+
+
+def pytest_addoption(parser):
+    parser.addoption("--framework", action="append", default=[], help="List of frameworks")
+
+def pytest_generate_tests(metafunc):
+    if "framework" in metafunc.fixturenames:
+        metafunc.parametrize("framework", metafunc.config.getoption("framework"))
+
+@pytest.fixture
+def to_gpu(framework):
+    to_gpu, _ = utils.transfer_funcs(framework)
+
+    return to_gpu
+
+
+@pytest.fixture
+def to_cpu(framework):
+    _, to_cpu = utils.transfer_funcs(framework)
+
+    return to_cpu
diff --git a/python/cufinufft/tests/test_array_ordering.py b/python/cufinufft/tests/test_array_ordering.py
@@ -2,60 +2,26 @@
 
 import numpy as np
 
-import pycuda.autoinit # NOQA:401
-import pycuda.gpuarray as gpuarray
-
-from cufinufft import Plan
+from cufinufft import Plan, _compat
 
 import utils
 
-def test_type2_ordering(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
-    complex_dtype = utils._complex_dtype(dtype)
-
-    k = utils.gen_nu_pts(M).astype(dtype)
-    fk = utils.gen_uniform_data(shape).astype(complex_dtype)
-
-    fkTT = fk.T.copy().T
-
-    k_gpu = gpuarray.to_gpu(k)
-    fk_gpu = gpuarray.to_gpu(fk)
-    fkTT_gpu = gpuarray.to_gpu(fkTT)
-
-    plan = Plan(2, shape, eps=tol, dtype=complex_dtype)
-
-    plan.setpts(k_gpu[0], k_gpu[1], k_gpu[2])
-
-    c_gpu = plan.execute(fk_gpu)
 
-    with pytest.raises(TypeError, match="following requirement: C") as err:
-        cTT_gpu = plan.execute(fkTT_gpu)
-
-    # Ideally, it should be possible to get this to align with true output,
-    # but corrently does not look like it.
-
-    # c = c_gpu.get()
-    # cTT = cTT_gpu.get()
-
-    # assert np.allclose(c, cTT, rtol=1e-2)
-
-
-def test_type1_ordering(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
+def test_type1_ordering(to_gpu, to_cpu, dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
     complex_dtype = utils._complex_dtype(dtype)
 
     k, c = utils.type1_problem(dtype, shape, M)
 
-    k_gpu = gpuarray.to_gpu(k)
-    c_gpu = gpuarray.to_gpu(c)
+    k_gpu = to_gpu(k)
+    c_gpu = to_gpu(c)
 
     plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
 
     plan.setpts(*k_gpu)
 
-    out_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
-
-    plan.execute(c_gpu, out=out_gpu)
+    out = np.empty(shape, dtype=complex_dtype, order="F")
 
-    out_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype, order="F")
+    out_gpu = to_gpu(out)
 
     with pytest.raises(TypeError, match="following requirement: C") as err:
         plan.execute(c_gpu, out=out_gpu)