diff --git a/Jenkinsfile b/Jenkinsfile
index bc7c64f02..0638dbf7c 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -27,7 +27,9 @@ pipeline {
       echo $HOME
     '''
     sh '''#!/bin/bash -ex
-        cuda_arch=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader|head -n 1| sed "s/\\.//")
+        # Oldest card in the Jenkins pool is a K40
+        cuda_arch="35"
+
         cmake -B build . -DFINUFFT_USE_CUDA=ON \
                          -DFINUFFT_USE_CPU=OFF \
                          -DFINUFFT_BUILD_TESTS=ON \
@@ -44,9 +46,14 @@ pipeline {
     sh '''#!/bin/bash -ex
       source $HOME/bin/activate
       python3 -m pip install --upgrade pip
+      python3 -m pip install --upgrade pycuda cupy-cuda110 numba
+      python3 -m pip install torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html
       python3 -m pip install -e python/cufinufft
       python3 -m pip install pytest
-      python3 -m pytest python/cufinufft
+      python3 -m pytest --framework=pycuda python/cufinufft
+      python3 -m pytest --framework=numba python/cufinufft
+      python3 -m pytest --framework=cupy python/cufinufft
+      python3 -m pytest --framework=torch python/cufinufft
     '''
       }
     }
diff --git a/python/cufinufft/cufinufft/_compat.py b/python/cufinufft/cufinufft/_compat.py
new file mode 100644
index 000000000..04e066a1a
--- /dev/null
+++ b/python/cufinufft/cufinufft/_compat.py
@@ -0,0 +1,106 @@
+import inspect
+
+import numpy as np
+
+
+def get_array_ptr(data):
+    try:
+        return data.__cuda_array_interface__['data'][0]
+    except RuntimeError:
+        # Handle torch with gradient enabled
+        # https://github.com/flatironinstitute/finufft/pull/326#issuecomment-1652212770
+        return data.data_ptr()
+    except AttributeError:
+        raise TypeError("Invalid GPU array implementation. Implementation must implement the standard cuda array interface.")
+
+
+def get_array_module(obj):
+    module_name = inspect.getmodule(type(obj)).__name__
+
+    if module_name.startswith("numba.cuda"):
+        return "numba"
+    elif module_name.startswith("torch"):
+        return "torch"
+    elif module_name.startswith("pycuda"):
+        return "pycuda"
+    else:
+        return "generic"
+
+
+def get_array_size(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "torch":
+        return len(obj)
+    else:
+        return obj.size
+
+
+def get_array_dtype(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "torch":
+        dtype_str = str(obj.dtype)
+        dtype_str = dtype_str[len("torch."):]
+        return np.dtype(dtype_str)
+    else:
+        return obj.dtype
+
+
+def is_array_contiguous(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "numba":
+        return obj.is_c_contiguous()
+    elif array_module == "torch":
+        return obj.is_contiguous()
+    else:
+        return obj.flags.c_contiguous
+
+
+def array_can_contiguous(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "pycuda":
+        return False
+    else:
+        return True
+
+
+def array_contiguous(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "numba":
+        import numba
+        ret = numba.cuda.device_array(obj.shape, obj.dtype, stream=obj.stream)
+        ret[:] = obj[:]
+        return ret
+    if array_module == "torch":
+        return obj.contiguous()
+    else:
+        return obj.copy(order="C")
+
+
+def array_empty_like(obj, *args, **kwargs):
+    module_name = get_array_module(obj)
+
+    if module_name == "numba":
+        import numba.cuda
+        return numba.cuda.device_array(*args, **kwargs)
+    elif module_name == "torch":
+        import torch
+        if "shape" in kwargs:
+            kwargs["size"] = kwargs.pop("shape")
+        if "dtype" in kwargs:
+            dtype = kwargs.pop("dtype")
+            if dtype == np.complex64:
+                dtype = torch.complex64
+            elif dtype == np.complex128:
+                dtype = torch.complex128
+            kwargs["dtype"] = dtype
+        if "device" not in kwargs:
+            kwargs["device"] = obj.device
+
+        return torch.empty(*args, **kwargs)
+    else:
+        return type(obj)(*args, **kwargs)
diff --git a/python/cufinufft/cufinufft/_cufinufft.py b/python/cufinufft/cufinufft/_cufinufft.py
index b36b60b74..beaeb8b8c 100644
--- a/python/cufinufft/cufinufft/_cufinufft.py
+++ b/python/cufinufft/cufinufft/_cufinufft.py
@@ -17,8 +17,6 @@
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     import imp
 
-import numpy as np
-
 from ctypes import c_double
 from ctypes import c_int
 from ctypes import c_int64
diff --git a/python/cufinufft/cufinufft/_plan.py b/python/cufinufft/cufinufft/_plan.py
index 084619990..4231a145a 100644
--- a/python/cufinufft/cufinufft/_plan.py
+++ b/python/cufinufft/cufinufft/_plan.py
@@ -25,7 +25,7 @@
 from cufinufft._cufinufft import _destroy_plan
 from cufinufft._cufinufft import _destroy_planf
 
-from pycuda.gpuarray import GPUArray
+from cufinufft import _compat
 
 
 # If we are shutting down python, we don't need to run __del__
@@ -206,7 +206,7 @@ def setpts(self, x, y=None, z=None, s=None, t=None, u=None):
 
         _x, _y, _z = _ensure_valid_pts(_x, _y, _z, self.dim)
 
-        M = _x.size
+        M = _compat.get_array_size(_x)
 
         # Because FINUFFT/cufinufft are internally column major,
         #   we will reorder the pts axes. Reordering references
@@ -217,17 +217,17 @@ def setpts(self, x, y=None, z=None, s=None, t=None, u=None):
         #     (x, y, None)    ~>  (y, x, None)
         #     (x, y, z)       ~>  (z, y, x)
         # Via code, we push each dimension onto a stack of axis
-        fpts_axes = [_x.ptr, None, None]
+        fpts_axes = [_compat.get_array_ptr(_x), None, None]
 
         # We will also store references to these arrays.
         #   This keeps python from prematurely cleaning them up.
         self._references.append(_x)
         if self.dim >= 2:
-            fpts_axes.insert(0, _y.ptr)
+            fpts_axes.insert(0, _compat.get_array_ptr(_y))
             self._references.append(_y)
 
         if self.dim >= 3:
-            fpts_axes.insert(0, _z.ptr)
+            fpts_axes.insert(0, _compat.get_array_ptr(_z))
             self._references.append(_z)
 
         # Then take three items off the stack as our reordered axis.
@@ -278,14 +278,16 @@ def execute(self, data, out=None):
         req_out_shape = batch_shape + req_out_shape
 
         if out is None:
-            _out = GPUArray(req_out_shape, dtype=self.dtype)
+            _out = _compat.array_empty_like(_data, req_out_shape, dtype=self.dtype)
         else:
             _out = _ensure_array_shape(_out, "out", req_out_shape)
 
         if self.type == 1:
-            ier = self._exec_plan(self._plan, data.ptr, _out.ptr)
+            ier = self._exec_plan(self._plan, _compat.get_array_ptr(_data),
+                    _compat.get_array_ptr(_out))
         elif self.type == 2:
-            ier = self._exec_plan(self._plan, _out.ptr, data.ptr)
+            ier = self._exec_plan(self._plan, _compat.get_array_ptr(_out),
+                    _compat.get_array_ptr(_data))
 
         if ier != 0:
             raise RuntimeError('Error executing plan.')
@@ -315,27 +317,21 @@ def __del__(self):
 
 def _ensure_array_type(x, name, dtype, output=False):
     if x is None:
-        return GPUArray(0, dtype=dtype, order="C")
+        return None
 
-    if x.dtype != dtype:
+    if _compat.get_array_dtype(x) != dtype:
         raise TypeError(f"Argument `{name}` does not have the correct dtype: "
                         f"{x.dtype} was given, but {dtype} was expected.")
 
-    if not x.flags.c_contiguous:
-        if output:
+    if not _compat.is_array_contiguous(x):
+        if output or not _compat.array_can_contiguous(x):
             raise TypeError(f"Argument `{name}` does not satisfy the "
                             f"following requirement: C")
         else:
-            raise TypeError(f"Argument `{name}` does not satisfy the "
-                            f"following requirement: C")
-
-            # Ideally we'd copy the array into the correct ordering here, but
-            # this does not seem possible as of pycuda 2022.2.2.
-
-            # warnings.warn(f"Argument `{name}` does not satisfy the "
-            #               f"following requirement: C. Copying array (this may
-            #               reduce performance)")
-            # x = gpuarray.GPUArray(x, dtype=dtype, order="C")
+            warnings.warn(f"Argument `{name}` does not satisfy the "
+                          f"following requirement: C. Copying array "
+                          f"(this may reduce performance)")
+            x = _compat.array_contiguous(x)
 
     return x
 
@@ -354,22 +350,21 @@ def _ensure_array_shape(x, name, shape, allow_reshape=False):
     else:
         return x
 
+
 def _ensure_valid_pts(x, y, z, dim):
     if x.ndim != 1:
         raise TypeError(f"Argument `x` must be a vector")
 
-    M = x.size
-
     if dim >= 2:
         y = _ensure_array_shape(y, "y", x.shape)
 
     if dim >= 3:
         z = _ensure_array_shape(z, "z", x.shape)
 
-    if dim < 3 and z.size > 0:
+    if dim < 3 and z is not None and _compat.get_array_size(z) > 0:
         raise TypeError(f"Plan dimension is {dim}, but `z` was specified")
 
-    if dim < 2 and y.size > 0:
+    if dim < 2 and y is not None and _compat.get_array_size(y) > 0:
         raise TypeError(f"Plan dimension is {dim}, but `y` was specified")
 
     return x, y, z
diff --git a/python/cufinufft/cufinufft/_simple.py b/python/cufinufft/cufinufft/_simple.py
index 2b42c9d25..ac36e90ab 100644
--- a/python/cufinufft/cufinufft/_simple.py
+++ b/python/cufinufft/cufinufft/_simple.py
@@ -1,4 +1,4 @@
-from cufinufft import Plan
+from cufinufft import Plan, _compat
 
 def nufft1d1(x, data, n_modes=None, out=None, eps=1e-6, isign=1, **kwargs):
     return _invoke_plan(1, 1, x, None, None, data, out, isign, eps, n_modes,
@@ -24,7 +24,7 @@ def nufft3d2(x, y, z, data, out=None, eps=1e-6, isign=-1, **kwargs):
 
 def _invoke_plan(dim, nufft_type, x, y, z, data, out, isign, eps,
         n_modes=None, kwargs=None):
-    dtype = data.dtype
+    dtype = _compat.get_array_dtype(data)
 
     n_trans = _get_ntrans(dim, nufft_type, data)
 
diff --git a/python/cufinufft/examples/example2d1many.py b/python/cufinufft/examples/example2d1_pycuda.py
similarity index 100%
rename from python/cufinufft/examples/example2d1many.py
rename to python/cufinufft/examples/example2d1_pycuda.py
diff --git a/python/cufinufft/examples/example2d2many.py b/python/cufinufft/examples/example2d2_pycuda.py
similarity index 100%
rename from python/cufinufft/examples/example2d2many.py
rename to python/cufinufft/examples/example2d2_pycuda.py
diff --git a/python/cufinufft/examples/getting_started.py b/python/cufinufft/examples/getting_started_pycuda.py
similarity index 100%
rename from python/cufinufft/examples/getting_started.py
rename to python/cufinufft/examples/getting_started_pycuda.py
diff --git a/python/cufinufft/requirements.txt b/python/cufinufft/requirements.txt
index fcbec6659..bc2cbbd1c 100644
--- a/python/cufinufft/requirements.txt
+++ b/python/cufinufft/requirements.txt
@@ -1,3 +1,2 @@
 numpy
-pycuda
 six
diff --git a/python/cufinufft/tests/conftest.py b/python/cufinufft/tests/conftest.py
new file mode 100644
index 000000000..56528681f
--- /dev/null
+++ b/python/cufinufft/tests/conftest.py
@@ -0,0 +1,24 @@
+import pytest
+
+import utils
+
+
+def pytest_addoption(parser):
+    parser.addoption("--framework", action="append", default=[], help="List of frameworks")
+
+def pytest_generate_tests(metafunc):
+    if "framework" in metafunc.fixturenames:
+        metafunc.parametrize("framework", metafunc.config.getoption("framework"))
+
+@pytest.fixture
+def to_gpu(framework):
+    to_gpu, _ = utils.transfer_funcs(framework)
+
+    return to_gpu
+
+
+@pytest.fixture
+def to_cpu(framework):
+    _, to_cpu = utils.transfer_funcs(framework)
+
+    return to_cpu
diff --git a/python/cufinufft/tests/test_array_ordering.py b/python/cufinufft/tests/test_array_ordering.py
index d42fd8fa7..0fba8f8f5 100644
--- a/python/cufinufft/tests/test_array_ordering.py
+++ b/python/cufinufft/tests/test_array_ordering.py
@@ -2,60 +2,26 @@
 
 import numpy as np
 
-import pycuda.autoinit # NOQA:401
-import pycuda.gpuarray as gpuarray
-
-from cufinufft import Plan
+from cufinufft import Plan, _compat
 
 import utils
 
-def test_type2_ordering(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
-    complex_dtype = utils._complex_dtype(dtype)
-
-    k = utils.gen_nu_pts(M).astype(dtype)
-    fk = utils.gen_uniform_data(shape).astype(complex_dtype)
-
-    fkTT = fk.T.copy().T
-
-    k_gpu = gpuarray.to_gpu(k)
-    fk_gpu = gpuarray.to_gpu(fk)
-    fkTT_gpu = gpuarray.to_gpu(fkTT)
-
-    plan = Plan(2, shape, eps=tol, dtype=complex_dtype)
-
-    plan.setpts(k_gpu[0], k_gpu[1], k_gpu[2])
-
-    c_gpu = plan.execute(fk_gpu)
 
-    with pytest.raises(TypeError, match="following requirement: C") as err:
-        cTT_gpu = plan.execute(fkTT_gpu)
-
-    # Ideally, it should be possible to get this to align with true output,
-    # but corrently does not look like it.
-
-    # c = c_gpu.get()
-    # cTT = cTT_gpu.get()
-
-    # assert np.allclose(c, cTT, rtol=1e-2)
-
-
-def test_type1_ordering(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
+def test_type1_ordering(to_gpu, to_cpu, dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
     complex_dtype = utils._complex_dtype(dtype)
 
     k, c = utils.type1_problem(dtype, shape, M)
 
-    k_gpu = gpuarray.to_gpu(k)
-    c_gpu = gpuarray.to_gpu(c)
+    k_gpu = to_gpu(k)
+    c_gpu = to_gpu(c)
 
     plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
 
     plan.setpts(*k_gpu)
 
-    out_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
-
-    plan.execute(c_gpu, out=out_gpu)
+    out = np.empty(shape, dtype=complex_dtype, order="F")
 
-    out_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype, order="F")
+    out_gpu = to_gpu(out)
 
     with pytest.raises(TypeError, match="following requirement: C") as err:
         plan.execute(c_gpu, out=out_gpu)
diff --git a/python/cufinufft/tests/test_basic.py b/python/cufinufft/tests/test_basic.py
index 7d0bccc90..d63c23a7a 100644
--- a/python/cufinufft/tests/test_basic.py
+++ b/python/cufinufft/tests/test_basic.py
@@ -2,10 +2,7 @@
 
 import numpy as np
 
-import pycuda.autoinit # NOQA:401
-import pycuda.gpuarray as gpuarray
-
-from cufinufft import Plan
+from cufinufft import Plan, _compat
 
 import utils
 
@@ -16,19 +13,21 @@
 MS = [256, 1024, 4096]
 TOLS = [1e-2, 1e-3]
 OUTPUT_ARGS = [False, True]
+CONTIGUOUS = [False, True]
+
 
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("shape", SHAPES)
 @pytest.mark.parametrize("M", MS)
 @pytest.mark.parametrize("tol", TOLS)
 @pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
-def test_type1(dtype, shape, M, tol, output_arg):
+def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
     complex_dtype = utils._complex_dtype(dtype)
 
     k, c = utils.type1_problem(dtype, shape, M)
 
-    k_gpu = gpuarray.to_gpu(k)
-    c_gpu = gpuarray.to_gpu(c)
+    k_gpu = to_gpu(k)
+    c_gpu = to_gpu(c)
 
     plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
 
@@ -38,12 +37,12 @@ def test_type1(dtype, shape, M, tol, output_arg):
     plan.setpts(*k_gpu)
 
     if output_arg:
-        fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
+        fk_gpu = _compat.array_empty_like(c_gpu, shape, dtype=complex_dtype)
         plan.execute(c_gpu, out=fk_gpu)
     else:
         fk_gpu = plan.execute(c_gpu)
 
-    fk = fk_gpu.get()
+    fk = to_cpu(fk_gpu)
 
     utils.verify_type1(k, c, fk, tol)
 
@@ -53,39 +52,61 @@ def test_type1(dtype, shape, M, tol, output_arg):
 @pytest.mark.parametrize("M", MS)
 @pytest.mark.parametrize("tol", TOLS)
 @pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
-def test_type2(dtype, shape, M, tol, output_arg):
+@pytest.mark.parametrize("contiguous", CONTIGUOUS)
+def test_type2(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, contiguous):
     complex_dtype = utils._complex_dtype(dtype)
 
     k, fk = utils.type2_problem(dtype, shape, M)
 
-    k_gpu = gpuarray.to_gpu(k)
-    fk_gpu = gpuarray.to_gpu(fk)
-
     plan = Plan(2, shape, eps=tol, dtype=complex_dtype)
 
+    check_result = True
+
+    if not contiguous and len(shape) > 1:
+        fk = fk.copy(order="F")
+
+        if _compat.array_can_contiguous(to_gpu(np.empty(1))):
+            def _execute(*args, **kwargs):
+                with pytest.warns(UserWarning, match="requirement: C. Copying"):
+                    return plan.execute(*args, **kwargs)
+        else:
+            check_result = False
+
+            def _execute(*args, **kwargs):
+                with pytest.raises(TypeError, match="requirement: C"):
+                    plan.execute(*args, **kwargs)
+
+    else:
+        def _execute(*args, **kwargs):
+            return plan.execute(*args, **kwargs)
+
+    k_gpu = to_gpu(k)
+    fk_gpu = to_gpu(fk)
+
     plan.setpts(*k_gpu)
 
     if output_arg:
-        c_gpu = gpuarray.GPUArray(shape=(M,), dtype=complex_dtype)
-        plan.execute(fk_gpu, out=c_gpu)
+        c_gpu = _compat.array_empty_like(fk_gpu, (M,), dtype=complex_dtype)
+        _execute(fk_gpu, out=c_gpu)
     else:
-        c_gpu = plan.execute(fk_gpu)
+        c_gpu = _execute(fk_gpu)
 
-    c = c_gpu.get()
+    if check_result:
+        c = to_cpu(c_gpu)
 
-    utils.verify_type2(k, fk, c, tol)
+        utils.verify_type2(k, fk, c, tol)
 
 
-def test_opts(shape=(8, 8, 8), M=32, tol=1e-3):
+def test_opts(to_gpu, to_cpu, shape=(8, 8, 8), M=32, tol=1e-3):
     dtype = np.float32
 
     complex_dtype = utils._complex_dtype(dtype)
 
     k, c = utils.type1_problem(dtype, shape, M)
 
-    k_gpu = gpuarray.to_gpu(k)
-    c_gpu = gpuarray.to_gpu(c)
-    fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
+    k_gpu = to_gpu(k)
+    c_gpu = to_gpu(c)
+    fk_gpu = _compat.array_empty_like(c_gpu, shape, dtype=complex_dtype)
 
     plan = Plan(1, shape, eps=tol, dtype=complex_dtype, gpu_sort=False,
                      gpu_maxsubprobsize=10)
@@ -94,7 +115,6 @@ def test_opts(shape=(8, 8, 8), M=32, tol=1e-3):
 
     plan.execute(c_gpu, fk_gpu)
 
-    fk = fk_gpu.get()
+    fk = to_cpu(fk_gpu)
 
     utils.verify_type1(k, c, fk, tol)
-
diff --git a/python/cufinufft/tests/test_error_checks.py b/python/cufinufft/tests/test_error_checks.py
index 59798e7ca..6a9a6b4aa 100644
--- a/python/cufinufft/tests/test_error_checks.py
+++ b/python/cufinufft/tests/test_error_checks.py
@@ -1,15 +1,11 @@
 import numpy as np
 import pytest
 
-import pycuda.autoinit # NOQA:401
-import pycuda.gpuarray as gpuarray
-
-from cufinufft import Plan
+from cufinufft import Plan, _compat
 
 import utils
 
-
-def test_set_nu_raises_on_dtype():
+def test_set_nu_raises_on_dtype(to_gpu):
     dtype = np.complex64
 
     M = 4096
@@ -19,10 +15,10 @@ def test_set_nu_raises_on_dtype():
 
     kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
 
-    kxyz_gpu = gpuarray.to_gpu(kxyz)
+    kxyz_gpu = to_gpu(kxyz)
 
     # Here we'll intentionally contruct an incorrect array dtype.
-    kxyz_gpu_wrong_type = gpuarray.to_gpu(kxyz.real.astype(np.float64))
+    kxyz_gpu_wrong_type = to_gpu(kxyz.real.astype(np.float64))
 
     plan = Plan(1, shape, eps=tol, dtype=dtype)
 
@@ -40,7 +36,7 @@ def test_set_nu_raises_on_dtype():
                      kxyz_gpu_wrong_type[1], kxyz_gpu_wrong_type[2])
 
 
-def test_set_pts_raises_on_size():
+def test_set_pts_raises_on_size(to_gpu):
     dtype = np.float32
     complex_dtype = np.complex64
 
@@ -51,7 +47,7 @@ def test_set_pts_raises_on_size():
 
     kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
 
-    kxyz_gpu = gpuarray.to_gpu(kxyz)
+    kxyz_gpu = to_gpu(kxyz)
 
     plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
 
@@ -62,7 +58,7 @@ def test_set_pts_raises_on_size():
         plan.setpts(kxyz_gpu[0], kxyz_gpu[1], kxyz_gpu[2][:4])
 
 
-def test_set_pts_raises_on_nonvector():
+def test_set_pts_raises_on_nonvector(to_gpu):
     dtype = np.float32
     complex_dtype = np.complex64
 
@@ -73,7 +69,7 @@ def test_set_pts_raises_on_nonvector():
 
     kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
 
-    kxyz_gpu = gpuarray.to_gpu(kxyz)
+    kxyz_gpu = to_gpu(kxyz)
 
     plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
 
@@ -81,7 +77,7 @@ def test_set_pts_raises_on_nonvector():
         plan.setpts(kxyz)
 
 
-def test_set_pts_raises_on_number_of_args():
+def test_set_pts_raises_on_number_of_args(to_gpu):
     dtype = np.float32
     complex_dtype = np.complex64
 
@@ -92,7 +88,7 @@ def test_set_pts_raises_on_number_of_args():
 
     kxyz = utils.gen_nu_pts(M, dim=3).astype(dtype)
 
-    kxyz_gpu = gpuarray.to_gpu(kxyz)
+    kxyz_gpu = to_gpu(kxyz)
 
     plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
 
@@ -112,7 +108,7 @@ def test_wrong_field_names():
         plan = Plan(1, (8, 8), foo="bar")
 
 
-def test_exec_raises_on_dtype():
+def test_exec_raises_on_dtype(to_gpu):
     dtype = np.float32
     complex_dtype = np.complex64
 
@@ -123,14 +119,17 @@ def test_exec_raises_on_dtype():
 
     kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
     c = utils.gen_nonuniform_data(M).astype(complex_dtype)
-    c_gpu = gpuarray.to_gpu(c)
+    c_gpu = to_gpu(c)
     # Using c.real gives us wrong dtype here...
-    c_gpu_wrong_dtype = gpuarray.to_gpu(c.real)
+    # Need contiguous here since numba does not allow transfers of
+    # non-contiguous arrays.
+    c_gpu_wrong_dtype = to_gpu(np.ascontiguousarray(c.real))
 
-    kxyz_gpu = gpuarray.to_gpu(kxyz)
-    fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
+    kxyz_gpu = to_gpu(kxyz)
+    fk_gpu = _compat.array_empty_like(kxyz_gpu, shape, dtype=complex_dtype)
     # Here we'll intentionally contruct an incorrect array dtype.
-    fk_gpu_wrong_dtype = gpuarray.GPUArray(shape, dtype=np.complex128)
+    fk_gpu_wrong_dtype = _compat.array_empty_like(fk_gpu, shape,
+            dtype=np.complex128)
 
     plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
 
diff --git a/python/cufinufft/tests/test_examples.py b/python/cufinufft/tests/test_examples.py
index 34fe610a8..c6fb5dd45 100644
--- a/python/cufinufft/tests/test_examples.py
+++ b/python/cufinufft/tests/test_examples.py
@@ -17,5 +17,11 @@
         scripts.append(os.path.join(examples_dir, filename))
 
 @pytest.mark.parametrize("filename", scripts)
-def test_example(filename):
-    subprocess.check_call([sys.executable, filename])
+def test_example(filename, request):
+    # Extract framework from format `example_framework.py`.
+    framework = Path(filename).stem.split("_")[-1]
+
+    if framework in request.config.getoption("framework"):
+        subprocess.check_call([sys.executable, filename])
+    else:
+        pytest.skip("Example not in list of frameworks")
diff --git a/python/cufinufft/tests/test_multi.py b/python/cufinufft/tests/test_multi.py
index 9115ca484..a8e392fed 100644
--- a/python/cufinufft/tests/test_multi.py
+++ b/python/cufinufft/tests/test_multi.py
@@ -1,16 +1,18 @@
 import pytest
 
 import numpy as np
-
-import pycuda.driver as drv
-import pycuda.gpuarray as gpuarray
-
 from cufinufft import Plan
 
 import utils
 
 
-def test_multi_type1(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
+def test_multi_type1(framework, dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
+    if framework == "pycuda":
+        import pycuda.driver as drv
+        import pycuda.gpuarray as gpuarray
+    else:
+        pytest.skip("Multi-GPU support only tested for pycuda")
+
     complex_dtype = utils._complex_dtype(dtype)
 
     drv.init()
diff --git a/python/cufinufft/tests/test_simple.py b/python/cufinufft/tests/test_simple.py
index b1a9d319a..f51a137f0 100644
--- a/python/cufinufft/tests/test_simple.py
+++ b/python/cufinufft/tests/test_simple.py
@@ -2,10 +2,8 @@
 
 import numpy as np
 
-import pycuda.autoinit
-import pycuda.gpuarray as gpuarray
-
 import cufinufft
+from cufinufft import _compat
 
 import utils
 
@@ -22,7 +20,7 @@
 @pytest.mark.parametrize("M", MS)
 @pytest.mark.parametrize("tol", TOLS)
 @pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
-def test_simple_type1(dtype, shape, n_trans, M, tol, output_arg):
+def test_simple_type1(to_gpu, to_cpu, dtype, shape, n_trans, M, tol, output_arg):
     real_dtype = dtype
     complex_dtype = utils._complex_dtype(dtype)
 
@@ -35,20 +33,21 @@ def test_simple_type1(dtype, shape, n_trans, M, tol, output_arg):
 
     k, c = utils.type1_problem(dtype, shape, M, n_trans=n_trans)
 
-    k_gpu = gpuarray.to_gpu(k)
-    c_gpu = gpuarray.to_gpu(c)
+    k_gpu = to_gpu(k)
+    c_gpu = to_gpu(c)
 
     if output_arg:
         # Ensure that output array has proper shape i.e., (N1, ...) for no
         # batch, (1, N1, ...) for batch of size one, and (n, N1, ...) for
         # batch of size n.
-        fk_gpu = gpuarray.GPUArray(n_trans + shape, dtype=complex_dtype)
+        fk_gpu = _compat.array_empty_like(c_gpu, n_trans + shape,
+                dtype=complex_dtype)
 
         fun(*k_gpu, c_gpu, out=fk_gpu, eps=tol)
     else:
         fk_gpu = fun(*k_gpu, c_gpu, shape, eps=tol)
 
-    fk = fk_gpu.get()
+    fk = to_cpu(fk_gpu)
 
     utils.verify_type1(k, c, fk, tol)
 
@@ -59,7 +58,7 @@ def test_simple_type1(dtype, shape, n_trans, M, tol, output_arg):
 @pytest.mark.parametrize("M", MS)
 @pytest.mark.parametrize("tol", TOLS)
 @pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
-def test_simple_type2(dtype, shape, n_trans, M, tol, output_arg):
+def test_simple_type2(to_gpu, to_cpu, dtype, shape, n_trans, M, tol, output_arg):
     real_dtype = dtype
     complex_dtype = utils._complex_dtype(dtype)
 
@@ -71,16 +70,17 @@ def test_simple_type2(dtype, shape, n_trans, M, tol, output_arg):
 
     k, fk = utils.type2_problem(dtype, shape, M, n_trans=n_trans)
 
-    k_gpu = gpuarray.to_gpu(k)
-    fk_gpu = gpuarray.to_gpu(fk)
+    k_gpu = to_gpu(k)
+    fk_gpu = to_gpu(fk)
 
     if output_arg:
-        c_gpu = gpuarray.GPUArray(n_trans + (M,), dtype=complex_dtype)
+        c_gpu = _compat.array_empty_like(fk_gpu, n_trans + (M,),
+                dtype=complex_dtype)
 
         fun(*k_gpu, fk_gpu, eps=tol, out=c_gpu)
     else:
         c_gpu = fun(*k_gpu, fk_gpu, eps=tol)
 
-    c = c_gpu.get()
+    c = to_cpu(c_gpu)
 
     utils.verify_type2(k, fk, c, tol)
diff --git a/python/cufinufft/tests/utils.py b/python/cufinufft/tests/utils.py
index 5bace0d71..9ea3281f3 100644
--- a/python/cufinufft/tests/utils.py
+++ b/python/cufinufft/tests/utils.py
@@ -126,3 +126,32 @@ def verify_type2(k, fk, c, tol):
     type2_rel_err = np.linalg.norm(c_target - c_est) / np.linalg.norm(c_target)
 
     assert type2_rel_err < 25 * tol
+
+
+def transfer_funcs(module_name):
+    if module_name == "pycuda":
+        import pycuda.autoinit # NOQA:401
+        from pycuda.gpuarray import to_gpu
+        def to_cpu(obj):
+            return obj.get()
+    elif module_name == "cupy":
+        import cupy
+        def to_gpu(obj):
+            return cupy.array(obj)
+        def to_cpu(obj):
+            return obj.get()
+    elif module_name == "numba":
+        import numba.cuda
+        to_gpu = numba.cuda.to_device
+        def to_cpu(obj):
+            return obj.copy_to_host()
+    elif module_name == "torch":
+        import torch
+        def to_gpu(obj):
+            return torch.as_tensor(obj, device=torch.device("cuda"))
+        def to_cpu(obj):
+            return obj.cpu().numpy()
+    else:
+        raise TypeError(f"Unsupported framework: {module_name}")
+
+    return to_gpu, to_cpu
diff --git a/tools/cufinufft/docker/cuda11.0/Dockerfile-x86_64 b/tools/cufinufft/docker/cuda11.0/Dockerfile-x86_64
index 4b09fc392..29954df1d 100644
--- a/tools/cufinufft/docker/cuda11.0/Dockerfile-x86_64
+++ b/tools/cufinufft/docker/cuda11.0/Dockerfile-x86_64
@@ -5,8 +5,7 @@ ENV CUDA_MAJOR 11
 ENV CUDA_MINOR 0
 
 ENV CUDART_VERSION 11.0.171
-ENV CUFFT_VERSION 10.2.1.245
-ENV CURAND_VERSION 10.2.1.245
+ENV CUDA_LIBRARIES_VERSION 11.0.3
 ENV NVPROF_VERSION 11.0.221
 ENV NVTX_VERSION 11.0.167
 ENV NVCC_VERSION 11.0.221
@@ -43,16 +42,14 @@ ENV NVIDIA_REQUIRE_CUDA "cuda>=${CUDA_DOT_VERSION} brand=tesla,driver>=418,drive
 
 # runtime
 RUN yum install -y \
-        libcufft-${CUDA_DASH_VERSION}-${CUFFT_VERSION}-1 \
-        libcurand-${CUDA_DASH_VERSION}-${CURAND_VERSION}-1 \
+        cuda-libraries-${CUDA_DASH_VERSION}-${CUDA_LIBRARIES_VERSION}-1 \
         cuda-nvtx-${CUDA_DASH_VERSION}-${NVTX_VERSION}-1 && \
     rm -rf /var/cache/yum/*
 
 # devel
 RUN yum install -y \
         cuda-cudart-devel-${CUDA_DASH_VERSION}-${CUDART_VERSION}-1 \
-        libcufft-devel-${CUDA_DASH_VERSION}-${CUFFT_VERSION}-1 \
-        libcurand-devel-${CUDA_DASH_VERSION}-${CURAND_VERSION}-1 \
+        cuda-libraries-devel-${CUDA_DASH_VERSION}-${CUDA_LIBRARIES_VERSION}-1 \
         cuda-nvprof-${CUDA_DASH_VERSION}-${NVPROF_VERSION}-1 \
         cuda-nvcc-${CUDA_DASH_VERSION}-${NVCC_VERSION}-1 && \
     rm -rf /var/cache/yum/*