diff --git a/tests/influence/_core/test_arnoldi_influence.py b/tests/influence/_core/test_arnoldi_influence.py
index 875c604ba..af31bb233 100644
--- a/tests/influence/_core/test_arnoldi_influence.py
+++ b/tests/influence/_core/test_arnoldi_influence.py
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List, Optional, Tuple
 
 import torch
 
@@ -27,8 +27,8 @@
     generate_assymetric_matrix_given_eigenvalues,
     generate_symmetric_matrix_given_eigenvalues,
     get_random_model_and_data,
+    is_gpu,
     UnpackDataset,
-    USE_GPU_LIST,
 )
 from torch import Tensor
 from torch.utils.data import DataLoader
@@ -229,6 +229,17 @@ def _param_matmul(params: Tuple[Tensor]):
                         "max",
                     )
 
+    # TODO: for some unknow reason, this test and the test below does not work
+    # on `cuda_data_parallel` setting. We need to investigate why.
+    # Use a local version of setting list for these two tests for now
+    # since we have changed the default setting list to includes all options.
+    # (This is also used in many other tests, which also needs to be unified later).
+    gpu_setting_list = (
+        ["", "cuda"]
+        if torch.cuda.is_available() and torch.cuda.device_count() != 0
+        else [""]
+    )
+
     @parameterized.expand(
         [
             (
@@ -237,9 +248,9 @@ def _param_matmul(params: Tuple[Tensor]):
                 delta,
                 mode,
                 unpack_inputs,
-                use_gpu,
+                gpu_setting,
             )
-            for use_gpu in USE_GPU_LIST
+            for gpu_setting in gpu_setting_list
             for (influence_constructor_1, influence_constructor_2, delta) in [
                 # compare implementations, when considering only 1 layer
                 (
@@ -247,7 +258,7 @@ def _param_matmul(params: Tuple[Tensor]):
                         NaiveInfluenceFunction,
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear1"]
                         ),
                         projection_dim=5,
@@ -258,7 +269,7 @@ def _param_matmul(params: Tuple[Tensor]):
                         ArnoldiInfluenceFunction,
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear1"]
                         ),
                         arnoldi_dim=50,
@@ -314,7 +325,7 @@ def test_compare_implementations_trained_NN_model_and_data(
         delta: float,
         mode: str,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         """
         this compares 2 influence implementations on a trained 2-layer NN model.
@@ -329,7 +340,7 @@ def test_compare_implementations_trained_NN_model_and_data(
             delta,
             mode,
             unpack_inputs,
-            use_gpu,
+            gpu_setting,
         )
 
     # this compares `ArnoldiInfluenceFunction` and `NaiveInfluenceFunction` on randomly
@@ -337,6 +348,7 @@ def test_compare_implementations_trained_NN_model_and_data(
     # can also compare the intermediate quantities. we do not compare with
     # `NaiveInfluence` because on randomly generated data, it is not comparable,
     # conceptually, with the other implementations, due to numerical issues.
+
     @parameterized.expand(
         [
             (
@@ -345,16 +357,16 @@ def test_compare_implementations_trained_NN_model_and_data(
                 delta,
                 mode,
                 unpack_inputs,
-                use_gpu,
+                gpu_setting,
             )
-            for use_gpu in USE_GPU_LIST
+            for gpu_setting in gpu_setting_list
             for (influence_constructor_1, influence_constructor_2, delta) in [
                 (
                     DataInfluenceConstructor(
                         NaiveInfluenceFunction,
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear1"]
                         ),
                         show_progress=False,
@@ -364,7 +376,7 @@ def test_compare_implementations_trained_NN_model_and_data(
                         ArnoldiInfluenceFunction,
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear1"]
                         ),
                         show_progress=False,
@@ -397,7 +409,7 @@ def test_compare_implementations_random_model_and_data(
         delta: float,
         mode: str,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         """
         this compares 2 influence implementations on a trained 2-layer NN model.
@@ -412,7 +424,7 @@ def test_compare_implementations_random_model_and_data(
             delta,
             mode,
             unpack_inputs,
-            use_gpu,
+            gpu_setting,
         )
 
     def _test_compare_implementations(
@@ -423,7 +435,7 @@ def _test_compare_implementations(
         delta: float,
         mode: str,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         """
         checks that 2 implementations of `InfluenceFunctionBase` return the same
@@ -444,13 +456,14 @@ def _test_compare_implementations(
                 tmpdir,
                 unpack_inputs,
                 return_test_data=True,
-                use_gpu=use_gpu,
+                gpu_setting=gpu_setting,
                 return_hessian_data=True,
                 model_type=model_type,
             )
 
             train_dataset = DataLoader(train_dataset, batch_size=5)
 
+            use_gpu = is_gpu(gpu_setting)
             hessian_dataset = (
                 ExplicitDataset(hessian_samples, hessian_labels, use_gpu)
                 if not unpack_inputs
diff --git a/tests/influence/_core/test_naive_influence.py b/tests/influence/_core/test_naive_influence.py
index b48a1ffaa..ddcb70994 100644
--- a/tests/influence/_core/test_naive_influence.py
+++ b/tests/influence/_core/test_naive_influence.py
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List, Optional, Tuple
 
 import torch
 
@@ -21,9 +21,9 @@
     DataInfluenceConstructor,
     ExplicitDataset,
     get_random_model_and_data,
+    is_gpu,
     Linear,
     UnpackDataset,
-    USE_GPU_LIST,
 )
 from torch.utils.data import DataLoader
 
@@ -51,6 +51,17 @@ def test_flatten_unflattener(self, param_shapes: List[Tuple[int, ...]]) -> None:
             mode="max",
         )
 
+    # TODO: for some unknow reason, this test does not work
+    # on `cuda_data_parallel` setting. We need to investigate why.
+    # Use a local version of setting list for these two tests for now
+    # since we have changed the default setting list to includes all options.
+    # (This is also used in many other tests, which also needs to be unified later).
+    gpu_setting_list = (
+        ["", "cuda"]
+        if torch.cuda.is_available() and torch.cuda.device_count() != 0
+        else [""]
+    )
+
     @parameterized.expand(
         [
             (
@@ -59,17 +70,17 @@ def test_flatten_unflattener(self, param_shapes: List[Tuple[int, ...]]) -> None:
                 delta,
                 mode,
                 unpack_inputs,
-                use_gpu,
+                gpu_setting,
             )
             for reduction in ["none", "sum", "mean"]
-            for use_gpu in USE_GPU_LIST
+            for gpu_setting in gpu_setting_list
             for (influence_constructor, delta) in [
                 (
                     DataInfluenceConstructor(
                         NaiveInfluenceFunction,
                         layers=(
                             ["module.linear"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear"]
                         ),
                         projection_dim=None,
@@ -109,7 +120,7 @@ def test_matches_linear_regression(
         delta: float,
         mode: str,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         """
         this tests that `NaiveInfluence`, the simplest implementation, agree with the
@@ -129,13 +140,14 @@ def test_matches_linear_regression(
                 tmpdir,
                 unpack_inputs,
                 return_test_data=True,
-                use_gpu=use_gpu,
+                gpu_setting=gpu_setting,
                 return_hessian_data=True,
                 model_type="trained_linear",
             )
 
             train_dataset = DataLoader(train_dataset, batch_size=5)
 
+            use_gpu = is_gpu(gpu_setting)
             hessian_dataset = (
                 ExplicitDataset(hessian_samples, hessian_labels, use_gpu)
                 if not unpack_inputs
diff --git a/tests/influence/_core/test_tracin_k_most_influential.py b/tests/influence/_core/test_tracin_k_most_influential.py
index 8d4b38c36..00a62314b 100644
--- a/tests/influence/_core/test_tracin_k_most_influential.py
+++ b/tests/influence/_core/test_tracin_k_most_influential.py
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Callable, Union
+from typing import Callable, Optional
 
 import torch
 import torch.nn as nn
@@ -13,22 +13,17 @@
     build_test_name_func,
     DataInfluenceConstructor,
     get_random_model_and_data,
+    GPU_SETTING_LIST,
+    is_gpu,
 )
 
 
 class TestTracInGetKMostInfluential(BaseTest):
-
-    use_gpu_list = (
-        [False, "cuda", "cuda_data_parallel"]
-        if torch.cuda.is_available() and torch.cuda.device_count() != 0
-        else [False]
-    )
-
     param_list = []
     for batch_size, k in [(4, 7), (7, 4), (40, 5), (5, 40), (40, 45)]:
         for unpack_inputs in [True, False]:
             for proponents in [True, False]:
-                for use_gpu in use_gpu_list:
+                for gpu_setting in GPU_SETTING_LIST:
                     for reduction, constr, aggregate in [
                         (
                             "none",
@@ -51,7 +46,7 @@ class TestTracInGetKMostInfluential(BaseTest):
                                 name="linear2",
                                 layers=(
                                     ["module.linear2"]
-                                    if use_gpu == "cuda_data_parallel"
+                                    if gpu_setting == "cuda_data_parallel"
                                     else ["linear2"]
                                 ),
                             ),
@@ -61,7 +56,7 @@ class TestTracInGetKMostInfluential(BaseTest):
                         if not (
                             "sample_wise_grads_per_batch" in constr.kwargs
                             and constr.kwargs["sample_wise_grads_per_batch"]
-                            and use_gpu
+                            and is_gpu(gpu_setting)
                         ):
                             param_list.append(
                                 (
@@ -71,7 +66,7 @@ class TestTracInGetKMostInfluential(BaseTest):
                                     proponents,
                                     batch_size,
                                     k,
-                                    use_gpu,
+                                    gpu_setting,
                                     aggregate,
                                 )
                             )
@@ -88,7 +83,7 @@ def test_tracin_k_most_influential(
         proponents: bool,
         batch_size: int,
         k: int,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
         aggregate: bool,
     ) -> None:
         """
@@ -107,7 +102,7 @@ def test_tracin_k_most_influential(
                 tmpdir,
                 unpack_inputs,
                 True,
-                use_gpu,
+                gpu_setting,
             )
 
             self.assertTrue(isinstance(reduction, str))
diff --git a/tests/influence/_core/test_tracin_self_influence.py b/tests/influence/_core/test_tracin_self_influence.py
index 7af3a3d61..6ea79a071 100644
--- a/tests/influence/_core/test_tracin_self_influence.py
+++ b/tests/influence/_core/test_tracin_self_influence.py
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Callable, Union
+from typing import Callable, Optional
 
 import torch
 import torch.nn as nn
@@ -15,6 +15,8 @@
     build_test_name_func,
     DataInfluenceConstructor,
     get_random_model_and_data,
+    GPU_SETTING_LIST,
+    is_gpu,
 )
 from torch.utils.data import DataLoader
 
@@ -27,14 +29,9 @@ class TestTracInSelfInfluence(BaseTest):
     # implementations separately, because the latter does not support `DataParallel`
 
     # add tests for `TracInCPBase` implementations
-    use_gpu_list = (
-        [False, "cuda", "cuda_data_parallel"]
-        if torch.cuda.is_available() and torch.cuda.device_count() != 0
-        else [False]
-    )
 
     for unpack_inputs in [True, False]:
-        for use_gpu in use_gpu_list:
+        for gpu_setting in GPU_SETTING_LIST:
             for reduction, constructor in [
                 (
                     "none",
@@ -47,7 +44,7 @@ class TestTracInSelfInfluence(BaseTest):
                         name="TracInCP_linear1",
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_data_parallel"
+                            if gpu_setting == "cuda_data_parallel"
                             else ["linear1"]
                         ),
                     ),
@@ -59,7 +56,7 @@ class TestTracInSelfInfluence(BaseTest):
                         name="TracInCP_linear1_linear2",
                         layers=(
                             ["module.linear1", "module.linear2"]
-                            if use_gpu == "cuda_data_parallel"
+                            if gpu_setting == "cuda_data_parallel"
                             else ["linear1", "linear2"]
                         ),
                     ),
@@ -88,19 +85,21 @@ class TestTracInSelfInfluence(BaseTest):
                 if not (
                     "sample_wise_grads_per_batch" in constructor.kwargs
                     and constructor.kwargs["sample_wise_grads_per_batch"]
-                    and use_gpu
+                    and is_gpu(gpu_setting)
                 ):
-                    param_list.append((reduction, constructor, unpack_inputs, use_gpu))
+                    param_list.append(
+                        (reduction, constructor, unpack_inputs, gpu_setting)
+                    )
 
     # add tests for `InfluenceFunctionBase` implementations
-    use_gpu_list = (
-        [False, "cuda"]
+    gpu_setting_list = (
+        ["", "cuda"]
         if torch.cuda.is_available() and torch.cuda.device_count() != 0
-        else [False]
+        else [""]
     )
 
     for unpack_inputs in [True, False]:
-        for use_gpu in use_gpu_list:
+        for gpu_setting in gpu_setting_list:
             for reduction, constructor in [
                 (
                     "none",
@@ -115,7 +114,7 @@ class TestTracInSelfInfluence(BaseTest):
                         name="NaiveInfluenceFunction_linear1",
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_data_parallel"
+                            if gpu_setting == "cuda_data_parallel"
                             else ["linear1"]
                         ),
                     ),
@@ -134,7 +133,7 @@ class TestTracInSelfInfluence(BaseTest):
                         name="ArnoldiInfluenceFunction_linear1",
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_data_parallel"
+                            if gpu_setting == "cuda_data_parallel"
                             else ["linear1"]
                         ),
                     ),
@@ -143,9 +142,11 @@ class TestTracInSelfInfluence(BaseTest):
                 if not (
                     "sample_wise_grads_per_batch" in constructor.kwargs
                     and constructor.kwargs["sample_wise_grads_per_batch"]
-                    and use_gpu
+                    and is_gpu(gpu_setting)
                 ):
-                    param_list.append((reduction, constructor, unpack_inputs, use_gpu))
+                    param_list.append(
+                        (reduction, constructor, unpack_inputs, gpu_setting)
+                    )
 
     @parameterized.expand(
         param_list,
@@ -156,7 +157,7 @@ def test_tracin_self_influence(
         reduction: str,
         tracin_constructor: Callable,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         with tempfile.TemporaryDirectory() as tmpdir:
             (
@@ -166,7 +167,7 @@ def test_tracin_self_influence(
                 tmpdir,
                 unpack_inputs,
                 False,
-                use_gpu,
+                gpu_setting,
             )
 
             # compute tracin_scores of training data on training data
diff --git a/tests/influence/_core/test_tracin_xor.py b/tests/influence/_core/test_tracin_xor.py
index 9f583245c..093639327 100644
--- a/tests/influence/_core/test_tracin_xor.py
+++ b/tests/influence/_core/test_tracin_xor.py
@@ -23,7 +23,7 @@ class TestTracInXOR(BaseTest):
 
     # TODO: Move test setup to use setUp and tearDown method overrides.
     def _test_tracin_xor_setup(self, tmpdir: str, use_gpu: bool = False):
-        net = BasicLinearNet(2, 2, 1)
+        net = BasicLinearNet(in_features=2, hidden_nodes=2, out_features=1)
 
         state = OrderedDict(
             [
diff --git a/tests/influence/_utils/common.py b/tests/influence/_utils/common.py
index e8c9d9c9a..0f825d79d 100644
--- a/tests/influence/_utils/common.py
+++ b/tests/influence/_utils/common.py
@@ -293,11 +293,41 @@ def get_random_data(
     return (train_dataset, hessian_dataset, test_dataset)
 
 
+def _adjust_model(model: Module, gpu_setting: Optional[str]) -> Module:
+    """
+    Given a model, returns a copy of the model on GPU based on the provided
+    `gpu_setting`.
+    Or returns the original model on CPU if no valid setting is provided.
+
+    Two valid settings are supported for now:
+        - `'cuda'`: returned model is on gpu
+        - `'cuda_data_parallel``: returned model is a `DataParallel` model,
+        and on gpu
+
+    The need to differentiate between `'cuda'` and `'cuda_data_parallel'`
+    is that sometimes we may want to test a model that is on gpu, but is *not*
+    wrapped in `DataParallel`.
+    """
+    if gpu_setting == "cuda_data_parallel":
+        return _wrap_model_in_dataparallel(model)
+    elif gpu_setting == "cuda":
+        return model.cuda()
+    else:
+        return model
+
+
+def is_gpu(gpu_setting: Optional[str]) -> bool:
+    """
+    Returns whether the model should be on gpu based on the given `gpu_setting` str.
+    """
+    return gpu_setting == "cuda_data_parallel" or gpu_setting == "cuda"
+
+
 def get_random_model_and_data(
     tmpdir,
     unpack_inputs,
     return_test_data=True,
-    use_gpu=False,
+    gpu_setting: Optional[str] = None,
     return_hessian_data=False,
     model_type="random",
 ):
@@ -330,16 +360,12 @@ def get_random_model_and_data(
     `InfluenceFunctionBase` can be more easily compared, due to lack of numerical
     issues.
 
-    `use_gpu` can either be
-    - `False`: returned model is on cpu
-    - `'cuda'`: returned model is on gpu
-    - `'cuda_data_parallel``: returned model is a `DataParallel` model, and on cpu
-    The need to differentiate between `'cuda'` and `'cuda_data_parallel'`
-    is that sometimes we may want to test a model that is on cpu, but is *not*
-    wrapped in `DataParallel`.
+    `gpu_setting` specify whether the model is on gpu and whether it is a `DataParallel`
+    model. More details in the `_adjust_model_for_gpu` API.
     """
     in_features, hidden_nodes = 5, 4
     num_inputs = 2
+    use_gpu = is_gpu(gpu_setting)
 
     # generate data. regardless the model, the data is always generated the same way
     # the only exception is if the `model_type` is 'trained_linear', i.e. a simple
@@ -367,22 +393,18 @@ def get_random_model_and_data(
         num_checkpoints = 5
 
         for i in range(num_checkpoints):
-            net.linear1.weight.data = torch.normal(
+            net.linear1.weight.data = torch.normal(  # type: ignore
                 3, 4, (hidden_nodes, in_features)
             ).double()
-            net.linear2.weight.data = torch.normal(
+            net.linear2.weight.data = torch.normal(  # type: ignore
                 5, 6, (out_features, hidden_nodes)
             ).double()
             if unpack_inputs:
-                net.pre.weight.data = torch.normal(
+                net.pre.weight.data = torch.normal(  # type: ignore
                     3, 4, (in_features, in_features * num_inputs)
                 ).double()
             checkpoint_name = "-".join(["checkpoint-reg", str(i + 1) + ".pt"])
-            net_adjusted = (
-                _wrap_model_in_dataparallel(net)
-                if use_gpu == "cuda_data_parallel"
-                else (net.to(device="cuda") if use_gpu == "cuda" else net)
-            )
+            net_adjusted = _adjust_model(net, gpu_setting)
             torch.save(net_adjusted.state_dict(), os.path.join(tmpdir, checkpoint_name))
 
     elif model_type == "trained_linear":
@@ -418,12 +440,8 @@ def get_random_model_and_data(
 
         # save that trained parameter as a checkpoint
         checkpoint_name = "checkpoint-final.pt"
-        net.linear.weight.data = theta.contiguous()
-        net_adjusted = (
-            _wrap_model_in_dataparallel(net)
-            if use_gpu == "cuda_data_parallel"
-            else (net.to(device="cuda") if use_gpu == "cuda" else net)
-        )
+        net.linear.weight.data = theta.contiguous()  # type: ignore
+        net_adjusted = _adjust_model(net, gpu_setting)
         torch.save(net_adjusted.state_dict(), os.path.join(tmpdir, checkpoint_name))
 
     elif model_type == "trained_NN":
@@ -433,11 +451,7 @@ def get_random_model_and_data(
             else MultLinearNet(in_features, hidden_nodes, out_features, num_inputs)
         ).double()
 
-        net_adjusted = (
-            _wrap_model_in_dataparallel(net)
-            if use_gpu == "cuda_data_parallel"
-            else (net.to(device="cuda") if use_gpu == "cuda" else net)
-        )
+        net_adjusted = _adjust_model(net, gpu_setting)
 
         # train model using several optimization steps on Hessian data
         batch = next(iter(DataLoader(hessian_dataset, batch_size=len(hessian_dataset))))
@@ -454,9 +468,6 @@ def get_random_model_and_data(
 
         # save that trained parameter as a checkpoint
         checkpoint_name = "checkpoint-final.pt"
-        net_adjusted = (
-            _wrap_model_in_dataparallel(net) if use_gpu == "cuda_data_parallel" else net
-        )
         torch.save(net_adjusted.state_dict(), os.path.join(tmpdir, checkpoint_name))
 
     training_data = (
@@ -680,8 +691,8 @@ def _format_batch_into_tuple(
         return (inputs, targets)
 
 
-USE_GPU_LIST = (
-    [False, "cuda"]
+GPU_SETTING_LIST = (
+    ["", "cuda", "cuda_data_parallel"]
     if torch.cuda.is_available() and torch.cuda.device_count() != 0
-    else [False]
+    else [""]
 )