From a94fc0f708efe9d71b6023bb9e3d24e3775cf266 Mon Sep 17 00:00:00 2001
From: "Chen Chen (AI Infra)" <cicichen@meta.com>
Date: Fri, 22 Mar 2024 14:09:56 -0700
Subject: [PATCH] Split the get_random_model_and_data() method [1/n] (#1252)

Summary:

As titled. The get_random_model_and_data() method is used to construct testing data for influence and it is reported as too complex by flake8 (https://www.flake8rules.com/rules/C901.html). This series of diff will split the method and abstract the common parts.

This diff isolate the model part for different gpu usage settings. It also eliminate the mix usage of bool and str.

Differential Revision: D55165054
---
 .../influence/_core/test_arnoldi_influence.py | 45 +++++++----
 tests/influence/_core/test_naive_influence.py | 26 +++++--
 .../_core/test_tracin_k_most_influential.py   | 23 +++---
 .../_core/test_tracin_self_influence.py       | 43 ++++++-----
 tests/influence/_core/test_tracin_xor.py      |  2 +-
 tests/influence/_utils/common.py              | 77 +++++++++++--------
 6 files changed, 124 insertions(+), 92 deletions(-)

diff --git a/tests/influence/_core/test_arnoldi_influence.py b/tests/influence/_core/test_arnoldi_influence.py
index 875c604ba..af31bb233 100644
--- a/tests/influence/_core/test_arnoldi_influence.py
+++ b/tests/influence/_core/test_arnoldi_influence.py
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List, Optional, Tuple
 
 import torch
 
@@ -27,8 +27,8 @@
     generate_assymetric_matrix_given_eigenvalues,
     generate_symmetric_matrix_given_eigenvalues,
     get_random_model_and_data,
+    is_gpu,
     UnpackDataset,
-    USE_GPU_LIST,
 )
 from torch import Tensor
 from torch.utils.data import DataLoader
@@ -229,6 +229,17 @@ def _param_matmul(params: Tuple[Tensor]):
                         "max",
                     )
 
+    # TODO: for some unknow reason, this test and the test below does not work
+    # on `cuda_data_parallel` setting. We need to investigate why.
+    # Use a local version of setting list for these two tests for now
+    # since we have changed the default setting list to includes all options.
+    # (This is also used in many other tests, which also needs to be unified later).
+    gpu_setting_list = (
+        ["", "cuda"]
+        if torch.cuda.is_available() and torch.cuda.device_count() != 0
+        else [""]
+    )
+
     @parameterized.expand(
         [
             (
@@ -237,9 +248,9 @@ def _param_matmul(params: Tuple[Tensor]):
                 delta,
                 mode,
                 unpack_inputs,
-                use_gpu,
+                gpu_setting,
             )
-            for use_gpu in USE_GPU_LIST
+            for gpu_setting in gpu_setting_list
             for (influence_constructor_1, influence_constructor_2, delta) in [
                 # compare implementations, when considering only 1 layer
                 (
@@ -247,7 +258,7 @@ def _param_matmul(params: Tuple[Tensor]):
                         NaiveInfluenceFunction,
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear1"]
                         ),
                         projection_dim=5,
@@ -258,7 +269,7 @@ def _param_matmul(params: Tuple[Tensor]):
                         ArnoldiInfluenceFunction,
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear1"]
                         ),
                         arnoldi_dim=50,
@@ -314,7 +325,7 @@ def test_compare_implementations_trained_NN_model_and_data(
         delta: float,
         mode: str,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         """
         this compares 2 influence implementations on a trained 2-layer NN model.
@@ -329,7 +340,7 @@ def test_compare_implementations_trained_NN_model_and_data(
             delta,
             mode,
             unpack_inputs,
-            use_gpu,
+            gpu_setting,
         )
 
     # this compares `ArnoldiInfluenceFunction` and `NaiveInfluenceFunction` on randomly
@@ -337,6 +348,7 @@ def test_compare_implementations_trained_NN_model_and_data(
     # can also compare the intermediate quantities. we do not compare with
     # `NaiveInfluence` because on randomly generated data, it is not comparable,
     # conceptually, with the other implementations, due to numerical issues.
+
     @parameterized.expand(
         [
             (
@@ -345,16 +357,16 @@ def test_compare_implementations_trained_NN_model_and_data(
                 delta,
                 mode,
                 unpack_inputs,
-                use_gpu,
+                gpu_setting,
             )
-            for use_gpu in USE_GPU_LIST
+            for gpu_setting in gpu_setting_list
             for (influence_constructor_1, influence_constructor_2, delta) in [
                 (
                     DataInfluenceConstructor(
                         NaiveInfluenceFunction,
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear1"]
                         ),
                         show_progress=False,
@@ -364,7 +376,7 @@ def test_compare_implementations_trained_NN_model_and_data(
                         ArnoldiInfluenceFunction,
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear1"]
                         ),
                         show_progress=False,
@@ -397,7 +409,7 @@ def test_compare_implementations_random_model_and_data(
         delta: float,
         mode: str,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         """
         this compares 2 influence implementations on a trained 2-layer NN model.
@@ -412,7 +424,7 @@ def test_compare_implementations_random_model_and_data(
             delta,
             mode,
             unpack_inputs,
-            use_gpu,
+            gpu_setting,
         )
 
     def _test_compare_implementations(
@@ -423,7 +435,7 @@ def _test_compare_implementations(
         delta: float,
         mode: str,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         """
         checks that 2 implementations of `InfluenceFunctionBase` return the same
@@ -444,13 +456,14 @@ def _test_compare_implementations(
                 tmpdir,
                 unpack_inputs,
                 return_test_data=True,
-                use_gpu=use_gpu,
+                gpu_setting=gpu_setting,
                 return_hessian_data=True,
                 model_type=model_type,
             )
 
             train_dataset = DataLoader(train_dataset, batch_size=5)
 
+            use_gpu = is_gpu(gpu_setting)
             hessian_dataset = (
                 ExplicitDataset(hessian_samples, hessian_labels, use_gpu)
                 if not unpack_inputs
diff --git a/tests/influence/_core/test_naive_influence.py b/tests/influence/_core/test_naive_influence.py
index b48a1ffaa..ddcb70994 100644
--- a/tests/influence/_core/test_naive_influence.py
+++ b/tests/influence/_core/test_naive_influence.py
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List, Optional, Tuple
 
 import torch
 
@@ -21,9 +21,9 @@
     DataInfluenceConstructor,
     ExplicitDataset,
     get_random_model_and_data,
+    is_gpu,
     Linear,
     UnpackDataset,
-    USE_GPU_LIST,
 )
 from torch.utils.data import DataLoader
 
@@ -51,6 +51,17 @@ def test_flatten_unflattener(self, param_shapes: List[Tuple[int, ...]]) -> None:
             mode="max",
         )
 
+    # TODO: for some unknow reason, this test does not work
+    # on `cuda_data_parallel` setting. We need to investigate why.
+    # Use a local version of setting list for these two tests for now
+    # since we have changed the default setting list to includes all options.
+    # (This is also used in many other tests, which also needs to be unified later).
+    gpu_setting_list = (
+        ["", "cuda"]
+        if torch.cuda.is_available() and torch.cuda.device_count() != 0
+        else [""]
+    )
+
     @parameterized.expand(
         [
             (
@@ -59,17 +70,17 @@ def test_flatten_unflattener(self, param_shapes: List[Tuple[int, ...]]) -> None:
                 delta,
                 mode,
                 unpack_inputs,
-                use_gpu,
+                gpu_setting,
             )
             for reduction in ["none", "sum", "mean"]
-            for use_gpu in USE_GPU_LIST
+            for gpu_setting in gpu_setting_list
             for (influence_constructor, delta) in [
                 (
                     DataInfluenceConstructor(
                         NaiveInfluenceFunction,
                         layers=(
                             ["module.linear"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear"]
                         ),
                         projection_dim=None,
@@ -109,7 +120,7 @@ def test_matches_linear_regression(
         delta: float,
         mode: str,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         """
         this tests that `NaiveInfluence`, the simplest implementation, agree with the
@@ -129,13 +140,14 @@ def test_matches_linear_regression(
                 tmpdir,
                 unpack_inputs,
                 return_test_data=True,
-                use_gpu=use_gpu,
+                gpu_setting=gpu_setting,
                 return_hessian_data=True,
                 model_type="trained_linear",
             )
 
             train_dataset = DataLoader(train_dataset, batch_size=5)
 
+            use_gpu = is_gpu(gpu_setting)
             hessian_dataset = (
                 ExplicitDataset(hessian_samples, hessian_labels, use_gpu)
                 if not unpack_inputs
diff --git a/tests/influence/_core/test_tracin_k_most_influential.py b/tests/influence/_core/test_tracin_k_most_influential.py
index 8d4b38c36..00a62314b 100644
--- a/tests/influence/_core/test_tracin_k_most_influential.py
+++ b/tests/influence/_core/test_tracin_k_most_influential.py
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Callable, Union
+from typing import Callable, Optional
 
 import torch
 import torch.nn as nn
@@ -13,22 +13,17 @@
     build_test_name_func,
     DataInfluenceConstructor,
     get_random_model_and_data,
+    GPU_SETTING_LIST,
+    is_gpu,
 )
 
 
 class TestTracInGetKMostInfluential(BaseTest):
-
-    use_gpu_list = (
-        [False, "cuda", "cuda_data_parallel"]
-        if torch.cuda.is_available() and torch.cuda.device_count() != 0
-        else [False]
-    )
-
     param_list = []
     for batch_size, k in [(4, 7), (7, 4), (40, 5), (5, 40), (40, 45)]:
         for unpack_inputs in [True, False]:
             for proponents in [True, False]:
-                for use_gpu in use_gpu_list:
+                for gpu_setting in GPU_SETTING_LIST:
                     for reduction, constr, aggregate in [
                         (
                             "none",
@@ -51,7 +46,7 @@ class TestTracInGetKMostInfluential(BaseTest):
                                 name="linear2",
                                 layers=(
                                     ["module.linear2"]
-                                    if use_gpu == "cuda_data_parallel"
+                                    if gpu_setting == "cuda_data_parallel"
                                     else ["linear2"]
                                 ),
                             ),
@@ -61,7 +56,7 @@ class TestTracInGetKMostInfluential(BaseTest):
                         if not (
                             "sample_wise_grads_per_batch" in constr.kwargs
                             and constr.kwargs["sample_wise_grads_per_batch"]
-                            and use_gpu
+                            and is_gpu(gpu_setting)
                         ):
                             param_list.append(
                                 (
@@ -71,7 +66,7 @@ class TestTracInGetKMostInfluential(BaseTest):
                                     proponents,
                                     batch_size,
                                     k,
-                                    use_gpu,
+                                    gpu_setting,
                                     aggregate,
                                 )
                             )
@@ -88,7 +83,7 @@ def test_tracin_k_most_influential(
         proponents: bool,
         batch_size: int,
         k: int,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
         aggregate: bool,
     ) -> None:
         """
@@ -107,7 +102,7 @@ def test_tracin_k_most_influential(
                 tmpdir,
                 unpack_inputs,
                 True,
-                use_gpu,
+                gpu_setting,
             )
 
             self.assertTrue(isinstance(reduction, str))
diff --git a/tests/influence/_core/test_tracin_self_influence.py b/tests/influence/_core/test_tracin_self_influence.py
index 7af3a3d61..6ea79a071 100644
--- a/tests/influence/_core/test_tracin_self_influence.py
+++ b/tests/influence/_core/test_tracin_self_influence.py
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Callable, Union
+from typing import Callable, Optional
 
 import torch
 import torch.nn as nn
@@ -15,6 +15,8 @@
     build_test_name_func,
     DataInfluenceConstructor,
     get_random_model_and_data,
+    GPU_SETTING_LIST,
+    is_gpu,
 )
 from torch.utils.data import DataLoader
 
@@ -27,14 +29,9 @@ class TestTracInSelfInfluence(BaseTest):
     # implementations separately, because the latter does not support `DataParallel`
 
     # add tests for `TracInCPBase` implementations
-    use_gpu_list = (
-        [False, "cuda", "cuda_data_parallel"]
-        if torch.cuda.is_available() and torch.cuda.device_count() != 0
-        else [False]
-    )
 
     for unpack_inputs in [True, False]:
-        for use_gpu in use_gpu_list:
+        for gpu_setting in GPU_SETTING_LIST:
             for reduction, constructor in [
                 (
                     "none",
@@ -47,7 +44,7 @@ class TestTracInSelfInfluence(BaseTest):
                         name="TracInCP_linear1",
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_data_parallel"
+                            if gpu_setting == "cuda_data_parallel"
                             else ["linear1"]
                         ),
                     ),
@@ -59,7 +56,7 @@ class TestTracInSelfInfluence(BaseTest):
                         name="TracInCP_linear1_linear2",
                         layers=(
                             ["module.linear1", "module.linear2"]
-                            if use_gpu == "cuda_data_parallel"
+                            if gpu_setting == "cuda_data_parallel"
                             else ["linear1", "linear2"]
                         ),
                     ),
@@ -88,19 +85,21 @@ class TestTracInSelfInfluence(BaseTest):
                 if not (
                     "sample_wise_grads_per_batch" in constructor.kwargs
                     and constructor.kwargs["sample_wise_grads_per_batch"]
-                    and use_gpu
+                    and is_gpu(gpu_setting)
                 ):
-                    param_list.append((reduction, constructor, unpack_inputs, use_gpu))
+                    param_list.append(
+                        (reduction, constructor, unpack_inputs, gpu_setting)
+                    )
 
     # add tests for `InfluenceFunctionBase` implementations
-    use_gpu_list = (
-        [False, "cuda"]
+    gpu_setting_list = (
+        ["", "cuda"]
         if torch.cuda.is_available() and torch.cuda.device_count() != 0
-        else [False]
+        else [""]
     )
 
     for unpack_inputs in [True, False]:
-        for use_gpu in use_gpu_list:
+        for gpu_setting in gpu_setting_list:
             for reduction, constructor in [
                 (
                     "none",
@@ -115,7 +114,7 @@ class TestTracInSelfInfluence(BaseTest):
                         name="NaiveInfluenceFunction_linear1",
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_data_parallel"
+                            if gpu_setting == "cuda_data_parallel"
                             else ["linear1"]
                         ),
                     ),
@@ -134,7 +133,7 @@ class TestTracInSelfInfluence(BaseTest):
                         name="ArnoldiInfluenceFunction_linear1",
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_data_parallel"
+                            if gpu_setting == "cuda_data_parallel"
                             else ["linear1"]
                         ),
                     ),
@@ -143,9 +142,11 @@ class TestTracInSelfInfluence(BaseTest):
                 if not (
                     "sample_wise_grads_per_batch" in constructor.kwargs
                     and constructor.kwargs["sample_wise_grads_per_batch"]
-                    and use_gpu
+                    and is_gpu(gpu_setting)
                 ):
-                    param_list.append((reduction, constructor, unpack_inputs, use_gpu))
+                    param_list.append(
+                        (reduction, constructor, unpack_inputs, gpu_setting)
+                    )
 
     @parameterized.expand(
         param_list,
@@ -156,7 +157,7 @@ def test_tracin_self_influence(
         reduction: str,
         tracin_constructor: Callable,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         with tempfile.TemporaryDirectory() as tmpdir:
             (
@@ -166,7 +167,7 @@ def test_tracin_self_influence(
                 tmpdir,
                 unpack_inputs,
                 False,
-                use_gpu,
+                gpu_setting,
             )
 
             # compute tracin_scores of training data on training data
diff --git a/tests/influence/_core/test_tracin_xor.py b/tests/influence/_core/test_tracin_xor.py
index 9f583245c..093639327 100644
--- a/tests/influence/_core/test_tracin_xor.py
+++ b/tests/influence/_core/test_tracin_xor.py
@@ -23,7 +23,7 @@ class TestTracInXOR(BaseTest):
 
     # TODO: Move test setup to use setUp and tearDown method overrides.
     def _test_tracin_xor_setup(self, tmpdir: str, use_gpu: bool = False):
-        net = BasicLinearNet(2, 2, 1)
+        net = BasicLinearNet(in_features=2, hidden_nodes=2, out_features=1)
 
         state = OrderedDict(
             [
diff --git a/tests/influence/_utils/common.py b/tests/influence/_utils/common.py
index e8c9d9c9a..0f825d79d 100644
--- a/tests/influence/_utils/common.py
+++ b/tests/influence/_utils/common.py
@@ -293,11 +293,41 @@ def get_random_data(
     return (train_dataset, hessian_dataset, test_dataset)
 
 
+def _adjust_model(model: Module, gpu_setting: Optional[str]) -> Module:
+    """
+    Given a model, returns a copy of the model on GPU based on the provided
+    `gpu_setting`.
+    Or returns the original model on CPU if no valid setting is provided.
+
+    Two valid settings are supported for now:
+        - `'cuda'`: returned model is on gpu
+        - `'cuda_data_parallel``: returned model is a `DataParallel` model,
+        and on gpu
+
+    The need to differentiate between `'cuda'` and `'cuda_data_parallel'`
+    is that sometimes we may want to test a model that is on gpu, but is *not*
+    wrapped in `DataParallel`.
+    """
+    if gpu_setting == "cuda_data_parallel":
+        return _wrap_model_in_dataparallel(model)
+    elif gpu_setting == "cuda":
+        return model.cuda()
+    else:
+        return model
+
+
+def is_gpu(gpu_setting: Optional[str]) -> bool:
+    """
+    Returns whether the model should be on gpu based on the given `gpu_setting` str.
+    """
+    return gpu_setting == "cuda_data_parallel" or gpu_setting == "cuda"
+
+
 def get_random_model_and_data(
     tmpdir,
     unpack_inputs,
     return_test_data=True,
-    use_gpu=False,
+    gpu_setting: Optional[str] = None,
     return_hessian_data=False,
     model_type="random",
 ):
@@ -330,16 +360,12 @@ def get_random_model_and_data(
     `InfluenceFunctionBase` can be more easily compared, due to lack of numerical
     issues.
 
-    `use_gpu` can either be
-    - `False`: returned model is on cpu
-    - `'cuda'`: returned model is on gpu
-    - `'cuda_data_parallel``: returned model is a `DataParallel` model, and on cpu
-    The need to differentiate between `'cuda'` and `'cuda_data_parallel'`
-    is that sometimes we may want to test a model that is on cpu, but is *not*
-    wrapped in `DataParallel`.
+    `gpu_setting` specify whether the model is on gpu and whether it is a `DataParallel`
+    model. More details in the `_adjust_model_for_gpu` API.
     """
     in_features, hidden_nodes = 5, 4
     num_inputs = 2
+    use_gpu = is_gpu(gpu_setting)
 
     # generate data. regardless the model, the data is always generated the same way
     # the only exception is if the `model_type` is 'trained_linear', i.e. a simple
@@ -367,22 +393,18 @@ def get_random_model_and_data(
         num_checkpoints = 5
 
         for i in range(num_checkpoints):
-            net.linear1.weight.data = torch.normal(
+            net.linear1.weight.data = torch.normal(  # type: ignore
                 3, 4, (hidden_nodes, in_features)
             ).double()
-            net.linear2.weight.data = torch.normal(
+            net.linear2.weight.data = torch.normal(  # type: ignore
                 5, 6, (out_features, hidden_nodes)
             ).double()
             if unpack_inputs:
-                net.pre.weight.data = torch.normal(
+                net.pre.weight.data = torch.normal(  # type: ignore
                     3, 4, (in_features, in_features * num_inputs)
                 ).double()
             checkpoint_name = "-".join(["checkpoint-reg", str(i + 1) + ".pt"])
-            net_adjusted = (
-                _wrap_model_in_dataparallel(net)
-                if use_gpu == "cuda_data_parallel"
-                else (net.to(device="cuda") if use_gpu == "cuda" else net)
-            )
+            net_adjusted = _adjust_model(net, gpu_setting)
             torch.save(net_adjusted.state_dict(), os.path.join(tmpdir, checkpoint_name))
 
     elif model_type == "trained_linear":
@@ -418,12 +440,8 @@ def get_random_model_and_data(
 
         # save that trained parameter as a checkpoint
         checkpoint_name = "checkpoint-final.pt"
-        net.linear.weight.data = theta.contiguous()
-        net_adjusted = (
-            _wrap_model_in_dataparallel(net)
-            if use_gpu == "cuda_data_parallel"
-            else (net.to(device="cuda") if use_gpu == "cuda" else net)
-        )
+        net.linear.weight.data = theta.contiguous()  # type: ignore
+        net_adjusted = _adjust_model(net, gpu_setting)
         torch.save(net_adjusted.state_dict(), os.path.join(tmpdir, checkpoint_name))
 
     elif model_type == "trained_NN":
@@ -433,11 +451,7 @@ def get_random_model_and_data(
             else MultLinearNet(in_features, hidden_nodes, out_features, num_inputs)
         ).double()
 
-        net_adjusted = (
-            _wrap_model_in_dataparallel(net)
-            if use_gpu == "cuda_data_parallel"
-            else (net.to(device="cuda") if use_gpu == "cuda" else net)
-        )
+        net_adjusted = _adjust_model(net, gpu_setting)
 
         # train model using several optimization steps on Hessian data
         batch = next(iter(DataLoader(hessian_dataset, batch_size=len(hessian_dataset))))
@@ -454,9 +468,6 @@ def get_random_model_and_data(
 
         # save that trained parameter as a checkpoint
         checkpoint_name = "checkpoint-final.pt"
-        net_adjusted = (
-            _wrap_model_in_dataparallel(net) if use_gpu == "cuda_data_parallel" else net
-        )
         torch.save(net_adjusted.state_dict(), os.path.join(tmpdir, checkpoint_name))
 
     training_data = (
@@ -680,8 +691,8 @@ def _format_batch_into_tuple(
         return (inputs, targets)
 
 
-USE_GPU_LIST = (
-    [False, "cuda"]
+GPU_SETTING_LIST = (
+    ["", "cuda", "cuda_data_parallel"]
     if torch.cuda.is_available() and torch.cuda.device_count() != 0
-    else [False]
+    else [""]
 )