diff --git a/tests/influence/_core/test_arnoldi_influence.py b/tests/influence/_core/test_arnoldi_influence.py index 875c604ba..af31bb233 100644 --- a/tests/influence/_core/test_arnoldi_influence.py +++ b/tests/influence/_core/test_arnoldi_influence.py @@ -1,5 +1,5 @@ import tempfile -from typing import Callable, List, Tuple, Union +from typing import Callable, List, Optional, Tuple import torch @@ -27,8 +27,8 @@ generate_assymetric_matrix_given_eigenvalues, generate_symmetric_matrix_given_eigenvalues, get_random_model_and_data, + is_gpu, UnpackDataset, - USE_GPU_LIST, ) from torch import Tensor from torch.utils.data import DataLoader @@ -229,6 +229,17 @@ def _param_matmul(params: Tuple[Tensor]): "max", ) + # TODO: for some unknow reason, this test and the test below does not work + # on `cuda_data_parallel` setting. We need to investigate why. + # Use a local version of setting list for these two tests for now + # since we have changed the default setting list to includes all options. + # (This is also used in many other tests, which also needs to be unified later). + gpu_setting_list = ( + ["", "cuda"] + if torch.cuda.is_available() and torch.cuda.device_count() != 0 + else [""] + ) + @parameterized.expand( [ ( @@ -237,9 +248,9 @@ def _param_matmul(params: Tuple[Tensor]): delta, mode, unpack_inputs, - use_gpu, + gpu_setting, ) - for use_gpu in USE_GPU_LIST + for gpu_setting in gpu_setting_list for (influence_constructor_1, influence_constructor_2, delta) in [ # compare implementations, when considering only 1 layer ( @@ -247,7 +258,7 @@ def _param_matmul(params: Tuple[Tensor]): NaiveInfluenceFunction, layers=( ["module.linear1"] - if use_gpu == "cuda_dataparallel" + if gpu_setting == "cuda_dataparallel" else ["linear1"] ), projection_dim=5, @@ -258,7 +269,7 @@ def _param_matmul(params: Tuple[Tensor]): ArnoldiInfluenceFunction, layers=( ["module.linear1"] - if use_gpu == "cuda_dataparallel" + if gpu_setting == "cuda_dataparallel" else ["linear1"] ), arnoldi_dim=50, @@ -314,7 +325,7 @@ def test_compare_implementations_trained_NN_model_and_data( delta: float, mode: str, unpack_inputs: bool, - use_gpu: Union[bool, str], + gpu_setting: Optional[str], ) -> None: """ this compares 2 influence implementations on a trained 2-layer NN model. @@ -329,7 +340,7 @@ def test_compare_implementations_trained_NN_model_and_data( delta, mode, unpack_inputs, - use_gpu, + gpu_setting, ) # this compares `ArnoldiInfluenceFunction` and `NaiveInfluenceFunction` on randomly @@ -337,6 +348,7 @@ def test_compare_implementations_trained_NN_model_and_data( # can also compare the intermediate quantities. we do not compare with # `NaiveInfluence` because on randomly generated data, it is not comparable, # conceptually, with the other implementations, due to numerical issues. + @parameterized.expand( [ ( @@ -345,16 +357,16 @@ def test_compare_implementations_trained_NN_model_and_data( delta, mode, unpack_inputs, - use_gpu, + gpu_setting, ) - for use_gpu in USE_GPU_LIST + for gpu_setting in gpu_setting_list for (influence_constructor_1, influence_constructor_2, delta) in [ ( DataInfluenceConstructor( NaiveInfluenceFunction, layers=( ["module.linear1"] - if use_gpu == "cuda_dataparallel" + if gpu_setting == "cuda_dataparallel" else ["linear1"] ), show_progress=False, @@ -364,7 +376,7 @@ def test_compare_implementations_trained_NN_model_and_data( ArnoldiInfluenceFunction, layers=( ["module.linear1"] - if use_gpu == "cuda_dataparallel" + if gpu_setting == "cuda_dataparallel" else ["linear1"] ), show_progress=False, @@ -397,7 +409,7 @@ def test_compare_implementations_random_model_and_data( delta: float, mode: str, unpack_inputs: bool, - use_gpu: Union[bool, str], + gpu_setting: Optional[str], ) -> None: """ this compares 2 influence implementations on a trained 2-layer NN model. @@ -412,7 +424,7 @@ def test_compare_implementations_random_model_and_data( delta, mode, unpack_inputs, - use_gpu, + gpu_setting, ) def _test_compare_implementations( @@ -423,7 +435,7 @@ def _test_compare_implementations( delta: float, mode: str, unpack_inputs: bool, - use_gpu: Union[bool, str], + gpu_setting: Optional[str], ) -> None: """ checks that 2 implementations of `InfluenceFunctionBase` return the same @@ -444,13 +456,14 @@ def _test_compare_implementations( tmpdir, unpack_inputs, return_test_data=True, - use_gpu=use_gpu, + gpu_setting=gpu_setting, return_hessian_data=True, model_type=model_type, ) train_dataset = DataLoader(train_dataset, batch_size=5) + use_gpu = is_gpu(gpu_setting) hessian_dataset = ( ExplicitDataset(hessian_samples, hessian_labels, use_gpu) if not unpack_inputs diff --git a/tests/influence/_core/test_naive_influence.py b/tests/influence/_core/test_naive_influence.py index b48a1ffaa..ddcb70994 100644 --- a/tests/influence/_core/test_naive_influence.py +++ b/tests/influence/_core/test_naive_influence.py @@ -1,5 +1,5 @@ import tempfile -from typing import Callable, List, Tuple, Union +from typing import Callable, List, Optional, Tuple import torch @@ -21,9 +21,9 @@ DataInfluenceConstructor, ExplicitDataset, get_random_model_and_data, + is_gpu, Linear, UnpackDataset, - USE_GPU_LIST, ) from torch.utils.data import DataLoader @@ -51,6 +51,17 @@ def test_flatten_unflattener(self, param_shapes: List[Tuple[int, ...]]) -> None: mode="max", ) + # TODO: for some unknow reason, this test does not work + # on `cuda_data_parallel` setting. We need to investigate why. + # Use a local version of setting list for these two tests for now + # since we have changed the default setting list to includes all options. + # (This is also used in many other tests, which also needs to be unified later). + gpu_setting_list = ( + ["", "cuda"] + if torch.cuda.is_available() and torch.cuda.device_count() != 0 + else [""] + ) + @parameterized.expand( [ ( @@ -59,17 +70,17 @@ def test_flatten_unflattener(self, param_shapes: List[Tuple[int, ...]]) -> None: delta, mode, unpack_inputs, - use_gpu, + gpu_setting, ) for reduction in ["none", "sum", "mean"] - for use_gpu in USE_GPU_LIST + for gpu_setting in gpu_setting_list for (influence_constructor, delta) in [ ( DataInfluenceConstructor( NaiveInfluenceFunction, layers=( ["module.linear"] - if use_gpu == "cuda_dataparallel" + if gpu_setting == "cuda_dataparallel" else ["linear"] ), projection_dim=None, @@ -109,7 +120,7 @@ def test_matches_linear_regression( delta: float, mode: str, unpack_inputs: bool, - use_gpu: Union[bool, str], + gpu_setting: Optional[str], ) -> None: """ this tests that `NaiveInfluence`, the simplest implementation, agree with the @@ -129,13 +140,14 @@ def test_matches_linear_regression( tmpdir, unpack_inputs, return_test_data=True, - use_gpu=use_gpu, + gpu_setting=gpu_setting, return_hessian_data=True, model_type="trained_linear", ) train_dataset = DataLoader(train_dataset, batch_size=5) + use_gpu = is_gpu(gpu_setting) hessian_dataset = ( ExplicitDataset(hessian_samples, hessian_labels, use_gpu) if not unpack_inputs diff --git a/tests/influence/_core/test_tracin_k_most_influential.py b/tests/influence/_core/test_tracin_k_most_influential.py index 8d4b38c36..00a62314b 100644 --- a/tests/influence/_core/test_tracin_k_most_influential.py +++ b/tests/influence/_core/test_tracin_k_most_influential.py @@ -1,5 +1,5 @@ import tempfile -from typing import Callable, Union +from typing import Callable, Optional import torch import torch.nn as nn @@ -13,22 +13,17 @@ build_test_name_func, DataInfluenceConstructor, get_random_model_and_data, + GPU_SETTING_LIST, + is_gpu, ) class TestTracInGetKMostInfluential(BaseTest): - - use_gpu_list = ( - [False, "cuda", "cuda_data_parallel"] - if torch.cuda.is_available() and torch.cuda.device_count() != 0 - else [False] - ) - param_list = [] for batch_size, k in [(4, 7), (7, 4), (40, 5), (5, 40), (40, 45)]: for unpack_inputs in [True, False]: for proponents in [True, False]: - for use_gpu in use_gpu_list: + for gpu_setting in GPU_SETTING_LIST: for reduction, constr, aggregate in [ ( "none", @@ -51,7 +46,7 @@ class TestTracInGetKMostInfluential(BaseTest): name="linear2", layers=( ["module.linear2"] - if use_gpu == "cuda_data_parallel" + if gpu_setting == "cuda_data_parallel" else ["linear2"] ), ), @@ -61,7 +56,7 @@ class TestTracInGetKMostInfluential(BaseTest): if not ( "sample_wise_grads_per_batch" in constr.kwargs and constr.kwargs["sample_wise_grads_per_batch"] - and use_gpu + and is_gpu(gpu_setting) ): param_list.append( ( @@ -71,7 +66,7 @@ class TestTracInGetKMostInfluential(BaseTest): proponents, batch_size, k, - use_gpu, + gpu_setting, aggregate, ) ) @@ -88,7 +83,7 @@ def test_tracin_k_most_influential( proponents: bool, batch_size: int, k: int, - use_gpu: Union[bool, str], + gpu_setting: Optional[str], aggregate: bool, ) -> None: """ @@ -107,7 +102,7 @@ def test_tracin_k_most_influential( tmpdir, unpack_inputs, True, - use_gpu, + gpu_setting, ) self.assertTrue(isinstance(reduction, str)) diff --git a/tests/influence/_core/test_tracin_self_influence.py b/tests/influence/_core/test_tracin_self_influence.py index 7af3a3d61..6ea79a071 100644 --- a/tests/influence/_core/test_tracin_self_influence.py +++ b/tests/influence/_core/test_tracin_self_influence.py @@ -1,5 +1,5 @@ import tempfile -from typing import Callable, Union +from typing import Callable, Optional import torch import torch.nn as nn @@ -15,6 +15,8 @@ build_test_name_func, DataInfluenceConstructor, get_random_model_and_data, + GPU_SETTING_LIST, + is_gpu, ) from torch.utils.data import DataLoader @@ -27,14 +29,9 @@ class TestTracInSelfInfluence(BaseTest): # implementations separately, because the latter does not support `DataParallel` # add tests for `TracInCPBase` implementations - use_gpu_list = ( - [False, "cuda", "cuda_data_parallel"] - if torch.cuda.is_available() and torch.cuda.device_count() != 0 - else [False] - ) for unpack_inputs in [True, False]: - for use_gpu in use_gpu_list: + for gpu_setting in GPU_SETTING_LIST: for reduction, constructor in [ ( "none", @@ -47,7 +44,7 @@ class TestTracInSelfInfluence(BaseTest): name="TracInCP_linear1", layers=( ["module.linear1"] - if use_gpu == "cuda_data_parallel" + if gpu_setting == "cuda_data_parallel" else ["linear1"] ), ), @@ -59,7 +56,7 @@ class TestTracInSelfInfluence(BaseTest): name="TracInCP_linear1_linear2", layers=( ["module.linear1", "module.linear2"] - if use_gpu == "cuda_data_parallel" + if gpu_setting == "cuda_data_parallel" else ["linear1", "linear2"] ), ), @@ -88,19 +85,21 @@ class TestTracInSelfInfluence(BaseTest): if not ( "sample_wise_grads_per_batch" in constructor.kwargs and constructor.kwargs["sample_wise_grads_per_batch"] - and use_gpu + and is_gpu(gpu_setting) ): - param_list.append((reduction, constructor, unpack_inputs, use_gpu)) + param_list.append( + (reduction, constructor, unpack_inputs, gpu_setting) + ) # add tests for `InfluenceFunctionBase` implementations - use_gpu_list = ( - [False, "cuda"] + gpu_setting_list = ( + ["", "cuda"] if torch.cuda.is_available() and torch.cuda.device_count() != 0 - else [False] + else [""] ) for unpack_inputs in [True, False]: - for use_gpu in use_gpu_list: + for gpu_setting in gpu_setting_list: for reduction, constructor in [ ( "none", @@ -115,7 +114,7 @@ class TestTracInSelfInfluence(BaseTest): name="NaiveInfluenceFunction_linear1", layers=( ["module.linear1"] - if use_gpu == "cuda_data_parallel" + if gpu_setting == "cuda_data_parallel" else ["linear1"] ), ), @@ -134,7 +133,7 @@ class TestTracInSelfInfluence(BaseTest): name="ArnoldiInfluenceFunction_linear1", layers=( ["module.linear1"] - if use_gpu == "cuda_data_parallel" + if gpu_setting == "cuda_data_parallel" else ["linear1"] ), ), @@ -143,9 +142,11 @@ class TestTracInSelfInfluence(BaseTest): if not ( "sample_wise_grads_per_batch" in constructor.kwargs and constructor.kwargs["sample_wise_grads_per_batch"] - and use_gpu + and is_gpu(gpu_setting) ): - param_list.append((reduction, constructor, unpack_inputs, use_gpu)) + param_list.append( + (reduction, constructor, unpack_inputs, gpu_setting) + ) @parameterized.expand( param_list, @@ -156,7 +157,7 @@ def test_tracin_self_influence( reduction: str, tracin_constructor: Callable, unpack_inputs: bool, - use_gpu: Union[bool, str], + gpu_setting: Optional[str], ) -> None: with tempfile.TemporaryDirectory() as tmpdir: ( @@ -166,7 +167,7 @@ def test_tracin_self_influence( tmpdir, unpack_inputs, False, - use_gpu, + gpu_setting, ) # compute tracin_scores of training data on training data diff --git a/tests/influence/_core/test_tracin_xor.py b/tests/influence/_core/test_tracin_xor.py index 9f583245c..093639327 100644 --- a/tests/influence/_core/test_tracin_xor.py +++ b/tests/influence/_core/test_tracin_xor.py @@ -23,7 +23,7 @@ class TestTracInXOR(BaseTest): # TODO: Move test setup to use setUp and tearDown method overrides. def _test_tracin_xor_setup(self, tmpdir: str, use_gpu: bool = False): - net = BasicLinearNet(2, 2, 1) + net = BasicLinearNet(in_features=2, hidden_nodes=2, out_features=1) state = OrderedDict( [ diff --git a/tests/influence/_utils/common.py b/tests/influence/_utils/common.py index e8c9d9c9a..0f825d79d 100644 --- a/tests/influence/_utils/common.py +++ b/tests/influence/_utils/common.py @@ -293,11 +293,41 @@ def get_random_data( return (train_dataset, hessian_dataset, test_dataset) +def _adjust_model(model: Module, gpu_setting: Optional[str]) -> Module: + """ + Given a model, returns a copy of the model on GPU based on the provided + `gpu_setting`. + Or returns the original model on CPU if no valid setting is provided. + + Two valid settings are supported for now: + - `'cuda'`: returned model is on gpu + - `'cuda_data_parallel``: returned model is a `DataParallel` model, + and on gpu + + The need to differentiate between `'cuda'` and `'cuda_data_parallel'` + is that sometimes we may want to test a model that is on gpu, but is *not* + wrapped in `DataParallel`. + """ + if gpu_setting == "cuda_data_parallel": + return _wrap_model_in_dataparallel(model) + elif gpu_setting == "cuda": + return model.cuda() + else: + return model + + +def is_gpu(gpu_setting: Optional[str]) -> bool: + """ + Returns whether the model should be on gpu based on the given `gpu_setting` str. + """ + return gpu_setting == "cuda_data_parallel" or gpu_setting == "cuda" + + def get_random_model_and_data( tmpdir, unpack_inputs, return_test_data=True, - use_gpu=False, + gpu_setting: Optional[str] = None, return_hessian_data=False, model_type="random", ): @@ -330,16 +360,12 @@ def get_random_model_and_data( `InfluenceFunctionBase` can be more easily compared, due to lack of numerical issues. - `use_gpu` can either be - - `False`: returned model is on cpu - - `'cuda'`: returned model is on gpu - - `'cuda_data_parallel``: returned model is a `DataParallel` model, and on cpu - The need to differentiate between `'cuda'` and `'cuda_data_parallel'` - is that sometimes we may want to test a model that is on cpu, but is *not* - wrapped in `DataParallel`. + `gpu_setting` specify whether the model is on gpu and whether it is a `DataParallel` + model. More details in the `_adjust_model_for_gpu` API. """ in_features, hidden_nodes = 5, 4 num_inputs = 2 + use_gpu = is_gpu(gpu_setting) # generate data. regardless the model, the data is always generated the same way # the only exception is if the `model_type` is 'trained_linear', i.e. a simple @@ -367,22 +393,18 @@ def get_random_model_and_data( num_checkpoints = 5 for i in range(num_checkpoints): - net.linear1.weight.data = torch.normal( + net.linear1.weight.data = torch.normal( # type: ignore 3, 4, (hidden_nodes, in_features) ).double() - net.linear2.weight.data = torch.normal( + net.linear2.weight.data = torch.normal( # type: ignore 5, 6, (out_features, hidden_nodes) ).double() if unpack_inputs: - net.pre.weight.data = torch.normal( + net.pre.weight.data = torch.normal( # type: ignore 3, 4, (in_features, in_features * num_inputs) ).double() checkpoint_name = "-".join(["checkpoint-reg", str(i + 1) + ".pt"]) - net_adjusted = ( - _wrap_model_in_dataparallel(net) - if use_gpu == "cuda_data_parallel" - else (net.to(device="cuda") if use_gpu == "cuda" else net) - ) + net_adjusted = _adjust_model(net, gpu_setting) torch.save(net_adjusted.state_dict(), os.path.join(tmpdir, checkpoint_name)) elif model_type == "trained_linear": @@ -418,12 +440,8 @@ def get_random_model_and_data( # save that trained parameter as a checkpoint checkpoint_name = "checkpoint-final.pt" - net.linear.weight.data = theta.contiguous() - net_adjusted = ( - _wrap_model_in_dataparallel(net) - if use_gpu == "cuda_data_parallel" - else (net.to(device="cuda") if use_gpu == "cuda" else net) - ) + net.linear.weight.data = theta.contiguous() # type: ignore + net_adjusted = _adjust_model(net, gpu_setting) torch.save(net_adjusted.state_dict(), os.path.join(tmpdir, checkpoint_name)) elif model_type == "trained_NN": @@ -433,11 +451,7 @@ def get_random_model_and_data( else MultLinearNet(in_features, hidden_nodes, out_features, num_inputs) ).double() - net_adjusted = ( - _wrap_model_in_dataparallel(net) - if use_gpu == "cuda_data_parallel" - else (net.to(device="cuda") if use_gpu == "cuda" else net) - ) + net_adjusted = _adjust_model(net, gpu_setting) # train model using several optimization steps on Hessian data batch = next(iter(DataLoader(hessian_dataset, batch_size=len(hessian_dataset)))) @@ -454,9 +468,6 @@ def get_random_model_and_data( # save that trained parameter as a checkpoint checkpoint_name = "checkpoint-final.pt" - net_adjusted = ( - _wrap_model_in_dataparallel(net) if use_gpu == "cuda_data_parallel" else net - ) torch.save(net_adjusted.state_dict(), os.path.join(tmpdir, checkpoint_name)) training_data = ( @@ -680,8 +691,8 @@ def _format_batch_into_tuple( return (inputs, targets) -USE_GPU_LIST = ( - [False, "cuda"] +GPU_SETTING_LIST = ( + ["", "cuda", "cuda_data_parallel"] if torch.cuda.is_available() and torch.cuda.device_count() != 0 - else [False] + else [""] )