diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ece3dd8..a548c0f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -98,3 +98,45 @@ jobs:
           pytest -v tests/test_mp_batch_norm.py
           pytest -v tests/test_optimizer_distribute.py
           pytest -v tests/test_model_distribute.py
+
+
+  tf-compability:
+    needs: build
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-20.04]
+        python-version: ["3.6"]
+        tf-version: [2.2.0, 2.3.0, 2.4.0, 2.5.0, 2.6.2]
+
+    steps:
+      - uses: actions/checkout@v1
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: pip install wheel setuptools flake8 pytest-cov
+
+      - name: Install tensorflow-datasets
+        run: |
+            pip install tensorflow==${{ matrix.tf-version }} "tensorflow-datasets<=4.8.2"
+            pip install "protobuf<=3.20" --force-reinstall
+
+      - name: Download artifact
+        uses: actions/download-artifact@master
+        with:
+          name: "Python wheel"
+
+      - name: Install wheel
+        run: pip install --find-links=${{github.workspace}} gradient_accumulator
+      
+      - name: Debug pip deps
+        run: pip list
+
+      - name: Test library accessibility
+        run: python -c "from gradient_accumulator import GradientAccumulateModel, GradientAccumulateOptimizer"
+
+      - name: Run tests
+        run: pytest -v tests/test_model_expected_result.py
diff --git a/tests/test_adaptive_gradient_clipping.py b/tests/test_adaptive_gradient_clipping.py
index 9346b18..d68a8ea 100644
--- a/tests/test_adaptive_gradient_clipping.py
+++ b/tests/test_adaptive_gradient_clipping.py
@@ -1,16 +1,24 @@
+import os
+
 import tensorflow as tf
 import tensorflow_datasets as tfds
+from tensorflow.keras import mixed_precision
 from tensorflow.keras.models import load_model
+
 from gradient_accumulator import GradientAccumulateModel
 from gradient_accumulator import unitwise_norm
-from tensorflow.keras import mixed_precision
-import os
+
 from .utils import normalize_img
 
 
 def test_unitwise_norm():
     for i in range(7):
-        x = tf.zeros([1,] * i)
+        x = tf.zeros(
+            [
+                1,
+            ]
+            * i
+        )
         try:
             unitwise_norm(x)
         except ValueError as e:
@@ -22,8 +30,8 @@ def test_unitwise_norm():
 def test_train_mnist():
     # load dataset
     (ds_train, ds_test), ds_info = tfds.load(
-        'mnist',
-        split=['train', 'test'],
+        "mnist",
+        split=["train", "test"],
         shuffle_files=True,
         as_supervised=True,
         with_info=True,
@@ -35,7 +43,7 @@ def test_train_mnist():
     # build train pipeline
     ds_train = ds_train.map(normalize_img)
     ds_train = ds_train.cache()
-    ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples)
+    ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples)
     ds_train = ds_train.batch(100)  # multiplum of 8
     ds_train = ds_train.prefetch(1)
 
@@ -46,14 +54,24 @@ def test_train_mnist():
     ds_test = ds_test.prefetch(1)
 
     # create model
-    model = tf.keras.models.Sequential([
-        tf.keras.layers.Flatten(input_shape=(28, 28)),
-        tf.keras.layers.Dense(32, activation='relu'),  # 32 multiplum of 8
-        tf.keras.layers.Dense(10, dtype='float32')  # output not numerically stable with float16
-    ])
+    model = tf.keras.models.Sequential(
+        [
+            tf.keras.layers.Flatten(input_shape=(28, 28)),
+            tf.keras.layers.Dense(32, activation="relu"),  # 32 multiplum of 8
+            tf.keras.layers.Dense(
+                10, dtype="float32"
+            ),  # output not numerically stable with float16
+        ]
+    )
 
     # wrap model to use gradient accumulation
-    model = GradientAccumulateModel(accum_steps=4, mixed_precision=False, use_agc=True, inputs=model.input, outputs=model.output)
+    model = GradientAccumulateModel(
+        accum_steps=4,
+        mixed_precision=False,
+        use_agc=True,
+        inputs=model.input,
+        outputs=model.output,
+    )
 
     # need to scale optimizer for mixed precision
     opt = tf.keras.optimizers.SGD(1e-2)
diff --git a/tests/test_batch_norm.py b/tests/test_batch_norm.py
index 3ed2ada..91dba0c 100644
--- a/tests/test_batch_norm.py
+++ b/tests/test_batch_norm.py
@@ -1,19 +1,25 @@
+import os
+import random as python_random
+
+import numpy as np
 import tensorflow as tf
 import tensorflow_datasets as tfds
 from tensorflow.keras.models import load_model
+
 from gradient_accumulator import GradientAccumulateModel
 from gradient_accumulator.layers import AccumBatchNormalization
-import random as python_random
-import numpy as np
-import os
-from .utils import reset, normalize_img
+
+from .utils import normalize_img
+from .utils import reset
 
 
-def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epochs:int = 3):
+def run_experiment(
+    custom_bn: bool = True, bs: int = 100, accum_steps: int = 1, epochs: int = 3
+):
     # load dataset
     (ds_train, ds_test), ds_info = tfds.load(
-        'mnist',
-        split=['train', 'test'],
+        "mnist",
+        split=["train", "test"],
         shuffle_files=True,
         as_supervised=True,
         with_info=True,
@@ -21,7 +27,7 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo
 
     # build train pipeline
     ds_train = ds_train.map(normalize_img)
-    ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples)
+    ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples)
     ds_train = ds_train.batch(bs)
     ds_train = ds_train.prefetch(1)
 
@@ -39,17 +45,21 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo
         normalization_layer = tf.keras.layers.Activation("linear")
 
     # create model
-    model = tf.keras.models.Sequential([
-        tf.keras.layers.Flatten(input_shape=(28, 28)),
-        tf.keras.layers.Dense(32),
-        normalization_layer,  # @TODO: BN before or after ReLU? Leads to different performance
-        tf.keras.layers.Activation("relu"),
-        tf.keras.layers.Dense(10)
-    ])
+    model = tf.keras.models.Sequential(
+        [
+            tf.keras.layers.Flatten(input_shape=(28, 28)),
+            tf.keras.layers.Dense(32),
+            normalization_layer,  # @TODO: BN before or after ReLU? Leads to different performance
+            tf.keras.layers.Activation("relu"),
+            tf.keras.layers.Dense(10),
+        ]
+    )
 
     # wrap model to use gradient accumulation
     if accum_steps > 1:
-        model = GradientAccumulateModel(accum_steps=accum_steps, inputs=model.input, outputs=model.output)
+        model = GradientAccumulateModel(
+            accum_steps=accum_steps, inputs=model.input, outputs=model.output
+        )
 
     # compile model
     model.compile(
@@ -79,10 +89,10 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo
 def test_compare_bn_layers():
     # set seed
     reset()
-    
+
     # custom BN without accum
     result1 = run_experiment(custom_bn=True, accum_steps=1, epochs=3)[1]
-    
+
     # reset before second run to get "identical" results
     reset()
 
@@ -98,10 +108,10 @@ def test_compare_bn_layers():
 def test_compare_accum_bn_expected_result():
     # set seed
     reset()
-    
+
     # custom BN without accum
     result1 = run_experiment(custom_bn=True, accum_steps=4, bs=25)[1]
-    
+
     # reset before second run to get "identical" results
     reset()
 
diff --git a/tests/test_bn_convnd.py b/tests/test_bn_convnd.py
index e7279c4..fe256d7 100644
--- a/tests/test_bn_convnd.py
+++ b/tests/test_bn_convnd.py
@@ -1,11 +1,14 @@
+import numpy as np
 import tensorflow as tf
 from tensorflow.keras.models import load_model
+
 from gradient_accumulator import GradientAccumulateModel
 from gradient_accumulator.layers import AccumBatchNormalization
-import numpy as np
 
 
-def test_bn_conv2d(custom_bn:bool = True, accum_steps:int = 1, epochs:int = 1):
+def test_bn_conv2d(
+    custom_bn: bool = True, accum_steps: int = 1, epochs: int = 1
+):
     # make toy dataset
     data = np.random.randint(2, size=(16, 8, 8, 1))
     gt = np.expand_dims(np.random.randint(2, size=16), axis=-1)
@@ -19,20 +22,24 @@ def test_bn_conv2d(custom_bn:bool = True, accum_steps:int = 1, epochs:int = 1):
         normalization_layer = tf.keras.layers.Activation("linear")
 
     # create model
-    model = tf.keras.models.Sequential([
-        tf.keras.layers.Conv2D(4, 3, input_shape=(8, 8, 1)),
-        normalization_layer,
-        tf.keras.layers.Activation("relu"),
-        tf.keras.layers.Flatten(),
-        tf.keras.layers.Dense(4),
-        normalization_layer,  # @TODO: BN before or after ReLU? Leads to different performance
-        tf.keras.layers.Activation("relu"),
-        tf.keras.layers.Dense(1, activation="sigmoid"),
-    ])
+    model = tf.keras.models.Sequential(
+        [
+            tf.keras.layers.Conv2D(4, 3, input_shape=(8, 8, 1)),
+            normalization_layer,
+            tf.keras.layers.Activation("relu"),
+            tf.keras.layers.Flatten(),
+            tf.keras.layers.Dense(4),
+            normalization_layer,  # @TODO: BN before or after ReLU? Leads to different performance
+            tf.keras.layers.Activation("relu"),
+            tf.keras.layers.Dense(1, activation="sigmoid"),
+        ]
+    )
 
     # wrap model to use gradient accumulation
     if accum_steps > 1:
-        model = GradientAccumulateModel(accum_steps=accum_steps, inputs=model.input, outputs=model.output)
+        model = GradientAccumulateModel(
+            accum_steps=accum_steps, inputs=model.input, outputs=model.output
+        )
 
     # compile model
     model.compile(
@@ -60,7 +67,9 @@ def test_bn_conv2d(custom_bn:bool = True, accum_steps:int = 1, epochs:int = 1):
     return result
 
 
-def test_bn_conv3d(custom_bn:bool = True, accum_steps:int = 1, epochs:int = 1):
+def test_bn_conv3d(
+    custom_bn: bool = True, accum_steps: int = 1, epochs: int = 1
+):
     # make toy dataset
     data = np.random.randint(2, size=(16, 8, 8, 8, 1))
     gt = np.expand_dims(np.random.randint(2, size=16), axis=-1)
@@ -74,20 +83,24 @@ def test_bn_conv3d(custom_bn:bool = True, accum_steps:int = 1, epochs:int = 1):
         normalization_layer = tf.keras.layers.Activation("linear")
 
     # create model
-    model = tf.keras.models.Sequential([
-        tf.keras.layers.Conv3D(4, 3, input_shape=(8, 8, 8, 1)),
-        normalization_layer,
-        tf.keras.layers.Activation("relu"),
-        tf.keras.layers.Flatten(),
-        tf.keras.layers.Dense(4),
-        normalization_layer,  # @TODO: BN before or after ReLU? Leads to different performance
-        tf.keras.layers.Activation("relu"),
-        tf.keras.layers.Dense(1, activation="sigmoid"),
-    ])
+    model = tf.keras.models.Sequential(
+        [
+            tf.keras.layers.Conv3D(4, 3, input_shape=(8, 8, 8, 1)),
+            normalization_layer,
+            tf.keras.layers.Activation("relu"),
+            tf.keras.layers.Flatten(),
+            tf.keras.layers.Dense(4),
+            normalization_layer,  # @TODO: BN before or after ReLU? Leads to different performance
+            tf.keras.layers.Activation("relu"),
+            tf.keras.layers.Dense(1, activation="sigmoid"),
+        ]
+    )
 
     # wrap model to use gradient accumulation
     if accum_steps > 1:
-        model = GradientAccumulateModel(accum_steps=accum_steps, inputs=model.input, outputs=model.output)
+        model = GradientAccumulateModel(
+            accum_steps=accum_steps, inputs=model.input, outputs=model.output
+        )
 
     # compile model
     model.compile(
diff --git a/tests/test_expected_result.py b/tests/test_expected_result.py
index e226f63..ddcf0de 100644
--- a/tests/test_expected_result.py
+++ b/tests/test_expected_result.py
@@ -1,12 +1,17 @@
+import os
+import random as python_random
+
 import numpy as np
 import tensorflow as tf
-import random as python_random
-import os
-from .utils import get_opt, normalize_img, reset
 import tensorflow_datasets as tfds
 from tensorflow.keras.models import load_model
-from gradient_accumulator import GradientAccumulateModel, GradientAccumulateOptimizer
 
+from gradient_accumulator import GradientAccumulateModel
+from gradient_accumulator import GradientAccumulateOptimizer
+
+from .utils import get_opt
+from .utils import normalize_img
+from .utils import reset
 
 # get current tf minor version
 tf_version = int(tf.version.VERSION.split(".")[1])
@@ -15,8 +20,8 @@
 def run_experiment(bs=50, accum_steps=2, epochs=1, modeloropt="opt"):
     # load dataset
     (ds_train, ds_test), ds_info = tfds.load(
-        'mnist',
-        split=['train', 'test'],
+        "mnist",
+        split=["train", "test"],
         shuffle_files=True,
         as_supervised=True,
         with_info=True,
@@ -35,7 +40,7 @@ def run_experiment(bs=50, accum_steps=2, epochs=1, modeloropt="opt"):
     # create model
     input = tf.keras.layers.Input(shape=(28, 28))
     x = tf.keras.layers.Flatten(input_shape=(28, 28))(input)
-    x = tf.keras.layers.Dense(128, activation='relu')(x)
+    x = tf.keras.layers.Dense(128, activation="relu")(x)
     output = tf.keras.layers.Dense(10)(x)
 
     opt = get_opt(opt_name="SGD", tf_version=tf_version)
@@ -45,14 +50,16 @@ def run_experiment(bs=50, accum_steps=2, epochs=1, modeloropt="opt"):
     else:
         if modeloropt == "model":
             # wrap model to use gradient accumulation
-            model = GradientAccumulateModel(accum_steps=accum_steps, inputs=input, outputs=output)
+            model = GradientAccumulateModel(
+                accum_steps=accum_steps, inputs=input, outputs=output
+            )
         else:
             # wrap optimizer to use gradient accumulation
             opt = GradientAccumulateOptimizer(opt, accum_steps=accum_steps)
 
             # compile model
             model = tf.keras.Model(inputs=input, outputs=output)
-    
+
     # compile model
     model.compile(
         optimizer=opt,
@@ -91,7 +98,7 @@ def test_expected_result():
 
     # run again with different batch size and number of accumulations
     result2 = run_experiment(bs=50, accum_steps=2, epochs=2, modeloropt="opt")
-    
+
     # reset again
     reset()
 
diff --git a/tests/test_mixed_precision.py b/tests/test_mixed_precision.py
index a39b745..d24d50a 100644
--- a/tests/test_mixed_precision.py
+++ b/tests/test_mixed_precision.py
@@ -2,24 +2,26 @@
 
 
 def run_experiment():
+    import os
+
     import tensorflow as tf
     import tensorflow_datasets as tfds
     from tensorflow.keras import mixed_precision
+
     from gradient_accumulator import GradientAccumulateModel
-    from .utils import normalize_img
-    import os
 
+    from .utils import normalize_img
 
     # disable GPU
     os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 
     # set mixed global precision policy
-    mixed_precision.set_global_policy('mixed_float16')
+    mixed_precision.set_global_policy("mixed_float16")
 
     # load dataset
     (ds_train, ds_test), ds_info = tfds.load(
-        'mnist',
-        split=['train', 'test'],
+        "mnist",
+        split=["train", "test"],
         shuffle_files=True,
         as_supervised=True,
         with_info=True,
@@ -28,8 +30,10 @@ def run_experiment():
     # build train pipeline
     ds_train = ds_train.map(normalize_img)
     ds_train = ds_train.cache()
-    ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples)
-    ds_train = ds_train.batch(32)  # multiplum of 8 on GPU to maximize performance
+    ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples)
+    ds_train = ds_train.batch(
+        32
+    )  # multiplum of 8 on GPU to maximize performance
     ds_train = ds_train.prefetch(1)
 
     # build test pipeline
@@ -39,14 +43,23 @@ def run_experiment():
     ds_test = ds_test.prefetch(1)
 
     # create model
-    model = tf.keras.models.Sequential([
-        tf.keras.layers.Flatten(input_shape=(28, 28)),
-        tf.keras.layers.Dense(32, activation='relu'),  # 32 multiplum of 8
-        tf.keras.layers.Dense(10, dtype='float32')  # output not numerically stable with float16
-    ])
+    model = tf.keras.models.Sequential(
+        [
+            tf.keras.layers.Flatten(input_shape=(28, 28)),
+            tf.keras.layers.Dense(32, activation="relu"),  # 32 multiplum of 8
+            tf.keras.layers.Dense(
+                10, dtype="float32"
+            ),  # output not numerically stable with float16
+        ]
+    )
 
     # wrap model to use gradient accumulation
-    model = GradientAccumulateModel(accum_steps=4, mixed_precision=True, inputs=model.input, outputs=model.output)
+    model = GradientAccumulateModel(
+        accum_steps=4,
+        mixed_precision=True,
+        inputs=model.input,
+        outputs=model.output,
+    )
 
     # need to scale optimizer for mixed precision
     opt = tf.keras.optimizers.Adam(1e-3)
@@ -65,7 +78,7 @@ def run_experiment():
         epochs=1,
         validation_data=ds_test,
     )
-    
+
     # save model on disk
     model.save("./trained_model")
 
@@ -86,9 +99,11 @@ def test_mixed_precision():
         pass
     else:
         cleanup_on_sigterm()
-    
+
     try:
-        mp.set_start_method('spawn', force=True)  # set start method to 'spawn' BEFORE instantiating the queue and the event
+        mp.set_start_method(
+            "spawn", force=True
+        )  # set start method to 'spawn' BEFORE instantiating the queue and the event
     except RuntimeError:
         pass
 
diff --git a/tests/test_model_distribute.py b/tests/test_model_distribute.py
index 9bd3e86..b47d75d 100644
--- a/tests/test_model_distribute.py
+++ b/tests/test_model_distribute.py
@@ -1,7 +1,9 @@
 import tensorflow as tf
 import tensorflow_datasets as tfds
 from tensorflow.keras.models import load_model
+
 from gradient_accumulator import GradientAccumulateModel
+
 from .utils import get_opt
 
 
@@ -10,15 +12,15 @@ def test_model_distribute():
 
     # load dataset
     (ds_train, ds_test), ds_info = tfds.load(
-        'mnist',
-        split=['train', 'test'],
+        "mnist",
+        split=["train", "test"],
         shuffle_files=True,
         as_supervised=True,
         with_info=True,
     )
 
     # build train pipeline
-    ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples)
+    ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples)
     ds_train = ds_train.batch(100)
     ds_train = ds_train.prefetch(1)
 
@@ -28,14 +30,18 @@ def test_model_distribute():
 
     with strategy.scope():
         # create model
-        model = tf.keras.models.Sequential([
-            tf.keras.layers.Flatten(input_shape=(28, 28)),
-            tf.keras.layers.Dense(16, activation='relu'),
-            tf.keras.layers.Dense(10)
-        ])
+        model = tf.keras.models.Sequential(
+            [
+                tf.keras.layers.Flatten(input_shape=(28, 28)),
+                tf.keras.layers.Dense(16, activation="relu"),
+                tf.keras.layers.Dense(10),
+            ]
+        )
         model = GradientAccumulateModel(
-            accum_steps=4, inputs=model.input,
-            outputs=model.output, experimental_distributed_support=True,
+            accum_steps=4,
+            inputs=model.input,
+            outputs=model.output,
+            experimental_distributed_support=True,
         )
 
         # define optimizer - currently only SGD compatible with GAOptimizerWrapper
@@ -44,17 +50,14 @@ def test_model_distribute():
         # compile model
         model.compile(
             optimizer=opt,
-            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+            loss=tf.keras.losses.SparseCategoricalCrossentropy(
+                from_logits=True
+            ),
             metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
         )
 
     # train model
-    model.fit(
-        ds_train,
-        epochs=3,
-        validation_data=ds_test,
-        verbose=1
-    )
+    model.fit(ds_train, epochs=3, validation_data=ds_test, verbose=1)
 
     model.save("./trained_model")
 
diff --git a/tests/test_model_expected_result.py b/tests/test_model_expected_result.py
new file mode 100644
index 0000000..81b67dc
--- /dev/null
+++ b/tests/test_model_expected_result.py
@@ -0,0 +1,37 @@
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.models import load_model
+
+from gradient_accumulator import GradientAccumulateModel
+from gradient_accumulator import GradientAccumulateOptimizer
+
+from .utils import get_opt
+from .utils import normalize_img
+from .utils import reset
+from .utils import run_experiment
+
+# get current tf minor version
+tf_version = int(tf.version.VERSION.split(".")[1])
+
+
+def test_model_expected_result():
+    # set seed
+    reset()
+
+    # run once
+    result1 = run_experiment(
+        bs=100, accum_steps=1, epochs=2, modeloropt="model"
+    )
+
+    # reset before second run to get identical results
+    reset()
+
+    # test with model wrapper instead
+    result2 = run_experiment(bs=50, accum_steps=2, epochs=2, modeloropt="model")
+
+    # results should be identical (theoretically, even in practice on CPU)
+    if tf_version <= 10:
+        assert result1 == result2
+    else:
+        # approximation worse for tf >= 2.11
+        np.testing.assert_almost_equal(result1, result2, decimal=2)
diff --git a/tests/test_mp_batch_norm.py b/tests/test_mp_batch_norm.py
index 79e5df7..50bec5c 100644
--- a/tests/test_mp_batch_norm.py
+++ b/tests/test_mp_batch_norm.py
@@ -1,21 +1,31 @@
 import multiprocessing as mp
 
 
-def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epochs:int = 3, queue=None, mixed_precision_flag=True):
+def run_experiment(
+    custom_bn: bool = True,
+    bs: int = 100,
+    accum_steps: int = 1,
+    epochs: int = 3,
+    queue=None,
+    mixed_precision_flag=True,
+):
+    import os
+    import random as python_random
+
+    import numpy as np
     import tensorflow as tf
     import tensorflow_datasets as tfds
     from tensorflow.keras import mixed_precision
     from tensorflow.keras.models import load_model
+
     from gradient_accumulator import GradientAccumulateModel
     from gradient_accumulator.layers import AccumBatchNormalization
-    import random as python_random
-    import numpy as np
-    import os
-    from .utils import normalize_img, get_opt
 
+    from .utils import get_opt
+    from .utils import normalize_img
 
     ## reset session and seed stuff before running experiment
-    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 
     # disable GPU
     os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
@@ -39,12 +49,12 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo
 
     # set mixed global precision policy
     if mixed_precision_flag:
-        mixed_precision.set_global_policy('mixed_float16')
+        mixed_precision.set_global_policy("mixed_float16")
 
     # load dataset
     (ds_train, ds_test), ds_info = tfds.load(
-        'mnist',
-        split=['train', 'test'],
+        "mnist",
+        split=["train", "test"],
         shuffle_files=True,
         as_supervised=True,
         with_info=True,
@@ -52,7 +62,7 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo
 
     # build train pipeline
     ds_train = ds_train.map(normalize_img)
-    ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples)
+    ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples)
     ds_train = ds_train.batch(bs)
     ds_train = ds_train.prefetch(1)
 
@@ -70,19 +80,23 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo
         normalization_layer = tf.keras.layers.Activation("linear")
 
     # create model
-    model = tf.keras.models.Sequential([
-        tf.keras.layers.Flatten(input_shape=(28, 28)),
-        tf.keras.layers.Dense(10),
-        normalization_layer,  # @TODO: BN before or after ReLU? Leads to different performance
-        tf.keras.layers.Activation("relu"),
-        tf.keras.layers.Dense(10, dtype=tf.float32)
-    ])
+    model = tf.keras.models.Sequential(
+        [
+            tf.keras.layers.Flatten(input_shape=(28, 28)),
+            tf.keras.layers.Dense(10),
+            normalization_layer,  # @TODO: BN before or after ReLU? Leads to different performance
+            tf.keras.layers.Activation("relu"),
+            tf.keras.layers.Dense(10, dtype=tf.float32),
+        ]
+    )
 
     # wrap model to use gradient accumulation
     if accum_steps > 1:
         model = GradientAccumulateModel(
-            accum_steps=accum_steps, mixed_precision=mixed_precision_flag,
-            inputs=model.input, outputs=model.output
+            accum_steps=accum_steps,
+            mixed_precision=mixed_precision_flag,
+            inputs=model.input,
+            outputs=model.output,
         )
 
     # need to scale optimizer for mixed precision
@@ -117,7 +131,9 @@ def run_experiment(custom_bn:bool = True, bs:int = 100, accum_steps:int = 1, epo
     queue.put(result)
 
 
-def run_experiment_wrapper(custom_bn=True, bs=100, accum_steps=1, epochs=3, mixed_precision=True):
+def run_experiment_wrapper(
+    custom_bn=True, bs=100, accum_steps=1, epochs=3, mixed_precision=True
+):
     # launch experiment in separate process, as we are enabling mixed precision
     # which will impact other unit tests, unless we do this
     try:
@@ -126,19 +142,29 @@ def run_experiment_wrapper(custom_bn=True, bs=100, accum_steps=1, epochs=3, mixe
         pass
     else:
         cleanup_on_sigterm()
-    
+
     try:
-        mp.set_start_method('spawn', force=True)  # set start method to 'spawn' BEFORE instantiating the queue and the event
+        mp.set_start_method(
+            "spawn", force=True
+        )  # set start method to 'spawn' BEFORE instantiating the queue and the event
     except RuntimeError:
         pass
-    
+
     queue = mp.Queue()
-    p = mp.Process(target=run_experiment(custom_bn=custom_bn, bs=bs, accum_steps=accum_steps, epochs=epochs, queue=queue))
+    p = mp.Process(
+        target=run_experiment(
+            custom_bn=custom_bn,
+            bs=bs,
+            accum_steps=accum_steps,
+            epochs=epochs,
+            queue=queue,
+        )
+    )
     try:
         p.start()
     finally:
         p.join()  # necessary so that the Process exists before the test suite exits (thus coverage is collected)
-    
+
     return queue.get()
 
 
@@ -146,19 +172,26 @@ def test_mixed_precision():
     import numpy as np
 
     # custom BN without accum
-    result1 = run_experiment_wrapper(custom_bn=True, accum_steps=4, bs=25, mixed_precision=False)[1]
+    result1 = run_experiment_wrapper(
+        custom_bn=True, accum_steps=4, bs=25, mixed_precision=False
+    )[1]
 
     # keras BN without accum
-    result2 = run_experiment_wrapper(custom_bn=True, accum_steps=1, bs=100, mixed_precision=False)[1]
+    result2 = run_experiment_wrapper(
+        custom_bn=True, accum_steps=1, bs=100, mixed_precision=False
+    )[1]
 
     # assert result1 == result2
     np.testing.assert_almost_equal(result1, result2, decimal=2)
 
-
     # custom BN with accum with mixed precision
-    result3 = run_experiment_wrapper(custom_bn=True, accum_steps=4, bs=25, mixed_precision=True)[1]
+    result3 = run_experiment_wrapper(
+        custom_bn=True, accum_steps=4, bs=25, mixed_precision=True
+    )[1]
 
     # keras BN without accum
-    result4 = run_experiment_wrapper(custom_bn=True, accum_steps=1, bs=100, mixed_precision=True)[1]
+    result4 = run_experiment_wrapper(
+        custom_bn=True, accum_steps=1, bs=100, mixed_precision=True
+    )[1]
 
     np.testing.assert_almost_equal(result3, result4, decimal=2)
diff --git a/tests/test_multitask.py b/tests/test_multitask.py
index 26a968f..311d3d1 100644
--- a/tests/test_multitask.py
+++ b/tests/test_multitask.py
@@ -1,13 +1,23 @@
+import os
+import random as python_random
+
 import numpy as np
 import tensorflow as tf
-import random as python_random
-import os
 import tensorflow_datasets as tfds
-from tensorflow.keras.models import Model, load_model
+from tensorflow.keras.layers import Activation
+from tensorflow.keras.layers import Conv2D
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.layers import Flatten
+from tensorflow.keras.layers import Input
+from tensorflow.keras.layers import MaxPooling2D
+from tensorflow.keras.layers import UpSampling2D
+from tensorflow.keras.models import Model
+from tensorflow.keras.models import load_model
+
 from gradient_accumulator import GradientAccumulateModel
-from tensorflow.keras.layers import Input, Dense, Flatten, Conv2D, UpSampling2D,\
-    MaxPooling2D, Activation
-from .utils import normalize_img, reset
+
+from .utils import normalize_img
+from .utils import reset
 
 
 def create_multi_input_output(image, label):
@@ -17,8 +27,8 @@ def create_multi_input_output(image, label):
 def run_experiment(bs=16, accum_steps=4, epochs=1):
     # load dataset
     (ds_train, ds_test), ds_info = tfds.load(
-        'mnist',
-        split=['train', 'test'],
+        "mnist",
+        split=["train", "test"],
         shuffle_files=True,
         as_supervised=True,
         with_info=True,
@@ -58,13 +68,19 @@ def run_experiment(bs=16, accum_steps=4, epochs=1):
 
     # wrap model to use gradient accumulation
     if accum_steps > 1:
-        model = GradientAccumulateModel(accum_steps=accum_steps, inputs=model.input, outputs=model.output)
+        model = GradientAccumulateModel(
+            accum_steps=accum_steps, inputs=model.input, outputs=model.output
+        )
 
     # compile model
     model.compile(
         optimizer=tf.keras.optimizers.SGD(1e-3),
-        loss={"classifier": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-              "reconstructor": "mse"},
+        loss={
+            "classifier": tf.keras.losses.SparseCategoricalCrossentropy(
+                from_logits=True
+            ),
+            "reconstructor": "mse",
+        },
         metrics={"classifier": tf.keras.metrics.SparseCategoricalAccuracy()},
     )
 
diff --git a/tests/test_optimizer_distribute.py b/tests/test_optimizer_distribute.py
index 4a5d9a3..1dd5441 100644
--- a/tests/test_optimizer_distribute.py
+++ b/tests/test_optimizer_distribute.py
@@ -1,16 +1,21 @@
+import numpy as np
 import tensorflow as tf
 import tensorflow_datasets as tfds
 from tensorflow.keras.models import load_model
+
 from gradient_accumulator import GradientAccumulateOptimizer
-import numpy as np
-from .utils import reset, get_opt, normalize_img
 
+from .utils import get_opt
+from .utils import normalize_img
+from .utils import reset
 
 # get current tf minor version
 tf_version = int(tf.version.VERSION.split(".")[1])
 
 
-def run_experiment(opt_name="adam", bs=100, accum_steps=1, epochs=1, strategy_name="multi"):
+def run_experiment(
+    opt_name="adam", bs=100, accum_steps=1, epochs=1, strategy_name="multi"
+):
     # setup single/multi-GPU strategy
     if strategy_name == "single":
         strategy = tf.distribute.get_strategy()  # get default strategy
@@ -21,8 +26,8 @@ def run_experiment(opt_name="adam", bs=100, accum_steps=1, epochs=1, strategy_na
 
     # load dataset
     (ds_train, ds_test), ds_info = tfds.load(
-        'mnist',
-        split=['train', 'test'],
+        "mnist",
+        split=["train", "test"],
         shuffle_files=True,
         as_supervised=True,
         with_info=True,
@@ -40,22 +45,28 @@ def run_experiment(opt_name="adam", bs=100, accum_steps=1, epochs=1, strategy_na
 
     with strategy.scope():
         # create model
-        model = tf.keras.models.Sequential([
-            tf.keras.layers.Flatten(input_shape=(28, 28)),
-            tf.keras.layers.Dense(128, activation='relu'),
-            tf.keras.layers.Dense(10)
-        ])
+        model = tf.keras.models.Sequential(
+            [
+                tf.keras.layers.Flatten(input_shape=(28, 28)),
+                tf.keras.layers.Dense(128, activation="relu"),
+                tf.keras.layers.Dense(10),
+            ]
+        )
 
         # define optimizer - currently only SGD compatible with GAOptimizerWrapper
         opt = get_opt(opt_name=opt_name, tf_version=tf_version)
 
         # wrap optimizer to add gradient accumulation support
-        opt = GradientAccumulateOptimizer(optimizer=opt, accum_steps=accum_steps)
+        opt = GradientAccumulateOptimizer(
+            optimizer=opt, accum_steps=accum_steps
+        )
 
         # compile model
         model.compile(
             optimizer=opt,
-            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+            loss=tf.keras.losses.SparseCategoricalCrossentropy(
+                from_logits=True
+            ),
             metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
         )
 
@@ -65,7 +76,7 @@ def run_experiment(opt_name="adam", bs=100, accum_steps=1, epochs=1, strategy_na
         batch_size=bs,
         epochs=epochs,
         validation_data=ds_test,
-        verbose=1
+        verbose=1,
     )
 
     model.save("./trained_model")
@@ -82,7 +93,7 @@ def run_experiment(opt_name="adam", bs=100, accum_steps=1, epochs=1, strategy_na
 
 
 def test_distributed_optimizer_invariance():
-    # run experiment for different optimizers, to see if GA is consistent 
+    # run experiment for different optimizers, to see if GA is consistent
     # within an optimizer. Note that it is expected for the results to
     # differ BETWEEN optimizers, as they behave differently.
     for strategy_name in ["single", "multi"]:
@@ -92,13 +103,25 @@ def test_distributed_optimizer_invariance():
             reset()
 
             # run once
-            result1 = run_experiment(opt_name=opt_name, bs=100, accum_steps=1, epochs=2, strategy_name=strategy_name)
+            result1 = run_experiment(
+                opt_name=opt_name,
+                bs=100,
+                accum_steps=1,
+                epochs=2,
+                strategy_name=strategy_name,
+            )
 
             # reset before second run to get identical results
             reset()
 
             # run again with different batch size and number of accumulations
-            result2 = run_experiment(opt_name=opt_name, bs=50, accum_steps=2, epochs=2, strategy_name=strategy_name)
+            result2 = run_experiment(
+                opt_name=opt_name,
+                bs=50,
+                accum_steps=2,
+                epochs=2,
+                strategy_name=strategy_name,
+            )
 
             # results should be "identical" (on CPU, can be different on GPU)
             np.testing.assert_almost_equal(result1, result2, decimal=2)
diff --git a/tests/test_optimizer_invariance.py b/tests/test_optimizer_invariance.py
index 947b2d9..cc990a6 100644
--- a/tests/test_optimizer_invariance.py
+++ b/tests/test_optimizer_invariance.py
@@ -1,22 +1,29 @@
+import os
+import random as python_random
+
 import numpy as np
 import tensorflow as tf
-import random as python_random
-import os
-from .utils import get_opt, normalize_img, reset
 import tensorflow_datasets as tfds
 from tensorflow.keras.models import load_model
-from gradient_accumulator import GradientAccumulateModel, GradientAccumulateOptimizer
 
+from gradient_accumulator import GradientAccumulateModel
+from gradient_accumulator import GradientAccumulateOptimizer
+
+from .utils import get_opt
+from .utils import normalize_img
+from .utils import reset
 
 # get current tf minor version
 tf_version = int(tf.version.VERSION.split(".")[1])
 
 
-def run_experiment(bs=100, accum_steps=1, epochs=1, opt_name="SGD", wrapper="model"):
+def run_experiment(
+    bs=100, accum_steps=1, epochs=1, opt_name="SGD", wrapper="model"
+):
     # load dataset
     (ds_train, ds_test), ds_info = tfds.load(
-        'mnist',
-        split=['train', 'test'],
+        "mnist",
+        split=["train", "test"],
         shuffle_files=True,
         as_supervised=True,
         with_info=True,
@@ -35,7 +42,7 @@ def run_experiment(bs=100, accum_steps=1, epochs=1, opt_name="SGD", wrapper="mod
     # create model
     input = tf.keras.layers.Input(shape=(28, 28))
     x = tf.keras.layers.Flatten(input_shape=(28, 28))(input)
-    x = tf.keras.layers.Dense(128, activation='relu')(x)
+    x = tf.keras.layers.Dense(128, activation="relu")(x)
     output = tf.keras.layers.Dense(10)(x)
     model = tf.keras.models.Model(inputs=input, outputs=output)
 
@@ -45,9 +52,13 @@ def run_experiment(bs=100, accum_steps=1, epochs=1, opt_name="SGD", wrapper="mod
     # wrap model to use gradient accumulation
     if accum_steps > 1:
         if wrapper == "model":
-            model = GradientAccumulateModel(accum_steps=accum_steps, inputs=input, outputs=output)
+            model = GradientAccumulateModel(
+                accum_steps=accum_steps, inputs=input, outputs=output
+            )
         elif wrapper == "optimizer":
-            opt = GradientAccumulateOptimizer(optimizer=opt, accum_steps=accum_steps)
+            opt = GradientAccumulateOptimizer(
+                optimizer=opt, accum_steps=accum_steps
+            )
         else:
             raise ValueError("Unknown wrapper was chosen:", wrapper)
 
@@ -78,7 +89,7 @@ def run_experiment(bs=100, accum_steps=1, epochs=1, opt_name="SGD", wrapper="mod
 
 
 def test_optimizer_invariance():
-    # run experiment for different optimizers, to see if GA is consistent 
+    # run experiment for different optimizers, to see if GA is consistent
     # within an optimizer. Note that it is expected for the results to
     # differ BETWEEN optimizers, as they behave differently.
     for wrapper in ["model", "optimizer"]:
@@ -88,13 +99,27 @@ def test_optimizer_invariance():
             reset()
 
             # run once
-            result1 = run_experiment(bs=100, accum_steps=1, epochs=2, opt_name=opt_name, wrapper=wrapper)
+            result1 = run_experiment(
+                bs=100,
+                accum_steps=1,
+                epochs=2,
+                opt_name=opt_name,
+                wrapper=wrapper,
+            )
 
             # reset before second run to get identical results
             reset()
 
             # run again with different batch size and number of accumulations
-            result2 = run_experiment(bs=50, accum_steps=2, epochs=2, opt_name=opt_name, wrapper=wrapper)
+            result2 = run_experiment(
+                bs=50,
+                accum_steps=2,
+                epochs=2,
+                opt_name=opt_name,
+                wrapper=wrapper,
+            )
 
             # results should be "identical" (on CPU, can be different on GPU)
-            np.testing.assert_almost_equal(result1, result2, decimal=2)  # decimals=3 OK for model wrapper but not optimizer
+            np.testing.assert_almost_equal(
+                result1, result2, decimal=2
+            )  # decimals=3 OK for model wrapper but not optimizer
diff --git a/tests/test_optimizer_wrapper.py b/tests/test_optimizer_wrapper.py
index a09e829..22a538c 100644
--- a/tests/test_optimizer_wrapper.py
+++ b/tests/test_optimizer_wrapper.py
@@ -1,12 +1,16 @@
+import os
+import random as python_random
+
 import numpy as np
 import tensorflow as tf
-import random as python_random
-import os
-from .utils import get_opt, reset, normalize_img
 import tensorflow_datasets as tfds
 from tensorflow.keras.models import load_model
+
 from gradient_accumulator import GradientAccumulateOptimizer
 
+from .utils import get_opt
+from .utils import normalize_img
+from .utils import reset
 
 tf_version = int(tf.version.VERSION.split(".")[1])
 
@@ -14,8 +18,8 @@
 def run_experiment(bs=16, accum_steps=4, epochs=1):
     # load dataset
     (ds_train, ds_test), ds_info = tfds.load(
-        'mnist',
-        split=['train', 'test'],
+        "mnist",
+        split=["train", "test"],
         shuffle_files=True,
         as_supervised=True,
         with_info=True,
@@ -32,21 +36,19 @@ def run_experiment(bs=16, accum_steps=4, epochs=1):
     ds_test = ds_test.prefetch(1)
 
     # create model
-    model = tf.keras.models.Sequential([
-        tf.keras.layers.Flatten(input_shape=(28, 28)),
-        tf.keras.layers.Dense(32, activation='relu'),
-        tf.keras.layers.Dense(10),
-    ])
+    model = tf.keras.models.Sequential(
+        [
+            tf.keras.layers.Flatten(input_shape=(28, 28)),
+            tf.keras.layers.Dense(32, activation="relu"),
+            tf.keras.layers.Dense(10),
+        ]
+    )
 
     # wrap optimizer to add gradient accumulation support
-    # opt = tf.keras.optimizers.Adam(learning_rate=1e-3)
-    # need to dynamically handle which Optimizer class to use dependent on tf version
-    if tf_version > 10:
-        curr_opt = tf.keras.optimizers.legacy.SGD(learning_rate=1e-2)
-    else:
-        curr_opt = tf.keras.optimizers.SGD(learning_rate=1e-2)  # IDENTICAL RESULTS WITH SGD!!!
-
-    opt = GradientAccumulateOptimizer(optimizer=curr_opt, accum_steps=accum_steps, reduction="MEAN")  # MEAN REDUCTION IMPORTANT!!!
+    opt = get_opt("SGD")
+    opt = GradientAccumulateOptimizer(
+        optimizer=opt, accum_steps=accum_steps, reduction="MEAN"
+    )  # MEAN REDUCTION IMPORTANT!!!
 
     # compile model
     model.compile(
@@ -80,7 +82,9 @@ def test_expected_result():
     reset()
 
     # run once
-    result1 = run_experiment(bs=500, accum_steps=1, epochs=3)  # NOTE: AS TO BE DIVISIBLE BY TRAIN SET SIZE = 50000 (!)
+    result1 = run_experiment(
+        bs=500, accum_steps=1, epochs=3
+    )  # NOTE: AS TO BE DIVISIBLE BY TRAIN SET SIZE = 50000 (!)
 
     # reset before second run to get identical results
     reset()
@@ -101,8 +105,8 @@ def test_expected_result():
     # result4 = run_experiment(bs=1, accum_steps=500, epochs=2)
 
     # results should be identical (theoretically, even in practice on CPU)
-    #assert result1 == result2
-    #assert result1 == result3
+    # assert result1 == result2
+    # assert result1 == result3
 
     # reduced constraint for temporarily
     np.testing.assert_almost_equal(result1, result2, decimal=2)
diff --git a/tests/test_param_count.py b/tests/test_param_count.py
index 9b0c91e..02ffd5d 100644
--- a/tests/test_param_count.py
+++ b/tests/test_param_count.py
@@ -1,14 +1,15 @@
 import tensorflow as tf
-from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense
+from tensorflow.keras.models import Sequential
+
 from gradient_accumulator import GradientAccumulateModel
 
 
 def create_model():
     input = tf.keras.layers.Input(shape=(10,))
-    x = Dense(32, input_shape=(10,), activation='relu')(input)
-    x = Dense(16, activation='relu')(x)
-    output = Dense(1, activation='sigmoid')(x)
+    x = Dense(32, input_shape=(10,), activation="relu")(input)
+    x = Dense(16, activation="relu")(x)
+    output = Dense(1, activation="sigmoid")(x)
 
     return input, output
 
@@ -19,23 +20,30 @@ def count_params(model):
 
 def test_param_count_with_wrapper():
     # Create a model
-    input,output = create_model()
+    input, output = create_model()
     original_model = tf.keras.Model(inputs=input, outputs=output)
 
     # Count the parameters of the original model
     original_param_count = count_params(original_model)
 
     # Wrap the model with GradientAccumulateModel
-    wrapped_model = GradientAccumulateModel(accum_steps=2, inputs=input, outputs=output)
+    wrapped_model = GradientAccumulateModel(
+        accum_steps=2, inputs=input, outputs=output
+    )
 
     # Count the parameters of the wrapped model
     wrapped_param_count = count_params(wrapped_model)
 
     # Compile both models
-    original_model.compile(optimizer=tf.keras.optimizers.Adam(), loss='binary_crossentropy')
-    wrapped_model.compile(optimizer=tf.keras.optimizers.Adam(), loss='binary_crossentropy')
+    original_model.compile(
+        optimizer=tf.keras.optimizers.Adam(), loss="binary_crossentropy"
+    )
+    wrapped_model.compile(
+        optimizer=tf.keras.optimizers.Adam(), loss="binary_crossentropy"
+    )
 
     # Check if the number of parameters in both models is the same
-    assert original_param_count == wrapped_param_count, \
-        f"Parameter count mismatch: Original model has {original_param_count} parameters, " \
+    assert original_param_count == wrapped_param_count, (
+        f"Parameter count mismatch: Original model has {original_param_count} parameters, "
         f"wrapped model has {wrapped_param_count} parameters."
+    )
diff --git a/tests/test_sparse_optimizer.py b/tests/test_sparse_optimizer.py
index fe13ec1..087abcb 100644
--- a/tests/test_sparse_optimizer.py
+++ b/tests/test_sparse_optimizer.py
@@ -1,16 +1,20 @@
+import os
+import random as python_random
+
 import numpy as np
 import tensorflow as tf
-from tensorflow.keras.preprocessing.text import one_hot
-from tensorflow.keras.preprocessing.sequence import pad_sequences
 import tensorflow_datasets as tfds
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.layers import Embedding
+from tensorflow.keras.layers import Flatten
 from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Flatten, Embedding, Dense
 from tensorflow.keras.models import load_model
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import one_hot
+
 from gradient_accumulator import GradientAccumulateOptimizer
-import os
-import random as python_random
-from .utils import reset
 
+from .utils import reset
 
 # get current tf minor version
 tf_version = int(tf.version.VERSION.split(".")[1])
@@ -19,10 +23,16 @@
 def preprocess_data(ds, vocab_size, max_length):
     def encode(x, y):
         x = tf.strings.substr(x, 0, max_length)
-        x = tf.strings.reduce_join(tf.strings.unicode_split(x, input_encoding="UTF-8"), separator=' ')
+        x = tf.strings.reduce_join(
+            tf.strings.unicode_split(x, input_encoding="UTF-8"), separator=" "
+        )
         x = tf.strings.split(x)
         x_hashed = tf.strings.to_hash_bucket_fast(x, vocab_size)
-        x_padded = tf.pad(x_hashed, paddings=[[0, max_length - tf.shape(x_hashed)[-1]]], constant_values=0)
+        x_padded = tf.pad(
+            x_hashed,
+            paddings=[[0, max_length - tf.shape(x_hashed)[-1]]],
+            constant_values=0,
+        )
         return x_padded, y
 
     ds = ds.map(encode)
@@ -32,8 +42,8 @@ def encode(x, y):
 def run_experiment(bs=100, accum_steps=1, epochs=2):
     # Load the IMDb dataset
     (ds_train, ds_test), ds_info = tfds.load(
-        'imdb_reviews',
-        split=['train', 'test'],
+        "imdb_reviews",
+        split=["train", "test"],
         shuffle_files=True,
         as_supervised=True,
         with_info=True,
@@ -51,25 +61,27 @@ def run_experiment(bs=100, accum_steps=1, epochs=2):
 
     # define model
     model = Sequential()
-    model.add(Embedding(input_dim=vocab_size, output_dim=8, input_length=max_length))
+    model.add(
+        Embedding(input_dim=vocab_size, output_dim=8, input_length=max_length)
+    )
     model.add(Flatten())
-    model.add(Dense(1, activation='sigmoid'))
+    model.add(Dense(1, activation="sigmoid"))
 
     # wrap optimizer to add gradient accumulation support
     # need to dynamically handle which Optimizer class to use dependent on tf version
     if tf_version > 10:
         opt = tf.keras.optimizers.legacy.SGD(learning_rate=1e-2)
     else:
-        opt = tf.keras.optimizers.SGD(learning_rate=1e-2)  # IDENTICAL RESULTS WITH SGD!!!
-    
+        opt = tf.keras.optimizers.SGD(
+            learning_rate=1e-2
+        )  # IDENTICAL RESULTS WITH SGD!!!
+
     if accum_steps > 1:
-        opt = GradientAccumulateOptimizer(optimizer=opt, accum_steps=accum_steps, reduction="MEAN")
+        opt = GradientAccumulateOptimizer(
+            optimizer=opt, accum_steps=accum_steps, reduction="MEAN"
+        )
 
-    model.compile(
-        optimizer=opt,
-        loss='binary_crossentropy',
-        metrics=['acc']
-    )
+    model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["acc"])
 
     model.fit(
         ds_train,
@@ -92,10 +104,10 @@ def run_experiment(bs=100, accum_steps=1, epochs=2):
 
 def test_sparse_expected_results():
     # set seed
-    #reset()
+    # reset()
 
     # run once
-    #result1 = run_experiment(bs=100, accum_steps=1, epochs=2)
+    # result1 = run_experiment(bs=100, accum_steps=1, epochs=2)
 
     # reset before second run to get identical results
     reset()
@@ -103,6 +115,5 @@ def test_sparse_expected_results():
     # run again with different batch size and number of accumulations
     result2 = run_experiment(bs=50, accum_steps=2, epochs=2)
 
-
     # results should be identical (theoretically, even in practice on CPU)
-    #assert result1 == result2
+    # assert result1 == result2
diff --git a/tests/utils.py b/tests/utils.py
index 64f73ac..0778068 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,12 +1,21 @@
+import os
 import random as python_random
-import tensorflow as tf
+
 import numpy as np
-import os
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from tensorflow.keras.models import load_model
+
+from gradient_accumulator import GradientAccumulateModel
+from gradient_accumulator import GradientAccumulateOptimizer
+
+# get current tf minor version
+tf_version = int(tf.version.VERSION.split(".")[1])
 
 
 def reset(seed=123):
     # set tf log level
-    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 
     # disable GPU
     os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
@@ -28,14 +37,15 @@ def reset(seed=123):
     # in the TensorFlow backend have a well-defined initial state.
     # For further details, see:
     # https://www.tensorflow.org/api_docs/python/tf/random/set_seed
-    tf.random.set_seed(1234)  # @TODO: Should this seed be different than for python and numpy?
+    # @TODO: Should this seed be different than for python and numpy?
+    tf.random.set_seed(seed)
 
     # https://stackoverflow.com/a/71311207
     try:
         tf.config.experimental.enable_op_determinism()  # Exist only for TF > 2.7
     except AttributeError as e:
         print(e)
-    
+
     # force cpu threading determinism
     # https://stackoverflow.com/questions/36288235/how-to-get-stable-results-with-tensorflow-setting-random-seed
     tf.config.threading.set_inter_op_parallelism_threads(1)
@@ -73,4 +83,73 @@ def get_opt(opt_name, tf_version=None):
 
 def normalize_img(image, label):
     """Normalizes images: `uint8` -> `float32`."""
-    return tf.cast(image, tf.float32) / 255., label
+    return tf.cast(image, tf.float32) / 255.0, label
+
+
+def run_experiment(bs=50, accum_steps=2, epochs=1, modeloropt="opt"):
+    # load dataset
+    (ds_train, ds_test), ds_info = tfds.load(
+        "mnist",
+        split=["train", "test"],
+        shuffle_files=True,
+        as_supervised=True,
+        with_info=True,
+    )
+
+    # build train pipeline
+    ds_train = ds_train.map(normalize_img)
+    ds_train = ds_train.batch(bs)
+    ds_train = ds_train.prefetch(1)
+
+    # build test pipeline
+    ds_test = ds_test.map(normalize_img)
+    ds_test = ds_test.batch(bs)
+    ds_test = ds_test.prefetch(1)
+
+    # create model
+    input = tf.keras.layers.Input(shape=(28, 28))
+    x = tf.keras.layers.Flatten(input_shape=(28, 28))(input)
+    x = tf.keras.layers.Dense(128, activation="relu")(x)
+    output = tf.keras.layers.Dense(10)(x)
+
+    opt = get_opt(opt_name="SGD", tf_version=tf_version)
+
+    if accum_steps == 1:
+        model = tf.keras.Model(inputs=input, outputs=output)
+    else:
+        if modeloropt == "model":
+            # wrap model to use gradient accumulation
+            model = GradientAccumulateModel(
+                accum_steps=accum_steps, inputs=input, outputs=output
+            )
+        else:
+            # wrap optimizer to use gradient accumulation
+            opt = GradientAccumulateOptimizer(opt, accum_steps=accum_steps)
+
+            # compile model
+            model = tf.keras.Model(inputs=input, outputs=output)
+
+    # compile model
+    model.compile(
+        optimizer=opt,
+        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
+    )
+
+    # train model
+    model.fit(
+        ds_train,
+        epochs=epochs,
+        validation_data=ds_test,
+    )
+
+    model.save("./trained_model")
+
+    # load trained model and test
+    del model
+    trained_model = load_model("./trained_model", compile=True)
+
+    result = trained_model.evaluate(ds_test, verbose=1)
+    print(result)
+
+    return result[1]