diff --git a/docs/user/huggingface.rst b/docs/user/huggingface.rst
index 44a290cf2..d462305a0 100644
--- a/docs/user/huggingface.rst
+++ b/docs/user/huggingface.rst
@@ -69,39 +69,10 @@ To install accelerate_, run the following command inside your Python environment
 Caution when using a multi-GPU setup
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-There is a known issue when using accelerate in a multi-GPU setup *if copies of
-the net are created*. In particular, be aware that sklearn often creates copies
-under the hood, which may not immediately obvious to a user. Examples of
-functions and classes creating copies are:
+There were some issues with old accelerate versions, for best results, please use
+0.21 or above.
 
-- `GridSearchCV`, `RandomizedSearchCV` etc.
-- `cross_validate`, `cross_val_score` etc.
-- `VotingClassifier`, `CalibratedClassifierCV` and other meta estimators (but
-  not `Pipeline`).
-
-When using any of those in a multi-GPU setup with :class:`.AccelerateMixin`, you
-may encounter errors. A possible fix is to prevent the ``Accelerator`` instance
-from being copied (or, to be precise, deep-copied):
-
-.. code:: python
-
-    class AcceleratedNet(AccelerateMixin, NeuralNet):
-        pass
-
-    class MyAccelerator(Accelerator):
-        def __deepcopy__(self, memo):
-            return self
-
-    accelerator = MyAccelerator()
-    net = AcceleratedNet(..., accelerator=accelerator)
-    # now grid search et al. should work
-    gs = GridSearchCV(net, ...)
-    gs.fit(X, y)
-
-Note that this is a hacky solution, so monitor your results closely to ensure
-nothing strange is going on.
-
-There is also a problem with caching not working correctly in multi-GPU
+There is a problem with caching not working correctly in multi-GPU
 training. Therefore, if using a scoring callback (e.g.
 :class:`skorch.callbacks.EpochScoring`), turn caching off by passing
 ``use_caching=False``. Be aware that when using
diff --git a/examples/accelerate-multigpu/README.md b/examples/accelerate-multigpu/README.md
index 54dac5972..debd72e8f 100644
--- a/examples/accelerate-multigpu/README.md
+++ b/examples/accelerate-multigpu/README.md
@@ -7,7 +7,7 @@ There was an issue with using skorch in a multi-GPU setting with accelerate. Aft
 1. skorch did not call `accelerator.gather_for_metrics`, which resulted in `y_pred` not having the correct size. For more on this, consult the [accelerate docs](https://huggingface.co/docs/accelerate/quicktour#distributed-evaluation).
 2. accelerate has an issue with beeing deepcopied, which happens for instance when using `GridSearchCV`. The problem is that some references get messed up, resulting in the `GradientState` of the `accelerator` instance and of the `dataloader` to diverge. Therefore, the `accelerator` did not "know" when the last batch was encountered and was thus unable to remove the dummy samples added for multi-GPU inference.
 
-The fix for 1. is provided in the same PR as this was added. For 2., the scripts contain a custom `Accelerator` class that overrides `__deepcopy__` to just return `self`. I don't know enough about accelerate internals to determine if this is a safe solution or if it can cause more issues down the line, but it resolves the issue.
+The fix for 1. is provided in the same PR as this was added. For 2., the problem has been fixed [in accelerate](https://github.com/huggingface/accelerate/pull/1694) and is contained in the 0.21 release.
 
 This example contains two scripts, one involving skorch and one with skorch completely removed. The scripts reproduce the issue in a multi-GPU setup (tested on a GCP VM instance with two T4's). Unfortunately, the GitHub Action runners don't have such an option, which is why there is no unit test being added for the bug.
 
diff --git a/examples/accelerate-multigpu/run-no-skorch.py b/examples/accelerate-multigpu/run-no-skorch.py
index e0f584e48..f6b26411f 100644
--- a/examples/accelerate-multigpu/run-no-skorch.py
+++ b/examples/accelerate-multigpu/run-no-skorch.py
@@ -69,17 +69,12 @@ def predict(self, X):
         return y_proba.argmax(1)
 
 
-class MyAccelerator(Accelerator):
-    def __deepcopy__(self, memo):
-        return self
-
-
 def main():
     X, y = make_classification(10000, n_features=100, n_informative=50, random_state=0)
     X = X.astype(np.float32)
 
     module = MyModule()
-    accelerator = MyAccelerator()
+    accelerator = Accelerator()
     net = Net(module, accelerator)
     # cross_validate creates a deepcopy of the accelerator attribute
     res = cross_validate(
diff --git a/examples/accelerate-multigpu/run-with-skorch.py b/examples/accelerate-multigpu/run-with-skorch.py
index f88dd6b16..fd14fefd1 100644
--- a/examples/accelerate-multigpu/run-with-skorch.py
+++ b/examples/accelerate-multigpu/run-with-skorch.py
@@ -28,17 +28,11 @@ class AcceleratedNeuralNetClassifier(AccelerateMixin, NeuralNetClassifier):
     pass
 
 
-# prevent the accelerator from being copied by sklearn
-class MyAccelerator(Accelerator):
-    def __deepcopy__(self, memo):
-        return self
-
-
 def main():
     X, y = make_classification(10000, n_features=100, n_informative=50, random_state=0)
     X = X.astype(np.float32)
 
-    accelerator = MyAccelerator()
+    accelerator = Accelerator()
 
     # use history class that works in distributed setting
     # see https://skorch.readthedocs.io/en/latest/user/history.html#distributed-history
@@ -52,6 +46,7 @@ def main():
 
     model = AcceleratedNeuralNetClassifier(
         MyModule,
+        criterion=nn.CrossEntropyLoss,
         accelerator=accelerator,
         max_epochs=3,
         lr=0.001,