diff --git a/.gitignore b/.gitignore
index d20faa42..08d78b4e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 _build
 .idea
 **/__pycache__
+/docs/examples/**/*.diff
diff --git a/docs/Minimal_examples.rst b/docs/Minimal_examples.rst
index 67556a79..185a5057 100644
--- a/docs/Minimal_examples.rst
+++ b/docs/Minimal_examples.rst
@@ -1,6 +1,8 @@
-.. ***************************
+.. ****************
 .. Minimal Examples
-.. ***************************
+.. ****************
 
 
 .. include:: examples/frameworks/README.rst
+.. include:: examples/distributed/README.rst
+.. include:: examples/data/README.rst
diff --git a/docs/conf.py b/docs/conf.py
index df42510e..2d135804 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -3,7 +3,8 @@
 from __future__ import division, print_function, unicode_literals
 
 from datetime import datetime
-
+import subprocess
+from pathlib import Path
 import sphinx_theme
 
 extensions = [
@@ -90,5 +91,18 @@
 # Include CNAME file so GitHub Pages can set Custom Domain name
 html_extra_path = ['CNAME']
 
+
+# Generate the diffs that are shown in the examples.
+file_dir = Path(__file__).parent / "examples/generate_diffs.sh"
+try:
+    proc = subprocess.run(str(file_dir), shell=True, capture_output=True, check=True)
+except subprocess.CalledProcessError as err:
+    raise RuntimeError(
+        "Could not build the diff files for the examples:\n"
+        + str(err.output, encoding="utf-8")
+        + str(err.stderr, encoding="utf-8")
+    )
+
+
 def setup(app):
     app.add_css_file('custom.css')
diff --git a/docs/examples/data/README.rst b/docs/examples/data/README.rst
new file mode 100644
index 00000000..146429b9
--- /dev/null
+++ b/docs/examples/data/README.rst
@@ -0,0 +1,7 @@
+*****************************
+Data Handling during Training
+*****************************
+
+
+.. include:: examples/data/torchvision/README.rst
+.. include:: examples/data/hf/README.rst
diff --git a/docs/examples/data/hf/README.rst b/docs/examples/data/hf/README.rst
new file mode 100644
index 00000000..b907baf3
--- /dev/null
+++ b/docs/examples/data/hf/README.rst
@@ -0,0 +1,49 @@
+Hugging Face Dataset
+====================
+
+
+**Prerequisites**
+
+Make sure to read the following sections of the documentation before using this example:
+
+* :ref:`pytorch_setup`
+* :ref:`001 - Single GPU Job`
+
+The full source code for this example is available on `the mila-docs GitHub repository. <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/data/hf>`_
+
+
+**job.sh**
+
+.. literalinclude:: examples/data/hf/job.sh.diff
+   :language: diff
+
+
+**main.py**
+
+.. literalinclude:: examples/data/hf/main.py.diff
+   :language: diff
+
+
+**prepare_data.py**
+
+.. literalinclude:: examples/data/hf/prepare_data.py
+   :language: python
+
+
+**get_dataset_cache_dir.py**
+
+.. literalinclude:: examples/data/hf/get_dataset_cache_dir.py
+   :language: python
+
+
+**cp_data.sh**
+
+.. literalinclude:: examples/data/hf/cp_data.sh
+   :language: bash
+
+
+**Running this example**
+
+.. code-block:: bash
+
+   $ sbatch job.sh
diff --git a/docs/examples/data/hf/cp_data.sh b/docs/examples/data/hf/cp_data.sh
new file mode 100644
index 00000000..53d75a94
--- /dev/null
+++ b/docs/examples/data/hf/cp_data.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -o errexit
+
+_SRC=$1
+_DEST=$2
+_WORKERS=$3
+
+# Copy the dataset
+(cd "${_SRC}" && find -L * -type f) | while read f
+do
+	mkdir --parents "${_DEST}/$(dirname "$f")"
+	# echo source first so it is matched to the cp's '-T' argument
+	readlink --canonicalize "${_SRC}/$f"
+	# echo output last so cp understands it's the output file
+	echo "${_DEST}/$f"
+done | xargs -n2 -P${_WORKERS} cp --update -T
diff --git a/docs/examples/data/hf/get_dataset_cache_dir.py b/docs/examples/data/hf/get_dataset_cache_dir.py
new file mode 100644
index 00000000..9c5740d3
--- /dev/null
+++ b/docs/examples/data/hf/get_dataset_cache_dir.py
@@ -0,0 +1,17 @@
+"""List to stdout the files of the dataset"""
+import sys
+
+import datasets
+
+
+# Redirect outputs to stderr to avoid noize in stdout
+_stdout = sys.stdout
+sys.stdout = sys.stderr
+
+try:
+    _CACHE_DIR = sys.argv[1]
+except IndexError:
+    _CACHE_DIR = None
+
+builder = datasets.load_dataset_builder("the_pile", cache_dir=_CACHE_DIR, subsets=["all"], version="0.0.0")
+print(builder.cache_dir, file=_stdout)
diff --git a/docs/examples/data/hf/job.sh b/docs/examples/data/hf/job.sh
new file mode 100644
index 00000000..c734d921
--- /dev/null
+++ b/docs/examples/data/hf/job.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+#SBATCH --gpus-per-task=rtx8000:1
+#SBATCH --cpus-per-task=4
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=24G
+#SBATCH --time=02:00:00
+#SBATCH --tmp=1500G
+set -o errexit
+
+
+# Echo time and hostname into log
+echo "Date:     $(date)"
+echo "Hostname: $(hostname)"
+
+
+# Ensure only anaconda/3 module loaded.
+module purge
+# This example uses Conda to manage package dependencies.
+# See https://docs.mila.quebec/Userguide.html#conda for more information.
+module load anaconda/3
+
+
+# Creating the environment for the first time:
+# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+#     pytorch-cuda=11.6 scipy -c pytorch -c nvidia
+# Other conda packages:
+# conda install -y -n pytorch -c conda-forge rich tqdm datasets
+
+# Activate pre-existing environment.
+conda activate pytorch
+
+
+# Prepare data for training
+mkdir -p "$SLURM_TMPDIR/data"
+
+if [[ -z "${HF_DATASETS_CACHE}" ]]
+then
+	# Store the huggingface datasets cache in $SCRATCH
+	export HF_DATASETS_CACHE=$SCRATCH/cache/huggingface/datasets
+fi
+if [[ -z "${_DATA_PREP_WORKERS}" ]]
+then
+	_DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE}
+fi
+if [[ -z "${_DATA_PREP_WORKERS}" ]]
+then
+	_DATA_PREP_WORKERS=16
+fi
+
+# Preprocess the dataset and cache the result such that the heavy work is done
+# only once *ever*
+# Required conda packages:
+# conda install -y -c conda-forge zstandard
+srun --ntasks=1 --ntasks-per-node=1 \
+	time -p python3 prepare_data.py "/network/datasets/pile" ${_DATA_PREP_WORKERS}
+
+# Copy the preprocessed dataset to $SLURM_TMPDIR so it is close to the GPUs for
+# faster training
+# Get the current dataset cache
+_DATASET_CACHE_DIR=$(python3 get_dataset_cache_dir.py)
+# Get the local dataset cache
+_LOCAL_DATASET_CACHE_DIR=$(python3 get_dataset_cache_dir.py "$SLURM_TMPDIR/data")
+srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
+	time -p bash cp_data.sh "${_DATASET_CACHE_DIR}" "${_LOCAL_DATASET_CACHE_DIR}" ${_DATA_PREP_WORKERS}
+
+# Use the local copy of the preprocessed dataset
+export HF_DATASETS_CACHE="$SLURM_TMPDIR/data"
+
+
+# Execute Python script
+python main.py
diff --git a/docs/examples/data/hf/main.py b/docs/examples/data/hf/main.py
new file mode 100644
index 00000000..9edd35bf
--- /dev/null
+++ b/docs/examples/data/hf/main.py
@@ -0,0 +1,144 @@
+"""Torchvision training example."""
+import logging
+import os
+
+import datasets
+import rich.logging
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+from torchvision.models import resnet18
+from tqdm import tqdm
+
+
+def main():
+    training_epochs = 1
+    learning_rate = 5e-4
+    weight_decay = 1e-4
+    batch_size = 256
+
+    # Check that the GPU is available
+    assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+    device = torch.device("cuda", 0)
+
+    # Setup logging (optional, but much better than using print statements)
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+    )
+
+    logger = logging.getLogger(__name__)
+
+    # Create a model and move it to the GPU.
+    model = resnet18()
+    model.to(device=device)
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+
+    # Setup ImageNet
+    num_workers = get_num_workers()
+    dataset_path = "the_pile"
+    train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=True,
+    )
+    valid_dataloader = DataLoader(
+        valid_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+    test_dataloader = DataLoader(  # NOTE: Not used in this example.
+        test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+
+    # Checkout the "checkpointing and preemption" example for more info!
+    logger.debug("Starting training from scratch.")
+
+    for epoch in range(training_epochs):
+        logger.debug(f"Starting epoch {epoch}/{training_epochs}")
+
+        # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
+        model.train()
+
+        # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+        progress_bar = tqdm(
+            total=len(train_dataloader),
+            desc=f"Train epoch {epoch}",
+        )
+
+        # Training loop
+        for batch in train_dataloader:
+            # Move the batch to the GPU before we pass it to the model
+            batch = tuple(item.to(device) for item in batch)
+
+            # [Training of the model goes here]
+
+            # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+            progress_bar.update(1)
+        progress_bar.close()
+
+        val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+        logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+
+    print("Done!")
+
+
+@torch.no_grad()
+def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+    model.eval()
+
+    total_loss = 0.0
+    n_samples = 0
+    correct_predictions = 0
+
+    for batch in dataloader:
+        batch = tuple(item.to(device) for item in batch)
+        x, y = batch
+
+        logits: Tensor = model(x)
+        loss = F.cross_entropy(logits, y)
+
+        batch_n_samples = x.shape[0]
+        batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+
+        total_loss += loss.item()
+        n_samples += batch_n_samples
+        correct_predictions += batch_correct_predictions
+
+    accuracy = correct_predictions / n_samples
+    return total_loss, accuracy
+
+
+def make_datasets(dataset_path: str):
+    """Returns the training, validation, and test splits for ImageNet.
+
+    NOTE: We don't use transforms here for simplicity.
+    Having different transformations for train and validation would complicate things a bit.
+    Later examples will show how to do the train/val/test split properly when using transforms.
+    """
+    builder = datasets.load_dataset_builder(dataset_path, subsets=["all"], version="0.0.0")
+    train_dataset = builder.as_dataset(split="train").with_format("torch")
+    valid_dataset = builder.as_dataset(split="validation").with_format("torch")
+    test_dataset = builder.as_dataset(split="test").with_format("torch")
+    return train_dataset, valid_dataset, test_dataset
+
+
+def get_num_workers() -> int:
+    """Gets the optimal number of DatLoader workers to use in the current job."""
+    if "SLURM_CPUS_PER_TASK" in os.environ:
+        return int(os.environ["SLURM_CPUS_PER_TASK"])
+    if hasattr(os, "sched_getaffinity"):
+        return len(os.sched_getaffinity(0))
+    return torch.multiprocessing.cpu_count()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/data/hf/prepare_data.py b/docs/examples/data/hf/prepare_data.py
new file mode 100644
index 00000000..595539e1
--- /dev/null
+++ b/docs/examples/data/hf/prepare_data.py
@@ -0,0 +1,52 @@
+"""Preprocess the dataset.
+In this example, HuggingFace is used and the resulting dataset will be stored in
+$HF_DATASETS_CACHE. It is preferable to set the datasets cache to a location in
+$SCRATCH"""
+import sys
+import time
+
+import datasets
+
+
+_LOCAL_DS = sys.argv[1]
+_LOCAL_DS_SPLITS = _LOCAL_DS.split("/")
+try:
+    _WORKERS = int(sys.argv[2])
+except IndexError:
+    _WORKERS = 16
+
+dl_config = datasets.DownloadConfig(cache_dir=_LOCAL_DS)
+
+# 'datasets' does not allow to use a local storage for the datasets' files using
+# it's exposed API. Mocking the download func to for the usage of the local file
+dl_man = datasets.DownloadManager(download_config=dl_config)
+def dl(url_or_urls, *args, **kwargs):
+    import glob
+    local_files = ["/".join(_f.split("/")[len(_LOCAL_DS_SPLITS):])
+                   for _f in glob.glob(f"{_LOCAL_DS}/**", recursive=True)]
+    local_files.sort()
+    if isinstance(url_or_urls, str):
+        url_or_urls = [url_or_urls]
+
+    # Replace all urls by local files if they can be found
+    for v in (url_or_urls.values() if isinstance(url_or_urls, dict) else {".":url_or_urls}):
+        for i, url in enumerate(v):
+            for lf in local_files:
+                if lf and url.endswith(lf):
+                    v[i] = f"{_LOCAL_DS}/{lf}"
+                    local_files.remove(lf)
+                    break
+
+    # Continue normal download process which should only checksum the local
+    # files instead of downloading them
+    return _download(url_or_urls, *args, **kwargs)
+
+_download = dl_man.download
+dl_man.download = dl
+builder = datasets.load_dataset_builder("the_pile", download_config=dl_config, subsets=["all"], version="0.0.0")
+
+t = -time.time()
+builder.download_and_prepare(dl_manager=dl_man, num_proc=_WORKERS)
+t += time.time()
+
+print(f"Prepared data in {t/60:.2f}m")
diff --git a/docs/examples/data/torchvision/README.rst b/docs/examples/data/torchvision/README.rst
new file mode 100644
index 00000000..77ab7445
--- /dev/null
+++ b/docs/examples/data/torchvision/README.rst
@@ -0,0 +1,43 @@
+Torchvision
+===========
+
+
+**Prerequisites**
+
+Make sure to read the following sections of the documentation before using this example:
+
+* :ref:`pytorch_setup`
+* :ref:`001 - Single GPU Job`
+
+The full source code for this example is available on `the mila-docs GitHub repository. <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/data/torchvision>`_
+
+
+**job.sh**
+
+.. literalinclude:: examples/data/torchvision/job.sh.diff
+   :language: diff
+
+
+**main.py**
+
+.. literalinclude:: examples/data/torchvision/main.py.diff
+   :language: diff
+
+
+**data.sh**
+
+.. literalinclude:: examples/data/torchvision/data.sh
+   :language: bash
+
+
+**data.py**
+
+.. literalinclude:: examples/data/torchvision/data.py
+   :language: python
+
+
+**Running this example**
+
+.. code-block:: bash
+
+   $ sbatch job.sh
diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/data/torchvision/data.py
new file mode 100644
index 00000000..a43129c4
--- /dev/null
+++ b/docs/examples/data/torchvision/data.py
@@ -0,0 +1,12 @@
+"""Make sure the data is available"""
+import sys
+import time
+
+from torchvision.datasets import INaturalist
+
+
+t = -time.time()
+INaturalist(root=sys.argv[1], version="2021_train", download=True)
+INaturalist(root=sys.argv[1], version="2021_valid", download=True)
+t += time.time()
+print(f"Prepared data in {t/60:.2f}m")
diff --git a/docs/examples/data/torchvision/data.sh b/docs/examples/data/torchvision/data.sh
new file mode 100644
index 00000000..981a7f73
--- /dev/null
+++ b/docs/examples/data/torchvision/data.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -o errexit
+
+_SRC=$1
+_DEST=$2
+_WORKERS=$3
+
+# Clone the dataset structure locally and reorganise the raw files if needed
+(cd "${_SRC}" && find -L * -type f) | while read f
+do
+	mkdir --parents "${_DEST}/$(dirname "$f")"
+	# echo source first so it is matched to the ln's '-T' argument
+	readlink --canonicalize "${_SRC}/$f"
+	# echo output last so ln understands it's the output file
+	echo "${_DEST}/$f"
+done | xargs -n2 -P${_WORKERS} ln --symbolic --force -T
+
+(
+	cd "${_DEST}"
+	# Torchvision expects these names
+	mv train.tar.gz 2021_train.tgz
+	mv val.tar.gz 2021_valid.tgz
+)
+
+# Extract and prepare the data
+python3 data.py "${_DEST}"
diff --git a/docs/examples/data/torchvision/job.sh b/docs/examples/data/torchvision/job.sh
new file mode 100644
index 00000000..5423e372
--- /dev/null
+++ b/docs/examples/data/torchvision/job.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+#SBATCH --gpus-per-task=rtx8000:1
+#SBATCH --cpus-per-task=4
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=16G
+#SBATCH --time=01:30:00
+set -o errexit
+
+
+# Echo time and hostname into log
+echo "Date:     $(date)"
+echo "Hostname: $(hostname)"
+
+
+# Ensure only anaconda/3 module loaded.
+module purge
+# This example uses Conda to manage package dependencies.
+# See https://docs.mila.quebec/Userguide.html#conda for more information.
+module load anaconda/3
+
+
+# Creating the environment for the first time:
+# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+#     pytorch-cuda=11.6 scipy -c pytorch -c nvidia
+# Other conda packages:
+# conda install -y -n pytorch -c conda-forge rich tqdm
+
+# Activate pre-existing environment.
+conda activate pytorch
+
+
+# Prepare data for training
+mkdir -p "$SLURM_TMPDIR/data"
+
+if [[ -z "${_DATA_PREP_WORKERS}" ]]
+then
+	_DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE}
+fi
+if [[ -z "${_DATA_PREP_WORKERS}" ]]
+then
+	_DATA_PREP_WORKERS=16
+fi
+
+# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
+# faster training
+srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
+	time -p bash data.sh "/network/datasets/inat" "$SLURM_TMPDIR/data" ${_DATA_PREP_WORKERS}
+
+# Execute Python script
+python main.py
diff --git a/docs/examples/data/torchvision/main.py b/docs/examples/data/torchvision/main.py
new file mode 100644
index 00000000..015394e0
--- /dev/null
+++ b/docs/examples/data/torchvision/main.py
@@ -0,0 +1,187 @@
+"""Torchvision training example."""
+import logging
+import os
+
+import rich.logging
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, random_split
+from torchvision import transforms
+from torchvision.datasets import INaturalist
+from torchvision.models import resnet18
+from tqdm import tqdm
+
+
+def main():
+    training_epochs = 1
+    learning_rate = 5e-4
+    weight_decay = 1e-4
+    batch_size = 256
+
+    # Check that the GPU is available
+    assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+    device = torch.device("cuda", 0)
+
+    # Setup logging (optional, but much better than using print statements)
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+    )
+
+    logger = logging.getLogger(__name__)
+
+    # Create a model and move it to the GPU.
+    model = resnet18(num_classes=10000)
+    model.to(device=device)
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+
+    # Setup ImageNet
+    num_workers = get_num_workers()
+    try:
+        dataset_path = f"{os.environ['SLURM_TMPDIR']}/data"
+    except KeyError:
+        dataset_path = "../dataset"
+    train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=True,
+    )
+    valid_dataloader = DataLoader(
+        valid_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+    test_dataloader = DataLoader(  # NOTE: Not used in this example.
+        test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+
+    # Checkout the "checkpointing and preemption" example for more info!
+    logger.debug("Starting training from scratch.")
+
+    for epoch in range(training_epochs):
+        logger.debug(f"Starting epoch {epoch}/{training_epochs}")
+
+        # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
+        model.train()
+
+        # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+        progress_bar = tqdm(
+            total=len(train_dataloader),
+            desc=f"Train epoch {epoch}",
+        )
+
+        # Training loop
+        for batch in train_dataloader:
+            # Move the batch to the GPU before we pass it to the model
+            batch = tuple(item.to(device) for item in batch)
+            x, y = batch
+
+            # Forward pass
+            logits: Tensor = model(x)
+
+            loss = F.cross_entropy(logits, y)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            # Calculate some metrics:
+            n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+            n_samples = y.shape[0]
+            accuracy = n_correct_predictions / n_samples
+
+            logger.debug(f"Accuracy: {accuracy.item():.2%}")
+            logger.debug(f"Average Loss: {loss.item()}")
+
+            # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+            progress_bar.update(1)
+            progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+        progress_bar.close()
+
+        val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+        logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+
+    print("Done!")
+
+
+@torch.no_grad()
+def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+    model.eval()
+
+    total_loss = 0.0
+    n_samples = 0
+    correct_predictions = 0
+
+    for batch in dataloader:
+        batch = tuple(item.to(device) for item in batch)
+        x, y = batch
+
+        logits: Tensor = model(x)
+        loss = F.cross_entropy(logits, y)
+
+        batch_n_samples = x.shape[0]
+        batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+
+        total_loss += loss.item()
+        n_samples += batch_n_samples
+        correct_predictions += batch_correct_predictions
+
+    accuracy = correct_predictions / n_samples
+    return total_loss, accuracy
+
+
+def make_datasets(
+    dataset_path: str,
+    val_split: float = 0.1,
+    val_split_seed: int = 42,
+):
+    """Returns the training, validation, and test splits for ImageNet.
+
+    NOTE: We don't use image transforms here for simplicity.
+    Having different transformations for train and validation would complicate things a bit.
+    Later examples will show how to do the train/val/test split properly when using transforms.
+    """
+    train_dataset = INaturalist(
+        root=dataset_path,
+        transform=transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+        ]),
+        version="2021_train"
+    )
+    test_dataset = INaturalist(
+        root=dataset_path,
+        transform=transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+        ]),
+        version="2021_valid"
+    )
+    # Split the training dataset into a training and validation set.
+    train_dataset, valid_dataset = random_split(
+        train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed)
+    )
+    return train_dataset, valid_dataset, test_dataset
+
+
+def get_num_workers() -> int:
+    """Gets the optimal number of DatLoader workers to use in the current job."""
+    if "SLURM_CPUS_PER_TASK" in os.environ:
+        return int(os.environ["SLURM_CPUS_PER_TASK"])
+    if hasattr(os, "sched_getaffinity"):
+        return len(os.sched_getaffinity(0))
+    return torch.multiprocessing.cpu_count()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/distributed/001_single_gpu/README.rst b/docs/examples/distributed/001_single_gpu/README.rst
new file mode 100644
index 00000000..5ddeafbb
--- /dev/null
+++ b/docs/examples/distributed/001_single_gpu/README.rst
@@ -0,0 +1,29 @@
+001 - Single GPU Job
+====================
+
+
+**Prerequisites**
+Make sure to read the following sections of the documentation before using this example:
+
+* :ref:`pytorch_setup`
+
+The full source code for this example is available on `the mila-docs GitHub repository. <https://github.com/lebrice/mila-docs/tree/pytorch_distributed_training_examples/docs/examples/distributed/001_single_gpu>`_
+
+**job.sh**
+
+.. literalinclude:: examples/distributed/001_single_gpu/job.sh
+    :language: bash
+
+
+**main.py**
+
+.. literalinclude:: examples/distributed/001_single_gpu/main.py
+    :language: python
+
+
+**Running this example**
+
+
+.. code-block:: bash
+
+    $ sbatch job.sh
diff --git a/docs/examples/distributed/001_single_gpu/job.sh b/docs/examples/distributed/001_single_gpu/job.sh
new file mode 100644
index 00000000..6dd819bb
--- /dev/null
+++ b/docs/examples/distributed/001_single_gpu/job.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+#SBATCH --gpus-per-task=rtx8000:1
+#SBATCH --cpus-per-task=4
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=16G
+#SBATCH --time=00:15:00
+
+
+# Echo time and hostname into log
+echo "Date:     $(date)"
+echo "Hostname: $(hostname)"
+
+
+# Ensure only anaconda/3 module loaded.
+module purge
+# This example uses Conda to manage package dependencies.
+# See https://docs.mila.quebec/Userguide.html#conda for more information.
+module load anaconda/3
+
+# Creating the environment for the first time:
+# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+#     pytorch-cuda=11.6 -c pytorch -c nvidia
+# Other conda packages:
+# conda install -y -n pytorch -c conda-forge rich
+
+# Activate pre-existing environment.
+conda activate pytorch
+
+
+# Stage dataset into $SLURM_TMPDIR
+cp -a /network/datasets/cifar10.var/cifar10_torchvision $SLURM_TMPDIR
+
+
+# Execute Python script
+python main.py
diff --git a/docs/examples/distributed/001_single_gpu/main.py b/docs/examples/distributed/001_single_gpu/main.py
new file mode 100644
index 00000000..f859e9f8
--- /dev/null
+++ b/docs/examples/distributed/001_single_gpu/main.py
@@ -0,0 +1,172 @@
+"""Single-GPU training example."""
+import logging
+import os
+
+import rich.logging
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, random_split
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet18
+from tqdm import tqdm
+
+
+def main():
+    training_epochs = 10
+    learning_rate = 5e-4
+    weight_decay = 1e-4
+    batch_size = 128
+
+    # Check that the GPU is available
+    assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+    device = torch.device("cuda", 0)
+
+    # Setup logging (optional, but much better than using print statements)
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+    )
+
+    logger = logging.getLogger(__name__)
+
+    # Create a model and move it to the GPU.
+    model = resnet18(num_classes=10)
+    model.to(device=device)
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+
+    # Setup CIFAR10
+    num_workers = get_num_workers()
+    dataset_path = os.environ.get("SLURM_TMPDIR", "../dataset")
+    train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=True,
+    )
+    valid_dataloader = DataLoader(
+        valid_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+    test_dataloader = DataLoader(  # NOTE: Not used in this example.
+        test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+
+    # Checkout the "checkpointing and preemption" example for more info!
+    logger.debug("Starting training from scratch.")
+
+    for epoch in range(training_epochs):
+        logger.debug(f"Starting epoch {epoch}/{training_epochs}")
+
+        # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
+        model.train()
+
+        # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+        progress_bar = tqdm(
+            total=len(train_dataloader),
+            desc=f"Train epoch {epoch}",
+        )
+
+        # Training loop
+        for batch in train_dataloader:
+            # Move the batch to the GPU before we pass it to the model
+            batch = tuple(item.to(device) for item in batch)
+            x, y = batch
+
+            # Forward pass
+            logits: Tensor = model(x)
+
+            loss = F.cross_entropy(logits, y)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            # Calculate some metrics:
+            n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+            n_samples = y.shape[0]
+            accuracy = n_correct_predictions / n_samples
+
+            logger.debug(f"Accuracy: {accuracy.item():.2%}")
+            logger.debug(f"Average Loss: {loss.item()}")
+
+            # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+            progress_bar.update(1)
+            progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+        progress_bar.close()
+
+        val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+        logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+
+    print("Done!")
+
+
+@torch.no_grad()
+def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+    model.eval()
+
+    total_loss = 0.0
+    n_samples = 0
+    correct_predictions = 0
+
+    for batch in dataloader:
+        batch = tuple(item.to(device) for item in batch)
+        x, y = batch
+
+        logits: Tensor = model(x)
+        loss = F.cross_entropy(logits, y)
+
+        batch_n_samples = x.shape[0]
+        batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+
+        total_loss += loss.item()
+        n_samples += batch_n_samples
+        correct_predictions += batch_correct_predictions
+
+    accuracy = correct_predictions / n_samples
+    return total_loss, accuracy
+
+
+def make_datasets(
+    dataset_path: str,
+    val_split: float = 0.1,
+    val_split_seed: int = 42,
+):
+    """Returns the training, validation, and test splits for CIFAR10.
+
+    NOTE: We don't use image transforms here for simplicity.
+    Having different transformations for train and validation would complicate things a bit.
+    Later examples will show how to do the train/val/test split properly when using transforms.
+    """
+    train_dataset = CIFAR10(
+        root=dataset_path, transform=transforms.ToTensor(), download=True, train=True
+    )
+    test_dataset = CIFAR10(
+        root=dataset_path, transform=transforms.ToTensor(), download=True, train=False
+    )
+    # Split the training dataset into a training and validation set.
+    train_dataset, valid_dataset = random_split(
+        train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed)
+    )
+    return train_dataset, valid_dataset, test_dataset
+
+
+def get_num_workers() -> int:
+    """Gets the optimal number of DatLoader workers to use in the current job."""
+    if "SLURM_CPUS_PER_TASK" in os.environ:
+        return int(os.environ["SLURM_CPUS_PER_TASK"])
+    if hasattr(os, "sched_getaffinity"):
+        return len(os.sched_getaffinity(0))
+    return torch.multiprocessing.cpu_count()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/distributed/README.rst b/docs/examples/distributed/README.rst
new file mode 100644
index 00000000..c6e5c7b4
--- /dev/null
+++ b/docs/examples/distributed/README.rst
@@ -0,0 +1,6 @@
+********************
+Distributed Training
+********************
+
+
+.. include:: /examples/distributed/001_single_gpu/README.rst
diff --git a/docs/examples/frameworks/pytorch_setup/README.rst b/docs/examples/frameworks/pytorch_setup/README.rst
index 3be1c08b..f69a1921 100644
--- a/docs/examples/frameworks/pytorch_setup/README.rst
+++ b/docs/examples/frameworks/pytorch_setup/README.rst
@@ -1,3 +1,5 @@
+.. _pytorch_setup:
+
 PyTorch Setup
 ===================
 
diff --git a/docs/examples/frameworks/pytorch_setup/job.sh b/docs/examples/frameworks/pytorch_setup/job.sh
index db126819..6f50e07d 100644
--- a/docs/examples/frameworks/pytorch_setup/job.sh
+++ b/docs/examples/frameworks/pytorch_setup/job.sh
@@ -17,6 +17,8 @@ module load anaconda/3
 # Creating the environment for the first time:
 # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
 #     pytorch-cuda=11.6 -c pytorch -c nvidia
+# Other conda packages:
+# conda install -y -n pytorch -c conda-forge rich
 
 # Activate the environment:
 conda activate pytorch
diff --git a/docs/examples/generate_diffs.sh b/docs/examples/generate_diffs.sh
new file mode 100755
index 00000000..24e3c767
--- /dev/null
+++ b/docs/examples/generate_diffs.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Use this to update the diffs based on the contents of the files.
+
+pushd `dirname "${BASH_SOURCE[0]}"` >/dev/null
+_SCRIPT_DIR=`pwd -P`
+popd >/dev/null
+
+set -e
+
+generate_diff() {
+    echo "Generating diff for docs/examples/$1 -> docs/examples/$2"
+    # NOTE: Assuming that this gets run from the `docs` folder (as is the case when building the docs).
+
+    # Write a diff file to be shown in the documentation.
+    
+    echo "# $1 -> $2" > "$2.diff"
+    git diff --no-index -U9999 \
+        "$1" \
+        "$2" \
+        | grep -Ev "^--- |^\+\+\+ |^@@ |^index |^diff --git" \
+        >> "$2.diff"
+}
+
+pushd "${_SCRIPT_DIR}" >/dev/null
+
+# single_gpu -> huggingface
+generate_diff distributed/001_single_gpu/job.sh data/hf/job.sh
+generate_diff distributed/001_single_gpu/main.py data/hf/main.py
+
+# single_gpu -> torchvision
+generate_diff distributed/001_single_gpu/job.sh data/torchvision/job.sh
+generate_diff distributed/001_single_gpu/main.py data/torchvision/main.py
+
+popd >/dev/null