diff --git a/.gitignore b/.gitignore
index d20faa42..08d78b4e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 _build
 .idea
 **/__pycache__
+/docs/examples/**/*.diff
diff --git a/docs/Minimal_examples.rst b/docs/Minimal_examples.rst
index 67556a79..f0792ba0 100644
--- a/docs/Minimal_examples.rst
+++ b/docs/Minimal_examples.rst
@@ -1,6 +1,8 @@
-.. ***************************
+.. ****************
 .. Minimal Examples
-.. ***************************
+.. ****************
 
 
-.. include:: examples/frameworks/README.rst
+.. include:: examples/frameworks/index.rst
+.. include:: examples/distributed/index.rst
+.. include:: examples/data/index.rst
diff --git a/docs/conf.py b/docs/conf.py
index df42510e..e89f8769 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -3,7 +3,8 @@
 from __future__ import division, print_function, unicode_literals
 
 from datetime import datetime
-
+from pathlib import Path
+import subprocess
 import sphinx_theme
 
 extensions = [
@@ -90,5 +91,28 @@
 # Include CNAME file so GitHub Pages can set Custom Domain name
 html_extra_path = ['CNAME']
 
+
+docs_root = Path(__file__).parent
+file_dir = docs_root / "examples/generate_diffs.sh"
+try:
+    _proc = subprocess.run(str(file_dir), shell=True, capture_output=True, check=True)
+except subprocess.CalledProcessError as err:
+    raise RuntimeError(
+        "Could not build the diff files for the examples:\n"
+        + str(err.output, encoding="utf-8")
+        + str(err.stderr, encoding="utf-8")
+    )
+
+pyfile = docs_root / "examples/preprocess.py"
+try:
+    _proc = subprocess.run(["python3", str(pyfile)], capture_output=True, check=True)
+except subprocess.CalledProcessError as err:
+    raise RuntimeError(
+        "Could not generate github README's:\n"
+        + str(err.output, encoding="utf-8")
+        + str(err.stderr, encoding="utf-8")
+    )
+
+
 def setup(app):
     app.add_css_file('custom.css')
diff --git a/docs/examples/data/hf/README.rst b/docs/examples/data/hf/README.rst
new file mode 100644
index 00000000..2d59c729
--- /dev/null
+++ b/docs/examples/data/hf/README.rst
@@ -0,0 +1,420 @@
+Hugging Face Dataset
+====================
+
+
+**Prerequisites**
+
+Make sure to read the following sections of the documentation before using this example:
+
+* :ref:`pytorch_setup`
+* :ref:`001 - Single GPU Job`
+
+The full source code for this example is available on `the mila-docs GitHub repository. <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/data/hf>`_
+
+
+**job.sh**
+
+.. code:: diff
+
+    # distributed/001_single_gpu/job.sh -> data/hf/job.sh
+    #!/bin/bash
+    #SBATCH --gpus-per-task=rtx8000:1
+    #SBATCH --cpus-per-task=4
+    #SBATCH --ntasks-per-node=1
+   -#SBATCH --mem=16G
+   -#SBATCH --time=00:15:00
+   +#SBATCH --mem=24G
+   +#SBATCH --time=02:00:00
+   +#SBATCH --tmp=1500G
+   +set -o errexit
+   +
+   +function wrap_cmd {
+   +	for a in "$@"
+   +	do
+   +		echo -n \"$a\" ""
+   +	done
+   +}
+    
+    
+    # Echo time and hostname into log
+    echo "Date:     $(date)"
+    echo "Hostname: $(hostname)"
+    
+    
+    # Ensure only anaconda/3 module loaded.
+    module purge
+    # This example uses Conda to manage package dependencies.
+    # See https://docs.mila.quebec/Userguide.html#conda for more information.
+    module load anaconda/3
+    
+   +
+    # Creating the environment for the first time:
+    # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+   -#     pytorch-cuda=11.6 -c pytorch -c nvidia
+   +#     pytorch-cuda=11.6 scipy -c pytorch -c nvidia
+    # Other conda packages:
+   -# conda install -y -n pytorch -c conda-forge rich
+   +# conda install -y -n pytorch -c conda-forge rich tqdm datasets
+    
+    # Activate pre-existing environment.
+    conda activate pytorch
+    
+    
+   -# Stage dataset into $SLURM_TMPDIR
+   -cp -a /network/datasets/cifar10.var/cifar10_torchvision $SLURM_TMPDIR
+   +# Prepare data for training
+   +mkdir -p "$SLURM_TMPDIR/data"
+   +
+   +if [[ -z "${HF_DATASETS_CACHE}" ]]
+   +then
+   +	# Store the huggingface datasets cache in $SCRATCH
+   +	export HF_DATASETS_CACHE=$SCRATCH/cache/huggingface/datasets
+   +fi
+   +if [[ -z "${_DATA_PREP_WORKERS}" ]]
+   +then
+   +	_DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE}
+   +fi
+   +if [[ -z "${_DATA_PREP_WORKERS}" ]]
+   +then
+   +	_DATA_PREP_WORKERS=16
+   +fi
+   +
+   +# Preprocess the dataset and cache the result such that the heavy work is done
+   +# only once *ever*
+   +# Required conda packages:
+   +# conda install -y -c conda-forge zstandard
+   +srun --ntasks=1 --ntasks-per-node=1 \
+   +	time -p python3 prepare_data.py "/network/datasets/pile" ${_DATA_PREP_WORKERS}
+   +
+   +# Copy the preprocessed dataset to $SLURM_TMPDIR so it is close to the GPUs for
+   +# faster training. This should be done once per compute node
+   +cmd=(
+   +	# Having 'bash' here allows the execution of a script file which might not
+   +	# have the execution flag on
+   +	bash
+   +	cp_data.sh
+   +	# Get the current dataset cache
+   +	"$(python3 get_dataset_cache_dir.py)"
+   +	# Get the local dataset cache
+   +	# Use '' to delay the execution of the command as $SLURM_TMPDIR needs to be
+   +	# expanded on the local compute node rather than the master node
+   +	'$(python3 get_dataset_cache_dir.py "$SLURM_TMPDIR/data")'
+   +	${_DATA_PREP_WORKERS}
+   +)
+   +# 'time' will objectively give a measure for the copy of the dataset. This can
+   +# be used to compare the timing of multiple attempts in optimizing code and make
+   +# sure any slow down doesn't come from the code itself
+   +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
+   +	time -p bash -c "$(wrap_cmd "${cmd[@]}")"
+   +
+   +# Use the local copy of the preprocessed dataset
+   +export HF_DATASETS_CACHE="$SLURM_TMPDIR/data"
+    
+    
+    # Execute Python script
+    python main.py
+   
+
+**main.py**
+
+.. code:: diff
+
+    # distributed/001_single_gpu/main.py -> data/hf/main.py
+   -"""Single-GPU training example."""
+   +"""Torchvision training example."""
+    import logging
+    import os
+    
+   +import datasets
+    import rich.logging
+    import torch
+    from torch import Tensor, nn
+    from torch.nn import functional as F
+   -from torch.utils.data import DataLoader, random_split
+   -from torchvision import transforms
+   -from torchvision.datasets import CIFAR10
+   +from torch.utils.data import DataLoader
+    from torchvision.models import resnet18
+    from tqdm import tqdm
+    
+    
+    def main():
+   -    training_epochs = 10
+   +    training_epochs = 1
+        learning_rate = 5e-4
+        weight_decay = 1e-4
+   -    batch_size = 128
+   +    batch_size = 256
+    
+        # Check that the GPU is available
+        assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+        device = torch.device("cuda", 0)
+    
+        # Setup logging (optional, but much better than using print statements)
+        logging.basicConfig(
+            level=logging.INFO,
+            handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+        )
+    
+        logger = logging.getLogger(__name__)
+    
+        # Create a model and move it to the GPU.
+   -    model = resnet18(num_classes=10)
+   +    model = resnet18()
+        model.to(device=device)
+    
+        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+    
+   -    # Setup CIFAR10
+   +    # Setup ImageNet
+        num_workers = get_num_workers()
+   -    dataset_path = os.environ.get("SLURM_TMPDIR", "../dataset")
+   +    dataset_path = "the_pile"
+        train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
+        train_dataloader = DataLoader(
+            train_dataset,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            shuffle=True,
+        )
+        valid_dataloader = DataLoader(
+            valid_dataset,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            shuffle=False,
+        )
+        test_dataloader = DataLoader(  # NOTE: Not used in this example.
+            test_dataset,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            shuffle=False,
+        )
+    
+        # Checkout the "checkpointing and preemption" example for more info!
+        logger.debug("Starting training from scratch.")
+    
+        for epoch in range(training_epochs):
+            logger.debug(f"Starting epoch {epoch}/{training_epochs}")
+    
+            # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
+            model.train()
+    
+            # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+            progress_bar = tqdm(
+                total=len(train_dataloader),
+                desc=f"Train epoch {epoch}",
+            )
+    
+            # Training loop
+            for batch in train_dataloader:
+                # Move the batch to the GPU before we pass it to the model
+                batch = tuple(item.to(device) for item in batch)
+   -            x, y = batch
+    
+   -            # Forward pass
+   -            logits: Tensor = model(x)
+   -
+   -            loss = F.cross_entropy(logits, y)
+   -
+   -            optimizer.zero_grad()
+   -            loss.backward()
+   -            optimizer.step()
+   -
+   -            # Calculate some metrics:
+   -            n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+   -            n_samples = y.shape[0]
+   -            accuracy = n_correct_predictions / n_samples
+   -
+   -            logger.debug(f"Accuracy: {accuracy.item():.2%}")
+   -            logger.debug(f"Average Loss: {loss.item()}")
+   +            # [Training of the model goes here]
+    
+                # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+                progress_bar.update(1)
+   -            progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+            progress_bar.close()
+    
+            val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+            logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+    
+        print("Done!")
+    
+    
+    @torch.no_grad()
+    def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+        model.eval()
+    
+        total_loss = 0.0
+        n_samples = 0
+        correct_predictions = 0
+    
+        for batch in dataloader:
+            batch = tuple(item.to(device) for item in batch)
+            x, y = batch
+    
+            logits: Tensor = model(x)
+            loss = F.cross_entropy(logits, y)
+    
+            batch_n_samples = x.shape[0]
+            batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+    
+            total_loss += loss.item()
+            n_samples += batch_n_samples
+            correct_predictions += batch_correct_predictions
+    
+        accuracy = correct_predictions / n_samples
+        return total_loss, accuracy
+    
+    
+   -def make_datasets(
+   -    dataset_path: str,
+   -    val_split: float = 0.1,
+   -    val_split_seed: int = 42,
+   -):
+   -    """Returns the training, validation, and test splits for CIFAR10.
+   +def make_datasets(dataset_path: str):
+   +    """Returns the training, validation, and test splits for ImageNet.
+    
+   -    NOTE: We don't use image transforms here for simplicity.
+   +    NOTE: We don't use transforms here for simplicity.
+        Having different transformations for train and validation would complicate things a bit.
+        Later examples will show how to do the train/val/test split properly when using transforms.
+        """
+   -    train_dataset = CIFAR10(
+   -        root=dataset_path, transform=transforms.ToTensor(), download=True, train=True
+   -    )
+   -    test_dataset = CIFAR10(
+   -        root=dataset_path, transform=transforms.ToTensor(), download=True, train=False
+   -    )
+   -    # Split the training dataset into a training and validation set.
+   -    train_dataset, valid_dataset = random_split(
+   -        train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed)
+   -    )
+   +    builder = datasets.load_dataset_builder(dataset_path, subsets=["all"], version="0.0.0")
+   +    train_dataset = builder.as_dataset(split="train").with_format("torch")
+   +    valid_dataset = builder.as_dataset(split="validation").with_format("torch")
+   +    test_dataset = builder.as_dataset(split="test").with_format("torch")
+        return train_dataset, valid_dataset, test_dataset
+    
+    
+    def get_num_workers() -> int:
+        """Gets the optimal number of DatLoader workers to use in the current job."""
+        if "SLURM_CPUS_PER_TASK" in os.environ:
+            return int(os.environ["SLURM_CPUS_PER_TASK"])
+        if hasattr(os, "sched_getaffinity"):
+            return len(os.sched_getaffinity(0))
+        return torch.multiprocessing.cpu_count()
+    
+    
+    if __name__ == "__main__":
+        main()
+   
+
+**prepare_data.py**
+
+.. code:: python
+
+   """Preprocess the dataset.
+   In this example, HuggingFace is used and the resulting dataset will be stored in
+   $HF_DATASETS_CACHE. It is preferable to set the datasets cache to a location in
+   $SCRATCH"""
+   import sys
+   import time
+   
+   import datasets
+   
+   
+   _LOCAL_DS = sys.argv[1]
+   _LOCAL_DS_SPLITS = _LOCAL_DS.split("/")
+   try:
+       _WORKERS = int(sys.argv[2])
+   except IndexError:
+       _WORKERS = 16
+   
+   dl_config = datasets.DownloadConfig(cache_dir=_LOCAL_DS)
+   
+   # 'datasets' does not allow to use a local storage for the datasets' files using
+   # it's exposed API. Mocking the download func to for the usage of the local file
+   dl_man = datasets.DownloadManager(download_config=dl_config)
+   def dl(url_or_urls, *args, **kwargs):
+       import glob
+       local_files = ["/".join(_f.split("/")[len(_LOCAL_DS_SPLITS):])
+                      for _f in glob.glob(f"{_LOCAL_DS}/**", recursive=True)]
+       local_files.sort()
+       if isinstance(url_or_urls, str):
+           url_or_urls = [url_or_urls]
+   
+       # Replace all urls by local files if they can be found
+       for v in (url_or_urls.values() if isinstance(url_or_urls, dict) else {".":url_or_urls}):
+           for i, url in enumerate(v):
+               for lf in local_files:
+                   if lf and url.endswith(lf):
+                       v[i] = f"{_LOCAL_DS}/{lf}"
+                       local_files.remove(lf)
+                       break
+   
+       # Continue normal download process which should only checksum the local
+       # files instead of downloading them
+       return _download(url_or_urls, *args, **kwargs)
+   
+   _download = dl_man.download
+   dl_man.download = dl
+   builder = datasets.load_dataset_builder("the_pile", download_config=dl_config, subsets=["all"], version="0.0.0")
+   
+   t = -time.time()
+   builder.download_and_prepare(dl_manager=dl_man, num_proc=_WORKERS)
+   t += time.time()
+   
+   print(f"Prepared data in {t/60:.2f}m")
+   
+
+**get_dataset_cache_dir.py**
+
+.. code:: python
+
+   """List to stdout the files of the dataset"""
+   import sys
+   
+   import datasets
+   
+   
+   # Redirect outputs to stderr to avoid noize in stdout
+   _stdout = sys.stdout
+   sys.stdout = sys.stderr
+   
+   try:
+       _CACHE_DIR = sys.argv[1]
+   except IndexError:
+       _CACHE_DIR = None
+   
+   builder = datasets.load_dataset_builder("the_pile", cache_dir=_CACHE_DIR, subsets=["all"], version="0.0.0")
+   print(builder.cache_dir, file=_stdout)
+   
+
+**cp_data.sh**
+
+.. code:: bash
+
+   #!/bin/bash
+   set -o errexit
+   
+   _SRC=$1
+   _DEST=$2
+   _WORKERS=$3
+   
+   # Copy the dataset
+   (cd "${_SRC}" && find -L * -type f) | while read f
+   do
+   	mkdir --parents "${_DEST}/$(dirname "$f")"
+   	# echo source first so it is matched to the cp's '-T' argument
+   	readlink --canonicalize "${_SRC}/$f"
+   	# echo output last so cp understands it's the output file
+   	echo "${_DEST}/$f"
+   done | xargs -n2 -P${_WORKERS} cp --update -T
+   
+
+**Running this example**
+
+.. code-block:: bash
+
+   $ sbatch job.sh
diff --git a/docs/examples/data/hf/_index.rst b/docs/examples/data/hf/_index.rst
new file mode 100644
index 00000000..b907baf3
--- /dev/null
+++ b/docs/examples/data/hf/_index.rst
@@ -0,0 +1,49 @@
+Hugging Face Dataset
+====================
+
+
+**Prerequisites**
+
+Make sure to read the following sections of the documentation before using this example:
+
+* :ref:`pytorch_setup`
+* :ref:`001 - Single GPU Job`
+
+The full source code for this example is available on `the mila-docs GitHub repository. <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/data/hf>`_
+
+
+**job.sh**
+
+.. literalinclude:: examples/data/hf/job.sh.diff
+   :language: diff
+
+
+**main.py**
+
+.. literalinclude:: examples/data/hf/main.py.diff
+   :language: diff
+
+
+**prepare_data.py**
+
+.. literalinclude:: examples/data/hf/prepare_data.py
+   :language: python
+
+
+**get_dataset_cache_dir.py**
+
+.. literalinclude:: examples/data/hf/get_dataset_cache_dir.py
+   :language: python
+
+
+**cp_data.sh**
+
+.. literalinclude:: examples/data/hf/cp_data.sh
+   :language: bash
+
+
+**Running this example**
+
+.. code-block:: bash
+
+   $ sbatch job.sh
diff --git a/docs/examples/data/hf/cp_data.sh b/docs/examples/data/hf/cp_data.sh
new file mode 100644
index 00000000..53d75a94
--- /dev/null
+++ b/docs/examples/data/hf/cp_data.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -o errexit
+
+_SRC=$1
+_DEST=$2
+_WORKERS=$3
+
+# Copy the dataset
+(cd "${_SRC}" && find -L * -type f) | while read f
+do
+	mkdir --parents "${_DEST}/$(dirname "$f")"
+	# echo source first so it is matched to the cp's '-T' argument
+	readlink --canonicalize "${_SRC}/$f"
+	# echo output last so cp understands it's the output file
+	echo "${_DEST}/$f"
+done | xargs -n2 -P${_WORKERS} cp --update -T
diff --git a/docs/examples/data/hf/get_dataset_cache_dir.py b/docs/examples/data/hf/get_dataset_cache_dir.py
new file mode 100644
index 00000000..9c5740d3
--- /dev/null
+++ b/docs/examples/data/hf/get_dataset_cache_dir.py
@@ -0,0 +1,17 @@
+"""List to stdout the files of the dataset"""
+import sys
+
+import datasets
+
+
+# Redirect outputs to stderr to avoid noize in stdout
+_stdout = sys.stdout
+sys.stdout = sys.stderr
+
+try:
+    _CACHE_DIR = sys.argv[1]
+except IndexError:
+    _CACHE_DIR = None
+
+builder = datasets.load_dataset_builder("the_pile", cache_dir=_CACHE_DIR, subsets=["all"], version="0.0.0")
+print(builder.cache_dir, file=_stdout)
diff --git a/docs/examples/data/hf/job.sh b/docs/examples/data/hf/job.sh
new file mode 100644
index 00000000..383b4b7a
--- /dev/null
+++ b/docs/examples/data/hf/job.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#SBATCH --gpus-per-task=rtx8000:1
+#SBATCH --cpus-per-task=4
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=24G
+#SBATCH --time=02:00:00
+#SBATCH --tmp=1500G
+set -o errexit
+
+function wrap_cmd {
+	for a in "$@"
+	do
+		echo -n \"$a\" ""
+	done
+}
+
+
+# Echo time and hostname into log
+echo "Date:     $(date)"
+echo "Hostname: $(hostname)"
+
+
+# Ensure only anaconda/3 module loaded.
+module purge
+# This example uses Conda to manage package dependencies.
+# See https://docs.mila.quebec/Userguide.html#conda for more information.
+module load anaconda/3
+
+
+# Creating the environment for the first time:
+# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+#     pytorch-cuda=11.6 scipy -c pytorch -c nvidia
+# Other conda packages:
+# conda install -y -n pytorch -c conda-forge rich tqdm datasets
+
+# Activate pre-existing environment.
+conda activate pytorch
+
+
+# Prepare data for training
+mkdir -p "$SLURM_TMPDIR/data"
+
+if [[ -z "${HF_DATASETS_CACHE}" ]]
+then
+	# Store the huggingface datasets cache in $SCRATCH
+	export HF_DATASETS_CACHE=$SCRATCH/cache/huggingface/datasets
+fi
+if [[ -z "${_DATA_PREP_WORKERS}" ]]
+then
+	_DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE}
+fi
+if [[ -z "${_DATA_PREP_WORKERS}" ]]
+then
+	_DATA_PREP_WORKERS=16
+fi
+
+# Preprocess the dataset and cache the result such that the heavy work is done
+# only once *ever*
+# Required conda packages:
+# conda install -y -c conda-forge zstandard
+srun --ntasks=1 --ntasks-per-node=1 \
+	time -p python3 prepare_data.py "/network/datasets/pile" ${_DATA_PREP_WORKERS}
+
+# Copy the preprocessed dataset to $SLURM_TMPDIR so it is close to the GPUs for
+# faster training. This should be done once per compute node
+cmd=(
+	# Having 'bash' here allows the execution of a script file which might not
+	# have the execution flag on
+	bash
+	cp_data.sh
+	# Get the current dataset cache
+	"$(python3 get_dataset_cache_dir.py)"
+	# Get the local dataset cache
+	# Use '' to delay the execution of the command as $SLURM_TMPDIR needs to be
+	# expanded on the local compute node rather than the master node
+	'$(python3 get_dataset_cache_dir.py "$SLURM_TMPDIR/data")'
+	${_DATA_PREP_WORKERS}
+)
+# 'time' will objectively give a measure for the copy of the dataset. This can
+# be used to compare the timing of multiple attempts in optimizing code and make
+# sure any slow down doesn't come from the code itself
+srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
+	time -p bash -c "$(wrap_cmd "${cmd[@]}")"
+
+# Use the local copy of the preprocessed dataset
+export HF_DATASETS_CACHE="$SLURM_TMPDIR/data"
+
+
+# Execute Python script
+python main.py
diff --git a/docs/examples/data/hf/main.py b/docs/examples/data/hf/main.py
new file mode 100644
index 00000000..9edd35bf
--- /dev/null
+++ b/docs/examples/data/hf/main.py
@@ -0,0 +1,144 @@
+"""Torchvision training example."""
+import logging
+import os
+
+import datasets
+import rich.logging
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+from torchvision.models import resnet18
+from tqdm import tqdm
+
+
+def main():
+    training_epochs = 1
+    learning_rate = 5e-4
+    weight_decay = 1e-4
+    batch_size = 256
+
+    # Check that the GPU is available
+    assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+    device = torch.device("cuda", 0)
+
+    # Setup logging (optional, but much better than using print statements)
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+    )
+
+    logger = logging.getLogger(__name__)
+
+    # Create a model and move it to the GPU.
+    model = resnet18()
+    model.to(device=device)
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+
+    # Setup ImageNet
+    num_workers = get_num_workers()
+    dataset_path = "the_pile"
+    train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=True,
+    )
+    valid_dataloader = DataLoader(
+        valid_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+    test_dataloader = DataLoader(  # NOTE: Not used in this example.
+        test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+
+    # Checkout the "checkpointing and preemption" example for more info!
+    logger.debug("Starting training from scratch.")
+
+    for epoch in range(training_epochs):
+        logger.debug(f"Starting epoch {epoch}/{training_epochs}")
+
+        # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
+        model.train()
+
+        # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+        progress_bar = tqdm(
+            total=len(train_dataloader),
+            desc=f"Train epoch {epoch}",
+        )
+
+        # Training loop
+        for batch in train_dataloader:
+            # Move the batch to the GPU before we pass it to the model
+            batch = tuple(item.to(device) for item in batch)
+
+            # [Training of the model goes here]
+
+            # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+            progress_bar.update(1)
+        progress_bar.close()
+
+        val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+        logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+
+    print("Done!")
+
+
+@torch.no_grad()
+def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+    model.eval()
+
+    total_loss = 0.0
+    n_samples = 0
+    correct_predictions = 0
+
+    for batch in dataloader:
+        batch = tuple(item.to(device) for item in batch)
+        x, y = batch
+
+        logits: Tensor = model(x)
+        loss = F.cross_entropy(logits, y)
+
+        batch_n_samples = x.shape[0]
+        batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+
+        total_loss += loss.item()
+        n_samples += batch_n_samples
+        correct_predictions += batch_correct_predictions
+
+    accuracy = correct_predictions / n_samples
+    return total_loss, accuracy
+
+
+def make_datasets(dataset_path: str):
+    """Returns the training, validation, and test splits for ImageNet.
+
+    NOTE: We don't use transforms here for simplicity.
+    Having different transformations for train and validation would complicate things a bit.
+    Later examples will show how to do the train/val/test split properly when using transforms.
+    """
+    builder = datasets.load_dataset_builder(dataset_path, subsets=["all"], version="0.0.0")
+    train_dataset = builder.as_dataset(split="train").with_format("torch")
+    valid_dataset = builder.as_dataset(split="validation").with_format("torch")
+    test_dataset = builder.as_dataset(split="test").with_format("torch")
+    return train_dataset, valid_dataset, test_dataset
+
+
+def get_num_workers() -> int:
+    """Gets the optimal number of DatLoader workers to use in the current job."""
+    if "SLURM_CPUS_PER_TASK" in os.environ:
+        return int(os.environ["SLURM_CPUS_PER_TASK"])
+    if hasattr(os, "sched_getaffinity"):
+        return len(os.sched_getaffinity(0))
+    return torch.multiprocessing.cpu_count()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/data/hf/prepare_data.py b/docs/examples/data/hf/prepare_data.py
new file mode 100644
index 00000000..595539e1
--- /dev/null
+++ b/docs/examples/data/hf/prepare_data.py
@@ -0,0 +1,52 @@
+"""Preprocess the dataset.
+In this example, HuggingFace is used and the resulting dataset will be stored in
+$HF_DATASETS_CACHE. It is preferable to set the datasets cache to a location in
+$SCRATCH"""
+import sys
+import time
+
+import datasets
+
+
+_LOCAL_DS = sys.argv[1]
+_LOCAL_DS_SPLITS = _LOCAL_DS.split("/")
+try:
+    _WORKERS = int(sys.argv[2])
+except IndexError:
+    _WORKERS = 16
+
+dl_config = datasets.DownloadConfig(cache_dir=_LOCAL_DS)
+
+# 'datasets' does not allow to use a local storage for the datasets' files using
+# it's exposed API. Mocking the download func to for the usage of the local file
+dl_man = datasets.DownloadManager(download_config=dl_config)
+def dl(url_or_urls, *args, **kwargs):
+    import glob
+    local_files = ["/".join(_f.split("/")[len(_LOCAL_DS_SPLITS):])
+                   for _f in glob.glob(f"{_LOCAL_DS}/**", recursive=True)]
+    local_files.sort()
+    if isinstance(url_or_urls, str):
+        url_or_urls = [url_or_urls]
+
+    # Replace all urls by local files if they can be found
+    for v in (url_or_urls.values() if isinstance(url_or_urls, dict) else {".":url_or_urls}):
+        for i, url in enumerate(v):
+            for lf in local_files:
+                if lf and url.endswith(lf):
+                    v[i] = f"{_LOCAL_DS}/{lf}"
+                    local_files.remove(lf)
+                    break
+
+    # Continue normal download process which should only checksum the local
+    # files instead of downloading them
+    return _download(url_or_urls, *args, **kwargs)
+
+_download = dl_man.download
+dl_man.download = dl
+builder = datasets.load_dataset_builder("the_pile", download_config=dl_config, subsets=["all"], version="0.0.0")
+
+t = -time.time()
+builder.download_and_prepare(dl_manager=dl_man, num_proc=_WORKERS)
+t += time.time()
+
+print(f"Prepared data in {t/60:.2f}m")
diff --git a/docs/examples/data/index.rst b/docs/examples/data/index.rst
new file mode 100644
index 00000000..bd8e2691
--- /dev/null
+++ b/docs/examples/data/index.rst
@@ -0,0 +1,7 @@
+*****************************
+Data Handling during Training
+*****************************
+
+
+.. include:: examples/data/torchvision/_index.rst
+.. include:: examples/data/hf/_index.rst
diff --git a/docs/examples/data/torchvision/README.rst b/docs/examples/data/torchvision/README.rst
new file mode 100644
index 00000000..2d515fb0
--- /dev/null
+++ b/docs/examples/data/torchvision/README.rst
@@ -0,0 +1,337 @@
+Torchvision
+===========
+
+
+**Prerequisites**
+
+Make sure to read the following sections of the documentation before using this example:
+
+* :ref:`pytorch_setup`
+* :ref:`001 - Single GPU Job`
+
+The full source code for this example is available on `the mila-docs GitHub repository. <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/data/torchvision>`_
+
+
+**job.sh**
+
+.. code:: diff
+
+    # distributed/001_single_gpu/job.sh -> data/torchvision/job.sh
+    #!/bin/bash
+    #SBATCH --gpus-per-task=rtx8000:1
+    #SBATCH --cpus-per-task=4
+    #SBATCH --ntasks-per-node=1
+    #SBATCH --mem=16G
+   -#SBATCH --time=00:15:00
+   +#SBATCH --time=01:30:00
+   +set -o errexit
+    
+    
+    # Echo time and hostname into log
+    echo "Date:     $(date)"
+    echo "Hostname: $(hostname)"
+    
+    
+    # Ensure only anaconda/3 module loaded.
+    module purge
+    # This example uses Conda to manage package dependencies.
+    # See https://docs.mila.quebec/Userguide.html#conda for more information.
+    module load anaconda/3
+    
+   +
+    # Creating the environment for the first time:
+    # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+   -#     pytorch-cuda=11.6 -c pytorch -c nvidia
+   +#     pytorch-cuda=11.6 scipy -c pytorch -c nvidia
+    # Other conda packages:
+   -# conda install -y -n pytorch -c conda-forge rich
+   +# conda install -y -n pytorch -c conda-forge rich tqdm
+    
+    # Activate pre-existing environment.
+    conda activate pytorch
+    
+    
+   -# Stage dataset into $SLURM_TMPDIR
+   -cp -a /network/datasets/cifar10.var/cifar10_torchvision $SLURM_TMPDIR
+   +# Prepare data for training
+   +mkdir -p "$SLURM_TMPDIR/data"
+   +
+   +if [[ -z "${_DATA_PREP_WORKERS}" ]]
+   +then
+   +	_DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE}
+   +fi
+   +if [[ -z "${_DATA_PREP_WORKERS}" ]]
+   +then
+   +	_DATA_PREP_WORKERS=16
+   +fi
+    
+   +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
+   +# faster training
+   +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
+   +	time -p bash data.sh "/network/datasets/inat" "$SLURM_TMPDIR/data" ${_DATA_PREP_WORKERS}
+    
+    # Execute Python script
+    python main.py
+   
+
+**main.py**
+
+.. code:: diff
+
+    # distributed/001_single_gpu/main.py -> data/torchvision/main.py
+   -"""Single-GPU training example."""
+   +"""Torchvision training example."""
+    import logging
+    import os
+    
+    import rich.logging
+    import torch
+    from torch import Tensor, nn
+    from torch.nn import functional as F
+    from torch.utils.data import DataLoader, random_split
+    from torchvision import transforms
+   -from torchvision.datasets import CIFAR10
+   +from torchvision.datasets import INaturalist
+    from torchvision.models import resnet18
+    from tqdm import tqdm
+    
+    
+    def main():
+   -    training_epochs = 10
+   +    training_epochs = 1
+        learning_rate = 5e-4
+        weight_decay = 1e-4
+   -    batch_size = 128
+   +    batch_size = 256
+    
+        # Check that the GPU is available
+        assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+        device = torch.device("cuda", 0)
+    
+        # Setup logging (optional, but much better than using print statements)
+        logging.basicConfig(
+            level=logging.INFO,
+            handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+        )
+    
+        logger = logging.getLogger(__name__)
+    
+        # Create a model and move it to the GPU.
+   -    model = resnet18(num_classes=10)
+   +    model = resnet18(num_classes=10000)
+        model.to(device=device)
+    
+        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+    
+   -    # Setup CIFAR10
+   +    # Setup ImageNet
+        num_workers = get_num_workers()
+   -    dataset_path = os.environ.get("SLURM_TMPDIR", "../dataset")
+   +    try:
+   +        dataset_path = f"{os.environ['SLURM_TMPDIR']}/data"
+   +    except KeyError:
+   +        dataset_path = "../dataset"
+        train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
+        train_dataloader = DataLoader(
+            train_dataset,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            shuffle=True,
+        )
+        valid_dataloader = DataLoader(
+            valid_dataset,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            shuffle=False,
+        )
+        test_dataloader = DataLoader(  # NOTE: Not used in this example.
+            test_dataset,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            shuffle=False,
+        )
+    
+        # Checkout the "checkpointing and preemption" example for more info!
+        logger.debug("Starting training from scratch.")
+    
+        for epoch in range(training_epochs):
+            logger.debug(f"Starting epoch {epoch}/{training_epochs}")
+    
+            # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
+            model.train()
+    
+            # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+            progress_bar = tqdm(
+                total=len(train_dataloader),
+                desc=f"Train epoch {epoch}",
+            )
+    
+            # Training loop
+            for batch in train_dataloader:
+                # Move the batch to the GPU before we pass it to the model
+                batch = tuple(item.to(device) for item in batch)
+                x, y = batch
+    
+                # Forward pass
+                logits: Tensor = model(x)
+    
+                loss = F.cross_entropy(logits, y)
+    
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+    
+                # Calculate some metrics:
+                n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+                n_samples = y.shape[0]
+                accuracy = n_correct_predictions / n_samples
+    
+                logger.debug(f"Accuracy: {accuracy.item():.2%}")
+                logger.debug(f"Average Loss: {loss.item()}")
+    
+                # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+                progress_bar.update(1)
+                progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+            progress_bar.close()
+    
+            val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+            logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+    
+        print("Done!")
+    
+    
+    @torch.no_grad()
+    def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+        model.eval()
+    
+        total_loss = 0.0
+        n_samples = 0
+        correct_predictions = 0
+    
+        for batch in dataloader:
+            batch = tuple(item.to(device) for item in batch)
+            x, y = batch
+    
+            logits: Tensor = model(x)
+            loss = F.cross_entropy(logits, y)
+    
+            batch_n_samples = x.shape[0]
+            batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+    
+            total_loss += loss.item()
+            n_samples += batch_n_samples
+            correct_predictions += batch_correct_predictions
+    
+        accuracy = correct_predictions / n_samples
+        return total_loss, accuracy
+    
+    
+    def make_datasets(
+        dataset_path: str,
+        val_split: float = 0.1,
+        val_split_seed: int = 42,
+    ):
+   -    """Returns the training, validation, and test splits for CIFAR10.
+   +    """Returns the training, validation, and test splits for ImageNet.
+    
+        NOTE: We don't use image transforms here for simplicity.
+        Having different transformations for train and validation would complicate things a bit.
+        Later examples will show how to do the train/val/test split properly when using transforms.
+        """
+   -    train_dataset = CIFAR10(
+   -        root=dataset_path, transform=transforms.ToTensor(), download=True, train=True
+   +    train_dataset = INaturalist(
+   +        root=dataset_path,
+   +        transform=transforms.Compose([
+   +            transforms.Resize(256),
+   +            transforms.CenterCrop(224),
+   +            transforms.ToTensor(),
+   +        ]),
+   +        version="2021_train"
+        )
+   -    test_dataset = CIFAR10(
+   -        root=dataset_path, transform=transforms.ToTensor(), download=True, train=False
+   +    test_dataset = INaturalist(
+   +        root=dataset_path,
+   +        transform=transforms.Compose([
+   +            transforms.Resize(256),
+   +            transforms.CenterCrop(224),
+   +            transforms.ToTensor(),
+   +        ]),
+   +        version="2021_valid"
+        )
+        # Split the training dataset into a training and validation set.
+        train_dataset, valid_dataset = random_split(
+            train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed)
+        )
+        return train_dataset, valid_dataset, test_dataset
+    
+    
+    def get_num_workers() -> int:
+        """Gets the optimal number of DatLoader workers to use in the current job."""
+        if "SLURM_CPUS_PER_TASK" in os.environ:
+            return int(os.environ["SLURM_CPUS_PER_TASK"])
+        if hasattr(os, "sched_getaffinity"):
+            return len(os.sched_getaffinity(0))
+        return torch.multiprocessing.cpu_count()
+    
+    
+    if __name__ == "__main__":
+        main()
+   
+
+**data.sh**
+
+.. code:: bash
+
+   #!/bin/bash
+   set -o errexit
+   
+   _SRC=$1
+   _DEST=$2
+   _WORKERS=$3
+   
+   # Clone the dataset structure locally and reorganise the raw files if needed
+   (cd "${_SRC}" && find -L * -type f) | while read f
+   do
+   	mkdir --parents "${_DEST}/$(dirname "$f")"
+   	# echo source first so it is matched to the ln's '-T' argument
+   	readlink --canonicalize "${_SRC}/$f"
+   	# echo output last so ln understands it's the output file
+   	echo "${_DEST}/$f"
+   done | xargs -n2 -P${_WORKERS} ln --symbolic --force -T
+   
+   (
+   	cd "${_DEST}"
+   	# Torchvision expects these names
+   	mv train.tar.gz 2021_train.tgz
+   	mv val.tar.gz 2021_valid.tgz
+   )
+   
+   # Extract and prepare the data
+   python3 data.py "${_DEST}"
+   
+
+**data.py**
+
+.. code:: python
+
+   """Make sure the data is available"""
+   import sys
+   import time
+   
+   from torchvision.datasets import INaturalist
+   
+   
+   t = -time.time()
+   INaturalist(root=sys.argv[1], version="2021_train", download=True)
+   INaturalist(root=sys.argv[1], version="2021_valid", download=True)
+   t += time.time()
+   print(f"Prepared data in {t/60:.2f}m")
+   
+
+**Running this example**
+
+.. code-block:: bash
+
+   $ sbatch job.sh
diff --git a/docs/examples/data/torchvision/_index.rst b/docs/examples/data/torchvision/_index.rst
new file mode 100644
index 00000000..77ab7445
--- /dev/null
+++ b/docs/examples/data/torchvision/_index.rst
@@ -0,0 +1,43 @@
+Torchvision
+===========
+
+
+**Prerequisites**
+
+Make sure to read the following sections of the documentation before using this example:
+
+* :ref:`pytorch_setup`
+* :ref:`001 - Single GPU Job`
+
+The full source code for this example is available on `the mila-docs GitHub repository. <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/data/torchvision>`_
+
+
+**job.sh**
+
+.. literalinclude:: examples/data/torchvision/job.sh.diff
+   :language: diff
+
+
+**main.py**
+
+.. literalinclude:: examples/data/torchvision/main.py.diff
+   :language: diff
+
+
+**data.sh**
+
+.. literalinclude:: examples/data/torchvision/data.sh
+   :language: bash
+
+
+**data.py**
+
+.. literalinclude:: examples/data/torchvision/data.py
+   :language: python
+
+
+**Running this example**
+
+.. code-block:: bash
+
+   $ sbatch job.sh
diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/data/torchvision/data.py
new file mode 100644
index 00000000..a43129c4
--- /dev/null
+++ b/docs/examples/data/torchvision/data.py
@@ -0,0 +1,12 @@
+"""Make sure the data is available"""
+import sys
+import time
+
+from torchvision.datasets import INaturalist
+
+
+t = -time.time()
+INaturalist(root=sys.argv[1], version="2021_train", download=True)
+INaturalist(root=sys.argv[1], version="2021_valid", download=True)
+t += time.time()
+print(f"Prepared data in {t/60:.2f}m")
diff --git a/docs/examples/data/torchvision/data.sh b/docs/examples/data/torchvision/data.sh
new file mode 100644
index 00000000..981a7f73
--- /dev/null
+++ b/docs/examples/data/torchvision/data.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -o errexit
+
+_SRC=$1
+_DEST=$2
+_WORKERS=$3
+
+# Clone the dataset structure locally and reorganise the raw files if needed
+(cd "${_SRC}" && find -L * -type f) | while read f
+do
+	mkdir --parents "${_DEST}/$(dirname "$f")"
+	# echo source first so it is matched to the ln's '-T' argument
+	readlink --canonicalize "${_SRC}/$f"
+	# echo output last so ln understands it's the output file
+	echo "${_DEST}/$f"
+done | xargs -n2 -P${_WORKERS} ln --symbolic --force -T
+
+(
+	cd "${_DEST}"
+	# Torchvision expects these names
+	mv train.tar.gz 2021_train.tgz
+	mv val.tar.gz 2021_valid.tgz
+)
+
+# Extract and prepare the data
+python3 data.py "${_DEST}"
diff --git a/docs/examples/data/torchvision/job.sh b/docs/examples/data/torchvision/job.sh
new file mode 100644
index 00000000..5423e372
--- /dev/null
+++ b/docs/examples/data/torchvision/job.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+#SBATCH --gpus-per-task=rtx8000:1
+#SBATCH --cpus-per-task=4
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=16G
+#SBATCH --time=01:30:00
+set -o errexit
+
+
+# Echo time and hostname into log
+echo "Date:     $(date)"
+echo "Hostname: $(hostname)"
+
+
+# Ensure only anaconda/3 module loaded.
+module purge
+# This example uses Conda to manage package dependencies.
+# See https://docs.mila.quebec/Userguide.html#conda for more information.
+module load anaconda/3
+
+
+# Creating the environment for the first time:
+# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+#     pytorch-cuda=11.6 scipy -c pytorch -c nvidia
+# Other conda packages:
+# conda install -y -n pytorch -c conda-forge rich tqdm
+
+# Activate pre-existing environment.
+conda activate pytorch
+
+
+# Prepare data for training
+mkdir -p "$SLURM_TMPDIR/data"
+
+if [[ -z "${_DATA_PREP_WORKERS}" ]]
+then
+	_DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE}
+fi
+if [[ -z "${_DATA_PREP_WORKERS}" ]]
+then
+	_DATA_PREP_WORKERS=16
+fi
+
+# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
+# faster training
+srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
+	time -p bash data.sh "/network/datasets/inat" "$SLURM_TMPDIR/data" ${_DATA_PREP_WORKERS}
+
+# Execute Python script
+python main.py
diff --git a/docs/examples/data/torchvision/main.py b/docs/examples/data/torchvision/main.py
new file mode 100644
index 00000000..015394e0
--- /dev/null
+++ b/docs/examples/data/torchvision/main.py
@@ -0,0 +1,187 @@
+"""Torchvision training example."""
+import logging
+import os
+
+import rich.logging
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, random_split
+from torchvision import transforms
+from torchvision.datasets import INaturalist
+from torchvision.models import resnet18
+from tqdm import tqdm
+
+
+def main():
+    training_epochs = 1
+    learning_rate = 5e-4
+    weight_decay = 1e-4
+    batch_size = 256
+
+    # Check that the GPU is available
+    assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+    device = torch.device("cuda", 0)
+
+    # Setup logging (optional, but much better than using print statements)
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+    )
+
+    logger = logging.getLogger(__name__)
+
+    # Create a model and move it to the GPU.
+    model = resnet18(num_classes=10000)
+    model.to(device=device)
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+
+    # Setup ImageNet
+    num_workers = get_num_workers()
+    try:
+        dataset_path = f"{os.environ['SLURM_TMPDIR']}/data"
+    except KeyError:
+        dataset_path = "../dataset"
+    train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=True,
+    )
+    valid_dataloader = DataLoader(
+        valid_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+    test_dataloader = DataLoader(  # NOTE: Not used in this example.
+        test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+
+    # Checkout the "checkpointing and preemption" example for more info!
+    logger.debug("Starting training from scratch.")
+
+    for epoch in range(training_epochs):
+        logger.debug(f"Starting epoch {epoch}/{training_epochs}")
+
+        # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
+        model.train()
+
+        # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+        progress_bar = tqdm(
+            total=len(train_dataloader),
+            desc=f"Train epoch {epoch}",
+        )
+
+        # Training loop
+        for batch in train_dataloader:
+            # Move the batch to the GPU before we pass it to the model
+            batch = tuple(item.to(device) for item in batch)
+            x, y = batch
+
+            # Forward pass
+            logits: Tensor = model(x)
+
+            loss = F.cross_entropy(logits, y)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            # Calculate some metrics:
+            n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+            n_samples = y.shape[0]
+            accuracy = n_correct_predictions / n_samples
+
+            logger.debug(f"Accuracy: {accuracy.item():.2%}")
+            logger.debug(f"Average Loss: {loss.item()}")
+
+            # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+            progress_bar.update(1)
+            progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+        progress_bar.close()
+
+        val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+        logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+
+    print("Done!")
+
+
+@torch.no_grad()
+def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+    model.eval()
+
+    total_loss = 0.0
+    n_samples = 0
+    correct_predictions = 0
+
+    for batch in dataloader:
+        batch = tuple(item.to(device) for item in batch)
+        x, y = batch
+
+        logits: Tensor = model(x)
+        loss = F.cross_entropy(logits, y)
+
+        batch_n_samples = x.shape[0]
+        batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+
+        total_loss += loss.item()
+        n_samples += batch_n_samples
+        correct_predictions += batch_correct_predictions
+
+    accuracy = correct_predictions / n_samples
+    return total_loss, accuracy
+
+
+def make_datasets(
+    dataset_path: str,
+    val_split: float = 0.1,
+    val_split_seed: int = 42,
+):
+    """Returns the training, validation, and test splits for ImageNet.
+
+    NOTE: We don't use image transforms here for simplicity.
+    Having different transformations for train and validation would complicate things a bit.
+    Later examples will show how to do the train/val/test split properly when using transforms.
+    """
+    train_dataset = INaturalist(
+        root=dataset_path,
+        transform=transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+        ]),
+        version="2021_train"
+    )
+    test_dataset = INaturalist(
+        root=dataset_path,
+        transform=transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+        ]),
+        version="2021_valid"
+    )
+    # Split the training dataset into a training and validation set.
+    train_dataset, valid_dataset = random_split(
+        train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed)
+    )
+    return train_dataset, valid_dataset, test_dataset
+
+
+def get_num_workers() -> int:
+    """Gets the optimal number of DatLoader workers to use in the current job."""
+    if "SLURM_CPUS_PER_TASK" in os.environ:
+        return int(os.environ["SLURM_CPUS_PER_TASK"])
+    if hasattr(os, "sched_getaffinity"):
+        return len(os.sched_getaffinity(0))
+    return torch.multiprocessing.cpu_count()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/distributed/001_single_gpu/README.rst b/docs/examples/distributed/001_single_gpu/README.rst
new file mode 100644
index 00000000..3bfb1bda
--- /dev/null
+++ b/docs/examples/distributed/001_single_gpu/README.rst
@@ -0,0 +1,236 @@
+001 - Single GPU Job
+====================
+
+
+**Prerequisites**
+Make sure to read the following sections of the documentation before using this example:
+
+* :ref:`pytorch_setup`
+
+The full source code for this example is available on `the mila-docs GitHub repository. <https://github.com/lebrice/mila-docs/tree/pytorch_distributed_training_examples/docs/examples/distributed/001_single_gpu>`_
+
+**job.sh**
+
+.. code:: bash
+
+   #!/bin/bash
+   #SBATCH --gpus-per-task=rtx8000:1
+   #SBATCH --cpus-per-task=4
+   #SBATCH --ntasks-per-node=1
+   #SBATCH --mem=16G
+   #SBATCH --time=00:15:00
+   
+   
+   # Echo time and hostname into log
+   echo "Date:     $(date)"
+   echo "Hostname: $(hostname)"
+   
+   
+   # Ensure only anaconda/3 module loaded.
+   module purge
+   # This example uses Conda to manage package dependencies.
+   # See https://docs.mila.quebec/Userguide.html#conda for more information.
+   module load anaconda/3
+   
+   # Creating the environment for the first time:
+   # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+   #     pytorch-cuda=11.6 -c pytorch -c nvidia
+   # Other conda packages:
+   # conda install -y -n pytorch -c conda-forge rich
+   
+   # Activate pre-existing environment.
+   conda activate pytorch
+   
+   
+   # Stage dataset into $SLURM_TMPDIR
+   cp -a /network/datasets/cifar10.var/cifar10_torchvision $SLURM_TMPDIR
+   
+   
+   # Execute Python script
+   python main.py
+   
+
+**main.py**
+
+.. code:: python
+
+   """Single-GPU training example."""
+   import logging
+   import os
+   
+   import rich.logging
+   import torch
+   from torch import Tensor, nn
+   from torch.nn import functional as F
+   from torch.utils.data import DataLoader, random_split
+   from torchvision import transforms
+   from torchvision.datasets import CIFAR10
+   from torchvision.models import resnet18
+   from tqdm import tqdm
+   
+   
+   def main():
+       training_epochs = 10
+       learning_rate = 5e-4
+       weight_decay = 1e-4
+       batch_size = 128
+   
+       # Check that the GPU is available
+       assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+       device = torch.device("cuda", 0)
+   
+       # Setup logging (optional, but much better than using print statements)
+       logging.basicConfig(
+           level=logging.INFO,
+           handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+       )
+   
+       logger = logging.getLogger(__name__)
+   
+       # Create a model and move it to the GPU.
+       model = resnet18(num_classes=10)
+       model.to(device=device)
+   
+       optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+   
+       # Setup CIFAR10
+       num_workers = get_num_workers()
+       dataset_path = os.environ.get("SLURM_TMPDIR", "../dataset")
+       train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
+       train_dataloader = DataLoader(
+           train_dataset,
+           batch_size=batch_size,
+           num_workers=num_workers,
+           shuffle=True,
+       )
+       valid_dataloader = DataLoader(
+           valid_dataset,
+           batch_size=batch_size,
+           num_workers=num_workers,
+           shuffle=False,
+       )
+       test_dataloader = DataLoader(  # NOTE: Not used in this example.
+           test_dataset,
+           batch_size=batch_size,
+           num_workers=num_workers,
+           shuffle=False,
+       )
+   
+       # Checkout the "checkpointing and preemption" example for more info!
+       logger.debug("Starting training from scratch.")
+   
+       for epoch in range(training_epochs):
+           logger.debug(f"Starting epoch {epoch}/{training_epochs}")
+   
+           # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
+           model.train()
+   
+           # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+           progress_bar = tqdm(
+               total=len(train_dataloader),
+               desc=f"Train epoch {epoch}",
+           )
+   
+           # Training loop
+           for batch in train_dataloader:
+               # Move the batch to the GPU before we pass it to the model
+               batch = tuple(item.to(device) for item in batch)
+               x, y = batch
+   
+               # Forward pass
+               logits: Tensor = model(x)
+   
+               loss = F.cross_entropy(logits, y)
+   
+               optimizer.zero_grad()
+               loss.backward()
+               optimizer.step()
+   
+               # Calculate some metrics:
+               n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+               n_samples = y.shape[0]
+               accuracy = n_correct_predictions / n_samples
+   
+               logger.debug(f"Accuracy: {accuracy.item():.2%}")
+               logger.debug(f"Average Loss: {loss.item()}")
+   
+               # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+               progress_bar.update(1)
+               progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+           progress_bar.close()
+   
+           val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+           logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+   
+       print("Done!")
+   
+   
+   @torch.no_grad()
+   def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+       model.eval()
+   
+       total_loss = 0.0
+       n_samples = 0
+       correct_predictions = 0
+   
+       for batch in dataloader:
+           batch = tuple(item.to(device) for item in batch)
+           x, y = batch
+   
+           logits: Tensor = model(x)
+           loss = F.cross_entropy(logits, y)
+   
+           batch_n_samples = x.shape[0]
+           batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+   
+           total_loss += loss.item()
+           n_samples += batch_n_samples
+           correct_predictions += batch_correct_predictions
+   
+       accuracy = correct_predictions / n_samples
+       return total_loss, accuracy
+   
+   
+   def make_datasets(
+       dataset_path: str,
+       val_split: float = 0.1,
+       val_split_seed: int = 42,
+   ):
+       """Returns the training, validation, and test splits for CIFAR10.
+   
+       NOTE: We don't use image transforms here for simplicity.
+       Having different transformations for train and validation would complicate things a bit.
+       Later examples will show how to do the train/val/test split properly when using transforms.
+       """
+       train_dataset = CIFAR10(
+           root=dataset_path, transform=transforms.ToTensor(), download=True, train=True
+       )
+       test_dataset = CIFAR10(
+           root=dataset_path, transform=transforms.ToTensor(), download=True, train=False
+       )
+       # Split the training dataset into a training and validation set.
+       train_dataset, valid_dataset = random_split(
+           train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed)
+       )
+       return train_dataset, valid_dataset, test_dataset
+   
+   
+   def get_num_workers() -> int:
+       """Gets the optimal number of DatLoader workers to use in the current job."""
+       if "SLURM_CPUS_PER_TASK" in os.environ:
+           return int(os.environ["SLURM_CPUS_PER_TASK"])
+       if hasattr(os, "sched_getaffinity"):
+           return len(os.sched_getaffinity(0))
+       return torch.multiprocessing.cpu_count()
+   
+   
+   if __name__ == "__main__":
+       main()
+   
+
+**Running this example**
+
+
+.. code-block:: bash
+
+    $ sbatch job.sh
diff --git a/docs/examples/distributed/001_single_gpu/_index.rst b/docs/examples/distributed/001_single_gpu/_index.rst
new file mode 100644
index 00000000..5ddeafbb
--- /dev/null
+++ b/docs/examples/distributed/001_single_gpu/_index.rst
@@ -0,0 +1,29 @@
+001 - Single GPU Job
+====================
+
+
+**Prerequisites**
+Make sure to read the following sections of the documentation before using this example:
+
+* :ref:`pytorch_setup`
+
+The full source code for this example is available on `the mila-docs GitHub repository. <https://github.com/lebrice/mila-docs/tree/pytorch_distributed_training_examples/docs/examples/distributed/001_single_gpu>`_
+
+**job.sh**
+
+.. literalinclude:: examples/distributed/001_single_gpu/job.sh
+    :language: bash
+
+
+**main.py**
+
+.. literalinclude:: examples/distributed/001_single_gpu/main.py
+    :language: python
+
+
+**Running this example**
+
+
+.. code-block:: bash
+
+    $ sbatch job.sh
diff --git a/docs/examples/distributed/001_single_gpu/job.sh b/docs/examples/distributed/001_single_gpu/job.sh
new file mode 100644
index 00000000..6dd819bb
--- /dev/null
+++ b/docs/examples/distributed/001_single_gpu/job.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+#SBATCH --gpus-per-task=rtx8000:1
+#SBATCH --cpus-per-task=4
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=16G
+#SBATCH --time=00:15:00
+
+
+# Echo time and hostname into log
+echo "Date:     $(date)"
+echo "Hostname: $(hostname)"
+
+
+# Ensure only anaconda/3 module loaded.
+module purge
+# This example uses Conda to manage package dependencies.
+# See https://docs.mila.quebec/Userguide.html#conda for more information.
+module load anaconda/3
+
+# Creating the environment for the first time:
+# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+#     pytorch-cuda=11.6 -c pytorch -c nvidia
+# Other conda packages:
+# conda install -y -n pytorch -c conda-forge rich
+
+# Activate pre-existing environment.
+conda activate pytorch
+
+
+# Stage dataset into $SLURM_TMPDIR
+cp -a /network/datasets/cifar10.var/cifar10_torchvision $SLURM_TMPDIR
+
+
+# Execute Python script
+python main.py
diff --git a/docs/examples/distributed/001_single_gpu/main.py b/docs/examples/distributed/001_single_gpu/main.py
new file mode 100644
index 00000000..f859e9f8
--- /dev/null
+++ b/docs/examples/distributed/001_single_gpu/main.py
@@ -0,0 +1,172 @@
+"""Single-GPU training example."""
+import logging
+import os
+
+import rich.logging
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, random_split
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet18
+from tqdm import tqdm
+
+
+def main():
+    training_epochs = 10
+    learning_rate = 5e-4
+    weight_decay = 1e-4
+    batch_size = 128
+
+    # Check that the GPU is available
+    assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+    device = torch.device("cuda", 0)
+
+    # Setup logging (optional, but much better than using print statements)
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+    )
+
+    logger = logging.getLogger(__name__)
+
+    # Create a model and move it to the GPU.
+    model = resnet18(num_classes=10)
+    model.to(device=device)
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+
+    # Setup CIFAR10
+    num_workers = get_num_workers()
+    dataset_path = os.environ.get("SLURM_TMPDIR", "../dataset")
+    train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=True,
+    )
+    valid_dataloader = DataLoader(
+        valid_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+    test_dataloader = DataLoader(  # NOTE: Not used in this example.
+        test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+
+    # Checkout the "checkpointing and preemption" example for more info!
+    logger.debug("Starting training from scratch.")
+
+    for epoch in range(training_epochs):
+        logger.debug(f"Starting epoch {epoch}/{training_epochs}")
+
+        # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
+        model.train()
+
+        # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+        progress_bar = tqdm(
+            total=len(train_dataloader),
+            desc=f"Train epoch {epoch}",
+        )
+
+        # Training loop
+        for batch in train_dataloader:
+            # Move the batch to the GPU before we pass it to the model
+            batch = tuple(item.to(device) for item in batch)
+            x, y = batch
+
+            # Forward pass
+            logits: Tensor = model(x)
+
+            loss = F.cross_entropy(logits, y)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            # Calculate some metrics:
+            n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+            n_samples = y.shape[0]
+            accuracy = n_correct_predictions / n_samples
+
+            logger.debug(f"Accuracy: {accuracy.item():.2%}")
+            logger.debug(f"Average Loss: {loss.item()}")
+
+            # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+            progress_bar.update(1)
+            progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+        progress_bar.close()
+
+        val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+        logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+
+    print("Done!")
+
+
+@torch.no_grad()
+def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+    model.eval()
+
+    total_loss = 0.0
+    n_samples = 0
+    correct_predictions = 0
+
+    for batch in dataloader:
+        batch = tuple(item.to(device) for item in batch)
+        x, y = batch
+
+        logits: Tensor = model(x)
+        loss = F.cross_entropy(logits, y)
+
+        batch_n_samples = x.shape[0]
+        batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+
+        total_loss += loss.item()
+        n_samples += batch_n_samples
+        correct_predictions += batch_correct_predictions
+
+    accuracy = correct_predictions / n_samples
+    return total_loss, accuracy
+
+
+def make_datasets(
+    dataset_path: str,
+    val_split: float = 0.1,
+    val_split_seed: int = 42,
+):
+    """Returns the training, validation, and test splits for CIFAR10.
+
+    NOTE: We don't use image transforms here for simplicity.
+    Having different transformations for train and validation would complicate things a bit.
+    Later examples will show how to do the train/val/test split properly when using transforms.
+    """
+    train_dataset = CIFAR10(
+        root=dataset_path, transform=transforms.ToTensor(), download=True, train=True
+    )
+    test_dataset = CIFAR10(
+        root=dataset_path, transform=transforms.ToTensor(), download=True, train=False
+    )
+    # Split the training dataset into a training and validation set.
+    train_dataset, valid_dataset = random_split(
+        train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed)
+    )
+    return train_dataset, valid_dataset, test_dataset
+
+
+def get_num_workers() -> int:
+    """Gets the optimal number of DatLoader workers to use in the current job."""
+    if "SLURM_CPUS_PER_TASK" in os.environ:
+        return int(os.environ["SLURM_CPUS_PER_TASK"])
+    if hasattr(os, "sched_getaffinity"):
+        return len(os.sched_getaffinity(0))
+    return torch.multiprocessing.cpu_count()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/distributed/index.rst b/docs/examples/distributed/index.rst
new file mode 100644
index 00000000..4e24c1d6
--- /dev/null
+++ b/docs/examples/distributed/index.rst
@@ -0,0 +1,6 @@
+********************
+Distributed Training
+********************
+
+
+.. include:: /examples/distributed/001_single_gpu/_index.rst
diff --git a/docs/examples/frameworks/README.rst b/docs/examples/frameworks/index.rst
similarity index 51%
rename from docs/examples/frameworks/README.rst
rename to docs/examples/frameworks/index.rst
index 3764c0f2..a3961cb0 100644
--- a/docs/examples/frameworks/README.rst
+++ b/docs/examples/frameworks/index.rst
@@ -3,4 +3,4 @@ Software Frameworks
 *******************
 
 
-.. include:: examples/frameworks/pytorch_setup/README.rst
+.. include:: examples/frameworks/pytorch_setup/_index.rst
diff --git a/docs/examples/frameworks/pytorch_setup/README.rst b/docs/examples/frameworks/pytorch_setup/README.rst
index 3be1c08b..4048a222 100644
--- a/docs/examples/frameworks/pytorch_setup/README.rst
+++ b/docs/examples/frameworks/pytorch_setup/README.rst
@@ -1,3 +1,5 @@
+.. _pytorch_setup:
+
 PyTorch Setup
 ===================
 
@@ -14,16 +16,63 @@ PyTorch Setup
 **job.sh**
 
 
-.. literalinclude:: /examples/frameworks/pytorch_setup/job.sh
-    :language: bash
-
+.. code:: bash
+
+   #!/bin/bash
+   #SBATCH --gres=gpu:1
+   #SBATCH --cpus-per-task=1
+   #SBATCH --mem=16G
+   #SBATCH --time=00:15:00
+   #SBATCH --partition=unkillable
+   
+   set -e  # exit on error.
+   echo "Date:     $(date)"
+   echo "Hostname: $(hostname)"
+   
+   module purge
+   # This example uses Conda to manage package dependencies.
+   # See https://docs.mila.quebec/Userguide.html#conda for more information.
+   module load anaconda/3
+   
+   # Creating the environment for the first time:
+   # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+   #     pytorch-cuda=11.6 -c pytorch -c nvidia
+   # Other conda packages:
+   # conda install -y -n pytorch -c conda-forge rich
+   
+   # Activate the environment:
+   conda activate pytorch
+   
+   python main.py
+   
 
 **main.py**
 
 
-.. literalinclude:: /examples/frameworks/pytorch_setup/main.py
-    :language: python
-
+.. code:: python
+
+   import torch
+   import torch.backends.cuda
+   
+   
+   def main():
+       cuda_built = torch.backends.cuda.is_built()
+       cuda_avail = torch.cuda.is_available()
+       device_count = torch.cuda.device_count()
+   
+       print(f"PyTorch built with CUDA:         {cuda_built}")
+       print(f"PyTorch detects CUDA available:  {cuda_avail}")
+       print(f"PyTorch-detected #GPUs:          {device_count}")
+       if device_count == 0:
+           print("    No GPU detected, not printing devices' names.")
+       else:
+           for i in range(device_count):
+               print(f"    GPU {i}:      {torch.cuda.get_device_name(i)}")
+   
+   
+   if __name__ == "__main__":
+       main()
+   
 
 **Running this example**
 
diff --git a/docs/examples/frameworks/pytorch_setup/_index.rst b/docs/examples/frameworks/pytorch_setup/_index.rst
new file mode 100644
index 00000000..9d9dfa05
--- /dev/null
+++ b/docs/examples/frameworks/pytorch_setup/_index.rst
@@ -0,0 +1,35 @@
+.. _pytorch_setup:
+
+PyTorch Setup
+===================
+
+.. IDEA: Add a link to all the sections of the documentation that have to
+.. absolutely have been read before this tutorial.
+
+**Prerequisites**: (Make sure to read the following before using this example!)
+
+* :ref:`Quick Start`
+* :ref:`Running your code`
+* :ref:`Conda`
+
+
+**job.sh**
+
+
+.. literalinclude:: examples/frameworks/pytorch_setup/job.sh
+    :language: bash
+
+
+**main.py**
+
+
+.. literalinclude:: examples/frameworks/pytorch_setup/main.py
+    :language: python
+
+
+**Running this example**
+
+
+.. code-block:: bash
+
+    $ sbatch job.sh
diff --git a/docs/examples/frameworks/pytorch_setup/job.sh b/docs/examples/frameworks/pytorch_setup/job.sh
index db126819..6f50e07d 100644
--- a/docs/examples/frameworks/pytorch_setup/job.sh
+++ b/docs/examples/frameworks/pytorch_setup/job.sh
@@ -17,6 +17,8 @@ module load anaconda/3
 # Creating the environment for the first time:
 # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
 #     pytorch-cuda=11.6 -c pytorch -c nvidia
+# Other conda packages:
+# conda install -y -n pytorch -c conda-forge rich
 
 # Activate the environment:
 conda activate pytorch
diff --git a/docs/examples/generate_diffs.sh b/docs/examples/generate_diffs.sh
new file mode 100755
index 00000000..106cc32b
--- /dev/null
+++ b/docs/examples/generate_diffs.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Use this to update the diffs based on the contents of the files.
+
+pushd `dirname "${BASH_SOURCE[0]}"` >/dev/null
+_SCRIPT_DIR=`pwd -P`
+popd >/dev/null
+
+set -e
+
+generate_diff() {
+    echo "Generating diff for docs/examples/$1 -> docs/examples/$2"
+    # NOTE: Assuming that this gets run from the `docs` folder (as is the case when building the docs).
+
+    # Write a diff file to be shown in the documentation.
+    
+    echo " # $1 -> $2" > "$2.diff"
+    git diff --no-index -U9999 \
+        "$1" \
+        "$2" \
+        | grep -Ev "^--- |^\+\+\+ |^@@ |^index |^diff --git" \
+        >> "$2.diff"
+}
+
+pushd "${_SCRIPT_DIR}" >/dev/null
+
+# single_gpu -> huggingface
+generate_diff distributed/001_single_gpu/job.sh data/hf/job.sh
+generate_diff distributed/001_single_gpu/main.py data/hf/main.py
+
+# single_gpu -> torchvision
+generate_diff distributed/001_single_gpu/job.sh data/torchvision/job.sh
+generate_diff distributed/001_single_gpu/main.py data/torchvision/main.py
+
+popd >/dev/null
diff --git a/docs/examples/preprocess.py b/docs/examples/preprocess.py
new file mode 100644
index 00000000..21f023dd
--- /dev/null
+++ b/docs/examples/preprocess.py
@@ -0,0 +1,44 @@
+"""Generate GitHub README's from _index.rst files
+GitHub doesn't support include of other files, even of the same type and
+location, so this file generates a README.rst with files content embedded
+"""
+from glob import glob
+from pathlib import Path
+import shutil
+
+
+def preprocess():
+    examples_root = Path(__file__).parent.parent
+    for _f in glob(str(examples_root / "examples/**/_index.rst"), recursive=True):
+        _f = Path(_f)
+        shutil.copyfile(str(_f), str(_f.with_name("README.rst")))
+        _f = _f.with_name("README.rst")
+        content = _f.read_text().split("\n")
+        i = 0
+        end = len(content)
+        while i < end:
+            line = content[i]
+            if line.startswith(".. literalinclude:: "):
+                path = line[len(".. literalinclude:: "):].strip(" ")
+                lang = ""
+                for j, _l in enumerate(content[i+1:]):
+                    _l = _l.strip(" ")
+                    if _l.startswith(":language:"):
+                        lang = _l[len(":language:"):].strip(" ")
+                    elif _l.startswith(".. literalinclude:: ") or not _l:
+                        break
+                del content[i:i+1+j]
+                insert = (
+                    [f".. code:: {lang}", ""] +
+                    [f"   {_l}" for _l in (examples_root / path).read_text().split("\n")]
+                )
+                content = content[:i] + insert + content[i+1:]
+                i += len(insert)
+                end = len(content)
+            else:
+                i += 1
+        _f.write_text("\n".join(content))
+
+
+if __name__ == "__main__":
+    preprocess()