diff --git a/.gitignore b/.gitignore index d20faa42..08d78b4e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ _build .idea **/__pycache__ +/docs/examples/**/*.diff diff --git a/docs/Minimal_examples.rst b/docs/Minimal_examples.rst index 67556a79..f0792ba0 100644 --- a/docs/Minimal_examples.rst +++ b/docs/Minimal_examples.rst @@ -1,6 +1,8 @@ -.. *************************** +.. **************** .. Minimal Examples -.. *************************** +.. **************** -.. include:: examples/frameworks/README.rst +.. include:: examples/frameworks/index.rst +.. include:: examples/distributed/index.rst +.. include:: examples/data/index.rst diff --git a/docs/conf.py b/docs/conf.py index df42510e..e89f8769 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -3,7 +3,8 @@ from __future__ import division, print_function, unicode_literals from datetime import datetime - +from pathlib import Path +import subprocess import sphinx_theme extensions = [ @@ -90,5 +91,28 @@ # Include CNAME file so GitHub Pages can set Custom Domain name html_extra_path = ['CNAME'] + +docs_root = Path(__file__).parent +file_dir = docs_root / "examples/generate_diffs.sh" +try: + _proc = subprocess.run(str(file_dir), shell=True, capture_output=True, check=True) +except subprocess.CalledProcessError as err: + raise RuntimeError( + "Could not build the diff files for the examples:\n" + + str(err.output, encoding="utf-8") + + str(err.stderr, encoding="utf-8") + ) + +pyfile = docs_root / "examples/preprocess.py" +try: + _proc = subprocess.run(["python3", str(pyfile)], capture_output=True, check=True) +except subprocess.CalledProcessError as err: + raise RuntimeError( + "Could not generate github README's:\n" + + str(err.output, encoding="utf-8") + + str(err.stderr, encoding="utf-8") + ) + + def setup(app): app.add_css_file('custom.css') diff --git a/docs/examples/data/hf/README.rst b/docs/examples/data/hf/README.rst new file mode 100644 index 00000000..2d59c729 --- /dev/null +++ b/docs/examples/data/hf/README.rst @@ -0,0 +1,420 @@ +Hugging Face Dataset +==================== + + +**Prerequisites** + +Make sure to read the following sections of the documentation before using this example: + +* :ref:`pytorch_setup` +* :ref:`001 - Single GPU Job` + +The full source code for this example is available on `the mila-docs GitHub repository. `_ + + +**job.sh** + +.. code:: diff + + # distributed/001_single_gpu/job.sh -> data/hf/job.sh + #!/bin/bash + #SBATCH --gpus-per-task=rtx8000:1 + #SBATCH --cpus-per-task=4 + #SBATCH --ntasks-per-node=1 + -#SBATCH --mem=16G + -#SBATCH --time=00:15:00 + +#SBATCH --mem=24G + +#SBATCH --time=02:00:00 + +#SBATCH --tmp=1500G + +set -o errexit + + + +function wrap_cmd { + + for a in "$@" + + do + + echo -n \"$a\" "" + + done + +} + + + # Echo time and hostname into log + echo "Date: $(date)" + echo "Hostname: $(hostname)" + + + # Ensure only anaconda/3 module loaded. + module purge + # This example uses Conda to manage package dependencies. + # See https://docs.mila.quebec/Userguide.html#conda for more information. + module load anaconda/3 + + + + # Creating the environment for the first time: + # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ + -# pytorch-cuda=11.6 -c pytorch -c nvidia + +# pytorch-cuda=11.6 scipy -c pytorch -c nvidia + # Other conda packages: + -# conda install -y -n pytorch -c conda-forge rich + +# conda install -y -n pytorch -c conda-forge rich tqdm datasets + + # Activate pre-existing environment. + conda activate pytorch + + + -# Stage dataset into $SLURM_TMPDIR + -cp -a /network/datasets/cifar10.var/cifar10_torchvision $SLURM_TMPDIR + +# Prepare data for training + +mkdir -p "$SLURM_TMPDIR/data" + + + +if [[ -z "${HF_DATASETS_CACHE}" ]] + +then + + # Store the huggingface datasets cache in $SCRATCH + + export HF_DATASETS_CACHE=$SCRATCH/cache/huggingface/datasets + +fi + +if [[ -z "${_DATA_PREP_WORKERS}" ]] + +then + + _DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE} + +fi + +if [[ -z "${_DATA_PREP_WORKERS}" ]] + +then + + _DATA_PREP_WORKERS=16 + +fi + + + +# Preprocess the dataset and cache the result such that the heavy work is done + +# only once *ever* + +# Required conda packages: + +# conda install -y -c conda-forge zstandard + +srun --ntasks=1 --ntasks-per-node=1 \ + + time -p python3 prepare_data.py "/network/datasets/pile" ${_DATA_PREP_WORKERS} + + + +# Copy the preprocessed dataset to $SLURM_TMPDIR so it is close to the GPUs for + +# faster training. This should be done once per compute node + +cmd=( + + # Having 'bash' here allows the execution of a script file which might not + + # have the execution flag on + + bash + + cp_data.sh + + # Get the current dataset cache + + "$(python3 get_dataset_cache_dir.py)" + + # Get the local dataset cache + + # Use '' to delay the execution of the command as $SLURM_TMPDIR needs to be + + # expanded on the local compute node rather than the master node + + '$(python3 get_dataset_cache_dir.py "$SLURM_TMPDIR/data")' + + ${_DATA_PREP_WORKERS} + +) + +# 'time' will objectively give a measure for the copy of the dataset. This can + +# be used to compare the timing of multiple attempts in optimizing code and make + +# sure any slow down doesn't come from the code itself + +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ + + time -p bash -c "$(wrap_cmd "${cmd[@]}")" + + + +# Use the local copy of the preprocessed dataset + +export HF_DATASETS_CACHE="$SLURM_TMPDIR/data" + + + # Execute Python script + python main.py + + +**main.py** + +.. code:: diff + + # distributed/001_single_gpu/main.py -> data/hf/main.py + -"""Single-GPU training example.""" + +"""Torchvision training example.""" + import logging + import os + + +import datasets + import rich.logging + import torch + from torch import Tensor, nn + from torch.nn import functional as F + -from torch.utils.data import DataLoader, random_split + -from torchvision import transforms + -from torchvision.datasets import CIFAR10 + +from torch.utils.data import DataLoader + from torchvision.models import resnet18 + from tqdm import tqdm + + + def main(): + - training_epochs = 10 + + training_epochs = 1 + learning_rate = 5e-4 + weight_decay = 1e-4 + - batch_size = 128 + + batch_size = 256 + + # Check that the GPU is available + assert torch.cuda.is_available() and torch.cuda.device_count() > 0 + device = torch.device("cuda", 0) + + # Setup logging (optional, but much better than using print statements) + logging.basicConfig( + level=logging.INFO, + handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. + ) + + logger = logging.getLogger(__name__) + + # Create a model and move it to the GPU. + - model = resnet18(num_classes=10) + + model = resnet18() + model.to(device=device) + + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + + - # Setup CIFAR10 + + # Setup ImageNet + num_workers = get_num_workers() + - dataset_path = os.environ.get("SLURM_TMPDIR", "../dataset") + + dataset_path = "the_pile" + train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path) + train_dataloader = DataLoader( + train_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=True, + ) + valid_dataloader = DataLoader( + valid_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + test_dataloader = DataLoader( # NOTE: Not used in this example. + test_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + + # Checkout the "checkpointing and preemption" example for more info! + logger.debug("Starting training from scratch.") + + for epoch in range(training_epochs): + logger.debug(f"Starting epoch {epoch}/{training_epochs}") + + # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers) + model.train() + + # NOTE: using a progress bar from tqdm because it's nicer than using `print`. + progress_bar = tqdm( + total=len(train_dataloader), + desc=f"Train epoch {epoch}", + ) + + # Training loop + for batch in train_dataloader: + # Move the batch to the GPU before we pass it to the model + batch = tuple(item.to(device) for item in batch) + - x, y = batch + + - # Forward pass + - logits: Tensor = model(x) + - + - loss = F.cross_entropy(logits, y) + - + - optimizer.zero_grad() + - loss.backward() + - optimizer.step() + - + - # Calculate some metrics: + - n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + - n_samples = y.shape[0] + - accuracy = n_correct_predictions / n_samples + - + - logger.debug(f"Accuracy: {accuracy.item():.2%}") + - logger.debug(f"Average Loss: {loss.item()}") + + # [Training of the model goes here] + + # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just) + progress_bar.update(1) + - progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) + progress_bar.close() + + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + + print("Done!") + + + @torch.no_grad() + def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): + model.eval() + + total_loss = 0.0 + n_samples = 0 + correct_predictions = 0 + + for batch in dataloader: + batch = tuple(item.to(device) for item in batch) + x, y = batch + + logits: Tensor = model(x) + loss = F.cross_entropy(logits, y) + + batch_n_samples = x.shape[0] + batch_correct_predictions = logits.argmax(-1).eq(y).sum() + + total_loss += loss.item() + n_samples += batch_n_samples + correct_predictions += batch_correct_predictions + + accuracy = correct_predictions / n_samples + return total_loss, accuracy + + + -def make_datasets( + - dataset_path: str, + - val_split: float = 0.1, + - val_split_seed: int = 42, + -): + - """Returns the training, validation, and test splits for CIFAR10. + +def make_datasets(dataset_path: str): + + """Returns the training, validation, and test splits for ImageNet. + + - NOTE: We don't use image transforms here for simplicity. + + NOTE: We don't use transforms here for simplicity. + Having different transformations for train and validation would complicate things a bit. + Later examples will show how to do the train/val/test split properly when using transforms. + """ + - train_dataset = CIFAR10( + - root=dataset_path, transform=transforms.ToTensor(), download=True, train=True + - ) + - test_dataset = CIFAR10( + - root=dataset_path, transform=transforms.ToTensor(), download=True, train=False + - ) + - # Split the training dataset into a training and validation set. + - train_dataset, valid_dataset = random_split( + - train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed) + - ) + + builder = datasets.load_dataset_builder(dataset_path, subsets=["all"], version="0.0.0") + + train_dataset = builder.as_dataset(split="train").with_format("torch") + + valid_dataset = builder.as_dataset(split="validation").with_format("torch") + + test_dataset = builder.as_dataset(split="test").with_format("torch") + return train_dataset, valid_dataset, test_dataset + + + def get_num_workers() -> int: + """Gets the optimal number of DatLoader workers to use in the current job.""" + if "SLURM_CPUS_PER_TASK" in os.environ: + return int(os.environ["SLURM_CPUS_PER_TASK"]) + if hasattr(os, "sched_getaffinity"): + return len(os.sched_getaffinity(0)) + return torch.multiprocessing.cpu_count() + + + if __name__ == "__main__": + main() + + +**prepare_data.py** + +.. code:: python + + """Preprocess the dataset. + In this example, HuggingFace is used and the resulting dataset will be stored in + $HF_DATASETS_CACHE. It is preferable to set the datasets cache to a location in + $SCRATCH""" + import sys + import time + + import datasets + + + _LOCAL_DS = sys.argv[1] + _LOCAL_DS_SPLITS = _LOCAL_DS.split("/") + try: + _WORKERS = int(sys.argv[2]) + except IndexError: + _WORKERS = 16 + + dl_config = datasets.DownloadConfig(cache_dir=_LOCAL_DS) + + # 'datasets' does not allow to use a local storage for the datasets' files using + # it's exposed API. Mocking the download func to for the usage of the local file + dl_man = datasets.DownloadManager(download_config=dl_config) + def dl(url_or_urls, *args, **kwargs): + import glob + local_files = ["/".join(_f.split("/")[len(_LOCAL_DS_SPLITS):]) + for _f in glob.glob(f"{_LOCAL_DS}/**", recursive=True)] + local_files.sort() + if isinstance(url_or_urls, str): + url_or_urls = [url_or_urls] + + # Replace all urls by local files if they can be found + for v in (url_or_urls.values() if isinstance(url_or_urls, dict) else {".":url_or_urls}): + for i, url in enumerate(v): + for lf in local_files: + if lf and url.endswith(lf): + v[i] = f"{_LOCAL_DS}/{lf}" + local_files.remove(lf) + break + + # Continue normal download process which should only checksum the local + # files instead of downloading them + return _download(url_or_urls, *args, **kwargs) + + _download = dl_man.download + dl_man.download = dl + builder = datasets.load_dataset_builder("the_pile", download_config=dl_config, subsets=["all"], version="0.0.0") + + t = -time.time() + builder.download_and_prepare(dl_manager=dl_man, num_proc=_WORKERS) + t += time.time() + + print(f"Prepared data in {t/60:.2f}m") + + +**get_dataset_cache_dir.py** + +.. code:: python + + """List to stdout the files of the dataset""" + import sys + + import datasets + + + # Redirect outputs to stderr to avoid noize in stdout + _stdout = sys.stdout + sys.stdout = sys.stderr + + try: + _CACHE_DIR = sys.argv[1] + except IndexError: + _CACHE_DIR = None + + builder = datasets.load_dataset_builder("the_pile", cache_dir=_CACHE_DIR, subsets=["all"], version="0.0.0") + print(builder.cache_dir, file=_stdout) + + +**cp_data.sh** + +.. code:: bash + + #!/bin/bash + set -o errexit + + _SRC=$1 + _DEST=$2 + _WORKERS=$3 + + # Copy the dataset + (cd "${_SRC}" && find -L * -type f) | while read f + do + mkdir --parents "${_DEST}/$(dirname "$f")" + # echo source first so it is matched to the cp's '-T' argument + readlink --canonicalize "${_SRC}/$f" + # echo output last so cp understands it's the output file + echo "${_DEST}/$f" + done | xargs -n2 -P${_WORKERS} cp --update -T + + +**Running this example** + +.. code-block:: bash + + $ sbatch job.sh diff --git a/docs/examples/data/hf/_index.rst b/docs/examples/data/hf/_index.rst new file mode 100644 index 00000000..b907baf3 --- /dev/null +++ b/docs/examples/data/hf/_index.rst @@ -0,0 +1,49 @@ +Hugging Face Dataset +==================== + + +**Prerequisites** + +Make sure to read the following sections of the documentation before using this example: + +* :ref:`pytorch_setup` +* :ref:`001 - Single GPU Job` + +The full source code for this example is available on `the mila-docs GitHub repository. `_ + + +**job.sh** + +.. literalinclude:: examples/data/hf/job.sh.diff + :language: diff + + +**main.py** + +.. literalinclude:: examples/data/hf/main.py.diff + :language: diff + + +**prepare_data.py** + +.. literalinclude:: examples/data/hf/prepare_data.py + :language: python + + +**get_dataset_cache_dir.py** + +.. literalinclude:: examples/data/hf/get_dataset_cache_dir.py + :language: python + + +**cp_data.sh** + +.. literalinclude:: examples/data/hf/cp_data.sh + :language: bash + + +**Running this example** + +.. code-block:: bash + + $ sbatch job.sh diff --git a/docs/examples/data/hf/cp_data.sh b/docs/examples/data/hf/cp_data.sh new file mode 100644 index 00000000..53d75a94 --- /dev/null +++ b/docs/examples/data/hf/cp_data.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -o errexit + +_SRC=$1 +_DEST=$2 +_WORKERS=$3 + +# Copy the dataset +(cd "${_SRC}" && find -L * -type f) | while read f +do + mkdir --parents "${_DEST}/$(dirname "$f")" + # echo source first so it is matched to the cp's '-T' argument + readlink --canonicalize "${_SRC}/$f" + # echo output last so cp understands it's the output file + echo "${_DEST}/$f" +done | xargs -n2 -P${_WORKERS} cp --update -T diff --git a/docs/examples/data/hf/get_dataset_cache_dir.py b/docs/examples/data/hf/get_dataset_cache_dir.py new file mode 100644 index 00000000..9c5740d3 --- /dev/null +++ b/docs/examples/data/hf/get_dataset_cache_dir.py @@ -0,0 +1,17 @@ +"""List to stdout the files of the dataset""" +import sys + +import datasets + + +# Redirect outputs to stderr to avoid noize in stdout +_stdout = sys.stdout +sys.stdout = sys.stderr + +try: + _CACHE_DIR = sys.argv[1] +except IndexError: + _CACHE_DIR = None + +builder = datasets.load_dataset_builder("the_pile", cache_dir=_CACHE_DIR, subsets=["all"], version="0.0.0") +print(builder.cache_dir, file=_stdout) diff --git a/docs/examples/data/hf/job.sh b/docs/examples/data/hf/job.sh new file mode 100644 index 00000000..383b4b7a --- /dev/null +++ b/docs/examples/data/hf/job.sh @@ -0,0 +1,90 @@ +#!/bin/bash +#SBATCH --gpus-per-task=rtx8000:1 +#SBATCH --cpus-per-task=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --mem=24G +#SBATCH --time=02:00:00 +#SBATCH --tmp=1500G +set -o errexit + +function wrap_cmd { + for a in "$@" + do + echo -n \"$a\" "" + done +} + + +# Echo time and hostname into log +echo "Date: $(date)" +echo "Hostname: $(hostname)" + + +# Ensure only anaconda/3 module loaded. +module purge +# This example uses Conda to manage package dependencies. +# See https://docs.mila.quebec/Userguide.html#conda for more information. +module load anaconda/3 + + +# Creating the environment for the first time: +# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ +# pytorch-cuda=11.6 scipy -c pytorch -c nvidia +# Other conda packages: +# conda install -y -n pytorch -c conda-forge rich tqdm datasets + +# Activate pre-existing environment. +conda activate pytorch + + +# Prepare data for training +mkdir -p "$SLURM_TMPDIR/data" + +if [[ -z "${HF_DATASETS_CACHE}" ]] +then + # Store the huggingface datasets cache in $SCRATCH + export HF_DATASETS_CACHE=$SCRATCH/cache/huggingface/datasets +fi +if [[ -z "${_DATA_PREP_WORKERS}" ]] +then + _DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE} +fi +if [[ -z "${_DATA_PREP_WORKERS}" ]] +then + _DATA_PREP_WORKERS=16 +fi + +# Preprocess the dataset and cache the result such that the heavy work is done +# only once *ever* +# Required conda packages: +# conda install -y -c conda-forge zstandard +srun --ntasks=1 --ntasks-per-node=1 \ + time -p python3 prepare_data.py "/network/datasets/pile" ${_DATA_PREP_WORKERS} + +# Copy the preprocessed dataset to $SLURM_TMPDIR so it is close to the GPUs for +# faster training. This should be done once per compute node +cmd=( + # Having 'bash' here allows the execution of a script file which might not + # have the execution flag on + bash + cp_data.sh + # Get the current dataset cache + "$(python3 get_dataset_cache_dir.py)" + # Get the local dataset cache + # Use '' to delay the execution of the command as $SLURM_TMPDIR needs to be + # expanded on the local compute node rather than the master node + '$(python3 get_dataset_cache_dir.py "$SLURM_TMPDIR/data")' + ${_DATA_PREP_WORKERS} +) +# 'time' will objectively give a measure for the copy of the dataset. This can +# be used to compare the timing of multiple attempts in optimizing code and make +# sure any slow down doesn't come from the code itself +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ + time -p bash -c "$(wrap_cmd "${cmd[@]}")" + +# Use the local copy of the preprocessed dataset +export HF_DATASETS_CACHE="$SLURM_TMPDIR/data" + + +# Execute Python script +python main.py diff --git a/docs/examples/data/hf/main.py b/docs/examples/data/hf/main.py new file mode 100644 index 00000000..9edd35bf --- /dev/null +++ b/docs/examples/data/hf/main.py @@ -0,0 +1,144 @@ +"""Torchvision training example.""" +import logging +import os + +import datasets +import rich.logging +import torch +from torch import Tensor, nn +from torch.nn import functional as F +from torch.utils.data import DataLoader +from torchvision.models import resnet18 +from tqdm import tqdm + + +def main(): + training_epochs = 1 + learning_rate = 5e-4 + weight_decay = 1e-4 + batch_size = 256 + + # Check that the GPU is available + assert torch.cuda.is_available() and torch.cuda.device_count() > 0 + device = torch.device("cuda", 0) + + # Setup logging (optional, but much better than using print statements) + logging.basicConfig( + level=logging.INFO, + handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. + ) + + logger = logging.getLogger(__name__) + + # Create a model and move it to the GPU. + model = resnet18() + model.to(device=device) + + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + + # Setup ImageNet + num_workers = get_num_workers() + dataset_path = "the_pile" + train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path) + train_dataloader = DataLoader( + train_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=True, + ) + valid_dataloader = DataLoader( + valid_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + test_dataloader = DataLoader( # NOTE: Not used in this example. + test_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + + # Checkout the "checkpointing and preemption" example for more info! + logger.debug("Starting training from scratch.") + + for epoch in range(training_epochs): + logger.debug(f"Starting epoch {epoch}/{training_epochs}") + + # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers) + model.train() + + # NOTE: using a progress bar from tqdm because it's nicer than using `print`. + progress_bar = tqdm( + total=len(train_dataloader), + desc=f"Train epoch {epoch}", + ) + + # Training loop + for batch in train_dataloader: + # Move the batch to the GPU before we pass it to the model + batch = tuple(item.to(device) for item in batch) + + # [Training of the model goes here] + + # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just) + progress_bar.update(1) + progress_bar.close() + + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + + print("Done!") + + +@torch.no_grad() +def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): + model.eval() + + total_loss = 0.0 + n_samples = 0 + correct_predictions = 0 + + for batch in dataloader: + batch = tuple(item.to(device) for item in batch) + x, y = batch + + logits: Tensor = model(x) + loss = F.cross_entropy(logits, y) + + batch_n_samples = x.shape[0] + batch_correct_predictions = logits.argmax(-1).eq(y).sum() + + total_loss += loss.item() + n_samples += batch_n_samples + correct_predictions += batch_correct_predictions + + accuracy = correct_predictions / n_samples + return total_loss, accuracy + + +def make_datasets(dataset_path: str): + """Returns the training, validation, and test splits for ImageNet. + + NOTE: We don't use transforms here for simplicity. + Having different transformations for train and validation would complicate things a bit. + Later examples will show how to do the train/val/test split properly when using transforms. + """ + builder = datasets.load_dataset_builder(dataset_path, subsets=["all"], version="0.0.0") + train_dataset = builder.as_dataset(split="train").with_format("torch") + valid_dataset = builder.as_dataset(split="validation").with_format("torch") + test_dataset = builder.as_dataset(split="test").with_format("torch") + return train_dataset, valid_dataset, test_dataset + + +def get_num_workers() -> int: + """Gets the optimal number of DatLoader workers to use in the current job.""" + if "SLURM_CPUS_PER_TASK" in os.environ: + return int(os.environ["SLURM_CPUS_PER_TASK"]) + if hasattr(os, "sched_getaffinity"): + return len(os.sched_getaffinity(0)) + return torch.multiprocessing.cpu_count() + + +if __name__ == "__main__": + main() diff --git a/docs/examples/data/hf/prepare_data.py b/docs/examples/data/hf/prepare_data.py new file mode 100644 index 00000000..595539e1 --- /dev/null +++ b/docs/examples/data/hf/prepare_data.py @@ -0,0 +1,52 @@ +"""Preprocess the dataset. +In this example, HuggingFace is used and the resulting dataset will be stored in +$HF_DATASETS_CACHE. It is preferable to set the datasets cache to a location in +$SCRATCH""" +import sys +import time + +import datasets + + +_LOCAL_DS = sys.argv[1] +_LOCAL_DS_SPLITS = _LOCAL_DS.split("/") +try: + _WORKERS = int(sys.argv[2]) +except IndexError: + _WORKERS = 16 + +dl_config = datasets.DownloadConfig(cache_dir=_LOCAL_DS) + +# 'datasets' does not allow to use a local storage for the datasets' files using +# it's exposed API. Mocking the download func to for the usage of the local file +dl_man = datasets.DownloadManager(download_config=dl_config) +def dl(url_or_urls, *args, **kwargs): + import glob + local_files = ["/".join(_f.split("/")[len(_LOCAL_DS_SPLITS):]) + for _f in glob.glob(f"{_LOCAL_DS}/**", recursive=True)] + local_files.sort() + if isinstance(url_or_urls, str): + url_or_urls = [url_or_urls] + + # Replace all urls by local files if they can be found + for v in (url_or_urls.values() if isinstance(url_or_urls, dict) else {".":url_or_urls}): + for i, url in enumerate(v): + for lf in local_files: + if lf and url.endswith(lf): + v[i] = f"{_LOCAL_DS}/{lf}" + local_files.remove(lf) + break + + # Continue normal download process which should only checksum the local + # files instead of downloading them + return _download(url_or_urls, *args, **kwargs) + +_download = dl_man.download +dl_man.download = dl +builder = datasets.load_dataset_builder("the_pile", download_config=dl_config, subsets=["all"], version="0.0.0") + +t = -time.time() +builder.download_and_prepare(dl_manager=dl_man, num_proc=_WORKERS) +t += time.time() + +print(f"Prepared data in {t/60:.2f}m") diff --git a/docs/examples/data/index.rst b/docs/examples/data/index.rst new file mode 100644 index 00000000..bd8e2691 --- /dev/null +++ b/docs/examples/data/index.rst @@ -0,0 +1,7 @@ +***************************** +Data Handling during Training +***************************** + + +.. include:: examples/data/torchvision/_index.rst +.. include:: examples/data/hf/_index.rst diff --git a/docs/examples/data/torchvision/README.rst b/docs/examples/data/torchvision/README.rst new file mode 100644 index 00000000..2d515fb0 --- /dev/null +++ b/docs/examples/data/torchvision/README.rst @@ -0,0 +1,337 @@ +Torchvision +=========== + + +**Prerequisites** + +Make sure to read the following sections of the documentation before using this example: + +* :ref:`pytorch_setup` +* :ref:`001 - Single GPU Job` + +The full source code for this example is available on `the mila-docs GitHub repository. `_ + + +**job.sh** + +.. code:: diff + + # distributed/001_single_gpu/job.sh -> data/torchvision/job.sh + #!/bin/bash + #SBATCH --gpus-per-task=rtx8000:1 + #SBATCH --cpus-per-task=4 + #SBATCH --ntasks-per-node=1 + #SBATCH --mem=16G + -#SBATCH --time=00:15:00 + +#SBATCH --time=01:30:00 + +set -o errexit + + + # Echo time and hostname into log + echo "Date: $(date)" + echo "Hostname: $(hostname)" + + + # Ensure only anaconda/3 module loaded. + module purge + # This example uses Conda to manage package dependencies. + # See https://docs.mila.quebec/Userguide.html#conda for more information. + module load anaconda/3 + + + + # Creating the environment for the first time: + # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ + -# pytorch-cuda=11.6 -c pytorch -c nvidia + +# pytorch-cuda=11.6 scipy -c pytorch -c nvidia + # Other conda packages: + -# conda install -y -n pytorch -c conda-forge rich + +# conda install -y -n pytorch -c conda-forge rich tqdm + + # Activate pre-existing environment. + conda activate pytorch + + + -# Stage dataset into $SLURM_TMPDIR + -cp -a /network/datasets/cifar10.var/cifar10_torchvision $SLURM_TMPDIR + +# Prepare data for training + +mkdir -p "$SLURM_TMPDIR/data" + + + +if [[ -z "${_DATA_PREP_WORKERS}" ]] + +then + + _DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE} + +fi + +if [[ -z "${_DATA_PREP_WORKERS}" ]] + +then + + _DATA_PREP_WORKERS=16 + +fi + + +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for + +# faster training + +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ + + time -p bash data.sh "/network/datasets/inat" "$SLURM_TMPDIR/data" ${_DATA_PREP_WORKERS} + + # Execute Python script + python main.py + + +**main.py** + +.. code:: diff + + # distributed/001_single_gpu/main.py -> data/torchvision/main.py + -"""Single-GPU training example.""" + +"""Torchvision training example.""" + import logging + import os + + import rich.logging + import torch + from torch import Tensor, nn + from torch.nn import functional as F + from torch.utils.data import DataLoader, random_split + from torchvision import transforms + -from torchvision.datasets import CIFAR10 + +from torchvision.datasets import INaturalist + from torchvision.models import resnet18 + from tqdm import tqdm + + + def main(): + - training_epochs = 10 + + training_epochs = 1 + learning_rate = 5e-4 + weight_decay = 1e-4 + - batch_size = 128 + + batch_size = 256 + + # Check that the GPU is available + assert torch.cuda.is_available() and torch.cuda.device_count() > 0 + device = torch.device("cuda", 0) + + # Setup logging (optional, but much better than using print statements) + logging.basicConfig( + level=logging.INFO, + handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. + ) + + logger = logging.getLogger(__name__) + + # Create a model and move it to the GPU. + - model = resnet18(num_classes=10) + + model = resnet18(num_classes=10000) + model.to(device=device) + + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + + - # Setup CIFAR10 + + # Setup ImageNet + num_workers = get_num_workers() + - dataset_path = os.environ.get("SLURM_TMPDIR", "../dataset") + + try: + + dataset_path = f"{os.environ['SLURM_TMPDIR']}/data" + + except KeyError: + + dataset_path = "../dataset" + train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path) + train_dataloader = DataLoader( + train_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=True, + ) + valid_dataloader = DataLoader( + valid_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + test_dataloader = DataLoader( # NOTE: Not used in this example. + test_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + + # Checkout the "checkpointing and preemption" example for more info! + logger.debug("Starting training from scratch.") + + for epoch in range(training_epochs): + logger.debug(f"Starting epoch {epoch}/{training_epochs}") + + # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers) + model.train() + + # NOTE: using a progress bar from tqdm because it's nicer than using `print`. + progress_bar = tqdm( + total=len(train_dataloader), + desc=f"Train epoch {epoch}", + ) + + # Training loop + for batch in train_dataloader: + # Move the batch to the GPU before we pass it to the model + batch = tuple(item.to(device) for item in batch) + x, y = batch + + # Forward pass + logits: Tensor = model(x) + + loss = F.cross_entropy(logits, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # Calculate some metrics: + n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + n_samples = y.shape[0] + accuracy = n_correct_predictions / n_samples + + logger.debug(f"Accuracy: {accuracy.item():.2%}") + logger.debug(f"Average Loss: {loss.item()}") + + # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just) + progress_bar.update(1) + progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) + progress_bar.close() + + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + + print("Done!") + + + @torch.no_grad() + def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): + model.eval() + + total_loss = 0.0 + n_samples = 0 + correct_predictions = 0 + + for batch in dataloader: + batch = tuple(item.to(device) for item in batch) + x, y = batch + + logits: Tensor = model(x) + loss = F.cross_entropy(logits, y) + + batch_n_samples = x.shape[0] + batch_correct_predictions = logits.argmax(-1).eq(y).sum() + + total_loss += loss.item() + n_samples += batch_n_samples + correct_predictions += batch_correct_predictions + + accuracy = correct_predictions / n_samples + return total_loss, accuracy + + + def make_datasets( + dataset_path: str, + val_split: float = 0.1, + val_split_seed: int = 42, + ): + - """Returns the training, validation, and test splits for CIFAR10. + + """Returns the training, validation, and test splits for ImageNet. + + NOTE: We don't use image transforms here for simplicity. + Having different transformations for train and validation would complicate things a bit. + Later examples will show how to do the train/val/test split properly when using transforms. + """ + - train_dataset = CIFAR10( + - root=dataset_path, transform=transforms.ToTensor(), download=True, train=True + + train_dataset = INaturalist( + + root=dataset_path, + + transform=transforms.Compose([ + + transforms.Resize(256), + + transforms.CenterCrop(224), + + transforms.ToTensor(), + + ]), + + version="2021_train" + ) + - test_dataset = CIFAR10( + - root=dataset_path, transform=transforms.ToTensor(), download=True, train=False + + test_dataset = INaturalist( + + root=dataset_path, + + transform=transforms.Compose([ + + transforms.Resize(256), + + transforms.CenterCrop(224), + + transforms.ToTensor(), + + ]), + + version="2021_valid" + ) + # Split the training dataset into a training and validation set. + train_dataset, valid_dataset = random_split( + train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed) + ) + return train_dataset, valid_dataset, test_dataset + + + def get_num_workers() -> int: + """Gets the optimal number of DatLoader workers to use in the current job.""" + if "SLURM_CPUS_PER_TASK" in os.environ: + return int(os.environ["SLURM_CPUS_PER_TASK"]) + if hasattr(os, "sched_getaffinity"): + return len(os.sched_getaffinity(0)) + return torch.multiprocessing.cpu_count() + + + if __name__ == "__main__": + main() + + +**data.sh** + +.. code:: bash + + #!/bin/bash + set -o errexit + + _SRC=$1 + _DEST=$2 + _WORKERS=$3 + + # Clone the dataset structure locally and reorganise the raw files if needed + (cd "${_SRC}" && find -L * -type f) | while read f + do + mkdir --parents "${_DEST}/$(dirname "$f")" + # echo source first so it is matched to the ln's '-T' argument + readlink --canonicalize "${_SRC}/$f" + # echo output last so ln understands it's the output file + echo "${_DEST}/$f" + done | xargs -n2 -P${_WORKERS} ln --symbolic --force -T + + ( + cd "${_DEST}" + # Torchvision expects these names + mv train.tar.gz 2021_train.tgz + mv val.tar.gz 2021_valid.tgz + ) + + # Extract and prepare the data + python3 data.py "${_DEST}" + + +**data.py** + +.. code:: python + + """Make sure the data is available""" + import sys + import time + + from torchvision.datasets import INaturalist + + + t = -time.time() + INaturalist(root=sys.argv[1], version="2021_train", download=True) + INaturalist(root=sys.argv[1], version="2021_valid", download=True) + t += time.time() + print(f"Prepared data in {t/60:.2f}m") + + +**Running this example** + +.. code-block:: bash + + $ sbatch job.sh diff --git a/docs/examples/data/torchvision/_index.rst b/docs/examples/data/torchvision/_index.rst new file mode 100644 index 00000000..77ab7445 --- /dev/null +++ b/docs/examples/data/torchvision/_index.rst @@ -0,0 +1,43 @@ +Torchvision +=========== + + +**Prerequisites** + +Make sure to read the following sections of the documentation before using this example: + +* :ref:`pytorch_setup` +* :ref:`001 - Single GPU Job` + +The full source code for this example is available on `the mila-docs GitHub repository. `_ + + +**job.sh** + +.. literalinclude:: examples/data/torchvision/job.sh.diff + :language: diff + + +**main.py** + +.. literalinclude:: examples/data/torchvision/main.py.diff + :language: diff + + +**data.sh** + +.. literalinclude:: examples/data/torchvision/data.sh + :language: bash + + +**data.py** + +.. literalinclude:: examples/data/torchvision/data.py + :language: python + + +**Running this example** + +.. code-block:: bash + + $ sbatch job.sh diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/data/torchvision/data.py new file mode 100644 index 00000000..a43129c4 --- /dev/null +++ b/docs/examples/data/torchvision/data.py @@ -0,0 +1,12 @@ +"""Make sure the data is available""" +import sys +import time + +from torchvision.datasets import INaturalist + + +t = -time.time() +INaturalist(root=sys.argv[1], version="2021_train", download=True) +INaturalist(root=sys.argv[1], version="2021_valid", download=True) +t += time.time() +print(f"Prepared data in {t/60:.2f}m") diff --git a/docs/examples/data/torchvision/data.sh b/docs/examples/data/torchvision/data.sh new file mode 100644 index 00000000..981a7f73 --- /dev/null +++ b/docs/examples/data/torchvision/data.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -o errexit + +_SRC=$1 +_DEST=$2 +_WORKERS=$3 + +# Clone the dataset structure locally and reorganise the raw files if needed +(cd "${_SRC}" && find -L * -type f) | while read f +do + mkdir --parents "${_DEST}/$(dirname "$f")" + # echo source first so it is matched to the ln's '-T' argument + readlink --canonicalize "${_SRC}/$f" + # echo output last so ln understands it's the output file + echo "${_DEST}/$f" +done | xargs -n2 -P${_WORKERS} ln --symbolic --force -T + +( + cd "${_DEST}" + # Torchvision expects these names + mv train.tar.gz 2021_train.tgz + mv val.tar.gz 2021_valid.tgz +) + +# Extract and prepare the data +python3 data.py "${_DEST}" diff --git a/docs/examples/data/torchvision/job.sh b/docs/examples/data/torchvision/job.sh new file mode 100644 index 00000000..5423e372 --- /dev/null +++ b/docs/examples/data/torchvision/job.sh @@ -0,0 +1,50 @@ +#!/bin/bash +#SBATCH --gpus-per-task=rtx8000:1 +#SBATCH --cpus-per-task=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --mem=16G +#SBATCH --time=01:30:00 +set -o errexit + + +# Echo time and hostname into log +echo "Date: $(date)" +echo "Hostname: $(hostname)" + + +# Ensure only anaconda/3 module loaded. +module purge +# This example uses Conda to manage package dependencies. +# See https://docs.mila.quebec/Userguide.html#conda for more information. +module load anaconda/3 + + +# Creating the environment for the first time: +# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ +# pytorch-cuda=11.6 scipy -c pytorch -c nvidia +# Other conda packages: +# conda install -y -n pytorch -c conda-forge rich tqdm + +# Activate pre-existing environment. +conda activate pytorch + + +# Prepare data for training +mkdir -p "$SLURM_TMPDIR/data" + +if [[ -z "${_DATA_PREP_WORKERS}" ]] +then + _DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE} +fi +if [[ -z "${_DATA_PREP_WORKERS}" ]] +then + _DATA_PREP_WORKERS=16 +fi + +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for +# faster training +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ + time -p bash data.sh "/network/datasets/inat" "$SLURM_TMPDIR/data" ${_DATA_PREP_WORKERS} + +# Execute Python script +python main.py diff --git a/docs/examples/data/torchvision/main.py b/docs/examples/data/torchvision/main.py new file mode 100644 index 00000000..015394e0 --- /dev/null +++ b/docs/examples/data/torchvision/main.py @@ -0,0 +1,187 @@ +"""Torchvision training example.""" +import logging +import os + +import rich.logging +import torch +from torch import Tensor, nn +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split +from torchvision import transforms +from torchvision.datasets import INaturalist +from torchvision.models import resnet18 +from tqdm import tqdm + + +def main(): + training_epochs = 1 + learning_rate = 5e-4 + weight_decay = 1e-4 + batch_size = 256 + + # Check that the GPU is available + assert torch.cuda.is_available() and torch.cuda.device_count() > 0 + device = torch.device("cuda", 0) + + # Setup logging (optional, but much better than using print statements) + logging.basicConfig( + level=logging.INFO, + handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. + ) + + logger = logging.getLogger(__name__) + + # Create a model and move it to the GPU. + model = resnet18(num_classes=10000) + model.to(device=device) + + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + + # Setup ImageNet + num_workers = get_num_workers() + try: + dataset_path = f"{os.environ['SLURM_TMPDIR']}/data" + except KeyError: + dataset_path = "../dataset" + train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path) + train_dataloader = DataLoader( + train_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=True, + ) + valid_dataloader = DataLoader( + valid_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + test_dataloader = DataLoader( # NOTE: Not used in this example. + test_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + + # Checkout the "checkpointing and preemption" example for more info! + logger.debug("Starting training from scratch.") + + for epoch in range(training_epochs): + logger.debug(f"Starting epoch {epoch}/{training_epochs}") + + # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers) + model.train() + + # NOTE: using a progress bar from tqdm because it's nicer than using `print`. + progress_bar = tqdm( + total=len(train_dataloader), + desc=f"Train epoch {epoch}", + ) + + # Training loop + for batch in train_dataloader: + # Move the batch to the GPU before we pass it to the model + batch = tuple(item.to(device) for item in batch) + x, y = batch + + # Forward pass + logits: Tensor = model(x) + + loss = F.cross_entropy(logits, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # Calculate some metrics: + n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + n_samples = y.shape[0] + accuracy = n_correct_predictions / n_samples + + logger.debug(f"Accuracy: {accuracy.item():.2%}") + logger.debug(f"Average Loss: {loss.item()}") + + # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just) + progress_bar.update(1) + progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) + progress_bar.close() + + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + + print("Done!") + + +@torch.no_grad() +def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): + model.eval() + + total_loss = 0.0 + n_samples = 0 + correct_predictions = 0 + + for batch in dataloader: + batch = tuple(item.to(device) for item in batch) + x, y = batch + + logits: Tensor = model(x) + loss = F.cross_entropy(logits, y) + + batch_n_samples = x.shape[0] + batch_correct_predictions = logits.argmax(-1).eq(y).sum() + + total_loss += loss.item() + n_samples += batch_n_samples + correct_predictions += batch_correct_predictions + + accuracy = correct_predictions / n_samples + return total_loss, accuracy + + +def make_datasets( + dataset_path: str, + val_split: float = 0.1, + val_split_seed: int = 42, +): + """Returns the training, validation, and test splits for ImageNet. + + NOTE: We don't use image transforms here for simplicity. + Having different transformations for train and validation would complicate things a bit. + Later examples will show how to do the train/val/test split properly when using transforms. + """ + train_dataset = INaturalist( + root=dataset_path, + transform=transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + ]), + version="2021_train" + ) + test_dataset = INaturalist( + root=dataset_path, + transform=transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + ]), + version="2021_valid" + ) + # Split the training dataset into a training and validation set. + train_dataset, valid_dataset = random_split( + train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed) + ) + return train_dataset, valid_dataset, test_dataset + + +def get_num_workers() -> int: + """Gets the optimal number of DatLoader workers to use in the current job.""" + if "SLURM_CPUS_PER_TASK" in os.environ: + return int(os.environ["SLURM_CPUS_PER_TASK"]) + if hasattr(os, "sched_getaffinity"): + return len(os.sched_getaffinity(0)) + return torch.multiprocessing.cpu_count() + + +if __name__ == "__main__": + main() diff --git a/docs/examples/distributed/001_single_gpu/README.rst b/docs/examples/distributed/001_single_gpu/README.rst new file mode 100644 index 00000000..3bfb1bda --- /dev/null +++ b/docs/examples/distributed/001_single_gpu/README.rst @@ -0,0 +1,236 @@ +001 - Single GPU Job +==================== + + +**Prerequisites** +Make sure to read the following sections of the documentation before using this example: + +* :ref:`pytorch_setup` + +The full source code for this example is available on `the mila-docs GitHub repository. `_ + +**job.sh** + +.. code:: bash + + #!/bin/bash + #SBATCH --gpus-per-task=rtx8000:1 + #SBATCH --cpus-per-task=4 + #SBATCH --ntasks-per-node=1 + #SBATCH --mem=16G + #SBATCH --time=00:15:00 + + + # Echo time and hostname into log + echo "Date: $(date)" + echo "Hostname: $(hostname)" + + + # Ensure only anaconda/3 module loaded. + module purge + # This example uses Conda to manage package dependencies. + # See https://docs.mila.quebec/Userguide.html#conda for more information. + module load anaconda/3 + + # Creating the environment for the first time: + # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ + # pytorch-cuda=11.6 -c pytorch -c nvidia + # Other conda packages: + # conda install -y -n pytorch -c conda-forge rich + + # Activate pre-existing environment. + conda activate pytorch + + + # Stage dataset into $SLURM_TMPDIR + cp -a /network/datasets/cifar10.var/cifar10_torchvision $SLURM_TMPDIR + + + # Execute Python script + python main.py + + +**main.py** + +.. code:: python + + """Single-GPU training example.""" + import logging + import os + + import rich.logging + import torch + from torch import Tensor, nn + from torch.nn import functional as F + from torch.utils.data import DataLoader, random_split + from torchvision import transforms + from torchvision.datasets import CIFAR10 + from torchvision.models import resnet18 + from tqdm import tqdm + + + def main(): + training_epochs = 10 + learning_rate = 5e-4 + weight_decay = 1e-4 + batch_size = 128 + + # Check that the GPU is available + assert torch.cuda.is_available() and torch.cuda.device_count() > 0 + device = torch.device("cuda", 0) + + # Setup logging (optional, but much better than using print statements) + logging.basicConfig( + level=logging.INFO, + handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. + ) + + logger = logging.getLogger(__name__) + + # Create a model and move it to the GPU. + model = resnet18(num_classes=10) + model.to(device=device) + + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + + # Setup CIFAR10 + num_workers = get_num_workers() + dataset_path = os.environ.get("SLURM_TMPDIR", "../dataset") + train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path) + train_dataloader = DataLoader( + train_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=True, + ) + valid_dataloader = DataLoader( + valid_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + test_dataloader = DataLoader( # NOTE: Not used in this example. + test_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + + # Checkout the "checkpointing and preemption" example for more info! + logger.debug("Starting training from scratch.") + + for epoch in range(training_epochs): + logger.debug(f"Starting epoch {epoch}/{training_epochs}") + + # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers) + model.train() + + # NOTE: using a progress bar from tqdm because it's nicer than using `print`. + progress_bar = tqdm( + total=len(train_dataloader), + desc=f"Train epoch {epoch}", + ) + + # Training loop + for batch in train_dataloader: + # Move the batch to the GPU before we pass it to the model + batch = tuple(item.to(device) for item in batch) + x, y = batch + + # Forward pass + logits: Tensor = model(x) + + loss = F.cross_entropy(logits, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # Calculate some metrics: + n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + n_samples = y.shape[0] + accuracy = n_correct_predictions / n_samples + + logger.debug(f"Accuracy: {accuracy.item():.2%}") + logger.debug(f"Average Loss: {loss.item()}") + + # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just) + progress_bar.update(1) + progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) + progress_bar.close() + + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + + print("Done!") + + + @torch.no_grad() + def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): + model.eval() + + total_loss = 0.0 + n_samples = 0 + correct_predictions = 0 + + for batch in dataloader: + batch = tuple(item.to(device) for item in batch) + x, y = batch + + logits: Tensor = model(x) + loss = F.cross_entropy(logits, y) + + batch_n_samples = x.shape[0] + batch_correct_predictions = logits.argmax(-1).eq(y).sum() + + total_loss += loss.item() + n_samples += batch_n_samples + correct_predictions += batch_correct_predictions + + accuracy = correct_predictions / n_samples + return total_loss, accuracy + + + def make_datasets( + dataset_path: str, + val_split: float = 0.1, + val_split_seed: int = 42, + ): + """Returns the training, validation, and test splits for CIFAR10. + + NOTE: We don't use image transforms here for simplicity. + Having different transformations for train and validation would complicate things a bit. + Later examples will show how to do the train/val/test split properly when using transforms. + """ + train_dataset = CIFAR10( + root=dataset_path, transform=transforms.ToTensor(), download=True, train=True + ) + test_dataset = CIFAR10( + root=dataset_path, transform=transforms.ToTensor(), download=True, train=False + ) + # Split the training dataset into a training and validation set. + train_dataset, valid_dataset = random_split( + train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed) + ) + return train_dataset, valid_dataset, test_dataset + + + def get_num_workers() -> int: + """Gets the optimal number of DatLoader workers to use in the current job.""" + if "SLURM_CPUS_PER_TASK" in os.environ: + return int(os.environ["SLURM_CPUS_PER_TASK"]) + if hasattr(os, "sched_getaffinity"): + return len(os.sched_getaffinity(0)) + return torch.multiprocessing.cpu_count() + + + if __name__ == "__main__": + main() + + +**Running this example** + + +.. code-block:: bash + + $ sbatch job.sh diff --git a/docs/examples/distributed/001_single_gpu/_index.rst b/docs/examples/distributed/001_single_gpu/_index.rst new file mode 100644 index 00000000..5ddeafbb --- /dev/null +++ b/docs/examples/distributed/001_single_gpu/_index.rst @@ -0,0 +1,29 @@ +001 - Single GPU Job +==================== + + +**Prerequisites** +Make sure to read the following sections of the documentation before using this example: + +* :ref:`pytorch_setup` + +The full source code for this example is available on `the mila-docs GitHub repository. `_ + +**job.sh** + +.. literalinclude:: examples/distributed/001_single_gpu/job.sh + :language: bash + + +**main.py** + +.. literalinclude:: examples/distributed/001_single_gpu/main.py + :language: python + + +**Running this example** + + +.. code-block:: bash + + $ sbatch job.sh diff --git a/docs/examples/distributed/001_single_gpu/job.sh b/docs/examples/distributed/001_single_gpu/job.sh new file mode 100644 index 00000000..6dd819bb --- /dev/null +++ b/docs/examples/distributed/001_single_gpu/job.sh @@ -0,0 +1,35 @@ +#!/bin/bash +#SBATCH --gpus-per-task=rtx8000:1 +#SBATCH --cpus-per-task=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --mem=16G +#SBATCH --time=00:15:00 + + +# Echo time and hostname into log +echo "Date: $(date)" +echo "Hostname: $(hostname)" + + +# Ensure only anaconda/3 module loaded. +module purge +# This example uses Conda to manage package dependencies. +# See https://docs.mila.quebec/Userguide.html#conda for more information. +module load anaconda/3 + +# Creating the environment for the first time: +# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ +# pytorch-cuda=11.6 -c pytorch -c nvidia +# Other conda packages: +# conda install -y -n pytorch -c conda-forge rich + +# Activate pre-existing environment. +conda activate pytorch + + +# Stage dataset into $SLURM_TMPDIR +cp -a /network/datasets/cifar10.var/cifar10_torchvision $SLURM_TMPDIR + + +# Execute Python script +python main.py diff --git a/docs/examples/distributed/001_single_gpu/main.py b/docs/examples/distributed/001_single_gpu/main.py new file mode 100644 index 00000000..f859e9f8 --- /dev/null +++ b/docs/examples/distributed/001_single_gpu/main.py @@ -0,0 +1,172 @@ +"""Single-GPU training example.""" +import logging +import os + +import rich.logging +import torch +from torch import Tensor, nn +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split +from torchvision import transforms +from torchvision.datasets import CIFAR10 +from torchvision.models import resnet18 +from tqdm import tqdm + + +def main(): + training_epochs = 10 + learning_rate = 5e-4 + weight_decay = 1e-4 + batch_size = 128 + + # Check that the GPU is available + assert torch.cuda.is_available() and torch.cuda.device_count() > 0 + device = torch.device("cuda", 0) + + # Setup logging (optional, but much better than using print statements) + logging.basicConfig( + level=logging.INFO, + handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. + ) + + logger = logging.getLogger(__name__) + + # Create a model and move it to the GPU. + model = resnet18(num_classes=10) + model.to(device=device) + + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + + # Setup CIFAR10 + num_workers = get_num_workers() + dataset_path = os.environ.get("SLURM_TMPDIR", "../dataset") + train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path) + train_dataloader = DataLoader( + train_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=True, + ) + valid_dataloader = DataLoader( + valid_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + test_dataloader = DataLoader( # NOTE: Not used in this example. + test_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + + # Checkout the "checkpointing and preemption" example for more info! + logger.debug("Starting training from scratch.") + + for epoch in range(training_epochs): + logger.debug(f"Starting epoch {epoch}/{training_epochs}") + + # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers) + model.train() + + # NOTE: using a progress bar from tqdm because it's nicer than using `print`. + progress_bar = tqdm( + total=len(train_dataloader), + desc=f"Train epoch {epoch}", + ) + + # Training loop + for batch in train_dataloader: + # Move the batch to the GPU before we pass it to the model + batch = tuple(item.to(device) for item in batch) + x, y = batch + + # Forward pass + logits: Tensor = model(x) + + loss = F.cross_entropy(logits, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # Calculate some metrics: + n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + n_samples = y.shape[0] + accuracy = n_correct_predictions / n_samples + + logger.debug(f"Accuracy: {accuracy.item():.2%}") + logger.debug(f"Average Loss: {loss.item()}") + + # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just) + progress_bar.update(1) + progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) + progress_bar.close() + + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + + print("Done!") + + +@torch.no_grad() +def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): + model.eval() + + total_loss = 0.0 + n_samples = 0 + correct_predictions = 0 + + for batch in dataloader: + batch = tuple(item.to(device) for item in batch) + x, y = batch + + logits: Tensor = model(x) + loss = F.cross_entropy(logits, y) + + batch_n_samples = x.shape[0] + batch_correct_predictions = logits.argmax(-1).eq(y).sum() + + total_loss += loss.item() + n_samples += batch_n_samples + correct_predictions += batch_correct_predictions + + accuracy = correct_predictions / n_samples + return total_loss, accuracy + + +def make_datasets( + dataset_path: str, + val_split: float = 0.1, + val_split_seed: int = 42, +): + """Returns the training, validation, and test splits for CIFAR10. + + NOTE: We don't use image transforms here for simplicity. + Having different transformations for train and validation would complicate things a bit. + Later examples will show how to do the train/val/test split properly when using transforms. + """ + train_dataset = CIFAR10( + root=dataset_path, transform=transforms.ToTensor(), download=True, train=True + ) + test_dataset = CIFAR10( + root=dataset_path, transform=transforms.ToTensor(), download=True, train=False + ) + # Split the training dataset into a training and validation set. + train_dataset, valid_dataset = random_split( + train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed) + ) + return train_dataset, valid_dataset, test_dataset + + +def get_num_workers() -> int: + """Gets the optimal number of DatLoader workers to use in the current job.""" + if "SLURM_CPUS_PER_TASK" in os.environ: + return int(os.environ["SLURM_CPUS_PER_TASK"]) + if hasattr(os, "sched_getaffinity"): + return len(os.sched_getaffinity(0)) + return torch.multiprocessing.cpu_count() + + +if __name__ == "__main__": + main() diff --git a/docs/examples/distributed/index.rst b/docs/examples/distributed/index.rst new file mode 100644 index 00000000..4e24c1d6 --- /dev/null +++ b/docs/examples/distributed/index.rst @@ -0,0 +1,6 @@ +******************** +Distributed Training +******************** + + +.. include:: /examples/distributed/001_single_gpu/_index.rst diff --git a/docs/examples/frameworks/README.rst b/docs/examples/frameworks/index.rst similarity index 51% rename from docs/examples/frameworks/README.rst rename to docs/examples/frameworks/index.rst index 3764c0f2..a3961cb0 100644 --- a/docs/examples/frameworks/README.rst +++ b/docs/examples/frameworks/index.rst @@ -3,4 +3,4 @@ Software Frameworks ******************* -.. include:: examples/frameworks/pytorch_setup/README.rst +.. include:: examples/frameworks/pytorch_setup/_index.rst diff --git a/docs/examples/frameworks/pytorch_setup/README.rst b/docs/examples/frameworks/pytorch_setup/README.rst index 3be1c08b..4048a222 100644 --- a/docs/examples/frameworks/pytorch_setup/README.rst +++ b/docs/examples/frameworks/pytorch_setup/README.rst @@ -1,3 +1,5 @@ +.. _pytorch_setup: + PyTorch Setup =================== @@ -14,16 +16,63 @@ PyTorch Setup **job.sh** -.. literalinclude:: /examples/frameworks/pytorch_setup/job.sh - :language: bash - +.. code:: bash + + #!/bin/bash + #SBATCH --gres=gpu:1 + #SBATCH --cpus-per-task=1 + #SBATCH --mem=16G + #SBATCH --time=00:15:00 + #SBATCH --partition=unkillable + + set -e # exit on error. + echo "Date: $(date)" + echo "Hostname: $(hostname)" + + module purge + # This example uses Conda to manage package dependencies. + # See https://docs.mila.quebec/Userguide.html#conda for more information. + module load anaconda/3 + + # Creating the environment for the first time: + # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ + # pytorch-cuda=11.6 -c pytorch -c nvidia + # Other conda packages: + # conda install -y -n pytorch -c conda-forge rich + + # Activate the environment: + conda activate pytorch + + python main.py + **main.py** -.. literalinclude:: /examples/frameworks/pytorch_setup/main.py - :language: python - +.. code:: python + + import torch + import torch.backends.cuda + + + def main(): + cuda_built = torch.backends.cuda.is_built() + cuda_avail = torch.cuda.is_available() + device_count = torch.cuda.device_count() + + print(f"PyTorch built with CUDA: {cuda_built}") + print(f"PyTorch detects CUDA available: {cuda_avail}") + print(f"PyTorch-detected #GPUs: {device_count}") + if device_count == 0: + print(" No GPU detected, not printing devices' names.") + else: + for i in range(device_count): + print(f" GPU {i}: {torch.cuda.get_device_name(i)}") + + + if __name__ == "__main__": + main() + **Running this example** diff --git a/docs/examples/frameworks/pytorch_setup/_index.rst b/docs/examples/frameworks/pytorch_setup/_index.rst new file mode 100644 index 00000000..9d9dfa05 --- /dev/null +++ b/docs/examples/frameworks/pytorch_setup/_index.rst @@ -0,0 +1,35 @@ +.. _pytorch_setup: + +PyTorch Setup +=================== + +.. IDEA: Add a link to all the sections of the documentation that have to +.. absolutely have been read before this tutorial. + +**Prerequisites**: (Make sure to read the following before using this example!) + +* :ref:`Quick Start` +* :ref:`Running your code` +* :ref:`Conda` + + +**job.sh** + + +.. literalinclude:: examples/frameworks/pytorch_setup/job.sh + :language: bash + + +**main.py** + + +.. literalinclude:: examples/frameworks/pytorch_setup/main.py + :language: python + + +**Running this example** + + +.. code-block:: bash + + $ sbatch job.sh diff --git a/docs/examples/frameworks/pytorch_setup/job.sh b/docs/examples/frameworks/pytorch_setup/job.sh index db126819..6f50e07d 100644 --- a/docs/examples/frameworks/pytorch_setup/job.sh +++ b/docs/examples/frameworks/pytorch_setup/job.sh @@ -17,6 +17,8 @@ module load anaconda/3 # Creating the environment for the first time: # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ # pytorch-cuda=11.6 -c pytorch -c nvidia +# Other conda packages: +# conda install -y -n pytorch -c conda-forge rich # Activate the environment: conda activate pytorch diff --git a/docs/examples/generate_diffs.sh b/docs/examples/generate_diffs.sh new file mode 100755 index 00000000..106cc32b --- /dev/null +++ b/docs/examples/generate_diffs.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Use this to update the diffs based on the contents of the files. + +pushd `dirname "${BASH_SOURCE[0]}"` >/dev/null +_SCRIPT_DIR=`pwd -P` +popd >/dev/null + +set -e + +generate_diff() { + echo "Generating diff for docs/examples/$1 -> docs/examples/$2" + # NOTE: Assuming that this gets run from the `docs` folder (as is the case when building the docs). + + # Write a diff file to be shown in the documentation. + + echo " # $1 -> $2" > "$2.diff" + git diff --no-index -U9999 \ + "$1" \ + "$2" \ + | grep -Ev "^--- |^\+\+\+ |^@@ |^index |^diff --git" \ + >> "$2.diff" +} + +pushd "${_SCRIPT_DIR}" >/dev/null + +# single_gpu -> huggingface +generate_diff distributed/001_single_gpu/job.sh data/hf/job.sh +generate_diff distributed/001_single_gpu/main.py data/hf/main.py + +# single_gpu -> torchvision +generate_diff distributed/001_single_gpu/job.sh data/torchvision/job.sh +generate_diff distributed/001_single_gpu/main.py data/torchvision/main.py + +popd >/dev/null diff --git a/docs/examples/preprocess.py b/docs/examples/preprocess.py new file mode 100644 index 00000000..21f023dd --- /dev/null +++ b/docs/examples/preprocess.py @@ -0,0 +1,44 @@ +"""Generate GitHub README's from _index.rst files +GitHub doesn't support include of other files, even of the same type and +location, so this file generates a README.rst with files content embedded +""" +from glob import glob +from pathlib import Path +import shutil + + +def preprocess(): + examples_root = Path(__file__).parent.parent + for _f in glob(str(examples_root / "examples/**/_index.rst"), recursive=True): + _f = Path(_f) + shutil.copyfile(str(_f), str(_f.with_name("README.rst"))) + _f = _f.with_name("README.rst") + content = _f.read_text().split("\n") + i = 0 + end = len(content) + while i < end: + line = content[i] + if line.startswith(".. literalinclude:: "): + path = line[len(".. literalinclude:: "):].strip(" ") + lang = "" + for j, _l in enumerate(content[i+1:]): + _l = _l.strip(" ") + if _l.startswith(":language:"): + lang = _l[len(":language:"):].strip(" ") + elif _l.startswith(".. literalinclude:: ") or not _l: + break + del content[i:i+1+j] + insert = ( + [f".. code:: {lang}", ""] + + [f" {_l}" for _l in (examples_root / path).read_text().split("\n")] + ) + content = content[:i] + insert + content[i+1:] + i += len(insert) + end = len(content) + else: + i += 1 + _f.write_text("\n".join(content)) + + +if __name__ == "__main__": + preprocess()