Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DOC-35] Add tests that run the examples on the cluster #209

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions docs/examples/distributed/multi_gpu/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Click here to see `the code for this example
#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=4
-#SBATCH --ntasks-per-node=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --ntasks-per-node=2
#SBATCH --mem=16G
#SBATCH --time=00:15:00

Expand All @@ -45,13 +45,9 @@ Click here to see `the code for this example
module load anaconda/3
module load cuda/11.7

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

+
# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch


Expand Down
8 changes: 2 additions & 6 deletions docs/examples/distributed/multi_gpu/job.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=4
#SBATCH --ntasks-per-node=4
#SBATCH --ntasks-per-node=2
#SBATCH --mem=16G
#SBATCH --time=00:15:00

Expand All @@ -18,13 +18,9 @@ module --quiet purge
module load anaconda/3
module load cuda/11.7

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch


Expand Down
10 changes: 3 additions & 7 deletions docs/examples/distributed/multi_node/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Click here to see `the source code for this example
#!/bin/bash
#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=4
#SBATCH --ntasks-per-node=4
#SBATCH --ntasks-per-node=2
+#SBATCH --nodes=2
#SBATCH --mem=16G
#SBATCH --time=00:15:00
Expand All @@ -47,13 +47,9 @@ Click here to see `the source code for this example
module load anaconda/3
module load cuda/11.7

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

-
# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch

-
Expand Down
9 changes: 2 additions & 7 deletions docs/examples/distributed/multi_node/job.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=4
#SBATCH --ntasks-per-node=4
#SBATCH --ntasks-per-node=2
#SBATCH --nodes=2
#SBATCH --mem=16G
#SBATCH --time=00:15:00
Expand All @@ -19,13 +19,8 @@ module --quiet purge
module load anaconda/3
module load cuda/11.7

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch

# Stage dataset into $SLURM_TMPDIR (only on the first worker of each node)
Expand Down
7 changes: 1 addition & 6 deletions docs/examples/distributed/single_gpu/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,8 @@ repository.
module load anaconda/3
module load cuda/11.7

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch


Expand Down
7 changes: 1 addition & 6 deletions docs/examples/distributed/single_gpu/job.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,8 @@ module --quiet purge
module load anaconda/3
module load cuda/11.7

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch


Expand Down
15 changes: 7 additions & 8 deletions docs/examples/frameworks/jax/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ repository.
.. code:: diff

# distributed/single_gpu/job.sh -> frameworks/jax/job.sh
old mode 100755
new mode 100644
#!/bin/bash
#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=4
Expand All @@ -45,11 +47,10 @@ repository.
module load anaconda/3
-module load cuda/11.7

# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-# pytorch-cuda=11.7 -c pytorch -c nvidia
-# Other conda packages:
-# conda install -y -n pytorch -c conda-forge rich tqdm
-# Activate pre-existing environment.
-# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
-conda activate pytorch
+# Creating the environment for the first time:
+# conda create -y -n jax_ex -c "nvidia/label/cuda-11.8.0" cuda python=3.9 virtualenv pip
+# conda activate jax_ex
+# Install Jax using `pip`
Expand All @@ -59,9 +60,7 @@ repository.
+# -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+# Other pip packages:
+# pip install pillow optax rich torch torchvision flax tqdm

-# Activate pre-existing environment.
-conda activate pytorch
+
+# Activate the environment:
+conda activate jax_ex

Expand Down
11 changes: 5 additions & 6 deletions docs/examples/frameworks/jax_setup/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ repository.
.. code:: diff

# frameworks/pytorch_setup/job.sh -> frameworks/jax_setup/job.sh
old mode 100755
new mode 100644
#!/bin/bash
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=1
Expand All @@ -39,20 +41,17 @@ repository.
# See https://docs.mila.quebec/Userguide.html#conda for more information.
module load anaconda/3

# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-# pytorch-cuda=11.6 -c pytorch -c nvidia
-# Other conda packages:
-# conda install -y -n pytorch -c conda-forge rich
+# Creating the environment for the first time:
+# conda create -y -n jax_ex -c "nvidia/label/cuda-11.8.0" cuda python=3.9 virtualenv pip
+# conda activate jax_ex
+# Install Jax using `pip`
+# *Please note* that as soon as you install packages from `pip install`, you
+# should not install any more packages using `conda install`
+# pip install --upgrade "jax[cuda11_pip]" \
+# -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html

+
# Activate the environment:
-# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
-conda activate pytorch
+conda activate jax_ex

Expand Down
55 changes: 29 additions & 26 deletions docs/examples/frameworks/pytorch_setup/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,8 @@ repository.
# See https://docs.mila.quebec/Userguide.html#conda for more information.
module load anaconda/3

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.6 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich

# Activate the environment:
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch

# Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
Expand Down Expand Up @@ -92,27 +87,35 @@ Note that we are requesting a GPU for this job, even though we're only going to
install packages. This is because we want PyTorch to be installed with GPU
support, and to have all the required libraries.

.. code-block:: bash
.. code:: bash

#!/bin/bash
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=1
#SBATCH --mem=16G
#SBATCH --time=00:30:00

$ salloc --gres=gpu:1 --cpus-per-task=4 --mem=16G --time=00:30:00
salloc: --------------------------------------------------------------------------------------------------
salloc: # Using default long partition
salloc: --------------------------------------------------------------------------------------------------
salloc: Pending job allocation 2959785
salloc: job 2959785 queued and waiting for resources
salloc: job 2959785 has been allocated resources
salloc: Granted job allocation 2959785
salloc: Waiting for resource configuration
salloc: Nodes cn-g022 are ready for job
$ # Load anaconda
$ module load anaconda/3
$ # Create the environment (see the example):
$ conda create -n pytorch python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
(...)
$ # Press 'y' to accept if everything looks good.
(...)
$ # Activate the environment:
$ conda activate pytorch
# NOTE: Run this either with `sbatch make_env.sh` or within an interactive job with `salloc`:
# salloc --gres=gpu:1 --cpus-per-task=1 --mem=16G --time=00:30:00

# Exit on error
set -e

module --quiet purge
module load anaconda/3
module load cuda/11.7

ENV_NAME=pytorch

## Create the environment (see the example):
conda create --yes --name $ENV_NAME python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 --channel pytorch --channel nvidia
# Install as many packages as possible with Conda:
conda install --yes --name $ENV_NAME tqdm --channel conda-forge
# Activate the environment:
conda activate $ENV_NAME
# Install the rest of the packages with pip:
pip install rich
conda env export --no-builds --from-history --file environment.yaml

Exit the interactive job once the environment has been created. Then, the
example can be launched to confirm that everything works:
Expand Down
23 changes: 2 additions & 21 deletions docs/examples/frameworks/pytorch_setup/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,27 +38,8 @@ Note that we are requesting a GPU for this job, even though we're only going to
install packages. This is because we want PyTorch to be installed with GPU
support, and to have all the required libraries.

.. code-block:: bash

$ salloc --gres=gpu:1 --cpus-per-task=4 --mem=16G --time=00:30:00
salloc: --------------------------------------------------------------------------------------------------
salloc: # Using default long partition
salloc: --------------------------------------------------------------------------------------------------
salloc: Pending job allocation 2959785
salloc: job 2959785 queued and waiting for resources
salloc: job 2959785 has been allocated resources
salloc: Granted job allocation 2959785
salloc: Waiting for resource configuration
salloc: Nodes cn-g022 are ready for job
$ # Load anaconda
$ module load anaconda/3
$ # Create the environment (see the example):
$ conda create -n pytorch python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
(...)
$ # Press 'y' to accept if everything looks good.
(...)
$ # Activate the environment:
$ conda activate pytorch
.. literalinclude:: examples/frameworks/pytorch_setup/make_env.sh
:language: bash

Exit the interactive job once the environment has been created. Then, the
example can be launched to confirm that everything works:
Expand Down
7 changes: 1 addition & 6 deletions docs/examples/frameworks/pytorch_setup/job.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,8 @@ module --quiet purge
# See https://docs.mila.quebec/Userguide.html#conda for more information.
module load anaconda/3

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.6 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich

# Activate the environment:
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch

# Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
Expand Down
27 changes: 27 additions & 0 deletions docs/examples/frameworks/pytorch_setup/make_env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=1
#SBATCH --mem=16G
#SBATCH --time=00:30:00

# NOTE: Run this either with `sbatch make_env.sh` or within an interactive job with `salloc`:
# salloc --gres=gpu:1 --cpus-per-task=1 --mem=16G --time=00:30:00

# Exit on error
set -e

module --quiet purge
module load anaconda/3
module load cuda/11.7

ENV_NAME=pytorch

## Create the environment (see the example):
conda create --yes --name $ENV_NAME python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 --channel pytorch --channel nvidia
# Install as many packages as possible with Conda:
conda install --yes --name $ENV_NAME tqdm --channel conda-forge
# Activate the environment:
conda activate $ENV_NAME
# Install the rest of the packages with pip:
pip install rich
conda env export --no-builds --from-history --file environment.yaml
10 changes: 1 addition & 9 deletions docs/examples/good_practices/checkpointing/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ repository.
.. code:: diff

# distributed/single_gpu/job.sh -> good_practices/checkpointing/job.sh
old mode 100644
new mode 100755
#!/bin/bash
-#SBATCH --gpus-per-task=rtx8000:1
+#SBATCH --gpus-per-task=1
Expand Down Expand Up @@ -55,14 +53,8 @@ repository.
module load cuda/11.7

+
# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-# pytorch-cuda=11.7 -c pytorch -c nvidia
+# pytorch-cuda=11.7 scipy -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch


Expand Down
7 changes: 1 addition & 6 deletions docs/examples/good_practices/checkpointing/job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,8 @@ module load anaconda/3
module load cuda/11.7


# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 scipy -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch


Expand Down
Loading
Loading