Add tests for the examples

Signed-off-by: Fabrice Normandin <[email protected]> Make all job script executable Signed-off-by: Fabrice Normandin <[email protected]> Move common stuff to a `run_example` function Signed-off-by: Fabrice Normandin <[email protected]> Add regex substitutions before comparing outputs Signed-off-by: Fabrice Normandin <[email protected]> Make the Pytorch-based examples reproducible Signed-off-by: Fabrice Normandin <[email protected]> Reduce the number of GPUs per node from 4 to 2 Signed-off-by: Fabrice Normandin <[email protected]> Unified test for pytorch-based examples Signed-off-by: Fabrice Normandin <[email protected]> Add a `make_env.sh` sbatch script in pytorch setup Signed-off-by: Fabrice Normandin <[email protected]> Simplify the `test_examples.py` file Signed-off-by: Fabrice Normandin <[email protected]> Update the regression files for the examples Signed-off-by: Fabrice Normandin <[email protected]> Add regression file for multi-node example Signed-off-by: Fabrice Normandin <[email protected]> Add the `pip install orion` line to Orion example Signed-off-by: Fabrice Normandin <[email protected]> Add a test for the checkpointing example Signed-off-by: Fabrice Normandin <[email protected]> Add the regression files for checkpointing example Signed-off-by: Fabrice Normandin <[email protected]> Fix regression file for the ckpt example test Signed-off-by: Fabrice Normandin <[email protected]> Split test code into testutils and test file Signed-off-by: Fabrice Normandin <[email protected]> Start to add test for "HPO with Orion" example Signed-off-by: Fabrice Normandin <[email protected]> Remove potentially buggy asserts Signed-off-by: Fabrice Normandin <[email protected]> Make a conda env for the Orion example Signed-off-by: Fabrice Normandin <[email protected]>
mila-iqia · Aug 7, 2023 · 9a05d48 · 9a05d48
1 parent 6c4797b
commit 9a05d48
Show file tree

Hide file tree

Showing 29 changed files with 1,306 additions and 131 deletions.
diff --git a/docs/examples/distributed/multi_gpu/README.rst b/docs/examples/distributed/multi_gpu/README.rst
@@ -28,7 +28,7 @@ Click here to see `the code for this example
     #SBATCH --gpus-per-task=rtx8000:1
     #SBATCH --cpus-per-task=4
    -#SBATCH --ntasks-per-node=1
-   +#SBATCH --ntasks-per-node=4
+   +#SBATCH --ntasks-per-node=2
     #SBATCH --mem=16G
     #SBATCH --time=00:15:00
 
@@ -45,13 +45,9 @@ Click here to see `the code for this example
     module load anaconda/3
     module load cuda/11.7
 
-    # Creating the environment for the first time:
-    # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-    #     pytorch-cuda=11.7 -c pytorch -c nvidia
-    # Other conda packages:
-    # conda install -y -n pytorch -c conda-forge rich tqdm
-
+   +
     # Activate pre-existing environment.
+    # NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
     conda activate pytorch
 
 

diff --git a/docs/examples/distributed/multi_gpu/job.sh b/docs/examples/distributed/multi_gpu/job.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH --gpus-per-task=rtx8000:1
 #SBATCH --cpus-per-task=4
-#SBATCH --ntasks-per-node=4
+#SBATCH --ntasks-per-node=2
 #SBATCH --mem=16G
 #SBATCH --time=00:15:00
 
@@ -18,13 +18,9 @@ module --quiet purge
 module load anaconda/3
 module load cuda/11.7
 
-# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-#     pytorch-cuda=11.7 -c pytorch -c nvidia
-# Other conda packages:
-# conda install -y -n pytorch -c conda-forge rich tqdm
 
 # Activate pre-existing environment.
+# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
 conda activate pytorch
 
 

diff --git a/docs/examples/distributed/multi_node/README.rst b/docs/examples/distributed/multi_node/README.rst
@@ -29,7 +29,7 @@ Click here to see `the source code for this example
     #!/bin/bash
     #SBATCH --gpus-per-task=rtx8000:1
     #SBATCH --cpus-per-task=4
-    #SBATCH --ntasks-per-node=4
+    #SBATCH --ntasks-per-node=2
    +#SBATCH --nodes=2
     #SBATCH --mem=16G
     #SBATCH --time=00:15:00
@@ -47,13 +47,9 @@ Click here to see `the source code for this example
     module load anaconda/3
     module load cuda/11.7
 
-    # Creating the environment for the first time:
-    # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-    #     pytorch-cuda=11.7 -c pytorch -c nvidia
-    # Other conda packages:
-    # conda install -y -n pytorch -c conda-forge rich tqdm
-
+   -
     # Activate pre-existing environment.
+    # NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
     conda activate pytorch
 
    -

diff --git a/docs/examples/distributed/multi_node/job.sh b/docs/examples/distributed/multi_node/job.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH --gpus-per-task=rtx8000:1
 #SBATCH --cpus-per-task=4
-#SBATCH --ntasks-per-node=4
+#SBATCH --ntasks-per-node=2
 #SBATCH --nodes=2
 #SBATCH --mem=16G
 #SBATCH --time=00:15:00
@@ -19,13 +19,8 @@ module --quiet purge
 module load anaconda/3
 module load cuda/11.7
 
-# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-#     pytorch-cuda=11.7 -c pytorch -c nvidia
-# Other conda packages:
-# conda install -y -n pytorch -c conda-forge rich tqdm
-
 # Activate pre-existing environment.
+# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
 conda activate pytorch
 
 # Stage dataset into $SLURM_TMPDIR (only on the first worker of each node)

diff --git a/docs/examples/distributed/single_gpu/README.rst b/docs/examples/distributed/single_gpu/README.rst
@@ -42,13 +42,8 @@ repository.
    module load anaconda/3
    module load cuda/11.7
 
-   # Creating the environment for the first time:
-   # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-   #     pytorch-cuda=11.7 -c pytorch -c nvidia
-   # Other conda packages:
-   # conda install -y -n pytorch -c conda-forge rich tqdm
-
    # Activate pre-existing environment.
+   # NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
    conda activate pytorch
 
 

diff --git a/docs/examples/distributed/single_gpu/job.sh b/docs/examples/distributed/single_gpu/job.sh
@@ -18,13 +18,8 @@ module --quiet purge
 module load anaconda/3
 module load cuda/11.7
 
-# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-#     pytorch-cuda=11.7 -c pytorch -c nvidia
-# Other conda packages:
-# conda install -y -n pytorch -c conda-forge rich tqdm
-
 # Activate pre-existing environment.
+# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
 conda activate pytorch
 
 

diff --git a/docs/examples/frameworks/pytorch_setup/README.rst b/docs/examples/frameworks/pytorch_setup/README.rst
@@ -41,13 +41,8 @@ repository.
    # See https://docs.mila.quebec/Userguide.html#conda for more information.
    module load anaconda/3
 
-   # Creating the environment for the first time:
-   # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-   #     pytorch-cuda=11.6 -c pytorch -c nvidia
-   # Other conda packages:
-   # conda install -y -n pytorch -c conda-forge rich
-
    # Activate the environment:
+   # NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
    conda activate pytorch
 
    # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
@@ -92,28 +87,38 @@ Note that we are requesting a GPU for this job, even though we're only going to
 install packages. This is because we want PyTorch to be installed with GPU
 support, and to have all the required libraries.
 
-.. code-block:: bash
+.. code:: bash
+
+   #!/bin/bash
+   #SBATCH --gres=gpu:1
+   #SBATCH --cpus-per-task=1
+   #SBATCH --mem=16G
+   #SBATCH --time=00:30:00
+
+   # NOTE: Run this either with `sbatch make_env.sh` or within an interactive job with `salloc`:
+   # salloc --gres=gpu:1 --cpus-per-task=1 --mem=16G --time=00:30:00
+
+   # Exit on error
+   set -e
+
+   module --quiet purge
+   module load anaconda/3
+   module load cuda/11.7
+
+   ENV_NAME=pytorch
+
+   ## Create the environment (see the example):
+   conda create --yes --name $ENV_NAME python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 --channel pytorch --channel nvidia
+   # Install as many packages as possible with Conda:
+   conda install --yes --name $ENV_NAME tqdm --channel conda-forge
+   # Activate the environment:
+   conda activate $ENV_NAME
+   # Install the rest of the packages with pip:
+   pip install rich
+   conda env export --no-builds --from-history --file environment.yaml
+
 
-    $ salloc --gres=gpu:1 --cpus-per-task=4 --mem=16G --time=00:30:00
-    salloc: --------------------------------------------------------------------------------------------------
-    salloc: # Using default long partition
-    salloc: --------------------------------------------------------------------------------------------------
-    salloc: Pending job allocation 2959785
-    salloc: job 2959785 queued and waiting for resources
-    salloc: job 2959785 has been allocated resources
-    salloc: Granted job allocation 2959785
-    salloc: Waiting for resource configuration
-    salloc: Nodes cn-g022 are ready for job
-    $ # Create the environment (see the example):
-    $ conda create -n pytorch python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
-    (...)
-    $ # Press 'y' to accept if everything looks good.
-    (...)
-    $ # Activate the environment:
-    $ conda activate pytorch
-
-Exit the interactive job once the environment has been created. Then, the
-example can be launched to confirm that everything works:
+Once the environment has been created, the example can be launched to confirm that everything works:
 
 .. code-block:: bash
 

diff --git a/docs/examples/frameworks/pytorch_setup/index.rst b/docs/examples/frameworks/pytorch_setup/index.rst
@@ -38,28 +38,11 @@ Note that we are requesting a GPU for this job, even though we're only going to
 install packages. This is because we want PyTorch to be installed with GPU
 support, and to have all the required libraries.
 
-.. code-block:: bash
+.. literalinclude:: examples/frameworks/pytorch_setup/make_env.sh
+    :language: bash
+
 
-    $ salloc --gres=gpu:1 --cpus-per-task=4 --mem=16G --time=00:30:00
-    salloc: --------------------------------------------------------------------------------------------------
-    salloc: # Using default long partition
-    salloc: --------------------------------------------------------------------------------------------------
-    salloc: Pending job allocation 2959785
-    salloc: job 2959785 queued and waiting for resources
-    salloc: job 2959785 has been allocated resources
-    salloc: Granted job allocation 2959785
-    salloc: Waiting for resource configuration
-    salloc: Nodes cn-g022 are ready for job
-    $ # Create the environment (see the example):
-    $ conda create -n pytorch python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
-    (...)
-    $ # Press 'y' to accept if everything looks good.
-    (...)
-    $ # Activate the environment:
-    $ conda activate pytorch
-
-Exit the interactive job once the environment has been created. Then, the
-example can be launched to confirm that everything works:
+Once the environment has been created, the example can be launched to confirm that everything works:
 
 .. code-block:: bash
 

diff --git a/docs/examples/frameworks/pytorch_setup/job.sh b/docs/examples/frameworks/pytorch_setup/job.sh
@@ -14,13 +14,8 @@ module purge
 # See https://docs.mila.quebec/Userguide.html#conda for more information.
 module load anaconda/3
 
-# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-#     pytorch-cuda=11.6 -c pytorch -c nvidia
-# Other conda packages:
-# conda install -y -n pytorch -c conda-forge rich
-
 # Activate the environment:
+# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
 conda activate pytorch
 
 # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0

diff --git a/docs/examples/frameworks/pytorch_setup/make_env.sh b/docs/examples/frameworks/pytorch_setup/make_env.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+#SBATCH --gres=gpu:1
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=16G
+#SBATCH --time=00:30:00
+
+# NOTE: Run this either with `sbatch make_env.sh` or within an interactive job with `salloc`:
+# salloc --gres=gpu:1 --cpus-per-task=1 --mem=16G --time=00:30:00
+
+# Exit on error
+set -e
+
+module --quiet purge
+module load anaconda/3
+module load cuda/11.7
+
+ENV_NAME=pytorch
+
+## Create the environment (see the example):
+conda create --yes --name $ENV_NAME python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 --channel pytorch --channel nvidia
+# Install as many packages as possible with Conda:
+conda install --yes --name $ENV_NAME tqdm --channel conda-forge
+# Activate the environment:
+conda activate $ENV_NAME
+# Install the rest of the packages with pip:
+pip install rich
+conda env export --no-builds --from-history --file environment.yaml
diff --git a/docs/examples/good_practices/checkpointing/README.rst b/docs/examples/good_practices/checkpointing/README.rst
@@ -23,9 +23,7 @@ repository.
 
 .. code:: diff
 
-    # distributed/single_gpu/job.sh -> good_practices/checkpointing/job.sh
-   old mode 100644
-   new mode 100755
+    # distributed/001_single_gpu/job.sh -> good_practices/checkpointing/job.sh
     #!/bin/bash
    -#SBATCH --gpus-per-task=rtx8000:1
    +#SBATCH --gpus-per-task=1
@@ -55,14 +53,8 @@ repository.
     module load cuda/11.7
 
    +
-    # Creating the environment for the first time:
-    # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-   -#     pytorch-cuda=11.7 -c pytorch -c nvidia
-   +#     pytorch-cuda=11.7 scipy -c pytorch -c nvidia
-    # Other conda packages:
-    # conda install -y -n pytorch -c conda-forge rich tqdm
-
     # Activate pre-existing environment.
+    # NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
     conda activate pytorch
 
 

diff --git a/docs/examples/good_practices/checkpointing/job.sh b/docs/examples/good_practices/checkpointing/job.sh
@@ -25,13 +25,8 @@ module load anaconda/3
 module load cuda/11.7
 
 
-# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-#     pytorch-cuda=11.7 scipy -c pytorch -c nvidia
-# Other conda packages:
-# conda install -y -n pytorch -c conda-forge rich tqdm
-
 # Activate pre-existing environment.
+# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
 conda activate pytorch
 
 

diff --git a/docs/examples/good_practices/hpo_with_orion/README.rst b/docs/examples/good_practices/hpo_with_orion/README.rst
@@ -36,6 +36,8 @@ The full source code for this example is available on `the mila-docs GitHub repo
     #SBATCH --mem=16G
     #SBATCH --time=00:15:00
 
+   +# Exit on error
+   +set -e
 
     # Echo time and hostname into log
     echo "Date:     $(date)"
@@ -49,21 +51,19 @@ The full source code for this example is available on `the mila-docs GitHub repo
     module load anaconda/3
     module load cuda/11.7
 
-    # Creating the environment for the first time:
-    # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-    #     pytorch-cuda=11.7 -c pytorch -c nvidia
-    # Other conda packages:
-    # conda install -y -n pytorch -c conda-forge rich tqdm
-   +# Orion package:
-   +# pip install orion
-
     # Activate pre-existing environment.
-    conda activate pytorch
+    # NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
+   -conda activate pytorch
+   +ENV_PATH="$SCRATCH/conda/pytorch_orion"
+   +conda activate $ENV_PATH
+   +# Install the Orion package:
+   +# pip install orion
 
 
     # Stage dataset into $SLURM_TMPDIR
     mkdir -p $SLURM_TMPDIR/data
-    cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/
+   -cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/
+   +cp --update /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/
     # General-purpose alternatives combining copy and unpack:
     #     unzip   /network/datasets/some/file.zip -d $SLURM_TMPDIR/data/
     #     tar -xf /network/datasets/some/file.tar -C $SLURM_TMPDIR/data/
@@ -86,7 +86,7 @@ The full source code for this example is available on `the mila-docs GitHub repo
    +# Then you can specify a search space for each `main.py`'s script parameter
    +# you want to optimize. Here we optimize only the learning rate.
    +
-   +orion hunt -n orion-example --exp-max-trials 10 python main.py --learning-rate~'loguniform(1e-5, 1.0)'
+   +orion --verbose hunt -n orion-example --exp-max-trials 10 python main.py --learning-rate~'loguniform(1e-5, 1.0)'
 
 
 **main.py**