diff --git a/docs/examples/distributed/multi_gpu/README.rst b/docs/examples/distributed/multi_gpu/README.rst index 29516161..a24969d2 100644 --- a/docs/examples/distributed/multi_gpu/README.rst +++ b/docs/examples/distributed/multi_gpu/README.rst @@ -28,7 +28,7 @@ Click here to see `the code for this example #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 -#SBATCH --ntasks-per-node=1 - +#SBATCH --ntasks-per-node=4 + +#SBATCH --ntasks-per-node=2 #SBATCH --mem=16G #SBATCH --time=00:15:00 @@ -45,13 +45,9 @@ Click here to see `the code for this example module load anaconda/3 module load cuda/11.7 - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.7 -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm - + + # Activate pre-existing environment. + # NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/distributed/multi_gpu/job.sh b/docs/examples/distributed/multi_gpu/job.sh old mode 100644 new mode 100755 index 949ce544..d8d305d3 --- a/docs/examples/distributed/multi_gpu/job.sh +++ b/docs/examples/distributed/multi_gpu/job.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 -#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks-per-node=2 #SBATCH --mem=16G #SBATCH --time=00:15:00 @@ -18,13 +18,9 @@ module --quiet purge module load anaconda/3 module load cuda/11.7 -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.7 -c pytorch -c nvidia -# Other conda packages: -# conda install -y -n pytorch -c conda-forge rich tqdm # Activate pre-existing environment. +# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/distributed/multi_node/README.rst b/docs/examples/distributed/multi_node/README.rst index d5be58e2..c7e84376 100644 --- a/docs/examples/distributed/multi_node/README.rst +++ b/docs/examples/distributed/multi_node/README.rst @@ -29,7 +29,7 @@ Click here to see `the source code for this example #!/bin/bash #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 - #SBATCH --ntasks-per-node=4 + #SBATCH --ntasks-per-node=2 +#SBATCH --nodes=2 #SBATCH --mem=16G #SBATCH --time=00:15:00 @@ -47,13 +47,9 @@ Click here to see `the source code for this example module load anaconda/3 module load cuda/11.7 - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.7 -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm - + - # Activate pre-existing environment. + # NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch - diff --git a/docs/examples/distributed/multi_node/job.sh b/docs/examples/distributed/multi_node/job.sh old mode 100644 new mode 100755 index d1b8a3ce..b43afc9e --- a/docs/examples/distributed/multi_node/job.sh +++ b/docs/examples/distributed/multi_node/job.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 -#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks-per-node=2 #SBATCH --nodes=2 #SBATCH --mem=16G #SBATCH --time=00:15:00 @@ -19,13 +19,8 @@ module --quiet purge module load anaconda/3 module load cuda/11.7 -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.7 -c pytorch -c nvidia -# Other conda packages: -# conda install -y -n pytorch -c conda-forge rich tqdm - # Activate pre-existing environment. +# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch # Stage dataset into $SLURM_TMPDIR (only on the first worker of each node) diff --git a/docs/examples/distributed/single_gpu/README.rst b/docs/examples/distributed/single_gpu/README.rst index f65e5bc6..13499a65 100644 --- a/docs/examples/distributed/single_gpu/README.rst +++ b/docs/examples/distributed/single_gpu/README.rst @@ -42,13 +42,8 @@ repository. module load anaconda/3 module load cuda/11.7 - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.7 -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm - # Activate pre-existing environment. + # NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/distributed/single_gpu/job.sh b/docs/examples/distributed/single_gpu/job.sh old mode 100644 new mode 100755 index 6f542f39..05c7a59d --- a/docs/examples/distributed/single_gpu/job.sh +++ b/docs/examples/distributed/single_gpu/job.sh @@ -18,13 +18,8 @@ module --quiet purge module load anaconda/3 module load cuda/11.7 -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.7 -c pytorch -c nvidia -# Other conda packages: -# conda install -y -n pytorch -c conda-forge rich tqdm - # Activate pre-existing environment. +# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/frameworks/jax/README.rst b/docs/examples/frameworks/jax/README.rst index f7fe8716..e2ffb6ff 100644 --- a/docs/examples/frameworks/jax/README.rst +++ b/docs/examples/frameworks/jax/README.rst @@ -25,6 +25,8 @@ repository. .. code:: diff # distributed/single_gpu/job.sh -> frameworks/jax/job.sh + old mode 100755 + new mode 100644 #!/bin/bash #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 @@ -45,11 +47,10 @@ repository. module load anaconda/3 -module load cuda/11.7 - # Creating the environment for the first time: - -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - -# pytorch-cuda=11.7 -c pytorch -c nvidia - -# Other conda packages: - -# conda install -y -n pytorch -c conda-forge rich tqdm + -# Activate pre-existing environment. + -# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. + -conda activate pytorch + +# Creating the environment for the first time: +# conda create -y -n jax_ex -c "nvidia/label/cuda-11.8.0" cuda python=3.9 virtualenv pip +# conda activate jax_ex +# Install Jax using `pip` @@ -59,9 +60,7 @@ repository. +# -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +# Other pip packages: +# pip install pillow optax rich torch torchvision flax tqdm - - -# Activate pre-existing environment. - -conda activate pytorch + + +# Activate the environment: +conda activate jax_ex diff --git a/docs/examples/frameworks/jax_setup/README.rst b/docs/examples/frameworks/jax_setup/README.rst index 9b1c5bf0..ac74c163 100644 --- a/docs/examples/frameworks/jax_setup/README.rst +++ b/docs/examples/frameworks/jax_setup/README.rst @@ -23,6 +23,8 @@ repository. .. code:: diff # frameworks/pytorch_setup/job.sh -> frameworks/jax_setup/job.sh + old mode 100755 + new mode 100644 #!/bin/bash #SBATCH --gres=gpu:1 #SBATCH --cpus-per-task=1 @@ -39,11 +41,7 @@ repository. # See https://docs.mila.quebec/Userguide.html#conda for more information. module load anaconda/3 - # Creating the environment for the first time: - -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - -# pytorch-cuda=11.6 -c pytorch -c nvidia - -# Other conda packages: - -# conda install -y -n pytorch -c conda-forge rich + +# Creating the environment for the first time: +# conda create -y -n jax_ex -c "nvidia/label/cuda-11.8.0" cuda python=3.9 virtualenv pip +# conda activate jax_ex +# Install Jax using `pip` @@ -51,8 +49,9 @@ repository. +# should not install any more packages using `conda install` +# pip install --upgrade "jax[cuda11_pip]" \ +# -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html - + + # Activate the environment: + -# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. -conda activate pytorch +conda activate jax_ex diff --git a/docs/examples/frameworks/pytorch_setup/README.rst b/docs/examples/frameworks/pytorch_setup/README.rst index c9f8cff4..3624da9b 100644 --- a/docs/examples/frameworks/pytorch_setup/README.rst +++ b/docs/examples/frameworks/pytorch_setup/README.rst @@ -41,13 +41,8 @@ repository. # See https://docs.mila.quebec/Userguide.html#conda for more information. module load anaconda/3 - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.6 -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich - # Activate the environment: + # NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 @@ -92,27 +87,35 @@ Note that we are requesting a GPU for this job, even though we're only going to install packages. This is because we want PyTorch to be installed with GPU support, and to have all the required libraries. -.. code-block:: bash +.. code:: bash + + #!/bin/bash + #SBATCH --gres=gpu:1 + #SBATCH --cpus-per-task=1 + #SBATCH --mem=16G + #SBATCH --time=00:30:00 - $ salloc --gres=gpu:1 --cpus-per-task=4 --mem=16G --time=00:30:00 - salloc: -------------------------------------------------------------------------------------------------- - salloc: # Using default long partition - salloc: -------------------------------------------------------------------------------------------------- - salloc: Pending job allocation 2959785 - salloc: job 2959785 queued and waiting for resources - salloc: job 2959785 has been allocated resources - salloc: Granted job allocation 2959785 - salloc: Waiting for resource configuration - salloc: Nodes cn-g022 are ready for job - $ # Load anaconda - $ module load anaconda/3 - $ # Create the environment (see the example): - $ conda create -n pytorch python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia - (...) - $ # Press 'y' to accept if everything looks good. - (...) - $ # Activate the environment: - $ conda activate pytorch + # NOTE: Run this either with `sbatch make_env.sh` or within an interactive job with `salloc`: + # salloc --gres=gpu:1 --cpus-per-task=1 --mem=16G --time=00:30:00 + + # Exit on error + set -e + + module --quiet purge + module load anaconda/3 + module load cuda/11.7 + + ENV_NAME=pytorch + + ## Create the environment (see the example): + conda create --yes --name $ENV_NAME python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 --channel pytorch --channel nvidia + # Install as many packages as possible with Conda: + conda install --yes --name $ENV_NAME tqdm --channel conda-forge + # Activate the environment: + conda activate $ENV_NAME + # Install the rest of the packages with pip: + pip install rich + conda env export --no-builds --from-history --file environment.yaml Exit the interactive job once the environment has been created. Then, the example can be launched to confirm that everything works: diff --git a/docs/examples/frameworks/pytorch_setup/index.rst b/docs/examples/frameworks/pytorch_setup/index.rst index 28c6e5ac..ec6a772a 100644 --- a/docs/examples/frameworks/pytorch_setup/index.rst +++ b/docs/examples/frameworks/pytorch_setup/index.rst @@ -38,27 +38,8 @@ Note that we are requesting a GPU for this job, even though we're only going to install packages. This is because we want PyTorch to be installed with GPU support, and to have all the required libraries. -.. code-block:: bash - - $ salloc --gres=gpu:1 --cpus-per-task=4 --mem=16G --time=00:30:00 - salloc: -------------------------------------------------------------------------------------------------- - salloc: # Using default long partition - salloc: -------------------------------------------------------------------------------------------------- - salloc: Pending job allocation 2959785 - salloc: job 2959785 queued and waiting for resources - salloc: job 2959785 has been allocated resources - salloc: Granted job allocation 2959785 - salloc: Waiting for resource configuration - salloc: Nodes cn-g022 are ready for job - $ # Load anaconda - $ module load anaconda/3 - $ # Create the environment (see the example): - $ conda create -n pytorch python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia - (...) - $ # Press 'y' to accept if everything looks good. - (...) - $ # Activate the environment: - $ conda activate pytorch +.. literalinclude:: examples/frameworks/pytorch_setup/make_env.sh + :language: bash Exit the interactive job once the environment has been created. Then, the example can be launched to confirm that everything works: diff --git a/docs/examples/frameworks/pytorch_setup/job.sh b/docs/examples/frameworks/pytorch_setup/job.sh old mode 100644 new mode 100755 index fb35daa7..e51ed13e --- a/docs/examples/frameworks/pytorch_setup/job.sh +++ b/docs/examples/frameworks/pytorch_setup/job.sh @@ -14,13 +14,8 @@ module --quiet purge # See https://docs.mila.quebec/Userguide.html#conda for more information. module load anaconda/3 -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.6 -c pytorch -c nvidia -# Other conda packages: -# conda install -y -n pytorch -c conda-forge rich - # Activate the environment: +# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 diff --git a/docs/examples/frameworks/pytorch_setup/make_env.sh b/docs/examples/frameworks/pytorch_setup/make_env.sh new file mode 100755 index 00000000..fd77f1b6 --- /dev/null +++ b/docs/examples/frameworks/pytorch_setup/make_env.sh @@ -0,0 +1,27 @@ +#!/bin/bash +#SBATCH --gres=gpu:1 +#SBATCH --cpus-per-task=1 +#SBATCH --mem=16G +#SBATCH --time=00:30:00 + +# NOTE: Run this either with `sbatch make_env.sh` or within an interactive job with `salloc`: +# salloc --gres=gpu:1 --cpus-per-task=1 --mem=16G --time=00:30:00 + +# Exit on error +set -e + +module --quiet purge +module load anaconda/3 +module load cuda/11.7 + +ENV_NAME=pytorch + +## Create the environment (see the example): +conda create --yes --name $ENV_NAME python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 --channel pytorch --channel nvidia +# Install as many packages as possible with Conda: +conda install --yes --name $ENV_NAME tqdm --channel conda-forge +# Activate the environment: +conda activate $ENV_NAME +# Install the rest of the packages with pip: +pip install rich +conda env export --no-builds --from-history --file environment.yaml diff --git a/docs/examples/good_practices/checkpointing/README.rst b/docs/examples/good_practices/checkpointing/README.rst index b227e0dc..24cf22c3 100644 --- a/docs/examples/good_practices/checkpointing/README.rst +++ b/docs/examples/good_practices/checkpointing/README.rst @@ -24,8 +24,6 @@ repository. .. code:: diff # distributed/single_gpu/job.sh -> good_practices/checkpointing/job.sh - old mode 100644 - new mode 100755 #!/bin/bash -#SBATCH --gpus-per-task=rtx8000:1 +#SBATCH --gpus-per-task=1 @@ -55,14 +53,8 @@ repository. module load cuda/11.7 + - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - -# pytorch-cuda=11.7 -c pytorch -c nvidia - +# pytorch-cuda=11.7 scipy -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm - # Activate pre-existing environment. + # NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/good_practices/checkpointing/job.sh b/docs/examples/good_practices/checkpointing/job.sh index 3ccebf4b..471e6814 100755 --- a/docs/examples/good_practices/checkpointing/job.sh +++ b/docs/examples/good_practices/checkpointing/job.sh @@ -25,13 +25,8 @@ module load anaconda/3 module load cuda/11.7 -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.7 scipy -c pytorch -c nvidia -# Other conda packages: -# conda install -y -n pytorch -c conda-forge rich tqdm - # Activate pre-existing environment. +# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/good_practices/hpo_with_orion/README.rst b/docs/examples/good_practices/hpo_with_orion/README.rst index 2ca9a29e..b07592d2 100644 --- a/docs/examples/good_practices/hpo_with_orion/README.rst +++ b/docs/examples/good_practices/hpo_with_orion/README.rst @@ -38,6 +38,8 @@ The full source code for this example is available on `the mila-docs GitHub repo #SBATCH --mem=16G #SBATCH --time=00:15:00 + +# Exit on error + +set -e # Echo time and hostname into log echo "Date: $(date)" @@ -51,21 +53,19 @@ The full source code for this example is available on `the mila-docs GitHub repo module load anaconda/3 module load cuda/11.7 - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.7 -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm - +# Orion package: - +# pip install orion - # Activate pre-existing environment. - conda activate pytorch + # NOTE: Use the `make_env.sh` script to create the environment if you haven't already. + -conda activate pytorch + +ENV_PATH="$SCRATCH/conda/pytorch_orion" + +conda activate $ENV_PATH + +# Install the Orion package: + +# pip install orion # Stage dataset into $SLURM_TMPDIR mkdir -p $SLURM_TMPDIR/data - cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ + -cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ + +cp --update /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ # General-purpose alternatives combining copy and unpack: # unzip /network/datasets/some/file.zip -d $SLURM_TMPDIR/data/ # tar -xf /network/datasets/some/file.tar -C $SLURM_TMPDIR/data/ @@ -88,7 +88,7 @@ The full source code for this example is available on `the mila-docs GitHub repo +# Then you can specify a search space for each `main.py`'s script parameter +# you want to optimize. Here we optimize only the learning rate. + - +orion hunt -n orion-example --exp-max-trials 10 python main.py --learning-rate~'loguniform(1e-5, 1.0)' + +orion --verbose hunt -n orion-example --exp-max-trials 10 python main.py --learning-rate~'loguniform(1e-5, 1.0)' **main.py** diff --git a/docs/examples/good_practices/hpo_with_orion/job.sh b/docs/examples/good_practices/hpo_with_orion/job.sh old mode 100644 new mode 100755 index 9f8155af..693f2bfd --- a/docs/examples/good_practices/hpo_with_orion/job.sh +++ b/docs/examples/good_practices/hpo_with_orion/job.sh @@ -5,6 +5,8 @@ #SBATCH --mem=16G #SBATCH --time=00:15:00 +# Exit on error +set -e # Echo time and hostname into log echo "Date: $(date)" @@ -18,21 +20,17 @@ module --quiet purge module load anaconda/3 module load cuda/11.7 -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.7 -c pytorch -c nvidia -# Other conda packages: -# conda install -y -n pytorch -c conda-forge rich tqdm -# Orion package: -# pip install orion - # Activate pre-existing environment. -conda activate pytorch +# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. +ENV_PATH="$SCRATCH/conda/pytorch_orion" +conda activate $ENV_PATH +# Install the Orion package: +# pip install orion # Stage dataset into $SLURM_TMPDIR mkdir -p $SLURM_TMPDIR/data -cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ +cp --update /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ # General-purpose alternatives combining copy and unpack: # unzip /network/datasets/some/file.zip -d $SLURM_TMPDIR/data/ # tar -xf /network/datasets/some/file.tar -C $SLURM_TMPDIR/data/ @@ -53,4 +51,4 @@ unset CUDA_VISIBLE_DEVICES # Then you can specify a search space for each `main.py`'s script parameter # you want to optimize. Here we optimize only the learning rate. -orion hunt -n orion-example --exp-max-trials 10 python main.py --learning-rate~'loguniform(1e-5, 1.0)' +orion --verbose hunt -n orion-example --exp-max-trials 10 python main.py --learning-rate~'loguniform(1e-5, 1.0)' diff --git a/docs/examples/good_practices/hpo_with_orion/make_env.sh b/docs/examples/good_practices/hpo_with_orion/make_env.sh new file mode 100755 index 00000000..9b069460 --- /dev/null +++ b/docs/examples/good_practices/hpo_with_orion/make_env.sh @@ -0,0 +1,29 @@ +#!/bin/bash +#SBATCH --gres=gpu:1 +#SBATCH --cpus-per-task=1 +#SBATCH --mem=16G +#SBATCH --time=00:30:00 +#SBATCH --partition=main + +# NOTE: Run this either with `sbatch make_env.sh` or within an interactive job with `salloc`: +# salloc --gres=gpu:1 --cpus-per-task=1 --mem=16G --time=00:30:00 + +# Exit on error +set -e + +module --quiet purge +module load anaconda/3 +module load cuda/11.7 + +ENV_PATH="$SCRATCH/conda/pytorch_orion" + +# Create the environment: +conda create --yes --prefix $ENV_PATH python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 --channel pytorch --channel nvidia +# Install as many packages as possible with Conda: +conda install --yes --prefix $ENV_PATH tqdm rich --channel conda-forge +# conda install --yes --prefix $ENV_PATH orion --channel epistimio # NOTE: Unfortunately this doesn't work atm: https://github.com/Epistimio/orion/issues/1111 +# Activate the environment: +conda activate $ENV_PATH +# Install the rest of the dependencies with pip: +pip install orion +conda env export --no-builds --from-history --file environment.yaml diff --git a/docs/examples/good_practices/launch_many_jobs/README.rst b/docs/examples/good_practices/launch_many_jobs/README.rst index 27c58eb5..43129069 100644 --- a/docs/examples/good_practices/launch_many_jobs/README.rst +++ b/docs/examples/good_practices/launch_many_jobs/README.rst @@ -38,6 +38,8 @@ repository. .. code:: diff # distributed/single_gpu/job.sh -> good_practices/launch_many_jobs/job.sh + old mode 100755 + new mode 100644 #!/bin/bash #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 @@ -58,13 +60,14 @@ repository. module load anaconda/3 module load cuda/11.7 - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.7 -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm - + +# Creating the environment for the first time: + +# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ + +# pytorch-cuda=11.7 -c pytorch -c nvidia + +# Other conda packages: + +# conda install -y -n pytorch -c conda-forge rich tqdm + + # Activate pre-existing environment. + -# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/good_practices/many_tasks_per_gpu/README.rst b/docs/examples/good_practices/many_tasks_per_gpu/README.rst index 387ffc1b..f78de2a2 100644 --- a/docs/examples/good_practices/many_tasks_per_gpu/README.rst +++ b/docs/examples/good_practices/many_tasks_per_gpu/README.rst @@ -33,8 +33,6 @@ repository. .. code:: diff # distributed/single_gpu/job.sh -> good_practices/many_tasks_per_gpu/job.sh - old mode 100644 - new mode 100755 #!/bin/bash -#SBATCH --gpus-per-task=rtx8000:1 +#SBATCH --gres=gpu:rtx8000:1 @@ -57,13 +55,14 @@ repository. module load anaconda/3 module load cuda/11.7 - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.7 -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm - + +# Creating the environment for the first time: + +# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ + +# pytorch-cuda=11.7 -c pytorch -c nvidia + +# Other conda packages: + +# conda install -y -n pytorch -c conda-forge rich tqdm + + # Activate pre-existing environment. + -# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/good_practices/wandb_setup/README.rst b/docs/examples/good_practices/wandb_setup/README.rst index 3936dca0..474525af 100644 --- a/docs/examples/good_practices/wandb_setup/README.rst +++ b/docs/examples/good_practices/wandb_setup/README.rst @@ -28,6 +28,8 @@ Click here to see `the source code for this example .. code:: diff # distributed/single_gpu/job.sh -> good_practices/wandb_setup/job.sh + old mode 100755 + new mode 100644 #!/bin/bash #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 @@ -48,14 +50,14 @@ Click here to see `the source code for this example module load anaconda/3 module load cuda/11.7 - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.7 -c pytorch -c nvidia - # Other conda packages: - -# conda install -y -n pytorch -c conda-forge rich tqdm + +# Creating the environment for the first time: + +# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ + +# pytorch-cuda=11.7 -c pytorch -c nvidia + +# Other conda packages: +# conda install -y -n pytorch -c conda-forge rich tqdm wandb - + + # Activate pre-existing environment. + -# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 00000000..668bc7a3 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1 @@ +.submitit \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/environment.yaml b/tests/environment.yaml new file mode 100644 index 00000000..a3fc46d5 --- /dev/null +++ b/tests/environment.yaml @@ -0,0 +1,130 @@ +# +# Generated 2023-01-24 +# +# conda create -n py38torch113 python=3.8 pytorch torchvision torchaudio \ +# pytorch-cuda=11.6 -c pytorch -c nvidia +# +name: py38torch113 +channels: + - pytorch + - nvidia + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - blas=1.0=mkl + - brotlipy=0.7.0=py38h27cfd23_1003 + - bzip2=1.0.8=h7b6447c_0 + - ca-certificates=2023.01.10=h06a4308_0 + - certifi=2022.12.7=py38h06a4308_0 + - cffi=1.15.1=py38h5eee18b_3 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - cryptography=38.0.4=py38h9ce1e76_0 + - cuda=11.6.1=0 + - cuda-cccl=11.6.55=hf6102b2_0 + - cuda-command-line-tools=11.6.2=0 + - cuda-compiler=11.6.2=0 + - cuda-cudart=11.6.55=he381448_0 + - cuda-cudart-dev=11.6.55=h42ad0f4_0 + - cuda-cuobjdump=11.6.124=h2eeebcb_0 + - cuda-cupti=11.6.124=h86345e5_0 + - cuda-cuxxfilt=11.6.124=hecbf4f6_0 + - cuda-driver-dev=11.6.55=0 + - cuda-gdb=12.0.90=0 + - cuda-libraries=11.6.1=0 + - cuda-libraries-dev=11.6.1=0 + - cuda-memcheck=11.8.86=0 + - cuda-nsight=12.0.78=0 + - cuda-nsight-compute=12.0.0=0 + - cuda-nvcc=11.6.124=hbba6d2d_0 + - cuda-nvdisasm=12.0.76=0 + - cuda-nvml-dev=11.6.55=haa9ef22_0 + - cuda-nvprof=12.0.90=0 + - cuda-nvprune=11.6.124=he22ec0a_0 + - cuda-nvrtc=11.6.124=h020bade_0 + - cuda-nvrtc-dev=11.6.124=h249d397_0 + - cuda-nvtx=11.6.124=h0630a44_0 + - cuda-nvvp=12.0.90=0 + - cuda-runtime=11.6.1=0 + - cuda-samples=11.6.101=h8efea70_0 + - cuda-sanitizer-api=12.0.90=0 + - cuda-toolkit=11.6.1=0 + - cuda-tools=11.6.1=0 + - cuda-visual-tools=11.6.1=0 + - ffmpeg=4.3=hf484d3e_0 + - flit-core=3.6.0=pyhd3eb1b0_0 + - freetype=2.12.1=h4a9f257_0 + - gds-tools=1.5.0.59=0 + - giflib=5.2.1=h7b6447c_0 + - gmp=6.2.1=h295c915_3 + - gnutls=3.6.15=he1e5248_0 + - idna=3.4=py38h06a4308_0 + - intel-openmp=2021.4.0=h06a4308_3561 + - jpeg=9e=h7f8727e_0 + - lame=3.100=h7b6447c_0 + - lcms2=2.12=h3be6417_0 + - ld_impl_linux-64=2.38=h1181459_1 + - lerc=3.0=h295c915_0 + - libcublas=11.9.2.110=h5e84587_0 + - libcublas-dev=11.9.2.110=h5c901ab_0 + - libcufft=10.7.1.112=hf425ae0_0 + - libcufft-dev=10.7.1.112=ha5ce4c0_0 + - libcufile=1.5.0.59=0 + - libcufile-dev=1.5.0.59=0 + - libcurand=10.3.1.50=0 + - libcurand-dev=10.3.1.50=0 + - libcusolver=11.3.4.124=h33c3c4e_0 + - libcusparse=11.7.2.124=h7538f96_0 + - libcusparse-dev=11.7.2.124=hbbe9722_0 + - libdeflate=1.8=h7f8727e_5 + - libffi=3.4.2=h6a678d5_6 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libiconv=1.16=h7f8727e_2 + - libidn2=2.3.2=h7f8727e_0 + - libnpp=11.6.3.124=hd2722f0_0 + - libnpp-dev=11.6.3.124=h3c42840_0 + - libnvjpeg=11.6.2.124=hd473ad6_0 + - libnvjpeg-dev=11.6.2.124=hb5906b9_0 + - libpng=1.6.37=hbc83047_0 + - libstdcxx-ng=11.2.0=h1234567_1 + - libtasn1=4.16.0=h27cfd23_0 + - libtiff=4.5.0=hecacb30_0 + - libunistring=0.9.10=h27cfd23_0 + - libwebp=1.2.4=h11a3e52_0 + - libwebp-base=1.2.4=h5eee18b_0 + - lz4-c=1.9.4=h6a678d5_0 + - mkl=2021.4.0=h06a4308_640 + - mkl-service=2.4.0=py38h7f8727e_0 + - mkl_fft=1.3.1=py38hd3c417c_0 + - mkl_random=1.2.2=py38h51133e4_0 + - ncurses=6.3=h5eee18b_3 + - nettle=3.7.3=hbbd107a_1 + - nsight-compute=2022.4.0.15=0 + - numpy=1.23.5=py38h14f4228_0 + - numpy-base=1.23.5=py38h31eccc5_0 + - openh264=2.1.1=h4ff587b_0 + - openssl=1.1.1s=h7f8727e_0 + - pillow=9.3.0=py38hace64e9_1 + - pip=22.3.1=py38h06a4308_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pyopenssl=22.0.0=pyhd3eb1b0_0 + - pysocks=1.7.1=py38h06a4308_0 + - python=3.8.16=h7a1cb2a_2 + - pytorch=1.13.1=py3.8_cuda11.6_cudnn8.3.2_0 + - pytorch-cuda=11.6=h867d48c_1 + - pytorch-mutex=1.0=cuda + - readline=8.2=h5eee18b_0 + - requests=2.28.1=py38h06a4308_0 + - setuptools=65.6.3=py38h06a4308_0 + - six=1.16.0=pyhd3eb1b0_1 + - sqlite=3.40.1=h5082296_0 + - tk=8.6.12=h1ccaba5_0 + - torchaudio=0.13.1=py38_cu116 + - torchvision=0.14.1=py38_cu116 + - typing_extensions=4.4.0=py38h06a4308_0 + - urllib3=1.26.14=py38h06a4308_0 + - wheel=0.37.1=pyhd3eb1b0_0 + - xz=5.2.10=h5eee18b_1 + - zlib=1.2.13=h5eee18b_0 + - zstd=1.5.2=ha4553b6_0 diff --git a/tests/test_examples.py b/tests/test_examples.py new file mode 100644 index 00000000..c1381c11 --- /dev/null +++ b/tests/test_examples.py @@ -0,0 +1,491 @@ +"""Tests that launch the examples as jobs on the Mila cluster and check that they work correctly.""" +from __future__ import annotations +import functools + +import logging +import os +import re +import runpy +import shlex +import subprocess +import time +from logging import getLogger as get_logger +from pathlib import Path +import enum +from typing import Any, NamedTuple + +import pytest +import rich.console +import rich.logging +import rich.traceback +from pytest_regressions.file_regression import FileRegressionFixture + +from .testutils import ( + DEFAULT_SBATCH_PARAMETER_OVERRIDES, + EXAMPLES_DIR, + SUBMITIT_DIR, + TEST_JOB_NAME, + copy_example_files_to_test_dir, + filter_job_output_before_regression_check, + run_example, + run_pytorch_example, +) + +logger = get_logger(__name__) +SCRATCH = Path(os.environ["SCRATCH"]) + + +class GpuModel(enum.Enum): + # a100_10gb = "1g.10gb" + a100_20gb = "2g.20gb" + a100_3g40gb = "3g.40gb" + a100_4g40gb = "4g.40gb" + a100 = "a100" + a100l = "a100l" + a6000 = "a6000" + rtx8000 = "rtx8000" + v100 = "v100" + + +gpu_memory_gb = { + "1g.10gb": 10, + "2g.20gb": 20, + "3g.40gb": 40, + "a100": 40, + "a100l": 80, + "a6000": 48, + "rtx8000": 48, + "v100": 16, +} + + +class AvailTotal(NamedTuple): + avail: int + total: int + + +@functools.cache +def savail() -> dict[str, AvailTotal]: + """Gets the output of the `savail` command in a Python dictionary. + + ``` + GPU Avail / Total + =============================== + 1g.10gb 38 / 40 + 2g.20gb 59 / 60 + 3g.40gb 39 / 40 + a100 0 / 16 + a100l 3 / 56 + a6000 1 / 8 + rtx8000 156 / 384 + v100 10 / 50 + ``` + """ + savail_output = subprocess.check_output(["savail"]).decode("utf-8") + lines = [line.strip() for line in savail_output.splitlines()[2:]] + return { + gpu_type: AvailTotal(int(avail), int(total)) + for gpu_type, avail, _, total in [line.split() for line in lines] + } + + +gpu_types = [v.value for v in GpuModel] + + +@pytest.fixture(scope="session", autouse=True) +def setup_logging(): + """Setup logging (using a recipe from @JesseFarebro)""" + # Get the current logging level: + # NOTE: Pytest already sets the logging level with --log-level, so we don't do it here. + level = logger.getEffectiveLevel() + console = rich.console.Console() + + _TRACEBACKS_EXCLUDES = [ + runpy, + "absl", + "click", + "tyro", + "simple_parsing", + "fiddle", + ] + + rich.traceback.install(console=console, suppress=_TRACEBACKS_EXCLUDES, show_locals=False) + logging.basicConfig( + level=level, + format="%(message)s", + datefmt="[%X]", + force=True, + handlers=[ + rich.logging.RichHandler( + console=console, + rich_tracebacks=True, + tracebacks_show_locals=False, + tracebacks_suppress=_TRACEBACKS_EXCLUDES, + ) + ], + ) + + +def make_conda_env_for_test( + make_env_sh_file: Path, + env_name_in_script: str, + env_path: Path, +): + job_script = make_env_sh_file + example_dir = make_env_sh_file.parent + + # Copy all the python and .sh files from the example dir to the test example dir. + # (NOTE: This is so we can potentially modify the contents before running them in tests.) + test_example_dir = SUBMITIT_DIR / "_".join(example_dir.relative_to(EXAMPLES_DIR).parts) + copy_example_files_to_test_dir(example_dir, test_example_dir) + + outputs = run_example( + job_script=test_example_dir / job_script.name, + conda_env_name_in_script=env_name_in_script, + conda_env=env_path, + sbatch_parameter_overrides=DEFAULT_SBATCH_PARAMETER_OVERRIDES, + ) + assert len(outputs) == 1 + output = outputs[0] + assert not output or output.isspace(), output + return env_path + + +@pytest.fixture(scope="session") +def pytorch_conda_env() -> Path: + """A fixture that launches a job to create the PyTorch + orion conda env.""" + env_name_in_script = "pytorch" # Name in the example + env_name = "pytorch_test" # Name used in the tests + env_path = SCRATCH / "conda" / env_name + make_env_sh_file = EXAMPLES_DIR / "frameworks" / "pytorch_setup" / "make_env.sh" + command_to_test_that_env_is_working = ( + f"conda run --prefix {env_path} python -c 'import torch, tqdm, rich'" + ) + + try: + subprocess.check_call(shlex.split(command_to_test_that_env_is_working)) + except subprocess.CalledProcessError: + logger.info(f"The {env_path} env has not already been created at {env_path}.") + else: + logger.info( + f"The {env_path} env has already been created with all required packages at {env_path}." + ) + return env_path + + make_conda_env_for_test( + env_path=env_path, + make_env_sh_file=make_env_sh_file, + env_name_in_script=env_name_in_script, + ) + return env_path + + +@pytest.fixture(autouse=True, scope="session") +def scancel_jobs_after_tests(): + yield + username = os.environ["USER"] + subprocess.check_call(["scancel", "-u", username, "--name", TEST_JOB_NAME]) + + +def _test_id(arg: Path | bool | dict) -> str: + if isinstance(arg, Path): + path = arg + return str(path.relative_to(EXAMPLES_DIR)) + if isinstance(arg, bool): + return str(arg) + assert isinstance(arg, dict) + return "-".join(f"{k}={v}" for k, v in arg.items()) + + +@pytest.fixture(params=gpu_types) +def sbatch_gpu_override(request: pytest.FixtureRequest) -> dict[str, str]: + gpu_type: str = request.param + gpu_availability = savail() + + assert gpu_type in gpu_availability, f"{gpu_type} doesn't show up in the savail output!" + avail, total = gpu_availability[gpu_type] + if avail == 0: + pytest.skip(reason="Isn't available on the cluster at the moment.") + + return {"gres": f"gpu:{gpu_type}:1"} + + +@pytest.mark.parametrize( + ("example_dir", "make_reproducible", "sbatch_overrides"), + [ + (EXAMPLES_DIR / "frameworks" / "pytorch_setup", False, {}), + (EXAMPLES_DIR / "distributed" / "single_gpu", True, {}), + (EXAMPLES_DIR / "distributed" / "multi_gpu", True, {}), + pytest.param( + EXAMPLES_DIR / "distributed" / "multi_node", + True, + {"partition": "long"}, + marks=[ + # pytest.mark.timeout(300), + # pytest.mark.xfail(raises=) + ], + ), + ], + ids=_test_id, +) +def test_pytorch_example_on_all_gpus( + example_dir: Path, + make_reproducible: bool, + sbatch_overrides: dict[str, Any] | None, + pytorch_conda_env: Path, + file_regression: FileRegressionFixture, +): + """Launches a pytorch-based example as a slurm job and checks that the output is as expected. + + Some of the examples are modified so their outputs are reproducible. + """ + + filtered_job_outputs = run_pytorch_example( + example_dir=example_dir, + pytorch_conda_env_location=pytorch_conda_env, + sbatch_parameter_overrides=sbatch_overrides, + examples_dir=EXAMPLES_DIR, + make_reproducible=make_reproducible, + ) + if len(filtered_job_outputs) == 1: + # Only one task. + file_regression.check(filtered_job_outputs[0]) + else: + file_regression.check( + "\n".join( + [ + f"Task {i} output:\n" + task_i_output + for i, task_i_output in enumerate(filtered_job_outputs) + ] + ) + ) + + +@pytest.mark.timeout(10 * 60) +def test_checkpointing_example(pytorch_conda_env: Path, file_regression: FileRegressionFixture): + """Tests the checkpointing example. + + This test is quite nice. Here's what it does: + - Launch the job, let it run till completion. + - Launch the job again, and then do `scontrol requeue ` to force it + to be requeued once it has created a checkpoint (reached Epoch 1) + - Check that the exact same result is reached whether it is requeued or not. + """ + example_dir = EXAMPLES_DIR / "good_practices" / "checkpointing" + test_example_dir = SUBMITIT_DIR / "_".join(example_dir.relative_to(EXAMPLES_DIR).parts) + + uninterrupted_job_outputs = run_pytorch_example( + example_dir=example_dir, + pytorch_conda_env_location=pytorch_conda_env, + # Need to specify a GPU so the results are reproducible. + sbatch_parameter_overrides={"gpus_per_task": "rtx8000:1"}, + test_example_dir=test_example_dir, + examples_dir=EXAMPLES_DIR, + make_reproducible=True, + ) + assert len(uninterrupted_job_outputs) == 1 + uninterrupted_job_output = uninterrupted_job_outputs[0] + file_regression.check(uninterrupted_job_output) + + # NOTE: Reusing the exact same job.sh and main.py scripts as were used above: + job_script = test_example_dir / "job.sh" + job = run_example( + job_script, + conda_env=pytorch_conda_env, + conda_env_name_in_script="pytorch", + sbatch_parameter_overrides={"gpus_per_task": "rtx8000:1"}, + wait_for_results=False, + ) + interval_seconds = 5 + + while job.state in ["UNKNOWN", "PENDING"]: + logger.debug(f"Waiting for job {job.job_id} to start running. ({job.state=!r})") + time.sleep(interval_seconds) + assert job.state == "RUNNING" + + output_file = job.paths.stdout + while not output_file.exists() or "Train epoch 1:" not in output_file.read_text(): + output_path = output_file.relative_to(Path.cwd()) + logger.debug( + f"Waiting for job {job.job_id} to reach the second epoch of training. {output_path=}" + ) + time.sleep(interval_seconds) + + requeue_command = f"scontrol requeue {job.job_id}" + logger.info(f"Requeueing the job using {requeue_command=!r}") + subprocess.check_call(shlex.split(requeue_command)) + + # todo: double-check that there aren't other intermediate states I might miss because of the low + # time-resolution. + + while job.state == "RUNNING": + logger.debug(f"Waiting for job {job.job_id} to get requeued. ({job.state=!r})") + time.sleep(interval_seconds) + + # assert job.state == "REQUEUED" + logger.debug(f"Job {job.job_id} is being requeued.") + while job.state == "REQUEUED": + logger.debug(f"Waiting for job {job.job_id} to become pending. ({job.state=!r})") + time.sleep(interval_seconds) + + # NOTE: The state doesn't get updated back to `RUNNING` after doing REQUEUED -> PENDING! + # (Either that, or there's some sort of caching mechanism that would take too long to get + # assert job.state == "PENDING" + logger.debug(f"Job {job.job_id} is now pending.") + # invalidated.) Therefore manually trigger a "cache" update here. + while job.watcher.get_state(job.job_id, mode="force") == "PENDING": + logger.debug(f"Waiting for job {job.job_id} to start running again. ({job.state=!r})") + time.sleep(interval_seconds) + + assert job.state in ["RUNNING", "COMPLETED"] + logger.info(f"Job {job.job_id} is now running again after having been requeued.") + # Wait for the job to finish (again): + requeued_job_output = job.result() + # Filter out lines that may change between executions: + filtered_requeued_job_output = filter_job_output_before_regression_check(requeued_job_output) + # TODO: Here it *might* be a bad idea for this requeued output to be checked using the + # file_regression fixture, because it could happen that we resume from a different epoch, + # depending on a few things: + # - how fast the output file can actually show us that the job has reached the second epoch + # - how long the job takes to actually stop and get requeued + # - how fast an epoch takes to run (if this were to become << the interval at which we check the + # output, then we might miss the second epoch) + # ALSO: not sure if it's because we're not using `exec`, but it seems like it's taking longer + # for the job to stop running once we ask it to requeue. + file_regression.check(filtered_requeued_job_output, extension="_requeued.txt") + + # todo: Compare the output of the requeued job to the output of the non-requeued job in a way + # that isn't too too hard-coded for that specific example. + # For example, we could extract the accuracies at each epoch and check that they line up. + uninterrupted_values = get_val_loss_and_accuracy_at_each_epoch(uninterrupted_job_output) + interrupted_values = get_val_loss_and_accuracy_at_each_epoch(filtered_requeued_job_output) + + resumed_epoch = min(interrupted_values.keys()) + final_epoch = max(interrupted_values.keys()) + assert set(uninterrupted_values.keys()) > set(interrupted_values.keys()) + for epoch in range(resumed_epoch, final_epoch + 1): + # Compare the values at each epoch, they should match: + assert uninterrupted_values[epoch] == interrupted_values[epoch] + + +def get_val_loss_and_accuracy_at_each_epoch( + filtered_job_output: str, +) -> dict[int, tuple[float, float]]: + # [(date) (time)] INFO Epoch 3: Val loss: 37.565 accuracy: 67.58% + # [(date) (time)] INFO Epoch 4: Val loss: 37.429 accuracy: 68.14% + # [(date) (time)] INFO Epoch 5: Val loss: 40.469 accuracy: 66.78% + # [(date) (time)] INFO Epoch 6: Val loss: 48.439 accuracy: 63.78% + # [(date) (time)] INFO Epoch 7: Val loss: 38.182 accuracy: 71.46% + # [(date) (time)] INFO Epoch 8: Val loss: 40.733 accuracy: 70.60% + # [(date) (time)] INFO Epoch 9: Val loss: 44.822 accuracy: 69.96% + val_losses_and_accuracies: dict[int, tuple[float, float]] = {} + for line in filtered_job_output.splitlines(): + match_epoch = re.search(r"Epoch (\d+):", line) + match_val_loss = re.search(r"Val loss: (\d+\.\d+)", line) + match_val_accuracy = re.search(r"accuracy: (\d+\.\d+)%", line) + if ( + match_epoch is not None + and match_val_loss is not None + and match_val_accuracy is not None + ): + epoch = int(match_epoch.group(1)) + val_loss = float(match_val_loss.group(1)) + val_accuracy = float(match_val_accuracy.group(1)) + val_losses_and_accuracies[epoch] = (val_loss, val_accuracy) + if not val_losses_and_accuracies: + raise RuntimeError( + "Unable to extract the val loss and accuracy! Perhaps the regex here are wrong?" + ) + return val_losses_and_accuracies + + +@pytest.fixture(scope="session") +def pytorch_orion_conda_env() -> Path: + """A fixture that launches a job to create the PyTorch + orion conda env.""" + env_name_in_script = "pytorch_orion" # Name in the example + env_name = "pytorch_orion_test" # Name used in the tests + env_path = SCRATCH / "conda" / env_name + make_env_sh_file = EXAMPLES_DIR / "good_practices" / "hpo_with_orion" / "make_env.sh" + command_to_test_that_env_is_working = ( + f"conda run --prefix {env_path} python -c 'import torch, tqdm, rich, orion'" + ) + + try: + logger.debug( + f"Checking if the environment is already created at {env_path} by running " + f"{command_to_test_that_env_is_working!r}" + ) + subprocess.check_call(shlex.split(command_to_test_that_env_is_working)) + except subprocess.CalledProcessError as err: + logger.info(f"The {env_path} env has not already been created: {err}") + else: + logger.info(f"The {env_path} env has already been created with all required packages.") + return env_path + + make_conda_env_for_test( + env_path=env_path, + make_env_sh_file=make_env_sh_file, + env_name_in_script=env_name_in_script, + ) + return env_path + + +# TODO: Make this run faster. Times out with 10 minutes, but seems to be reaching the end though, +# which is quite strange. Perhaps we could reduce the number of trials? +@pytest.mark.timeout(20 * 60) +def test_orion_example(pytorch_orion_conda_env: Path, file_regression: FileRegressionFixture): + """Tests the "HPO with Orion" example. + + TODO: This should probably use a different conda environment, instead of adding a + `pip install orion` to the same pytorch env. + """ + example_dir = EXAMPLES_DIR / "good_practices" / "hpo_with_orion" + sbatch_overrides = None + + def modify_job_script_before_running(job_script_path: Path) -> None: + job_script_lines = job_script_path.read_text().splitlines() + # TODO: Make this use a database in $SLURM_TMPDIR or something, so each run is independent. + + last_line = job_script_lines[-1] + assert "hunt" in last_line + + example_dir = job_script_path.parent + # TODO: Create an Orion config so that we can pass the path to the database to use. + # Otherwise it uses a config in ~/.local/shapre/orion.core/orion/orion_db.pkl + # TODO: Make the Orion suggestions reproducible by passing a seed to the algorithm. + import yaml + + orion_config_path = example_dir / "orion_config.yaml" + with open(orion_config_path, "w+") as f: + yaml.dump( + { + "storage": { + "type": "legacy", + "database": { + "type": "pickleddb", + "host": str(example_dir / "database.pkl"), + }, + }, + }, + f, + ) + + last_line = last_line.replace("--exp-max-trials 10", "--exp-max-trials 3") + last_line = last_line.replace("hunt", f"hunt --config={orion_config_path}") + + job_script_lines[-1] = last_line + job_script_path.write_text("\n".join(job_script_lines)) + + filtered_job_outputs = run_pytorch_example( + example_dir=example_dir, + pytorch_conda_env_location=pytorch_orion_conda_env, + sbatch_parameter_overrides=sbatch_overrides, + make_reproducible=True, + examples_dir=EXAMPLES_DIR, + submitit_dir=SUBMITIT_DIR, + modify_job_script_before_running=modify_job_script_before_running, + conda_env_name_in_script="pytorch_orion", + ) + + assert len(filtered_job_outputs) == 1 + file_regression.check(filtered_job_outputs[0]) diff --git a/tests/test_examples/test_checkpointing_example.txt b/tests/test_examples/test_checkpointing_example.txt new file mode 100644 index 00000000..1957b51c --- /dev/null +++ b/tests/test_examples/test_checkpointing_example.txt @@ -0,0 +1,15 @@ +[(date) (time)] INFO No checkpoints found in /$SCRATCH/checkpointing_example/$SLURM_JOB_ID/checkpoints. Training from scratch. main.py:117 +Using downloaded and verified file: $SLURM_TMPDIR/data/cifar-10-python.tar.gz +Extracting $SLURM_TMPDIR/data/cifar-10-python.tar.gz to $SLURM_TMPDIR/data +Files already downloaded and verified +[(date) (time)] INFO Epoch 0: Val loss: 50.314 accuracy: 54.26% main.py:204 +[(date) (time)] INFO Epoch 1: Val loss: 46.534 accuracy: 59.04% main.py:204 +[(date) (time)] INFO Epoch 2: Val loss: 42.161 accuracy: 62.84% main.py:204 +[(date) (time)] INFO Epoch 3: Val loss: 37.565 accuracy: 67.58% main.py:204 +[(date) (time)] INFO Epoch 4: Val loss: 37.429 accuracy: 68.14% main.py:204 +[(date) (time)] INFO Epoch 5: Val loss: 40.469 accuracy: 66.78% main.py:204 +[(date) (time)] INFO Epoch 6: Val loss: 48.439 accuracy: 63.78% main.py:204 +[(date) (time)] INFO Epoch 7: Val loss: 38.182 accuracy: 71.46% main.py:204 +[(date) (time)] INFO Epoch 8: Val loss: 40.733 accuracy: 70.60% main.py:204 +[(date) (time)] INFO Epoch 9: Val loss: 44.822 accuracy: 69.96% main.py:204 +Done! \ No newline at end of file diff --git a/tests/test_examples/test_checkpointing_example_requeued.txt b/tests/test_examples/test_checkpointing_example_requeued.txt new file mode 100644 index 00000000..f2dbec3a --- /dev/null +++ b/tests/test_examples/test_checkpointing_example_requeued.txt @@ -0,0 +1,12 @@ +[(date) (time)] INFO NOTE: This job has been restarted 1 times by SLURM. main.py:293 + INFO Resuming training at epoch 4 (best_acc=67.58%). main.py:115 +Using downloaded and verified file: $SLURM_TMPDIR/data/cifar-10-python.tar.gz +Extracting $SLURM_TMPDIR/data/cifar-10-python.tar.gz to $SLURM_TMPDIR/data +Files already downloaded and verified +[(date) (time)] INFO Epoch 4: Val loss: 37.429 accuracy: 68.14% main.py:204 +[(date) (time)] INFO Epoch 5: Val loss: 40.469 accuracy: 66.78% main.py:204 +[(date) (time)] INFO Epoch 6: Val loss: 48.439 accuracy: 63.78% main.py:204 +[(date) (time)] INFO Epoch 7: Val loss: 38.182 accuracy: 71.46% main.py:204 +[(date) (time)] INFO Epoch 8: Val loss: 40.733 accuracy: 70.60% main.py:204 +[(date) (time)] INFO Epoch 9: Val loss: 44.822 accuracy: 69.96% main.py:204 +Done! \ No newline at end of file diff --git a/tests/test_examples/test_orion_example.txt b/tests/test_examples/test_orion_example.txt new file mode 100644 index 00000000..4c64cc98 --- /dev/null +++ b/tests/test_examples/test_orion_example.txt @@ -0,0 +1,83 @@ +[(date) (time)] INFO INFO:__main__:Args: { main.py:61 + "epochs": 10, + "learning_rate": 0.0009614, + "weight_decay": 0.0001, + "batch_size": 128 + } +Using downloaded and verified file: $SLURM_TMPDIR/data/cifar-10-python.tar.gz +Extracting $SLURM_TMPDIR/data/cifar-10-python.tar.gz to $SLURM_TMPDIR/data +Files already downloaded and verified +[(date) (time)] INFO INFO:__main__:Epoch 0: Val loss: 56.395 accuracy: 50.04% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 1: Val loss: 39.939 accuracy: 64.00% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 2: Val loss: 43.116 accuracy: 61.02% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 3: Val loss: 41.245 accuracy: 66.02% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 4: Val loss: 35.337 accuracy: 69.38% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 5: Val loss: 35.258 accuracy: 70.92% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 6: Val loss: 33.818 accuracy: 72.62% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 7: Val loss: 33.997 accuracy: 73.18% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 8: Val loss: 39.907 accuracy: 70.62% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 9: Val loss: 42.275 accuracy: 71.44% main.py:136 +Done! +[(date) (time)] INFO INFO:__main__:Args: { main.py:61 + "epochs": 10, + "learning_rate": 0.01945, + "weight_decay": 0.0001, + "batch_size": 128 + } +Files already downloaded and verified +Files already downloaded and verified +[(date) (time)] INFO INFO:__main__:Epoch 0: Val loss: 61.882 accuracy: 43.92% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 1: Val loss: 52.464 accuracy: 52.86% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 2: Val loss: 44.925 accuracy: 59.68% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 3: Val loss: 48.011 accuracy: 57.98% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 4: Val loss: 44.668 accuracy: 61.34% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 5: Val loss: 37.669 accuracy: 67.84% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 6: Val loss: 49.468 accuracy: 60.66% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 7: Val loss: 36.664 accuracy: 69.90% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 8: Val loss: 36.523 accuracy: 70.32% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 9: Val loss: 39.187 accuracy: 69.94% main.py:136 +Done! +[(date) (time)] INFO INFO:__main__:Args: { main.py:61 + "epochs": 10, + "learning_rate": 0.001854, + "weight_decay": 0.0001, + "batch_size": 128 + } +Files already downloaded and verified +Files already downloaded and verified +[(date) (time)] INFO INFO:__main__:Epoch 0: Val loss: 71.891 accuracy: 39.80% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 1: Val loss: 42.319 accuracy: 62.22% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 2: Val loss: 41.534 accuracy: 63.50% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 3: Val loss: 35.034 accuracy: 68.82% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 4: Val loss: 40.732 accuracy: 65.76% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 5: Val loss: 36.258 accuracy: 71.42% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 6: Val loss: 37.599 accuracy: 69.62% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 7: Val loss: 33.993 accuracy: 72.62% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 8: Val loss: 45.572 accuracy: 68.40% main.py:136 +[(date) (time)] INFO INFO:__main__:Epoch 9: Val loss: 39.671 accuracy: 72.72% main.py:136 +Done! +Search finished successfully + +Stats +===== +completed: True +trials completed: 3 +best trial: + id: aaa501d452bf588b3b06443386a1518d + evaluation: 0.2728000283241272 + params: + /learning-rate: 0.001854 +start time: 2023-08-07 20:30:48.685649 +finish time: 2023-08-07 20:36:53.205726 +elapsed_time: 0:06:04.153296 + + +Hints +===== + +Info +---- + +To get more information on the experiment, run the command + +orion info --name orion-example --version 1 \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_distributed_001_single_gpu_True__.txt b/tests/test_examples/test_pytorch_example_distributed_001_single_gpu_True__.txt new file mode 100644 index 00000000..25b636a1 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_distributed_001_single_gpu_True__.txt @@ -0,0 +1,24 @@ +Using downloaded and verified file: $SLURM_TMPDIR/data/cifar-10-python.tar.gz +Extracting $SLURM_TMPDIR/data/cifar-10-python.tar.gz to $SLURM_TMPDIR/data +Files already downloaded and verified +[(date) (time)] INFO INFO:__main__:Epoch 0: Val loss: 50.314 main.py:121 + accuracy: 54.26% +[(date) (time)] INFO INFO:__main__:Epoch 1: Val loss: 46.534 main.py:121 + accuracy: 59.04% +[(date) (time)] INFO INFO:__main__:Epoch 2: Val loss: 42.161 main.py:121 + accuracy: 62.84% +[(date) (time)] INFO INFO:__main__:Epoch 3: Val loss: 37.565 main.py:121 + accuracy: 67.58% +[(date) (time)] INFO INFO:__main__:Epoch 4: Val loss: 37.429 main.py:121 + accuracy: 68.14% +[(date) (time)] INFO INFO:__main__:Epoch 5: Val loss: 40.469 main.py:121 + accuracy: 66.78% +[(date) (time)] INFO INFO:__main__:Epoch 6: Val loss: 48.439 main.py:121 + accuracy: 63.78% +[(date) (time)] INFO INFO:__main__:Epoch 7: Val loss: 38.182 main.py:121 + accuracy: 71.46% +[(date) (time)] INFO INFO:__main__:Epoch 8: Val loss: 40.733 main.py:121 + accuracy: 70.60% +[(date) (time)] INFO INFO:__main__:Epoch 9: Val loss: 44.822 main.py:121 + accuracy: 69.96% +Done! \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_distributed_002_multi_gpu_True__.txt b/tests/test_examples/test_pytorch_example_distributed_002_multi_gpu_True__.txt new file mode 100644 index 00000000..3d893328 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_distributed_002_multi_gpu_True__.txt @@ -0,0 +1,51 @@ +Task 0 output: +PyTorch Distributed available. + Backends: + Gloo: True + NCCL: True + MPI: False +[(date) (time)] INFO [0/2] __main__ - World size: 2, global main.py:53 + rank: 0 +Using downloaded and verified file: $SLURM_TMPDIR/data/cifar-10-python.tar.gz +Extracting $SLURM_TMPDIR/data/cifar-10-python.tar.gz to $SLURM_TMPDIR/data +Files already downloaded and verified +[(date) (time)] INFO [0/2] __main__ - Effective batch size: main.py:81 + 256 +[(date) (time)] INFO [0/2] distributed.py:1140 + torch.nn.parallel.distributed - + Reducer buckets have been + rebuilt in this iteration. +[(date) (time)] INFO [0/2] __main__ - Epoch 0: Val loss: main.py:175 + 64.641 accuracy: 43.60% +[(date) (time)] INFO [0/2] __main__ - Epoch 1: Val loss: main.py:175 + 59.205 accuracy: 49.18% +[(date) (time)] INFO [0/2] __main__ - Epoch 2: Val loss: main.py:175 + 40.863 accuracy: 63.34% +[(date) (time)] INFO [0/2] __main__ - Epoch 3: Val loss: main.py:175 + 41.587 accuracy: 65.02% +[(date) (time)] INFO [0/2] __main__ - Epoch 4: Val loss: main.py:175 + 41.128 accuracy: 65.16% +[(date) (time)] INFO [0/2] __main__ - Epoch 5: Val loss: main.py:175 + 40.960 accuracy: 66.24% +[(date) (time)] INFO [0/2] __main__ - Epoch 6: Val loss: main.py:175 + 45.061 accuracy: 65.86% +[(date) (time)] INFO [0/2] __main__ - Epoch 7: Val loss: main.py:175 + 59.227 accuracy: 60.28% +[(date) (time)] INFO [0/2] __main__ - Epoch 8: Val loss: main.py:175 + 50.601 accuracy: 66.42% +[(date) (time)] INFO [0/2] __main__ - Epoch 9: Val loss: main.py:175 + 53.997 accuracy: 66.60% +Done! +Task 1 output: +PyTorch Distributed available. + Backends: + Gloo: True + NCCL: True + MPI: False +[(date) (time)] INFO [1/2] __main__ - World size: 2, global main.py:53 + rank: 1 +[(date) (time)] INFO [1/2] distributed.py:1140 + torch.nn.parallel.distributed - + Reducer buckets have been + rebuilt in this iteration. +Done! \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_distributed_003_multi_node_True_partition_long_.txt b/tests/test_examples/test_pytorch_example_distributed_003_multi_node_True_partition_long_.txt new file mode 100644 index 00000000..74cea4e6 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_distributed_003_multi_node_True_partition_long_.txt @@ -0,0 +1,80 @@ +Task 0 output: +PyTorch Distributed available. + Backends: + Gloo: True + NCCL: True + MPI: False +[(date) (time)] INFO [0/4] __main__ - World size: 4, global main.py:55 + rank: 0, local rank: 0 +Using downloaded and verified file: $SLURM_TMPDIR/data/cifar-10-python.tar.gz +Extracting $SLURM_TMPDIR/data/cifar-10-python.tar.gz to $SLURM_TMPDIR/data +Files already downloaded and verified +[(date) (time)] INFO [0/4] __main__ - Effective batch size: main.py:86 + 512 +[(date) (time)] INFO [0/4] distributed.py:1140 + torch.nn.parallel.distributed - + Reducer buckets have been + rebuilt in this iteration. +[(date) (time)] INFO [0/4] __main__ - Epoch 0: Val loss: main.py:180 + 55.032 accuracy: 50.08% +[(date) (time)] INFO [0/4] __main__ - Epoch 1: Val loss: main.py:180 + 48.975 accuracy: 56.76% +[(date) (time)] INFO [0/4] __main__ - Epoch 2: Val loss: main.py:180 + 53.192 accuracy: 55.32% +[(date) (time)] INFO [0/4] __main__ - Epoch 3: Val loss: main.py:180 + 47.434 accuracy: 59.68% +[(date) (time)] INFO [0/4] __main__ - Epoch 4: Val loss: main.py:180 + 44.753 accuracy: 63.28% +[(date) (time)] INFO [0/4] __main__ - Epoch 5: Val loss: main.py:180 + 56.168 accuracy: 59.26% +[(date) (time)] INFO [0/4] __main__ - Epoch 6: Val loss: main.py:180 + 54.097 accuracy: 63.38% +[(date) (time)] INFO [0/4] __main__ - Epoch 7: Val loss: main.py:180 + 54.764 accuracy: 63.02% +[(date) (time)] INFO [0/4] __main__ - Epoch 8: Val loss: main.py:180 + 64.655 accuracy: 61.20% +[(date) (time)] INFO [0/4] __main__ - Epoch 9: Val loss: main.py:180 + 61.904 accuracy: 63.20% +Done! +Task 1 output: +PyTorch Distributed available. + Backends: + Gloo: True + NCCL: True + MPI: False +[(date) (time)] INFO [1/4] __main__ - World size: 4, global main.py:55 + rank: 1, local rank: 1 +[(date) (time)] INFO [1/4] distributed.py:1140 + torch.nn.parallel.distributed - + Reducer buckets have been + rebuilt in this iteration. +Done! +Task 2 output: +PyTorch Distributed available. + Backends: + Gloo: True + NCCL: True + MPI: False +[(date) (time)] INFO [2/4] __main__ - World size: 4, global main.py:55 + rank: 2, local rank: 0 +Using downloaded and verified file: $SLURM_TMPDIR/data/cifar-10-python.tar.gz +Extracting $SLURM_TMPDIR/data/cifar-10-python.tar.gz to $SLURM_TMPDIR/data +Files already downloaded and verified +[(date) (time)] INFO [2/4] distributed.py:1140 + torch.nn.parallel.distributed - + Reducer buckets have been + rebuilt in this iteration. +Done! +Task 3 output: +PyTorch Distributed available. + Backends: + Gloo: True + NCCL: True + MPI: False +[(date) (time)] INFO [3/4] __main__ - World size: 4, global main.py:55 + rank: 3, local rank: 1 +[(date) (time)] INFO [3/4] distributed.py:1140 + torch.nn.parallel.distributed - + Reducer buckets have been + rebuilt in this iteration. +Done! \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_1g_10gb_1_.txt b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_1g_10gb_1_.txt new file mode 100644 index 00000000..4103bf37 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_1g_10gb_1_.txt @@ -0,0 +1,4 @@ +PyTorch built with CUDA: True +PyTorch detects CUDA available: True +PyTorch-detected #GPUs: 1 + GPU 0: NVIDIA A100-SXM4-80GB MIG 1g.10gb \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_2g_20gb_1_.txt b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_2g_20gb_1_.txt new file mode 100644 index 00000000..7a4327f7 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_2g_20gb_1_.txt @@ -0,0 +1,4 @@ +PyTorch built with CUDA: True +PyTorch detects CUDA available: True +PyTorch-detected #GPUs: 1 + GPU 0: NVIDIA A100-SXM4-80GB MIG 2g.20gb \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_3g_40gb_1_.txt b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_3g_40gb_1_.txt new file mode 100644 index 00000000..6e017995 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_3g_40gb_1_.txt @@ -0,0 +1,4 @@ +PyTorch built with CUDA: True +PyTorch detects CUDA available: True +PyTorch-detected #GPUs: 1 + GPU 0: NVIDIA A100-SXM4-80GB MIG 3g.40gb \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_rtx8000_1_.txt b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_rtx8000_1_.txt new file mode 100644 index 00000000..41daacfe --- /dev/null +++ b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_rtx8000_1_.txt @@ -0,0 +1,4 @@ +PyTorch built with CUDA: True +PyTorch detects CUDA available: True +PyTorch-detected #GPUs: 1 + GPU 0: Quadro RTX 8000 \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_v100_1_.txt b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_v100_1_.txt new file mode 100644 index 00000000..f0b99729 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_v100_1_.txt @@ -0,0 +1,4 @@ +PyTorch built with CUDA: True +PyTorch detects CUDA available: True +PyTorch-detected #GPUs: 1 + GPU 0: Tesla V100-SXM2-32GB-LS \ No newline at end of file diff --git a/tests/testutils.py b/tests/testutils.py new file mode 100644 index 00000000..4f419a8f --- /dev/null +++ b/tests/testutils.py @@ -0,0 +1,435 @@ +""" Idea: Use `submitit` to test that the setup works for this repo on the current cluster. +""" +from __future__ import annotations + +import itertools +import json +import re +import shlex +import shutil +import warnings +from logging import getLogger as get_logger +from pathlib import Path +from typing import Any, Callable, Literal, Sequence, TypeVar, overload +import submitit + +logger = get_logger(__name__) + +TEST_JOB_NAME = "example_tests" +ROOT_DIR = Path(__file__).parent.parent +EXAMPLES_DIR = ROOT_DIR / "docs" / "examples" +TESTS_DIR = Path(__file__).parent +SUBMITIT_DIR = TESTS_DIR / ".submitit" + +DEFAULT_SBATCH_PARAMETER_OVERRIDES = dict( + partition="main", + job_name=TEST_JOB_NAME, + stderr_to_stdout=True, +) + + +REPRODUCIBLE_BLOCK_PYTHON = """\ +### NOTE: This block is added to make the example reproducible during unit tests +import random +import numpy + +seed = 123 +random.seed(seed) +numpy.random.seed(seed) +torch.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.use_deterministic_algorithms(True) +### +""" + +REPRODUCIBLE_BLOCK_BATCH_SCRIPT = """\ +## +# Adding this line makes it possible to set `torch.use_deterministic_algorithms(True)` +export CUBLAS_WORKSPACE_CONFIG=:4096:8 +## +""" + + +@overload +def run_example( + job_script: Path, + conda_env: Path, + conda_env_name_in_script: str, + sbatch_parameter_overrides: dict[str, Any] + | None = None, # Actually defaults to `default_overrides` + wait_for_results: Literal[True] = True, +) -> list[str]: + ... + + +@overload +def run_example( + job_script: Path, + conda_env: Path, + conda_env_name_in_script: str, + sbatch_parameter_overrides: dict[str, Any] + | None = None, # Actually defaults to `default_overrides` + wait_for_results: Literal[False] = False, +) -> submitit.Job[str]: + ... + + +@overload +def run_example( + job_script: Path, + conda_env: Path, + conda_env_name_in_script: str, + sbatch_parameter_overrides: dict[str, Any] + | None = None, # Actually defaults to `default_overrides` + wait_for_results: bool = True, +) -> list[str] | submitit.Job[str]: + ... + + +def run_example( + job_script: Path, + conda_env: Path, + conda_env_name_in_script: str, + sbatch_parameter_overrides: dict[str, Any] + | None = None, # Actually defaults to `default_overrides` + wait_for_results: bool = True, +) -> list[str] | submitit.Job[str]: + """Submits the `job.sh` script of an example as a slurm job and returns the output. + + NOTE: The backslashes in the docstring here are there so that the IDE (VsCode) shows the full + text when hovering over an argument. + + Parameters: + job_script: The path to the `job.sh` script of the example to run. + conda_env: The path to the conda environment to use in the example. + conda_env_name_in_script: The name of the conda environment as it appears in the `job.sh` \ + of the example. This is replaced with `conda_env` before the job is submitted. + sbatch_parameter_overrides: SBATCH parameters to override (in python form, e.g. \ + `--ntasks-per-node` becomes "ntasks_per_node"). + wait_for_results: Whether to wait for the job to finish and return results, or just submit \ + the job and return it. + """ + assert job_script.exists() and job_script.is_file() and job_script.suffix == ".sh" + sbatch_parameter_overrides = sbatch_parameter_overrides or DEFAULT_SBATCH_PARAMETER_OVERRIDES + example_dir = job_script.parent + # Adds the --chdir parameter as a SBATCH flag, so the paths work and the outputs are produced in + # the right folder. + sbatch_parameter_overrides.setdefault("additional_parameters", {})["chdir"] = str(example_dir) + + job_script_content = job_script.read_text() + job_script_content = change_conda_env_used_in_job_script( + job_script_content, + conda_env_path=conda_env, + conda_env_name_in_script=conda_env_name_in_script, + ) + # TODO: Should we modify the job script file in-place here? Not doing so would keeps things + # "clean" but also make it harder to debug, since we can't just do `srun job.sh` from inside + # `tests/.submitit/` + job_script.write_text(job_script_content) + + example_lines_after_sbatch = [ + stripped_line + for line in job_script_content.splitlines(keepends=False) + if (stripped_line := line.strip()) and not stripped_line.startswith("#SBATCH") + ] + last_non_empty_line_index = -1 + job_setup = example_lines_after_sbatch[:last_non_empty_line_index] + job_command_in_example = example_lines_after_sbatch[last_non_empty_line_index] + + # NOTE: Could be nice to use the new match-case statement for this, but it requires python=3.10 + # match job_command.split(): + # case "python main.py": + srun_args: list[str] = sbatch_parameter_overrides.get("srun_args", []) + _old_srun_args = srun_args.copy() + # NOTE: If there's an `srun` in the job command, this is going to cause an issue, because + # submitit will create a last line that goes + # `srun (...) submitit.load_and_run_ish "srun job.sh"` and the job will hang! + # Therefore, we tweak the last line of the example into something that will work with submitit. + submitit_job_command, srun_args = _get_submitit_job_command_and_srun_args( + job_command_in_example, srun_args=srun_args + ) + sbatch_parameter_overrides["srun_args"] = srun_args + if submitit_job_command != job_command_in_example: + logger.debug(f"{job_command_in_example=!r}") + logger.debug(f"srun args before: {_old_srun_args!r}") + logger.debug(f"{submitit_job_command=!r}") + logger.debug(f"srun args after: {srun_args!r}") + + logger.info(f"Command that will be run by submitit: {submitit_job_command!r}") + logger.info(f"Additional args to be passed to `srun`: {srun_args!r}") + + job_setup = ( + ["set -e"] # Make the job crash if one of the command fails. + + job_setup + + ([f"# NOTE: Command that will be run by submitit: {submitit_job_command!r}"]) + ) + + executor = submitit.SlurmExecutor(folder=example_dir) + job_script_params = get_params_from_job_script(job_script) + executor.update_parameters( + setup=job_setup, + **_recursive_dict_union(job_script_params, sbatch_parameter_overrides), + ) + logger.debug(f"Using the following sbatch params: {json.dumps(executor.parameters, indent=4)}") + + assert "srun" not in submitit_job_command + function = submitit.helpers.CommandFunction( + shlex.split(submitit_job_command), + cwd=example_dir, + ) + job = executor.submit(function) + if wait_for_results: + job_outputs = job.results() + return job_outputs + return job + + +def run_pytorch_example( + example_dir: str | Path, + pytorch_conda_env_location: Path, + sbatch_parameter_overrides: dict[str, Any] | None = None, + test_example_dir: Path | None = None, + examples_dir: Path = EXAMPLES_DIR, + make_reproducible: bool = True, + submitit_dir: Path = SUBMITIT_DIR, + conda_env_name_in_script="pytorch", + modify_job_script_before_running: Callable[[Path], None] | None = None, +) -> list[str]: + """Runs a pytorch-base example with a main.py and job.sh file. + + Compared with `run_example`, this also: + - Copies the files into a `test_example_dir` directory so they can be modified before being run + - Optionally makes it reproducible by adding a block of code to the main.py and job.sh files + - Filters out the job output to remove lines that may change between executions + """ + example_dir = Path(example_dir) + assert example_dir.is_dir() + assert (example_dir / "job.sh").is_file() + assert (example_dir / "main.py").is_file() + assert example_dir.is_relative_to(examples_dir) + assert pytorch_conda_env_location.is_dir() + if test_example_dir is None: + test_example_dir = submitit_dir / "_".join(example_dir.relative_to(examples_dir).parts) + copy_example_files_to_test_dir(example_dir, test_example_dir) + + if make_reproducible: + logger.info( + f"Making a variant of the main.py and job.sh files from {example_dir} to make them " + f"~100% reproducible." + ) + make_reproducible_version_of_example(example_dir, test_example_dir) + + job_script = test_example_dir / "job.sh" + if modify_job_script_before_running: + modify_job_script_before_running(job_script) + + job_outputs = run_example( + job_script, + conda_env=pytorch_conda_env_location, + sbatch_parameter_overrides=sbatch_parameter_overrides or {}, + conda_env_name_in_script=conda_env_name_in_script, + wait_for_results=True, + ) + # Filter out lines that may change between executions: + return [filter_job_output_before_regression_check(job_output) for job_output in job_outputs] + + +def copy_example_files_to_test_dir( + example_dir: Path, test_example_dir: Path, include_patterns: Sequence[str] = ("*.py", "*.sh") +) -> None: + test_example_dir.mkdir(exist_ok=True, parents=True) + for file in itertools.chain(*[example_dir.glob(pattern) for pattern in include_patterns]): + dest = test_example_dir / file.name + if dest.exists(): + dest.unlink() + shutil.copy2(file, dest) + + +def make_logging_use_wider_console(python_script_content: str) -> str: + """Make the example use a wider console for logging. + + This is done so we can more easily match or substitute some of the contents of the job outputs + before they are checked using the file regression fixture. + """ + old = "rich.logging.RichHandler(markup=True)" + new = "rich.logging.RichHandler(markup=True, console=rich.console.Console(width=255))" + assert old in python_script_content + return python_script_content.replace(old, new) + + +def _get_submitit_job_command_and_srun_args( + job_command_in_example: str, srun_args: list[str] +) -> tuple[str, list[str]]: + """Adapts the last line of the job script so it can be run using a CommandFunction of submitit. + + TODO: This needs to be customized for each example, unfortunately. + """ + srun_args = srun_args.copy() + + if job_command_in_example == "python main.py": + return job_command_in_example, srun_args + + if job_command_in_example == "srun python main.py": + # submitit already does `srun (...) run_this_command_ish "python main.py"` as the last line, + # so we just remove the srun prefix here. + return "python main.py", srun_args + + if job_command_in_example == "exec python main.py": + # BUG: Getting a FileNotFoundError("exec") here if we leave the `exec python main.py` in! + return "python main.py", srun_args + + if "srun" in job_command_in_example: + # TODO: We need to do something different if we have an `srun` in the last line! + # Make the last line (the job command) just python main.py (...) and move all the srun args + # into the `srun_args` list of submitit. + # TODO: Need to remove the `srun` and move the srun params into the srun_args, so that + # submitit can do it with a CommandFunction that uses the right conda env! + # job_command = "python main.py" + # raise NotImplementedError(job_command) + srun_part, python, python_args = job_command_in_example.partition("python") + srun_args_str = srun_part.removeprefix("srun") + srun_args.append(srun_args_str) + job_command = python + python_args + return job_command, srun_args + + warnings.warn( + RuntimeWarning( + f"Don't yet know how to adapt the {job_command_in_example=!r} to work " + f"with submitit. Will try to run it as-is." + ) + ) + return job_command_in_example, srun_args + + +def change_conda_env_used_in_job_script( + job_script_content: str, conda_env_path: Path, conda_env_name_in_script: str +) -> str: + """Modify some lines of the source job script before it is run in the unit test.""" + + job_script_content = ( + job_script_content.replace( + f"ENV_PATH=${{ENV_PATH:-{conda_env_name_in_script}}}", + f"ENV_PATH={conda_env_name_in_script}", + ) + .replace(f"conda activate {conda_env_name_in_script}", f"conda activate {conda_env_path}") + .replace(f"-n {conda_env_name_in_script}", f"--prefix {conda_env_path}") + .replace(f"--name {conda_env_name_in_script}", f"--prefix {conda_env_path}") + .replace(f"-p {conda_env_name_in_script}", f"--prefix {conda_env_path}") + .replace(f"--prefix {conda_env_name_in_script}", f"--prefix {conda_env_path}") + ) + return "\n".join( + line if "ENV_PATH=" not in line else f"ENV_PATH={conda_env_path}" + for line in job_script_content.splitlines() + ) + + +def make_reproducible_version_of_example(example_dir: Path, test_example_dir: Path) -> None: + """Create a reproducible version of the examples by inserting some code blocks in the files. + + This modifies the job.sh and main.py scripts in the test example directory. + """ + directory_with_modified_files_for_test = test_example_dir + assert directory_with_modified_files_for_test.is_dir() + + # Modify the Python script in-place to make it ~100% reproducible: + python_script = example_dir / "main.py" + job_script = example_dir / "job.sh" + + modified_job_script = directory_with_modified_files_for_test / job_script.name + modified_python_script = directory_with_modified_files_for_test / python_script.name + + python_script_content = python_script.read_text() + python_script_content = make_logging_use_wider_console(python_script_content) + python_script_lines = python_script_content.splitlines(keepends=False) + # TODO: Where do we add the block? Before the def main()? Inside main? + + insertion_index = python_script_lines.index("def main():") - 1 + python_script_lines = ( + python_script_lines[:insertion_index] + + [""] + + REPRODUCIBLE_BLOCK_PYTHON.splitlines() + + [""] + + python_script_lines[insertion_index:] + ) + job_script_lines = job_script.read_text().splitlines(keepends=False) + insertion_index = -2 + # Somewhere before the end of the script (assuming the last line has the main command.) + job_script_lines = ( + job_script_lines[:insertion_index] + + REPRODUCIBLE_BLOCK_BATCH_SCRIPT.splitlines() + + job_script_lines[insertion_index:] + ) + + modified_python_script.write_text("\n".join(python_script_lines)) + modified_job_script.write_text("\n".join(job_script_lines)) + + +def filter_job_output_before_regression_check( + job_output: str, + prefix_of_lines_to_remove: str | tuple[str, ...] = ("Date:", "Hostname:", "INFO:__main__:"), + regex_substitutions: dict[str, str] = { + "/Tmp/slurm.[0-9]+.0/": "$SLURM_TMPDIR/", + r"\[\d{2}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}\]": "[(date) (time)]", + r"network/scratch/[a-z]{1}/[a-z]*/": "$SCRATCH/", + r"/checkpointing_example/[\d]*/checkpoints": ( + "/checkpointing_example/$SLURM_JOB_ID/checkpoints" + ), + }, + # line_matches: Sequence[str] = ("INFO:__main__:Epoch", "accuracy:"), + line_matches: Sequence[str] = (), +) -> str: + outputs = "\n".join( + line for line in job_output.splitlines() if not line.startswith(prefix_of_lines_to_remove) + ) + for regex, replacement in regex_substitutions.items(): + outputs = re.sub(regex, replacement, outputs) + + return "\n".join( + line + for line in outputs.splitlines() + if not any(pattern in line for pattern in line_matches) + ) + + +def get_params_from_job_script(job_script: Path) -> dict[str, Any]: + lines = job_script.read_text().splitlines() + sbatch_lines = [ + line.strip().removeprefix("#SBATCH").split("#", 1)[0].strip() + for line in lines + if line.strip().startswith("#SBATCH") + ] + params: dict[str, Any] = {} + for sbatch_arg_string in sbatch_lines: + value: Any + if "=" not in sbatch_arg_string: + flag = sbatch_arg_string + value = True + else: + flag, _, value = sbatch_arg_string.partition("=") + value = value.strip() + if value.isnumeric(): + value = int(value) + new_key = flag.strip().lstrip("-").replace("-", "_") + params[new_key] = value + for key in ["signal", "requeue"]: + if key in params: + params.setdefault("additional_parameters", {})[key] = params.pop(key) + return params + + +K = TypeVar("K") +V = TypeVar("V") + + +def _recursive_dict_union(*dicts: dict[K, V]) -> dict[K, V]: + """Recursively merge two dictionaries.""" + result: dict[K, V] = {} + for key in set(dicts[0]).union(*dicts[1:]): + values = [d[key] for d in dicts if key in d] + if any(isinstance(value, dict) for value in values): + result[key] = _recursive_dict_union( + *[value for value in values if isinstance(value, dict)] + ) + else: + result[key] = values[-1] + return result