diff --git a/docs/examples/distributed/multi_gpu/README.rst b/docs/examples/distributed/multi_gpu/README.rst index a7b486b4..408d3845 100644 --- a/docs/examples/distributed/multi_gpu/README.rst +++ b/docs/examples/distributed/multi_gpu/README.rst @@ -28,7 +28,7 @@ Click here to see `the code for this example #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 -#SBATCH --ntasks-per-node=1 - +#SBATCH --ntasks-per-node=4 + +#SBATCH --ntasks-per-node=2 #SBATCH --mem=16G #SBATCH --time=00:15:00 @@ -45,13 +45,9 @@ Click here to see `the code for this example module load anaconda/3 module load cuda/11.7 - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.7 -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm - + + # Activate pre-existing environment. + # NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/distributed/multi_gpu/job.sh b/docs/examples/distributed/multi_gpu/job.sh old mode 100644 new mode 100755 index 949ce544..d8d305d3 --- a/docs/examples/distributed/multi_gpu/job.sh +++ b/docs/examples/distributed/multi_gpu/job.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 -#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks-per-node=2 #SBATCH --mem=16G #SBATCH --time=00:15:00 @@ -18,13 +18,9 @@ module --quiet purge module load anaconda/3 module load cuda/11.7 -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.7 -c pytorch -c nvidia -# Other conda packages: -# conda install -y -n pytorch -c conda-forge rich tqdm # Activate pre-existing environment. +# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/distributed/multi_node/README.rst b/docs/examples/distributed/multi_node/README.rst index cdbc7aeb..04dbe508 100644 --- a/docs/examples/distributed/multi_node/README.rst +++ b/docs/examples/distributed/multi_node/README.rst @@ -29,7 +29,7 @@ Click here to see `the source code for this example #!/bin/bash #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 - #SBATCH --ntasks-per-node=4 + #SBATCH --ntasks-per-node=2 +#SBATCH --nodes=2 #SBATCH --mem=16G #SBATCH --time=00:15:00 @@ -47,13 +47,9 @@ Click here to see `the source code for this example module load anaconda/3 module load cuda/11.7 - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.7 -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm - + - # Activate pre-existing environment. + # NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch - diff --git a/docs/examples/distributed/multi_node/job.sh b/docs/examples/distributed/multi_node/job.sh old mode 100644 new mode 100755 index d1b8a3ce..b43afc9e --- a/docs/examples/distributed/multi_node/job.sh +++ b/docs/examples/distributed/multi_node/job.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 -#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks-per-node=2 #SBATCH --nodes=2 #SBATCH --mem=16G #SBATCH --time=00:15:00 @@ -19,13 +19,8 @@ module --quiet purge module load anaconda/3 module load cuda/11.7 -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.7 -c pytorch -c nvidia -# Other conda packages: -# conda install -y -n pytorch -c conda-forge rich tqdm - # Activate pre-existing environment. +# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch # Stage dataset into $SLURM_TMPDIR (only on the first worker of each node) diff --git a/docs/examples/distributed/single_gpu/README.rst b/docs/examples/distributed/single_gpu/README.rst index e10da925..76ca4bfb 100644 --- a/docs/examples/distributed/single_gpu/README.rst +++ b/docs/examples/distributed/single_gpu/README.rst @@ -42,13 +42,8 @@ repository. module load anaconda/3 module load cuda/11.7 - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.7 -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm - # Activate pre-existing environment. + # NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/distributed/single_gpu/job.sh b/docs/examples/distributed/single_gpu/job.sh old mode 100644 new mode 100755 index 6f542f39..05c7a59d --- a/docs/examples/distributed/single_gpu/job.sh +++ b/docs/examples/distributed/single_gpu/job.sh @@ -18,13 +18,8 @@ module --quiet purge module load anaconda/3 module load cuda/11.7 -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.7 -c pytorch -c nvidia -# Other conda packages: -# conda install -y -n pytorch -c conda-forge rich tqdm - # Activate pre-existing environment. +# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/frameworks/pytorch_setup/README.rst b/docs/examples/frameworks/pytorch_setup/README.rst index 99c2220c..20040708 100644 --- a/docs/examples/frameworks/pytorch_setup/README.rst +++ b/docs/examples/frameworks/pytorch_setup/README.rst @@ -41,13 +41,8 @@ repository. # See https://docs.mila.quebec/Userguide.html#conda for more information. module load anaconda/3 - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.6 -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich - # Activate the environment: + # NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 @@ -92,28 +87,38 @@ Note that we are requesting a GPU for this job, even though we're only going to install packages. This is because we want PyTorch to be installed with GPU support, and to have all the required libraries. -.. code-block:: bash +.. code:: bash + + #!/bin/bash + #SBATCH --gres=gpu:1 + #SBATCH --cpus-per-task=1 + #SBATCH --mem=16G + #SBATCH --time=00:30:00 + + # NOTE: Run this either with `sbatch make_env.sh` or within an interactive job with `salloc`: + # salloc --gres=gpu:1 --cpus-per-task=1 --mem=16G --time=00:30:00 + + # Exit on error + set -e + + module --quiet purge + module load anaconda/3 + module load cuda/11.7 + + ENV_NAME=pytorch + + ## Create the environment (see the example): + conda create --yes --name $ENV_NAME python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 --channel pytorch --channel nvidia + # Install as many packages as possible with Conda: + conda install --yes --name $ENV_NAME tqdm --channel conda-forge + # Activate the environment: + conda activate $ENV_NAME + # Install the rest of the packages with pip: + pip install rich + conda env export --no-builds --from-history --file environment.yaml + - $ salloc --gres=gpu:1 --cpus-per-task=4 --mem=16G --time=00:30:00 - salloc: -------------------------------------------------------------------------------------------------- - salloc: # Using default long partition - salloc: -------------------------------------------------------------------------------------------------- - salloc: Pending job allocation 2959785 - salloc: job 2959785 queued and waiting for resources - salloc: job 2959785 has been allocated resources - salloc: Granted job allocation 2959785 - salloc: Waiting for resource configuration - salloc: Nodes cn-g022 are ready for job - $ # Create the environment (see the example): - $ conda create -n pytorch python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia - (...) - $ # Press 'y' to accept if everything looks good. - (...) - $ # Activate the environment: - $ conda activate pytorch - -Exit the interactive job once the environment has been created. Then, the -example can be launched to confirm that everything works: +Once the environment has been created, the example can be launched to confirm that everything works: .. code-block:: bash diff --git a/docs/examples/frameworks/pytorch_setup/index.rst b/docs/examples/frameworks/pytorch_setup/index.rst index 9871d763..3cf0d2cc 100644 --- a/docs/examples/frameworks/pytorch_setup/index.rst +++ b/docs/examples/frameworks/pytorch_setup/index.rst @@ -38,28 +38,11 @@ Note that we are requesting a GPU for this job, even though we're only going to install packages. This is because we want PyTorch to be installed with GPU support, and to have all the required libraries. -.. code-block:: bash +.. literalinclude:: examples/frameworks/pytorch_setup/make_env.sh + :language: bash + - $ salloc --gres=gpu:1 --cpus-per-task=4 --mem=16G --time=00:30:00 - salloc: -------------------------------------------------------------------------------------------------- - salloc: # Using default long partition - salloc: -------------------------------------------------------------------------------------------------- - salloc: Pending job allocation 2959785 - salloc: job 2959785 queued and waiting for resources - salloc: job 2959785 has been allocated resources - salloc: Granted job allocation 2959785 - salloc: Waiting for resource configuration - salloc: Nodes cn-g022 are ready for job - $ # Create the environment (see the example): - $ conda create -n pytorch python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia - (...) - $ # Press 'y' to accept if everything looks good. - (...) - $ # Activate the environment: - $ conda activate pytorch - -Exit the interactive job once the environment has been created. Then, the -example can be launched to confirm that everything works: +Once the environment has been created, the example can be launched to confirm that everything works: .. code-block:: bash diff --git a/docs/examples/frameworks/pytorch_setup/job.sh b/docs/examples/frameworks/pytorch_setup/job.sh old mode 100644 new mode 100755 index 7c5acec7..52c72ef0 --- a/docs/examples/frameworks/pytorch_setup/job.sh +++ b/docs/examples/frameworks/pytorch_setup/job.sh @@ -14,13 +14,8 @@ module purge # See https://docs.mila.quebec/Userguide.html#conda for more information. module load anaconda/3 -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.6 -c pytorch -c nvidia -# Other conda packages: -# conda install -y -n pytorch -c conda-forge rich - # Activate the environment: +# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 diff --git a/docs/examples/frameworks/pytorch_setup/make_env.sh b/docs/examples/frameworks/pytorch_setup/make_env.sh new file mode 100755 index 00000000..fd77f1b6 --- /dev/null +++ b/docs/examples/frameworks/pytorch_setup/make_env.sh @@ -0,0 +1,27 @@ +#!/bin/bash +#SBATCH --gres=gpu:1 +#SBATCH --cpus-per-task=1 +#SBATCH --mem=16G +#SBATCH --time=00:30:00 + +# NOTE: Run this either with `sbatch make_env.sh` or within an interactive job with `salloc`: +# salloc --gres=gpu:1 --cpus-per-task=1 --mem=16G --time=00:30:00 + +# Exit on error +set -e + +module --quiet purge +module load anaconda/3 +module load cuda/11.7 + +ENV_NAME=pytorch + +## Create the environment (see the example): +conda create --yes --name $ENV_NAME python=3.9 pytorch torchvision torchaudio pytorch-cuda=11.7 --channel pytorch --channel nvidia +# Install as many packages as possible with Conda: +conda install --yes --name $ENV_NAME tqdm --channel conda-forge +# Activate the environment: +conda activate $ENV_NAME +# Install the rest of the packages with pip: +pip install rich +conda env export --no-builds --from-history --file environment.yaml diff --git a/docs/examples/good_practices/checkpointing/README.rst b/docs/examples/good_practices/checkpointing/README.rst index 5a2ecfc2..0ef1c3be 100644 --- a/docs/examples/good_practices/checkpointing/README.rst +++ b/docs/examples/good_practices/checkpointing/README.rst @@ -23,9 +23,7 @@ repository. .. code:: diff - # distributed/single_gpu/job.sh -> good_practices/checkpointing/job.sh - old mode 100644 - new mode 100755 + # distributed/001_single_gpu/job.sh -> good_practices/checkpointing/job.sh #!/bin/bash -#SBATCH --gpus-per-task=rtx8000:1 +#SBATCH --gpus-per-task=1 @@ -55,14 +53,8 @@ repository. module load cuda/11.7 + - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - -# pytorch-cuda=11.7 -c pytorch -c nvidia - +# pytorch-cuda=11.7 scipy -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm - # Activate pre-existing environment. + # NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/good_practices/checkpointing/job.sh b/docs/examples/good_practices/checkpointing/job.sh index 3ccebf4b..471e6814 100755 --- a/docs/examples/good_practices/checkpointing/job.sh +++ b/docs/examples/good_practices/checkpointing/job.sh @@ -25,13 +25,8 @@ module load anaconda/3 module load cuda/11.7 -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.7 scipy -c pytorch -c nvidia -# Other conda packages: -# conda install -y -n pytorch -c conda-forge rich tqdm - # Activate pre-existing environment. +# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. conda activate pytorch diff --git a/docs/examples/good_practices/hpo_with_orion/README.rst b/docs/examples/good_practices/hpo_with_orion/README.rst index 3dfa78c5..bab082c2 100644 --- a/docs/examples/good_practices/hpo_with_orion/README.rst +++ b/docs/examples/good_practices/hpo_with_orion/README.rst @@ -36,6 +36,8 @@ The full source code for this example is available on `the mila-docs GitHub repo #SBATCH --mem=16G #SBATCH --time=00:15:00 + +# Exit on error + +set -e # Echo time and hostname into log echo "Date: $(date)" @@ -49,21 +51,19 @@ The full source code for this example is available on `the mila-docs GitHub repo module load anaconda/3 module load cuda/11.7 - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.7 -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm - +# Orion package: - +# pip install orion - # Activate pre-existing environment. - conda activate pytorch + # NOTE: Use the `make_env.sh` script to create the environment if you haven't already. + -conda activate pytorch + +ENV_PATH="$SCRATCH/conda/pytorch_orion" + +conda activate $ENV_PATH + +# Install the Orion package: + +# pip install orion # Stage dataset into $SLURM_TMPDIR mkdir -p $SLURM_TMPDIR/data - cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ + -cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ + +cp --update /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ # General-purpose alternatives combining copy and unpack: # unzip /network/datasets/some/file.zip -d $SLURM_TMPDIR/data/ # tar -xf /network/datasets/some/file.tar -C $SLURM_TMPDIR/data/ @@ -86,7 +86,7 @@ The full source code for this example is available on `the mila-docs GitHub repo +# Then you can specify a search space for each `main.py`'s script parameter +# you want to optimize. Here we optimize only the learning rate. + - +orion hunt -n orion-example --exp-max-trials 10 python main.py --learning-rate~'loguniform(1e-5, 1.0)' + +orion --verbose hunt -n orion-example --exp-max-trials 10 python main.py --learning-rate~'loguniform(1e-5, 1.0)' **main.py** diff --git a/docs/examples/good_practices/hpo_with_orion/job.sh b/docs/examples/good_practices/hpo_with_orion/job.sh old mode 100644 new mode 100755 index 9f8155af..693f2bfd --- a/docs/examples/good_practices/hpo_with_orion/job.sh +++ b/docs/examples/good_practices/hpo_with_orion/job.sh @@ -5,6 +5,8 @@ #SBATCH --mem=16G #SBATCH --time=00:15:00 +# Exit on error +set -e # Echo time and hostname into log echo "Date: $(date)" @@ -18,21 +20,17 @@ module --quiet purge module load anaconda/3 module load cuda/11.7 -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.7 -c pytorch -c nvidia -# Other conda packages: -# conda install -y -n pytorch -c conda-forge rich tqdm -# Orion package: -# pip install orion - # Activate pre-existing environment. -conda activate pytorch +# NOTE: Use the `make_env.sh` script to create the environment if you haven't already. +ENV_PATH="$SCRATCH/conda/pytorch_orion" +conda activate $ENV_PATH +# Install the Orion package: +# pip install orion # Stage dataset into $SLURM_TMPDIR mkdir -p $SLURM_TMPDIR/data -cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ +cp --update /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ # General-purpose alternatives combining copy and unpack: # unzip /network/datasets/some/file.zip -d $SLURM_TMPDIR/data/ # tar -xf /network/datasets/some/file.tar -C $SLURM_TMPDIR/data/ @@ -53,4 +51,4 @@ unset CUDA_VISIBLE_DEVICES # Then you can specify a search space for each `main.py`'s script parameter # you want to optimize. Here we optimize only the learning rate. -orion hunt -n orion-example --exp-max-trials 10 python main.py --learning-rate~'loguniform(1e-5, 1.0)' +orion --verbose hunt -n orion-example --exp-max-trials 10 python main.py --learning-rate~'loguniform(1e-5, 1.0)' diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 00000000..668bc7a3 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1 @@ +.submitit \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/environment.yaml b/tests/environment.yaml new file mode 100644 index 00000000..a3fc46d5 --- /dev/null +++ b/tests/environment.yaml @@ -0,0 +1,130 @@ +# +# Generated 2023-01-24 +# +# conda create -n py38torch113 python=3.8 pytorch torchvision torchaudio \ +# pytorch-cuda=11.6 -c pytorch -c nvidia +# +name: py38torch113 +channels: + - pytorch + - nvidia + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - blas=1.0=mkl + - brotlipy=0.7.0=py38h27cfd23_1003 + - bzip2=1.0.8=h7b6447c_0 + - ca-certificates=2023.01.10=h06a4308_0 + - certifi=2022.12.7=py38h06a4308_0 + - cffi=1.15.1=py38h5eee18b_3 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - cryptography=38.0.4=py38h9ce1e76_0 + - cuda=11.6.1=0 + - cuda-cccl=11.6.55=hf6102b2_0 + - cuda-command-line-tools=11.6.2=0 + - cuda-compiler=11.6.2=0 + - cuda-cudart=11.6.55=he381448_0 + - cuda-cudart-dev=11.6.55=h42ad0f4_0 + - cuda-cuobjdump=11.6.124=h2eeebcb_0 + - cuda-cupti=11.6.124=h86345e5_0 + - cuda-cuxxfilt=11.6.124=hecbf4f6_0 + - cuda-driver-dev=11.6.55=0 + - cuda-gdb=12.0.90=0 + - cuda-libraries=11.6.1=0 + - cuda-libraries-dev=11.6.1=0 + - cuda-memcheck=11.8.86=0 + - cuda-nsight=12.0.78=0 + - cuda-nsight-compute=12.0.0=0 + - cuda-nvcc=11.6.124=hbba6d2d_0 + - cuda-nvdisasm=12.0.76=0 + - cuda-nvml-dev=11.6.55=haa9ef22_0 + - cuda-nvprof=12.0.90=0 + - cuda-nvprune=11.6.124=he22ec0a_0 + - cuda-nvrtc=11.6.124=h020bade_0 + - cuda-nvrtc-dev=11.6.124=h249d397_0 + - cuda-nvtx=11.6.124=h0630a44_0 + - cuda-nvvp=12.0.90=0 + - cuda-runtime=11.6.1=0 + - cuda-samples=11.6.101=h8efea70_0 + - cuda-sanitizer-api=12.0.90=0 + - cuda-toolkit=11.6.1=0 + - cuda-tools=11.6.1=0 + - cuda-visual-tools=11.6.1=0 + - ffmpeg=4.3=hf484d3e_0 + - flit-core=3.6.0=pyhd3eb1b0_0 + - freetype=2.12.1=h4a9f257_0 + - gds-tools=1.5.0.59=0 + - giflib=5.2.1=h7b6447c_0 + - gmp=6.2.1=h295c915_3 + - gnutls=3.6.15=he1e5248_0 + - idna=3.4=py38h06a4308_0 + - intel-openmp=2021.4.0=h06a4308_3561 + - jpeg=9e=h7f8727e_0 + - lame=3.100=h7b6447c_0 + - lcms2=2.12=h3be6417_0 + - ld_impl_linux-64=2.38=h1181459_1 + - lerc=3.0=h295c915_0 + - libcublas=11.9.2.110=h5e84587_0 + - libcublas-dev=11.9.2.110=h5c901ab_0 + - libcufft=10.7.1.112=hf425ae0_0 + - libcufft-dev=10.7.1.112=ha5ce4c0_0 + - libcufile=1.5.0.59=0 + - libcufile-dev=1.5.0.59=0 + - libcurand=10.3.1.50=0 + - libcurand-dev=10.3.1.50=0 + - libcusolver=11.3.4.124=h33c3c4e_0 + - libcusparse=11.7.2.124=h7538f96_0 + - libcusparse-dev=11.7.2.124=hbbe9722_0 + - libdeflate=1.8=h7f8727e_5 + - libffi=3.4.2=h6a678d5_6 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libiconv=1.16=h7f8727e_2 + - libidn2=2.3.2=h7f8727e_0 + - libnpp=11.6.3.124=hd2722f0_0 + - libnpp-dev=11.6.3.124=h3c42840_0 + - libnvjpeg=11.6.2.124=hd473ad6_0 + - libnvjpeg-dev=11.6.2.124=hb5906b9_0 + - libpng=1.6.37=hbc83047_0 + - libstdcxx-ng=11.2.0=h1234567_1 + - libtasn1=4.16.0=h27cfd23_0 + - libtiff=4.5.0=hecacb30_0 + - libunistring=0.9.10=h27cfd23_0 + - libwebp=1.2.4=h11a3e52_0 + - libwebp-base=1.2.4=h5eee18b_0 + - lz4-c=1.9.4=h6a678d5_0 + - mkl=2021.4.0=h06a4308_640 + - mkl-service=2.4.0=py38h7f8727e_0 + - mkl_fft=1.3.1=py38hd3c417c_0 + - mkl_random=1.2.2=py38h51133e4_0 + - ncurses=6.3=h5eee18b_3 + - nettle=3.7.3=hbbd107a_1 + - nsight-compute=2022.4.0.15=0 + - numpy=1.23.5=py38h14f4228_0 + - numpy-base=1.23.5=py38h31eccc5_0 + - openh264=2.1.1=h4ff587b_0 + - openssl=1.1.1s=h7f8727e_0 + - pillow=9.3.0=py38hace64e9_1 + - pip=22.3.1=py38h06a4308_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pyopenssl=22.0.0=pyhd3eb1b0_0 + - pysocks=1.7.1=py38h06a4308_0 + - python=3.8.16=h7a1cb2a_2 + - pytorch=1.13.1=py3.8_cuda11.6_cudnn8.3.2_0 + - pytorch-cuda=11.6=h867d48c_1 + - pytorch-mutex=1.0=cuda + - readline=8.2=h5eee18b_0 + - requests=2.28.1=py38h06a4308_0 + - setuptools=65.6.3=py38h06a4308_0 + - six=1.16.0=pyhd3eb1b0_1 + - sqlite=3.40.1=h5082296_0 + - tk=8.6.12=h1ccaba5_0 + - torchaudio=0.13.1=py38_cu116 + - torchvision=0.14.1=py38_cu116 + - typing_extensions=4.4.0=py38h06a4308_0 + - urllib3=1.26.14=py38h06a4308_0 + - wheel=0.37.1=pyhd3eb1b0_0 + - xz=5.2.10=h5eee18b_1 + - zlib=1.2.13=h5eee18b_0 + - zstd=1.5.2=ha4553b6_0 diff --git a/tests/test_examples.py b/tests/test_examples.py new file mode 100644 index 00000000..d652a7e7 --- /dev/null +++ b/tests/test_examples.py @@ -0,0 +1,453 @@ +"""Tests that launch the examples as jobs on the Mila cluster and check that they work correctly.""" +from __future__ import annotations + +import logging +import os +import re +import runpy +import shlex +import subprocess +import time +from logging import getLogger as get_logger +from pathlib import Path +from typing import Any + +import pytest +import rich.console +import rich.logging +import rich.traceback +from pytest_regressions.file_regression import FileRegressionFixture + +from .testutils import ( + DEFAULT_SBATCH_PARAMETER_OVERRIDES, + EXAMPLES_DIR, + SUBMITIT_DIR, + TEST_JOB_NAME, + copy_example_files_to_test_dir, + filter_job_output_before_regression_check, + run_example, + run_pytorch_example, +) + +logger = get_logger(__name__) +SCRATCH = Path(os.environ["SCRATCH"]) + + +gpu_types = [ + "1g.10gb", # MIG-ed A100 GPU + "2g.20gb", # MIG-ed A100 GPU + "3g.40gb", # MIG-ed A100 GPU + # "a100", + # "a100l", # Note: needs a reservation. + # "a6000", + "rtx8000", + pytest.param( + "v100", + marks=[ + pytest.mark.xfail(reason="Can take a while to schedule"), + pytest.mark.timeout(120), + ], + ), +] + + +@pytest.fixture(scope="session", autouse=True) +def setup_logging(): + """Setup logging (using a recipe from @JesseFarebro)""" + + LOGLEVEL = os.environ.get("LOGLEVEL", "INFO").upper() + console = rich.console.Console() + + _TRACEBACKS_EXCLUDES = [ + runpy, + "absl", + "click", + "tyro", + "simple_parsing", + "fiddle", + ] + + rich.traceback.install(console=console, suppress=_TRACEBACKS_EXCLUDES, show_locals=False) + logging.basicConfig( + level=LOGLEVEL, + format="%(message)s", + datefmt="[%X]", + force=True, + handlers=[ + rich.logging.RichHandler( + console=console, + rich_tracebacks=True, + tracebacks_show_locals=False, + tracebacks_suppress=_TRACEBACKS_EXCLUDES, + ) + ], + ) + + +def make_conda_env_for_test( + make_env_sh_file: Path, + env_name_in_script: str, + env_path: Path, +): + job_script = make_env_sh_file + example_dir = make_env_sh_file.parent + + # Copy all the python and .sh files from the example dir to the test example dir. + # (NOTE: This is so we can potentially modify the contents before running them in tests.) + test_example_dir = SUBMITIT_DIR / "_".join(example_dir.relative_to(EXAMPLES_DIR).parts) + copy_example_files_to_test_dir(example_dir, test_example_dir) + + outputs = run_example( + job_script=test_example_dir / job_script.name, + conda_env_name_in_script=env_name_in_script, + conda_env=env_path, + sbatch_parameter_overrides=DEFAULT_SBATCH_PARAMETER_OVERRIDES, + ) + assert len(outputs) == 1 + output = outputs[0] + assert output.isspace(), output + return env_path + + +@pytest.fixture(scope="session") +def pytorch_conda_env() -> Path: + """A fixture that launches a job to create the PyTorch + orion conda env.""" + env_name_in_script = "pytorch" # Name in the example + env_name = "pytorch_test" # Name used in the tests + env_path = SCRATCH / "conda" / env_name + make_env_sh_file = EXAMPLES_DIR / "frameworks" / "pytorch_setup" / "make_env.sh" + command_to_test_that_env_is_working = ( + f"conda run --prefix {env_path} python -c 'import torch, tqdm, rich'" + ) + + try: + subprocess.check_call(shlex.split(command_to_test_that_env_is_working)) + except subprocess.CalledProcessError: + logger.info(f"The {env_path} env has not already been created at {env_path}.") + else: + logger.info( + f"The {env_path} env has already been created with all required packages at {env_path}." + ) + return env_path + + make_conda_env_for_test( + env_path=env_path, + make_env_sh_file=make_env_sh_file, + env_name_in_script=env_name_in_script, + ) + return env_path + + +@pytest.fixture(autouse=True, scope="session") +def scancel_jobs_after_tests(): + yield + username = os.environ["USER"] + subprocess.check_call(["scancel", "-u", username, "--name", TEST_JOB_NAME]) + + +def _test_id(arg: Path | bool | dict) -> str: + if isinstance(arg, Path): + path = arg + return str(path.relative_to(EXAMPLES_DIR)) + if isinstance(arg, bool): + return str(arg) + assert isinstance(arg, dict) + return "-".join(f"{k}={v}" for k, v in arg.items()) + + +@pytest.mark.parametrize( + ("example_dir", "make_reproducible", "sbatch_overrides"), + [ + pytest.param( + EXAMPLES_DIR / "frameworks" / "pytorch_setup", + False, + {"gres": f"gpu:{gpu_type}:1"}, + marks=( + [ + pytest.mark.xfail(reason="Can take a while to schedule"), + pytest.mark.timeout(120), + ] + if gpu_type == "v100" + else [] + ), + ) + for gpu_type in [ + "1g.10gb", # MIG-ed A100 GPU + "2g.20gb", # MIG-ed A100 GPU + "3g.40gb", # MIG-ed A100 GPU + # "a100", + # "a100l", # Note: needs a reservation. + # "a6000", + "rtx8000", + "v100", + ] + ] + + [ + (EXAMPLES_DIR / "distributed" / "001_single_gpu", True, {}), + (EXAMPLES_DIR / "distributed" / "002_multi_gpu", True, {}), + pytest.param( + EXAMPLES_DIR / "distributed" / "003_multi_node", + True, + {"partition": "long"}, + marks=[ + # pytest.mark.timeout(300), + # pytest.mark.xfail(raises=) + ], + ), + ], + ids=_test_id, +) +def test_pytorch_example( + example_dir: Path, + make_reproducible: bool, + sbatch_overrides: dict[str, Any] | None, + pytorch_conda_env: Path, + file_regression: FileRegressionFixture, +): + """Launches a pytorch-based example as a slurm job and checks that the output is as expected. + + Some of the examples are modified so their outputs are reproducible. + """ + + filtered_job_outputs = run_pytorch_example( + example_dir=example_dir, + pytorch_conda_env_location=pytorch_conda_env, + sbatch_parameter_overrides=sbatch_overrides, + examples_dir=EXAMPLES_DIR, + make_reproducible=make_reproducible, + ) + if len(filtered_job_outputs) == 1: + # Only one task. + file_regression.check(filtered_job_outputs[0]) + else: + file_regression.check( + "\n".join( + [ + f"Task {i} output:\n" + task_i_output + for i, task_i_output in enumerate(filtered_job_outputs) + ] + ) + ) + + +@pytest.mark.timeout(10 * 60) +def test_checkpointing_example(pytorch_conda_env: Path, file_regression: FileRegressionFixture): + """Tests the checkpointing example. + + This test is quite nice. Here's what it does: + - Launch the job, let it run till completion. + - Launch the job again, and then do `scontrol requeue ` to force it + to be requeued once it has created a checkpoint (reached Epoch 1) + - Check that the exact same result is reached whether it is requeued or not. + """ + example_dir = EXAMPLES_DIR / "good_practices" / "checkpointing" + test_example_dir = SUBMITIT_DIR / "_".join(example_dir.relative_to(EXAMPLES_DIR).parts) + + uninterrupted_job_outputs = run_pytorch_example( + example_dir=example_dir, + pytorch_conda_env_location=pytorch_conda_env, + # Need to specify a GPU so the results are reproducible. + sbatch_parameter_overrides={"gpus_per_task": "rtx8000:1"}, + test_example_dir=test_example_dir, + examples_dir=EXAMPLES_DIR, + make_reproducible=True, + ) + assert len(uninterrupted_job_outputs) == 1 + uninterrupted_job_output = uninterrupted_job_outputs[0] + file_regression.check(uninterrupted_job_output) + + # NOTE: Reusing the exact same job.sh and main.py scripts as were used above: + job_script = test_example_dir / "job.sh" + job = run_example( + job_script, + conda_env=pytorch_conda_env, + conda_env_name_in_script="pytorch", + sbatch_parameter_overrides={"gpus_per_task": "rtx8000:1"}, + wait_for_results=False, + ) + interval_seconds = 5 + + while job.state in ["UNKNOWN", "PENDING"]: + logger.debug(f"Waiting for job {job.job_id} to start running. ({job.state=!r})") + time.sleep(interval_seconds) + assert job.state == "RUNNING" + + output_file = job.paths.stdout + while not output_file.exists() or "Train epoch 1:" not in output_file.read_text(): + output_path = output_file.relative_to(Path.cwd()) + logger.debug( + f"Waiting for job {job.job_id} to reach the second epoch of training. {output_path=}" + ) + time.sleep(interval_seconds) + + requeue_command = f"scontrol requeue {job.job_id}" + logger.info(f"Requeueing the job using {requeue_command=!r}") + subprocess.check_call(shlex.split(requeue_command)) + + # todo: double-check that there aren't other intermediate states I might miss because of the low + # time-resolution. + + while job.state == "RUNNING": + logger.debug(f"Waiting for job {job.job_id} to get requeued. ({job.state=!r})") + time.sleep(interval_seconds) + + # assert job.state == "REQUEUED" + logger.debug(f"Job {job.job_id} is being requeued.") + while job.state == "REQUEUED": + logger.debug(f"Waiting for job {job.job_id} to become pending. ({job.state=!r})") + time.sleep(interval_seconds) + + # NOTE: The state doesn't get updated back to `RUNNING` after doing REQUEUED -> PENDING! + # (Either that, or there's some sort of caching mechanism that would take too long to get + # assert job.state == "PENDING" + logger.debug(f"Job {job.job_id} is now pending.") + # invalidated.) Therefore manually trigger a "cache" update here. + while job.watcher.get_state(job.job_id, mode="force") == "PENDING": + logger.debug(f"Waiting for job {job.job_id} to start running again. ({job.state=!r})") + time.sleep(interval_seconds) + + assert job.state in ["RUNNING", "COMPLETED"] + logger.info(f"Job {job.job_id} is now running again after having been requeued.") + # Wait for the job to finish (again): + requeued_job_output = job.result() + # Filter out lines that may change between executions: + filtered_requeued_job_output = filter_job_output_before_regression_check(requeued_job_output) + # TODO: Here it *might* be a bad idea for this requeued output to be checked using the + # file_regression fixture, because it could happen that we resume from a different epoch, + # depending on a few things: + # - how fast the output file can actually show us that the job has reached the second epoch + # - how long the job takes to actually stop and get requeued + # - how fast an epoch takes to run (if this were to become << the interval at which we check the + # output, then we might miss the second epoch) + # ALSO: not sure if it's because we're not using `exec`, but it seems like it's taking longer + # for the job to stop running once we ask it to requeue. + file_regression.check(filtered_requeued_job_output, extension="_requeued.txt") + + # todo: Compare the output of the requeued job to the output of the non-requeued job in a way + # that isn't too too hard-coded for that specific example. + # For example, we could extract the accuracies at each epoch and check that they line up. + uninterrupted_values = get_val_loss_and_accuracy_at_each_epoch(uninterrupted_job_output) + interrupted_values = get_val_loss_and_accuracy_at_each_epoch(filtered_requeued_job_output) + + resumed_epoch = min(interrupted_values.keys()) + final_epoch = max(interrupted_values.keys()) + assert set(uninterrupted_values.keys()) > set(interrupted_values.keys()) + for epoch in range(resumed_epoch, final_epoch + 1): + # Compare the values at each epoch, they should match: + assert uninterrupted_values[epoch] == interrupted_values[epoch] + + +def get_val_loss_and_accuracy_at_each_epoch( + filtered_job_output: str, +) -> dict[int, tuple[float, float]]: + # [(date) (time)] INFO Epoch 3: Val loss: 37.565 accuracy: 67.58% + # [(date) (time)] INFO Epoch 4: Val loss: 37.429 accuracy: 68.14% + # [(date) (time)] INFO Epoch 5: Val loss: 40.469 accuracy: 66.78% + # [(date) (time)] INFO Epoch 6: Val loss: 48.439 accuracy: 63.78% + # [(date) (time)] INFO Epoch 7: Val loss: 38.182 accuracy: 71.46% + # [(date) (time)] INFO Epoch 8: Val loss: 40.733 accuracy: 70.60% + # [(date) (time)] INFO Epoch 9: Val loss: 44.822 accuracy: 69.96% + val_losses_and_accuracies: dict[int, tuple[float, float]] = {} + for line in filtered_job_output.splitlines(): + match_epoch = re.search(r"Epoch (\d+):", line) + match_val_loss = re.search(r"Val loss: (\d+\.\d+)", line) + match_val_accuracy = re.search(r"accuracy: (\d+\.\d+)%", line) + if ( + match_epoch is not None + and match_val_loss is not None + and match_val_accuracy is not None + ): + epoch = int(match_epoch.group(1)) + val_loss = float(match_val_loss.group(1)) + val_accuracy = float(match_val_accuracy.group(1)) + val_losses_and_accuracies[epoch] = (val_loss, val_accuracy) + if not val_losses_and_accuracies: + raise RuntimeError( + "Unable to extract the val loss and accuracy! Perhaps the regex here are wrong?" + ) + return val_losses_and_accuracies + + +@pytest.fixture(scope="session") +def pytorch_orion_conda_env() -> Path: + """A fixture that launches a job to create the PyTorch + orion conda env.""" + env_name_in_script = "pytorch_orion" # Name in the example + env_name = "pytorch_orion_test" # Name used in the tests + env_path = SCRATCH / "conda" / env_name + make_env_sh_file = EXAMPLES_DIR / "good_practices" / "hpo_with_orion" / "make_env.sh" + command_to_test_that_env_is_working = ( + f"conda run --prefix {env_path} python -c 'import torch, tqdm, rich, orion'" + ) + try: + subprocess.check_call(shlex.split(command_to_test_that_env_is_working)) + except subprocess.CalledProcessError: + logger.info(f"The {env_path} env has not already been created at {env_path}.") + else: + logger.info( + f"The {env_path} env has already been created with all required packages at {env_path}." + ) + return env_path + + make_conda_env_for_test( + env_path=env_path, + make_env_sh_file=make_env_sh_file, + env_name_in_script=env_name_in_script, + ) + return env_path + + +# TODO: Make this run faster. Times out with 10 minutes, but seems to be reaching the end though, +# which is quite strange. Perhaps we could reduce the number of trials? +@pytest.mark.timeout(20 * 60) +def test_orion_example(pytorch_orion_conda_env: Path, file_regression: FileRegressionFixture): + """Tests the "HPO with Orion" example. + + TODO: This should probably use a different conda environment, instead of adding a + `pip install orion` to the same pytorch env. + """ + example_dir = EXAMPLES_DIR / "good_practices" / "hpo_with_orion" + sbatch_overrides = None + + def modify_job_script_before_running(job_script_path: Path) -> None: + job_script_lines = job_script_path.read_text().splitlines() + # TODO: Make this use a database in $SLURM_TMPDIR or something, so each run is independent. + + last_line = job_script_lines[-1] + assert "hunt" in last_line + + example_dir = job_script_path.parent + # TODO: Create an Orion config so that we can pass the path to the database to use. + import yaml + + orion_config_path = example_dir / "orion_config.yaml" + with open(orion_config_path, "w+") as f: + yaml.dump( + { + "storage": { + "type": "legacy", + "database": { + "type": "pickleddb", + "host": str(example_dir / "database.pkl"), + }, + }, + }, + f, + ) + + last_line = last_line.replace("--exp-max-trials 10", "--exp-max-trials 3") + last_line = last_line.replace("hunt", f"hunt --config {orion_config_path}") + + job_script_lines[-1] = last_line + job_script_path.write_text("\n".join(job_script_lines)) + + filtered_job_outputs = run_pytorch_example( + example_dir=example_dir, + pytorch_conda_env_location=pytorch_orion_conda_env, + sbatch_parameter_overrides=sbatch_overrides, + make_reproducible=True, + examples_dir=EXAMPLES_DIR, + submitit_dir=SUBMITIT_DIR, + modify_job_script_before_running=modify_job_script_before_running, + ) + assert len(filtered_job_outputs) == 1 + file_regression.check(filtered_job_outputs[0]) diff --git a/tests/test_examples/test_checkpointing_example.txt b/tests/test_examples/test_checkpointing_example.txt new file mode 100644 index 00000000..1957b51c --- /dev/null +++ b/tests/test_examples/test_checkpointing_example.txt @@ -0,0 +1,15 @@ +[(date) (time)] INFO No checkpoints found in /$SCRATCH/checkpointing_example/$SLURM_JOB_ID/checkpoints. Training from scratch. main.py:117 +Using downloaded and verified file: $SLURM_TMPDIR/data/cifar-10-python.tar.gz +Extracting $SLURM_TMPDIR/data/cifar-10-python.tar.gz to $SLURM_TMPDIR/data +Files already downloaded and verified +[(date) (time)] INFO Epoch 0: Val loss: 50.314 accuracy: 54.26% main.py:204 +[(date) (time)] INFO Epoch 1: Val loss: 46.534 accuracy: 59.04% main.py:204 +[(date) (time)] INFO Epoch 2: Val loss: 42.161 accuracy: 62.84% main.py:204 +[(date) (time)] INFO Epoch 3: Val loss: 37.565 accuracy: 67.58% main.py:204 +[(date) (time)] INFO Epoch 4: Val loss: 37.429 accuracy: 68.14% main.py:204 +[(date) (time)] INFO Epoch 5: Val loss: 40.469 accuracy: 66.78% main.py:204 +[(date) (time)] INFO Epoch 6: Val loss: 48.439 accuracy: 63.78% main.py:204 +[(date) (time)] INFO Epoch 7: Val loss: 38.182 accuracy: 71.46% main.py:204 +[(date) (time)] INFO Epoch 8: Val loss: 40.733 accuracy: 70.60% main.py:204 +[(date) (time)] INFO Epoch 9: Val loss: 44.822 accuracy: 69.96% main.py:204 +Done! \ No newline at end of file diff --git a/tests/test_examples/test_checkpointing_example_requeued.txt b/tests/test_examples/test_checkpointing_example_requeued.txt new file mode 100644 index 00000000..f2dbec3a --- /dev/null +++ b/tests/test_examples/test_checkpointing_example_requeued.txt @@ -0,0 +1,12 @@ +[(date) (time)] INFO NOTE: This job has been restarted 1 times by SLURM. main.py:293 + INFO Resuming training at epoch 4 (best_acc=67.58%). main.py:115 +Using downloaded and verified file: $SLURM_TMPDIR/data/cifar-10-python.tar.gz +Extracting $SLURM_TMPDIR/data/cifar-10-python.tar.gz to $SLURM_TMPDIR/data +Files already downloaded and verified +[(date) (time)] INFO Epoch 4: Val loss: 37.429 accuracy: 68.14% main.py:204 +[(date) (time)] INFO Epoch 5: Val loss: 40.469 accuracy: 66.78% main.py:204 +[(date) (time)] INFO Epoch 6: Val loss: 48.439 accuracy: 63.78% main.py:204 +[(date) (time)] INFO Epoch 7: Val loss: 38.182 accuracy: 71.46% main.py:204 +[(date) (time)] INFO Epoch 8: Val loss: 40.733 accuracy: 70.60% main.py:204 +[(date) (time)] INFO Epoch 9: Val loss: 44.822 accuracy: 69.96% main.py:204 +Done! \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_distributed_001_single_gpu_True__.txt b/tests/test_examples/test_pytorch_example_distributed_001_single_gpu_True__.txt new file mode 100644 index 00000000..25b636a1 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_distributed_001_single_gpu_True__.txt @@ -0,0 +1,24 @@ +Using downloaded and verified file: $SLURM_TMPDIR/data/cifar-10-python.tar.gz +Extracting $SLURM_TMPDIR/data/cifar-10-python.tar.gz to $SLURM_TMPDIR/data +Files already downloaded and verified +[(date) (time)] INFO INFO:__main__:Epoch 0: Val loss: 50.314 main.py:121 + accuracy: 54.26% +[(date) (time)] INFO INFO:__main__:Epoch 1: Val loss: 46.534 main.py:121 + accuracy: 59.04% +[(date) (time)] INFO INFO:__main__:Epoch 2: Val loss: 42.161 main.py:121 + accuracy: 62.84% +[(date) (time)] INFO INFO:__main__:Epoch 3: Val loss: 37.565 main.py:121 + accuracy: 67.58% +[(date) (time)] INFO INFO:__main__:Epoch 4: Val loss: 37.429 main.py:121 + accuracy: 68.14% +[(date) (time)] INFO INFO:__main__:Epoch 5: Val loss: 40.469 main.py:121 + accuracy: 66.78% +[(date) (time)] INFO INFO:__main__:Epoch 6: Val loss: 48.439 main.py:121 + accuracy: 63.78% +[(date) (time)] INFO INFO:__main__:Epoch 7: Val loss: 38.182 main.py:121 + accuracy: 71.46% +[(date) (time)] INFO INFO:__main__:Epoch 8: Val loss: 40.733 main.py:121 + accuracy: 70.60% +[(date) (time)] INFO INFO:__main__:Epoch 9: Val loss: 44.822 main.py:121 + accuracy: 69.96% +Done! \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_distributed_002_multi_gpu_True__.txt b/tests/test_examples/test_pytorch_example_distributed_002_multi_gpu_True__.txt new file mode 100644 index 00000000..3d893328 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_distributed_002_multi_gpu_True__.txt @@ -0,0 +1,51 @@ +Task 0 output: +PyTorch Distributed available. + Backends: + Gloo: True + NCCL: True + MPI: False +[(date) (time)] INFO [0/2] __main__ - World size: 2, global main.py:53 + rank: 0 +Using downloaded and verified file: $SLURM_TMPDIR/data/cifar-10-python.tar.gz +Extracting $SLURM_TMPDIR/data/cifar-10-python.tar.gz to $SLURM_TMPDIR/data +Files already downloaded and verified +[(date) (time)] INFO [0/2] __main__ - Effective batch size: main.py:81 + 256 +[(date) (time)] INFO [0/2] distributed.py:1140 + torch.nn.parallel.distributed - + Reducer buckets have been + rebuilt in this iteration. +[(date) (time)] INFO [0/2] __main__ - Epoch 0: Val loss: main.py:175 + 64.641 accuracy: 43.60% +[(date) (time)] INFO [0/2] __main__ - Epoch 1: Val loss: main.py:175 + 59.205 accuracy: 49.18% +[(date) (time)] INFO [0/2] __main__ - Epoch 2: Val loss: main.py:175 + 40.863 accuracy: 63.34% +[(date) (time)] INFO [0/2] __main__ - Epoch 3: Val loss: main.py:175 + 41.587 accuracy: 65.02% +[(date) (time)] INFO [0/2] __main__ - Epoch 4: Val loss: main.py:175 + 41.128 accuracy: 65.16% +[(date) (time)] INFO [0/2] __main__ - Epoch 5: Val loss: main.py:175 + 40.960 accuracy: 66.24% +[(date) (time)] INFO [0/2] __main__ - Epoch 6: Val loss: main.py:175 + 45.061 accuracy: 65.86% +[(date) (time)] INFO [0/2] __main__ - Epoch 7: Val loss: main.py:175 + 59.227 accuracy: 60.28% +[(date) (time)] INFO [0/2] __main__ - Epoch 8: Val loss: main.py:175 + 50.601 accuracy: 66.42% +[(date) (time)] INFO [0/2] __main__ - Epoch 9: Val loss: main.py:175 + 53.997 accuracy: 66.60% +Done! +Task 1 output: +PyTorch Distributed available. + Backends: + Gloo: True + NCCL: True + MPI: False +[(date) (time)] INFO [1/2] __main__ - World size: 2, global main.py:53 + rank: 1 +[(date) (time)] INFO [1/2] distributed.py:1140 + torch.nn.parallel.distributed - + Reducer buckets have been + rebuilt in this iteration. +Done! \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_distributed_003_multi_node_True_partition_long_.txt b/tests/test_examples/test_pytorch_example_distributed_003_multi_node_True_partition_long_.txt new file mode 100644 index 00000000..74cea4e6 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_distributed_003_multi_node_True_partition_long_.txt @@ -0,0 +1,80 @@ +Task 0 output: +PyTorch Distributed available. + Backends: + Gloo: True + NCCL: True + MPI: False +[(date) (time)] INFO [0/4] __main__ - World size: 4, global main.py:55 + rank: 0, local rank: 0 +Using downloaded and verified file: $SLURM_TMPDIR/data/cifar-10-python.tar.gz +Extracting $SLURM_TMPDIR/data/cifar-10-python.tar.gz to $SLURM_TMPDIR/data +Files already downloaded and verified +[(date) (time)] INFO [0/4] __main__ - Effective batch size: main.py:86 + 512 +[(date) (time)] INFO [0/4] distributed.py:1140 + torch.nn.parallel.distributed - + Reducer buckets have been + rebuilt in this iteration. +[(date) (time)] INFO [0/4] __main__ - Epoch 0: Val loss: main.py:180 + 55.032 accuracy: 50.08% +[(date) (time)] INFO [0/4] __main__ - Epoch 1: Val loss: main.py:180 + 48.975 accuracy: 56.76% +[(date) (time)] INFO [0/4] __main__ - Epoch 2: Val loss: main.py:180 + 53.192 accuracy: 55.32% +[(date) (time)] INFO [0/4] __main__ - Epoch 3: Val loss: main.py:180 + 47.434 accuracy: 59.68% +[(date) (time)] INFO [0/4] __main__ - Epoch 4: Val loss: main.py:180 + 44.753 accuracy: 63.28% +[(date) (time)] INFO [0/4] __main__ - Epoch 5: Val loss: main.py:180 + 56.168 accuracy: 59.26% +[(date) (time)] INFO [0/4] __main__ - Epoch 6: Val loss: main.py:180 + 54.097 accuracy: 63.38% +[(date) (time)] INFO [0/4] __main__ - Epoch 7: Val loss: main.py:180 + 54.764 accuracy: 63.02% +[(date) (time)] INFO [0/4] __main__ - Epoch 8: Val loss: main.py:180 + 64.655 accuracy: 61.20% +[(date) (time)] INFO [0/4] __main__ - Epoch 9: Val loss: main.py:180 + 61.904 accuracy: 63.20% +Done! +Task 1 output: +PyTorch Distributed available. + Backends: + Gloo: True + NCCL: True + MPI: False +[(date) (time)] INFO [1/4] __main__ - World size: 4, global main.py:55 + rank: 1, local rank: 1 +[(date) (time)] INFO [1/4] distributed.py:1140 + torch.nn.parallel.distributed - + Reducer buckets have been + rebuilt in this iteration. +Done! +Task 2 output: +PyTorch Distributed available. + Backends: + Gloo: True + NCCL: True + MPI: False +[(date) (time)] INFO [2/4] __main__ - World size: 4, global main.py:55 + rank: 2, local rank: 0 +Using downloaded and verified file: $SLURM_TMPDIR/data/cifar-10-python.tar.gz +Extracting $SLURM_TMPDIR/data/cifar-10-python.tar.gz to $SLURM_TMPDIR/data +Files already downloaded and verified +[(date) (time)] INFO [2/4] distributed.py:1140 + torch.nn.parallel.distributed - + Reducer buckets have been + rebuilt in this iteration. +Done! +Task 3 output: +PyTorch Distributed available. + Backends: + Gloo: True + NCCL: True + MPI: False +[(date) (time)] INFO [3/4] __main__ - World size: 4, global main.py:55 + rank: 3, local rank: 1 +[(date) (time)] INFO [3/4] distributed.py:1140 + torch.nn.parallel.distributed - + Reducer buckets have been + rebuilt in this iteration. +Done! \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_1g_10gb_1_.txt b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_1g_10gb_1_.txt new file mode 100644 index 00000000..4103bf37 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_1g_10gb_1_.txt @@ -0,0 +1,4 @@ +PyTorch built with CUDA: True +PyTorch detects CUDA available: True +PyTorch-detected #GPUs: 1 + GPU 0: NVIDIA A100-SXM4-80GB MIG 1g.10gb \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_2g_20gb_1_.txt b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_2g_20gb_1_.txt new file mode 100644 index 00000000..7a4327f7 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_2g_20gb_1_.txt @@ -0,0 +1,4 @@ +PyTorch built with CUDA: True +PyTorch detects CUDA available: True +PyTorch-detected #GPUs: 1 + GPU 0: NVIDIA A100-SXM4-80GB MIG 2g.20gb \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_3g_40gb_1_.txt b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_3g_40gb_1_.txt new file mode 100644 index 00000000..6e017995 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_3g_40gb_1_.txt @@ -0,0 +1,4 @@ +PyTorch built with CUDA: True +PyTorch detects CUDA available: True +PyTorch-detected #GPUs: 1 + GPU 0: NVIDIA A100-SXM4-80GB MIG 3g.40gb \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_rtx8000_1_.txt b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_rtx8000_1_.txt new file mode 100644 index 00000000..41daacfe --- /dev/null +++ b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_rtx8000_1_.txt @@ -0,0 +1,4 @@ +PyTorch built with CUDA: True +PyTorch detects CUDA available: True +PyTorch-detected #GPUs: 1 + GPU 0: Quadro RTX 8000 \ No newline at end of file diff --git a/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_v100_1_.txt b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_v100_1_.txt new file mode 100644 index 00000000..f0b99729 --- /dev/null +++ b/tests/test_examples/test_pytorch_example_frameworks_pytorch_setup_False_gres_gpu_v100_1_.txt @@ -0,0 +1,4 @@ +PyTorch built with CUDA: True +PyTorch detects CUDA available: True +PyTorch-detected #GPUs: 1 + GPU 0: Tesla V100-SXM2-32GB-LS \ No newline at end of file diff --git a/tests/testutils.py b/tests/testutils.py new file mode 100644 index 00000000..2f48cb13 --- /dev/null +++ b/tests/testutils.py @@ -0,0 +1,421 @@ +""" Idea: Use `submitit` to test that the setup works for this repo on the current cluster. +""" +from __future__ import annotations + +import itertools +import json +import re +import shlex +import shutil +import warnings +from logging import getLogger as get_logger +from pathlib import Path +from typing import Any, Callable, Literal, Sequence, TypeVar, overload +import submitit + +logger = get_logger(__name__) + +TEST_JOB_NAME = "example_tests" +ROOT_DIR = Path(__file__).parent.parent +EXAMPLES_DIR = ROOT_DIR / "docs" / "examples" +TESTS_DIR = Path(__file__).parent +SUBMITIT_DIR = TESTS_DIR / ".submitit" + +DEFAULT_SBATCH_PARAMETER_OVERRIDES = dict( + partition="main", + job_name=TEST_JOB_NAME, + stderr_to_stdout=True, +) + + +REPRODUCIBLE_BLOCK_PYTHON = """\ +### NOTE: This block is added to make the example reproducible during unit tests +import random +import numpy + +seed = 123 +random.seed(seed) +numpy.random.seed(seed) +torch.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.use_deterministic_algorithms(True) +### +""" + +REPRODUCIBLE_BLOCK_BATCH_SCRIPT = """\ +## +# Adding this line makes it possible to set `torch.use_deterministic_algorithms(True)` +export CUBLAS_WORKSPACE_CONFIG=:4096:8 +## +""" + + +@overload +def run_example( + job_script: Path, + conda_env: Path, + conda_env_name_in_script: str, + sbatch_parameter_overrides: dict[str, Any] + | None = None, # Actually defaults to `default_overrides` + wait_for_results: Literal[True] = True, +) -> list[str]: + ... + + +@overload +def run_example( + job_script: Path, + conda_env: Path, + conda_env_name_in_script: str, + sbatch_parameter_overrides: dict[str, Any] + | None = None, # Actually defaults to `default_overrides` + wait_for_results: Literal[False] = False, +) -> submitit.Job[str]: + ... + + +@overload +def run_example( + job_script: Path, + conda_env: Path, + conda_env_name_in_script: str, + sbatch_parameter_overrides: dict[str, Any] + | None = None, # Actually defaults to `default_overrides` + wait_for_results: bool = True, +) -> list[str] | submitit.Job[str]: + ... + + +def run_example( + job_script: Path, + conda_env: Path, + conda_env_name_in_script: str, + sbatch_parameter_overrides: dict[str, Any] + | None = None, # Actually defaults to `default_overrides` + wait_for_results: bool = True, +) -> list[str] | submitit.Job[str]: + """Submits the `job.sh` script of an example as a slurm job and returns the output. + + NOTE: The backslashes in the docstring here are there so that the IDE (VsCode) shows the full + text when hovering over an argument. + + Parameters: + job_script: The path to the `job.sh` script of the example to run. + conda_env: The path to the conda environment to use in the example. + conda_env_name_in_script: The name of the conda environment as it appears in the `job.sh` \ + of the example. This is replaced with `conda_env` before the job is submitted. + sbatch_parameter_overrides: SBATCH parameters to override (in python form, e.g. \ + `--ntasks-per-node` becomes "ntasks_per_node"). + wait_for_results: Whether to wait for the job to finish and return results, or just submit \ + the job and return it. + """ + assert job_script.exists() and job_script.is_file() and job_script.suffix == ".sh" + sbatch_parameter_overrides = sbatch_parameter_overrides or DEFAULT_SBATCH_PARAMETER_OVERRIDES + example_dir = job_script.parent + # Adds the --chdir parameter as a SBATCH flag, so the paths work and the outputs are produced in + # the right folder. + sbatch_parameter_overrides.setdefault("additional_parameters", {})["chdir"] = str(example_dir) + + job_script_content = job_script.read_text() + job_script_content = change_conda_env_used_in_job_script( + job_script_content, + conda_env_path=conda_env, + conda_env_name_in_script=conda_env_name_in_script, + ) + example_lines_after_sbatch = [ + stripped_line + for line in job_script_content.splitlines(keepends=False) + if (stripped_line := line.strip()) and not stripped_line.startswith("#SBATCH") + ] + last_non_empty_line_index = -1 + job_setup = example_lines_after_sbatch[:last_non_empty_line_index] + job_command_in_example = example_lines_after_sbatch[last_non_empty_line_index] + + # NOTE: Could be nice to use the new match-case statement for this, but it requires python=3.10 + # match job_command.split(): + # case "python main.py": + srun_args: list[str] = sbatch_parameter_overrides.get("srun_args", []) + _old_srun_args = srun_args.copy() + # NOTE: If there's an `srun` in the job command, this is going to cause an issue, because + # submitit will create a last line that goes + # `srun (...) submitit.load_and_run_ish "srun job.sh"` and the job will hang! + # Therefore, we tweak the last line of the example into something that will work with submitit. + submitit_job_command, srun_args = _get_submitit_job_command_and_srun_args( + job_command_in_example, srun_args=srun_args + ) + sbatch_parameter_overrides["srun_args"] = srun_args + if submitit_job_command != job_command_in_example: + logger.debug(f"{job_command_in_example=!r}") + logger.debug(f"srun args before: {_old_srun_args!r}") + logger.debug(f"{submitit_job_command=!r}") + logger.debug(f"srun args after: {srun_args!r}") + + logger.info(f"Command that will be run by submitit: {submitit_job_command!r}") + logger.info(f"Additional args to be passed to `srun`: {srun_args!r}") + + job_setup = ( + ["set -e"] # Make the job crash if one of the command fails. + + job_setup + + ([f"# NOTE: Command that will be run by submitit: {submitit_job_command!r}"]) + ) + + executor = submitit.SlurmExecutor(folder=example_dir) + job_script_params = get_params_from_job_script(job_script) + executor.update_parameters( + setup=job_setup, + **_recursive_dict_union(job_script_params, sbatch_parameter_overrides), + ) + logger.debug(f"Using the following sbatch params: {json.dumps(executor.parameters, indent=4)}") + + assert "srun" not in submitit_job_command + function = submitit.helpers.CommandFunction( + shlex.split(submitit_job_command), + cwd=example_dir, + ) + job = executor.submit(function) + if wait_for_results: + job_outputs = job.results() + return job_outputs + return job + + +def run_pytorch_example( + example_dir: str | Path, + pytorch_conda_env_location: Path, + sbatch_parameter_overrides: dict[str, Any] | None = None, + test_example_dir: Path | None = None, + examples_dir: Path = EXAMPLES_DIR, + make_reproducible: bool = True, + submitit_dir: Path = SUBMITIT_DIR, + modify_job_script_before_running: Callable[[Path], None] | None = None, +) -> list[str]: + """Runs a pytorch-base example with a main.py and job.sh file. + + Compared with `run_example`, this also: + - Copies the files into a `test_example_dir` directory so they can be modified before being run + - Optionally makes it reproducible by adding a block of code to the main.py and job.sh files + - Filters out the job output to remove lines that may change between executions + """ + example_dir = Path(example_dir) + assert example_dir.is_dir() + assert (example_dir / "job.sh").is_file() + assert (example_dir / "main.py").is_file() + assert example_dir.is_relative_to(examples_dir) + assert pytorch_conda_env_location.is_dir() + if test_example_dir is None: + test_example_dir = submitit_dir / "_".join(example_dir.relative_to(examples_dir).parts) + copy_example_files_to_test_dir(example_dir, test_example_dir) + + if make_reproducible: + logger.info( + f"Making a variant of the main.py and job.sh files from {example_dir} to make them " + f"~100% reproducible." + ) + make_reproducible_version_of_example(example_dir, test_example_dir) + + job_script = test_example_dir / "job.sh" + if modify_job_script_before_running: + modify_job_script_before_running(job_script) + + job_outputs = run_example( + job_script, + conda_env=pytorch_conda_env_location, + sbatch_parameter_overrides=sbatch_parameter_overrides or {}, + conda_env_name_in_script="pytorch", + wait_for_results=True, + ) + # Filter out lines that may change between executions: + return [filter_job_output_before_regression_check(job_output) for job_output in job_outputs] + + +def copy_example_files_to_test_dir( + example_dir: Path, test_example_dir: Path, include_patterns: Sequence[str] = ("*.py", "*.sh") +) -> None: + test_example_dir.mkdir(exist_ok=True, parents=True) + for file in itertools.chain(*[example_dir.glob(pattern) for pattern in include_patterns]): + dest = test_example_dir / file.name + if dest.exists(): + dest.unlink() + shutil.copyfile(file, dest) + + +def make_logging_use_wider_console(python_script_content: str) -> str: + """Make the example use a wider console for logging. + + This is done so we can more easily match or substitute some of the contents of the job outputs + before they are checked using the file regression fixture. + """ + old = "rich.logging.RichHandler(markup=True)" + new = "rich.logging.RichHandler(markup=True, console=rich.console.Console(width=255))" + assert old in python_script_content + return python_script_content.replace(old, new) + + +def _get_submitit_job_command_and_srun_args( + job_command_in_example: str, srun_args: list[str] +) -> tuple[str, list[str]]: + """Adapts the last line of the job script so it can be run using a CommandFunction of submitit. + + TODO: This needs to be customized for each example, unfortunately. + """ + srun_args = srun_args.copy() + + if job_command_in_example == "python main.py": + return job_command_in_example, srun_args + + if job_command_in_example == "srun python main.py": + # submitit already does `srun (...) run_this_command_ish "python main.py"` as the last line, + # so we just remove the srun prefix here. + return "python main.py", srun_args + + if job_command_in_example == "exec python main.py": + # BUG: Getting a FileNotFoundError("exec") here if we leave the `exec python main.py` in! + return "python main.py", srun_args + + if "srun" in job_command_in_example: + # TODO: We need to do something different if we have an `srun` in the last line! + # Make the last line (the job command) just python main.py (...) and move all the srun args + # into the `srun_args` list of submitit. + # TODO: Need to remove the `srun` and move the srun params into the srun_args, so that + # submitit can do it with a CommandFunction that uses the right conda env! + # job_command = "python main.py" + # raise NotImplementedError(job_command) + srun_part, python, python_args = job_command_in_example.partition("python") + srun_args_str = srun_part.removeprefix("srun") + srun_args.append(srun_args_str) + job_command = python + python_args + return job_command, srun_args + + warnings.warn( + RuntimeWarning( + f"Don't yet know how to adapt the {job_command_in_example=!r} to work " + f"with submitit. Will try to run it as-is." + ) + ) + return job_command_in_example, srun_args + + +def change_conda_env_used_in_job_script( + job_script_content: str, conda_env_path: Path, conda_env_name_in_script: str +) -> str: + """Modify some lines of the source job script before it is run in the unit test.""" + return ( + (job_script_content) + .replace(f"conda activate {conda_env_name_in_script}", f"conda activate {conda_env_path}") + .replace(f"-n {conda_env_name_in_script}", f"--prefix {conda_env_path}") + .replace(f"--name {conda_env_name_in_script}", f"--prefix {conda_env_path}") + .replace(f"-p {conda_env_name_in_script}", f"--prefix {conda_env_path}") + .replace(f"--prefix {conda_env_name_in_script}", f"--prefix {conda_env_path}") + ) + + +def make_reproducible_version_of_example(example_dir: Path, test_example_dir: Path) -> None: + """Create a reproducible version of the examples by inserting some code blocks in the files. + + This modifies the job.sh and main.py scripts in the test example directory. + """ + directory_with_modified_files_for_test = test_example_dir + assert directory_with_modified_files_for_test.is_dir() + + # Modify the Python script in-place to make it ~100% reproducible: + python_script = example_dir / "main.py" + job_script = example_dir / "job.sh" + + modified_job_script = directory_with_modified_files_for_test / job_script.name + modified_python_script = directory_with_modified_files_for_test / python_script.name + + python_script_content = python_script.read_text() + python_script_content = make_logging_use_wider_console(python_script_content) + python_script_lines = python_script_content.splitlines(keepends=False) + # TODO: Where do we add the block? Before the def main()? Inside main? + + insertion_index = python_script_lines.index("def main():") - 1 + python_script_lines = ( + python_script_lines[:insertion_index] + + [""] + + REPRODUCIBLE_BLOCK_PYTHON.splitlines() + + [""] + + python_script_lines[insertion_index:] + ) + job_script_lines = job_script.read_text().splitlines(keepends=False) + insertion_index = -2 + # Somewhere before the end of the script (assuming the last line has the main command.) + job_script_lines = ( + job_script_lines[:insertion_index] + + REPRODUCIBLE_BLOCK_BATCH_SCRIPT.splitlines() + + job_script_lines[insertion_index:] + ) + + modified_python_script.write_text("\n".join(python_script_lines)) + modified_job_script.write_text("\n".join(job_script_lines)) + + +def filter_job_output_before_regression_check( + job_output: str, + prefix_of_lines_to_remove: str | tuple[str, ...] = ("Date:", "Hostname:", "INFO:__main__:"), + regex_substitutions: dict[str, str] = { + "/Tmp/slurm.[0-9]+.0/": "$SLURM_TMPDIR/", + r"\[\d{2}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}\]": "[(date) (time)]", + r"network/scratch/[a-z]{1}/[a-z]*/": "$SCRATCH/", + r"/checkpointing_example/[\d]*/checkpoints": ( + "/checkpointing_example/$SLURM_JOB_ID/checkpoints" + ), + }, + # line_matches: Sequence[str] = ("INFO:__main__:Epoch", "accuracy:"), + line_matches: Sequence[str] = (), +) -> str: + outputs = "\n".join( + line for line in job_output.splitlines() if not line.startswith(prefix_of_lines_to_remove) + ) + for regex, replacement in regex_substitutions.items(): + outputs = re.sub(regex, replacement, outputs) + + return "\n".join( + line + for line in outputs.splitlines() + if not any(pattern in line for pattern in line_matches) + ) + + +def get_params_from_job_script(job_script: Path) -> dict[str, Any]: + lines = job_script.read_text().splitlines() + sbatch_lines = [ + line.strip().removeprefix("#SBATCH").split("#", 1)[0].strip() + for line in lines + if line.strip().startswith("#SBATCH") + ] + params: dict[str, Any] = {} + for sbatch_arg_string in sbatch_lines: + value: Any + if "=" not in sbatch_arg_string: + flag = sbatch_arg_string + value = True + else: + flag, _, value = sbatch_arg_string.partition("=") + value = value.strip() + if value.isnumeric(): + value = int(value) + new_key = flag.strip().lstrip("-").replace("-", "_") + params[new_key] = value + for key in ["signal", "requeue"]: + if key in params: + params.setdefault("additional_parameters", {})[key] = params.pop(key) + return params + + +K = TypeVar("K") +V = TypeVar("V") + + +def _recursive_dict_union(*dicts: dict[K, V]) -> dict[K, V]: + """Recursively merge two dictionaries.""" + result: dict[K, V] = {} + for key in set(dicts[0]).union(*dicts[1:]): + values = [d[key] for d in dicts if key in d] + if any(isinstance(value, dict) for value in values): + result[key] = _recursive_dict_union( + *[value for value in values if isinstance(value, dict)] + ) + else: + result[key] = values[-1] + return result