Merge pull request #131 from mila-iqia/development

Release 3.2.0
mila-iqia · Oct 11, 2024 · 904af42 · 904af42
2 parents fd446d2 + af83f07
commit 904af42
Show file tree

Hide file tree

Showing 17 changed files with 372 additions and 239 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,4 +1,5 @@
 [flake8]
 max-line-length=100
 ignore=W503,D104,D100,D401
-docstring-convention=google
+docstring-convention=google
+exclude= .venv
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,10 +15,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
-    - name: python-3.9
+    - name: python-3.11
       uses: actions/setup-python@v2
       with:
-        python-version: 3.9
+        python-version: 3.11
     - name: linting checks
       run: |
         python -m pip install --upgrade pip
@@ -30,9 +30,12 @@ jobs:
     - name: unit tests
       run: |
         pytest --cov=amlrt-project
-    - name: pytorch-end2end
+    - name: pytorch-end2end-single
       run: |
-        ./tests/end2end_pytorch/run.sh
+        ./tests/end2end_pytorch/run_single.sh
+    - name: pytorch-end2end-orion
+      run: |
+        ./tests/end2end_pytorch/run_orion.sh
     - name: type checking
       run: |
         pytype amlrt_project/

diff --git a/README.md b/README.md
@@ -1,5 +1,22 @@
 # AMLRT Cookiecutter - Initialize a new project
 
+
+## Automatic Install (recommended)
+
+For convenience, you can run:
+
+    bash <(curl -s https://raw.githubusercontent.com/mila-iqia/cookiecutter-pyml/master/scripts/quick_install.sh) --project-name my_new_project
+
+replace `my_new_project` with the name of your project (the default value is `amlrt_project`). This will clone and setup the cookiecutter for you in the newly created folder `my_new_project`.
+
+Once done, go to the [First Commit](#first-commit) section and follow the instructions.
+
+Note: if the `my_new_project` folder already exists, the installation will not proceed.
+
+## Manual Install
+
+Note: Skip to next section if you used the automatic install
+
 First, git clone this project template locally.
 
     git clone https://github.com/mila-iqia/cookiecutter-pyml.git
@@ -21,12 +38,17 @@ Now, initialize the repository with git:
 
     git init
 
-And perform the first commit:
+You can now replace this README.md file with the standard README file for a project.
+    mv scripts/README.md.example README.md
+
+# First Commit
+
+To perform your first commit:
 
     git add .
     git commit -m 'first commit'
 
-Go on github and follow the instructions to create a new project.
+Next, go on github and follow the instructions to create a new project.
 When done, do not add any file, and follow the instructions to
 link your local git to the remote project, which should look like this:
 (PS: these instructions are reported here for your convenience.
@@ -43,199 +65,6 @@ suggest to delete this section ("AMLRT Cookiecutter - Initialize a new project")
 and all you need from now on is just to git clone from the repository you
 just pushed, i.e., `[email protected]:${GITHUB_USERNAME}/${PROJECT_NAME}.git`).
 
-# amlrt_project (change this name to the name of your project)
-
-Replace this line with a short description about your project!
-
-## Instructions to setup the project
-
-### Install the dependencies:
-First, activate a virtual environment (recommended).
-Install the package in `editable` mode so you can modify the source directly:
-
-    pip install -e .
-
-To add new dependencies, simply add them to the setup.py.
-
-### Setup pre-commit hooks:
-These hooks will:
-* validate flake8 before any commit
-* check that jupyter notebook outputs have been stripped
-
-    cd .git/hooks/ && ln -s ../../hooks/pre-commit .
-
-### Setup Continuous Integration
-
-Continuous integration will run the following:
-- Unit tests under `tests`.
-- End-to-end test under `exmaples/local`.
-- `flake8` to check the code syntax.
-- Checks on documentation presence and format (using `sphinx`).
-
-We support the GitHub Actions for running CI.
-
-Github actions are already configured in `.github/workflows/tests.yml`.
-Github actions are already enabled by default when using Github, so, when
-pushing to github, they will be executed automatically for pull requests to
-`main` and to `develop`.
-
-## Running the code
-
-### Run the tests
-Just run (from the root folder):
-
-    pytest
-
-### Run the code/examples.
-Note that the code should already compile at this point.
-
-Running examples can be found under the `examples` folder.
-
-In particular, you will find examples for:
-* local machine (e.g., your laptop).
-* a slurm cluster.
-
-For both these cases, there is the possibility to run with or without Orion.
-(Orion is a hyper-parameter search tool - see https://github.com/Epistimio/orion -
-that is already configured in this project)
-
-In any case, the run script will take multiple config files as arguments (`--configs`).
-This is because the config files will be merged together thanks to OmegaConf (the latter
-takes precedence).
-Note the param `--cli-config-params` can also be used, at CLI time, to modify/add more parameters.
-
-### Loggers
-Currently, Tensorboard, Comet and Aims are supported.
-For Comet, you will have to specify the key and the project.
-This can be done in several way (see the Comet-ML docs); a quick way is to set the env variables:
-```
-COMET_WORKSPACE=...
-COMET_PROJECT_NAME=...
-COMET_API_KEY=...
-```
-
-#### Run locally
-
-For example, to run on your local machine without Orion:
-
-    cd examples/local
-    sh run.sh
-
-This will run a simple MLP on a simple toy task: sum 5 float numbers.
-You should see an almost perfect loss of 0 after a few epochs.
-
-Note you have a new `output` folder which contains models and a summary of results:
-* best_model: the best model checkpoint during training
-* last_model: the last model checkpoint during training
-* lightning_logs: contains the tensorboard logs.
-
-To view tensorboard logs, simply run:
-
-    tensorboard --logdir output
-
-#### Run on a remote cluster (with Slurm)
-
-First, bring you project on the cluster (assuming you didn't create your
-project directly there). To do so, simply login on the cluster and git
-clone your project:
-
-    git clone [email protected]:${GITHUB_USERNAME}/${PROJECT_NAME}.git
-
-Then activate your virtual env, and install the dependencies:
-
-    cd amlrt_project
-    pip install -e .
-
-To run with Slurm, just:
-
-    cd examples/slurm
-    sh run.sh
-
-Check the log to see that you got an almost perfect loss (i.e., 0).
-
-#### Measure GPU time (and others) on the Mila cluster
-
-You can track down the GPU time (and other resources) of your jobs by
-associating a tag to the job (when using `sbatch`).
-To associate a tag to a job, replace `my_tag` with a proper tag,
-and uncomment the line (i.e., remove one #) from the line:
-
-    ##SBATCH --wckey=my_tag
-
-This line is inside the file `examples/slurm_mila/to_submit.sh`.
-
-To get a sumary for a particular tag, just run:
-
-    sacct --allusers --wckeys=my_tag --format=JobID,JobName,Start,Elapsed -X -P --delimiter=','
-
-(again, remember to change `my_tag` into the real tag name)
-
-#### GPU profiling on the Mila cluster
-
-It can be useful to monitor and profile how you utilise your GPU (usage, memory, etc.). For the 
-time being, you can only monitor your profiling in real-time from the Mila cluster, i.e. while your 
-experiments are running. To monitor your GPU, you need to setup port-forwarding on the host your 
-experiments are running on. This can be done in the following way:
-
-Once you have launched your job on the mila cluster, open the log for your current experiment:
-
-`head logs/amlrt_project__<your_slurm_job_id>.err`
-
-You should see printed in the first few lines the hostname of your machine, e.g.,
-
-```
-INFO:amlrt_project.utils.logging_utils:Experiment info:
-hostname: leto35
-git code hash: a51bfc5447d188bd6d31fac3afbd5757650ef524
-data folder: ../data
-data folder (abs): /network/tmp1/bronzimi/20191105_cookiecutter/amlrt_project/examples/data
-```
-
-In a separate shell on your local computer, run the following command:
-
-`ssh -L 19999:<hostname>.server.mila.quebec:19999 <username>@login.server.mila.quebec -p 2222` 
-
-where `<username>` is your user name on the Mila cluster and `<hostname>` is the name of the machine your job is currenty running on (`leto35` in our example). You can then navigate your local browser to `http://localhost:19999/` to view the ressources being used on the cluster and monitor your job. You should see something like this:
-
-![image](https://user-images.githubusercontent.com/18450628/88088807-fe2acd80-cb58-11ea-8ab2-bd090e8a826c.png)
-{%- endif %}
-
-#### Run with Orion on the Slurm cluster
-
-This example will run orion for 2 trials (see the orion config file).
-To do so, go into `examples/slurm_orion`.
-Here you can find the orion config file (`orion_config.yaml`), as well as the config
-file (`config.yaml`) for your project (that contains the hyper-parameters).
-
-In general, you will want to run Orion in parallel over N slurm jobs.
-To do so, simply run `sh run.sh` N times.
-
-When Orion has completed the trials, you will find the orion db file. 
-
-You will also find the output of your experiments in `orion_working_dir`, which
-will contain a folder for every trial.
-Inside these folders, you can find the models (the best one and the last one), the config file with
-the hyper-parameters for this trial, and the log file.
-
-You can check orion status with the following commands:
-(to be run from `examples/slurm_orion`)
-
-    export ORION_DB_ADDRESS='orion_db.pkl'
-    export ORION_DB_TYPE='pickleddb'
-    orion status
-    orion info --name my_exp
-
-### Building docs:
-
-Documentation is built using sphinx. It will automatically document all functions based on docstrings.
-To automatically generate docs for your project, navigate to the `docs` folder and build the documentation: 
-
-    cd docs
-    make html
-
-To view the docs locally, open `docs/_build/html/index.html` in your browser.
-
-
-## YOUR PROJECT README:
+Once you have successfully completed these steps, you can remove this README.md and update it with the template README provided. Adapt it to your needs:
 
-* __TODO__
+    mv scripts/README.new.md README.md
diff --git a/amlrt_project/train.py b/amlrt_project/train.py
@@ -161,14 +161,15 @@ def train_impl(model, datamodule, output, hyper_params, use_progress_bar,
     check_and_log_hp(['max_epoch'], hyper_params)
 
     best_model_path = os.path.join(output, BEST_MODEL_NAME)
+    best_checkpoint_params = hyper_params['best_checkpoint']
     best_checkpoint_callback = ModelCheckpoint(
         dirpath=best_model_path,
         filename='model',
         save_top_k=1,
         verbose=use_progress_bar,
-        monitor="val_loss",
-        mode="min",
-        every_n_epochs=1,
+        monitor=best_checkpoint_params['metric'],
+        mode=best_checkpoint_params['mode'],
+        every_n_epochs=best_checkpoint_params['every_n_epochs']
     )
 
     last_model_path = os.path.join(output, LAST_MODEL_NAME)

diff --git a/examples/config.yaml b/examples/config.yaml
@@ -22,8 +22,20 @@ hidden_dim: 256
 num_classes: 10
 architecture: simple_mlp
 
+# here wew centralize the metric and the mode to use in both early stopping and
+# best checkpoint selection. If instead you want to use different metric/mode,
+# remove this section and define them directly in the early_stopping / best_checkpoint blocks.
+metric_to_use: 'val_loss'
+mode_to_use: 'min'
+
 # early stopping
 early_stopping:
-  metric: val_loss
-  mode: min
+  metric: ${metric_to_use}
+  mode: ${mode_to_use}
   patience: 3
+
+# best checkpoint params
+best_checkpoint:
+  metric: ${metric_to_use}
+  mode: ${mode_to_use}
+  every_n_epochs: 1
diff --git a/examples/local/config.yaml b/examples/local/config.yaml
@@ -1,2 +1,5 @@
+# note how this param is overriding the parent config one (check folder above)
+max_epoch: 2
+
 # architecture
 hidden_dim: 256
diff --git a/examples/local/eval.sh b/examples/local/eval.sh
@@ -0,0 +1 @@
+amlrt_project_eval --data ../data --config ../config.yaml config.yaml --ckpt-path output/best_model/model.ckpt
diff --git a/examples/local/run.sh b/examples/local/run.sh
@@ -1,3 +1 @@
-set -e
-amlrt-train --data ../data --output output --config ../config.yaml config.yaml --start-from-scratch
-amlrt-eval --data ../data --config ../config.yaml config.yaml --ckpt-path output/best_model/model.ckpt
+amlrt_project_train --data ../data --output output --config ../config.yaml config.yaml --start-from-scratch
diff --git a/examples/local_orion/run.sh b/examples/local_orion/run.sh
@@ -2,8 +2,8 @@ set -e
 export ORION_DB_ADDRESS='orion_db.pkl'
 export ORION_DB_TYPE='pickleddb'
 
-merge-configs --config ../config.yaml config.yaml --merged-config-file merged_config.yaml
-orion -vvv -v hunt --config orion_config.yaml amlrt-train --data ../data \
+amlrt_project_merge_configs --config ../config.yaml config.yaml --merged-config-file merged_config.yaml
+orion -vvv -v hunt --config orion_config.yaml amlrt_project_train --data ../data \
     --config merged_config.yaml --disable-progressbar \
     --output '{exp.working_dir}/{trial.id}/' \
     --log '{exp.working_dir}/{trial.id}/exp.log'
diff --git a/examples/slurm/to_submit.sh b/examples/slurm/to_submit.sh
@@ -27,4 +27,4 @@
 
 export MLFLOW_TRACKING_URI='mlruns'
 
-amlrt-train --data ../data --output output --config ../config.yaml config.yaml --tmp-folder ${SLURM_TMPDIR} --disable-progressbar
+amlrt_project_train --data ../data --output output --config ../config.yaml config.yaml --tmp-folder ${SLURM_TMPDIR} --disable-progressbar
diff --git a/examples/slurm_orion/to_submit.sh b/examples/slurm_orion/to_submit.sh
@@ -25,9 +25,9 @@
 export ORION_DB_ADDRESS='orion_db.pkl'
 export ORION_DB_TYPE='pickleddb'
 
-merge-configs --config ../config.yaml config.yaml --merged-config-file merged_config.yaml
+amlrt_project_merge_configs --config ../config.yaml config.yaml --merged-config-file merged_config.yaml
 orion -v hunt --config orion_config.yaml \
-    amlrt-train --data ../data --config merged_config.yaml --disable-progressbar \
+    amlrt_project_train --data ../data --config merged_config.yaml --disable-progressbar \
     --output '{exp.working_dir}/{trial.id}/' \
     --log '{exp.working_dir}/{trial.id}/exp.log' \
     --tmp-folder ${SLURM_TMPDIR}/{trial.id}
diff --git a/hooks/pre-commit b/hooks/pre-commit
@@ -5,7 +5,7 @@ set -e
 
 # linting
 flake8 --ignore D .  # Check everything but docstrings
-flake8 --select D --exclude tests/  # Check only the docstrings
+flake8 --select D . --exclude tests/,.venv # Check only the docstrings
 isort --check .  # Check imports
 
 # Raise error if any staged notebooks contain outputs