diff --git a/.flake8 b/.flake8 index 469e681..56124f5 100644 --- a/.flake8 +++ b/.flake8 @@ -1,4 +1,5 @@ [flake8] max-line-length=100 ignore=W503,D104,D100,D401 -docstring-convention=google \ No newline at end of file +docstring-convention=google +exclude= .venv diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f1d3e96..a41f74b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,10 +15,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: python-3.9 + - name: python-3.11 uses: actions/setup-python@v2 with: - python-version: 3.9 + python-version: 3.11 - name: linting checks run: | python -m pip install --upgrade pip @@ -30,9 +30,12 @@ jobs: - name: unit tests run: | pytest --cov=amlrt-project - - name: pytorch-end2end + - name: pytorch-end2end-single run: | - ./tests/end2end_pytorch/run.sh + ./tests/end2end_pytorch/run_single.sh + - name: pytorch-end2end-orion + run: | + ./tests/end2end_pytorch/run_orion.sh - name: type checking run: | pytype amlrt_project/ diff --git a/README.md b/README.md index f554816..4a0b853 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,22 @@ # AMLRT Cookiecutter - Initialize a new project + +## Automatic Install (recommended) + +For convenience, you can run: + + bash <(curl -s https://raw.githubusercontent.com/mila-iqia/cookiecutter-pyml/master/scripts/quick_install.sh) --project-name my_new_project + +replace `my_new_project` with the name of your project (the default value is `amlrt_project`). This will clone and setup the cookiecutter for you in the newly created folder `my_new_project`. + +Once done, go to the [First Commit](#first-commit) section and follow the instructions. + +Note: if the `my_new_project` folder already exists, the installation will not proceed. + +## Manual Install + +Note: Skip to next section if you used the automatic install + First, git clone this project template locally. git clone https://github.com/mila-iqia/cookiecutter-pyml.git @@ -21,12 +38,17 @@ Now, initialize the repository with git: git init -And perform the first commit: +You can now replace this README.md file with the standard README file for a project. + mv scripts/README.md.example README.md + +# First Commit + +To perform your first commit: git add . git commit -m 'first commit' -Go on github and follow the instructions to create a new project. +Next, go on github and follow the instructions to create a new project. When done, do not add any file, and follow the instructions to link your local git to the remote project, which should look like this: (PS: these instructions are reported here for your convenience. @@ -43,199 +65,6 @@ suggest to delete this section ("AMLRT Cookiecutter - Initialize a new project") and all you need from now on is just to git clone from the repository you just pushed, i.e., `git@github.com:${GITHUB_USERNAME}/${PROJECT_NAME}.git`). -# amlrt_project (change this name to the name of your project) - -Replace this line with a short description about your project! - -## Instructions to setup the project - -### Install the dependencies: -First, activate a virtual environment (recommended). -Install the package in `editable` mode so you can modify the source directly: - - pip install -e . - -To add new dependencies, simply add them to the setup.py. - -### Setup pre-commit hooks: -These hooks will: -* validate flake8 before any commit -* check that jupyter notebook outputs have been stripped - - cd .git/hooks/ && ln -s ../../hooks/pre-commit . - -### Setup Continuous Integration - -Continuous integration will run the following: -- Unit tests under `tests`. -- End-to-end test under `exmaples/local`. -- `flake8` to check the code syntax. -- Checks on documentation presence and format (using `sphinx`). - -We support the GitHub Actions for running CI. - -Github actions are already configured in `.github/workflows/tests.yml`. -Github actions are already enabled by default when using Github, so, when -pushing to github, they will be executed automatically for pull requests to -`main` and to `develop`. - -## Running the code - -### Run the tests -Just run (from the root folder): - - pytest - -### Run the code/examples. -Note that the code should already compile at this point. - -Running examples can be found under the `examples` folder. - -In particular, you will find examples for: -* local machine (e.g., your laptop). -* a slurm cluster. - -For both these cases, there is the possibility to run with or without Orion. -(Orion is a hyper-parameter search tool - see https://github.com/Epistimio/orion - -that is already configured in this project) - -In any case, the run script will take multiple config files as arguments (`--configs`). -This is because the config files will be merged together thanks to OmegaConf (the latter -takes precedence). -Note the param `--cli-config-params` can also be used, at CLI time, to modify/add more parameters. - -### Loggers -Currently, Tensorboard, Comet and Aims are supported. -For Comet, you will have to specify the key and the project. -This can be done in several way (see the Comet-ML docs); a quick way is to set the env variables: -``` -COMET_WORKSPACE=... -COMET_PROJECT_NAME=... -COMET_API_KEY=... -``` - -#### Run locally - -For example, to run on your local machine without Orion: - - cd examples/local - sh run.sh - -This will run a simple MLP on a simple toy task: sum 5 float numbers. -You should see an almost perfect loss of 0 after a few epochs. - -Note you have a new `output` folder which contains models and a summary of results: -* best_model: the best model checkpoint during training -* last_model: the last model checkpoint during training -* lightning_logs: contains the tensorboard logs. - -To view tensorboard logs, simply run: - - tensorboard --logdir output - -#### Run on a remote cluster (with Slurm) - -First, bring you project on the cluster (assuming you didn't create your -project directly there). To do so, simply login on the cluster and git -clone your project: - - git clone git@github.com:${GITHUB_USERNAME}/${PROJECT_NAME}.git - -Then activate your virtual env, and install the dependencies: - - cd amlrt_project - pip install -e . - -To run with Slurm, just: - - cd examples/slurm - sh run.sh - -Check the log to see that you got an almost perfect loss (i.e., 0). - -#### Measure GPU time (and others) on the Mila cluster - -You can track down the GPU time (and other resources) of your jobs by -associating a tag to the job (when using `sbatch`). -To associate a tag to a job, replace `my_tag` with a proper tag, -and uncomment the line (i.e., remove one #) from the line: - - ##SBATCH --wckey=my_tag - -This line is inside the file `examples/slurm_mila/to_submit.sh`. - -To get a sumary for a particular tag, just run: - - sacct --allusers --wckeys=my_tag --format=JobID,JobName,Start,Elapsed -X -P --delimiter=',' - -(again, remember to change `my_tag` into the real tag name) - -#### GPU profiling on the Mila cluster - -It can be useful to monitor and profile how you utilise your GPU (usage, memory, etc.). For the -time being, you can only monitor your profiling in real-time from the Mila cluster, i.e. while your -experiments are running. To monitor your GPU, you need to setup port-forwarding on the host your -experiments are running on. This can be done in the following way: - -Once you have launched your job on the mila cluster, open the log for your current experiment: - -`head logs/amlrt_project__.err` - -You should see printed in the first few lines the hostname of your machine, e.g., - -``` -INFO:amlrt_project.utils.logging_utils:Experiment info: -hostname: leto35 -git code hash: a51bfc5447d188bd6d31fac3afbd5757650ef524 -data folder: ../data -data folder (abs): /network/tmp1/bronzimi/20191105_cookiecutter/amlrt_project/examples/data -``` - -In a separate shell on your local computer, run the following command: - -`ssh -L 19999:.server.mila.quebec:19999 @login.server.mila.quebec -p 2222` - -where `` is your user name on the Mila cluster and `` is the name of the machine your job is currenty running on (`leto35` in our example). You can then navigate your local browser to `http://localhost:19999/` to view the ressources being used on the cluster and monitor your job. You should see something like this: - -![image](https://user-images.githubusercontent.com/18450628/88088807-fe2acd80-cb58-11ea-8ab2-bd090e8a826c.png) -{%- endif %} - -#### Run with Orion on the Slurm cluster - -This example will run orion for 2 trials (see the orion config file). -To do so, go into `examples/slurm_orion`. -Here you can find the orion config file (`orion_config.yaml`), as well as the config -file (`config.yaml`) for your project (that contains the hyper-parameters). - -In general, you will want to run Orion in parallel over N slurm jobs. -To do so, simply run `sh run.sh` N times. - -When Orion has completed the trials, you will find the orion db file. - -You will also find the output of your experiments in `orion_working_dir`, which -will contain a folder for every trial. -Inside these folders, you can find the models (the best one and the last one), the config file with -the hyper-parameters for this trial, and the log file. - -You can check orion status with the following commands: -(to be run from `examples/slurm_orion`) - - export ORION_DB_ADDRESS='orion_db.pkl' - export ORION_DB_TYPE='pickleddb' - orion status - orion info --name my_exp - -### Building docs: - -Documentation is built using sphinx. It will automatically document all functions based on docstrings. -To automatically generate docs for your project, navigate to the `docs` folder and build the documentation: - - cd docs - make html - -To view the docs locally, open `docs/_build/html/index.html` in your browser. - - -## YOUR PROJECT README: +Once you have successfully completed these steps, you can remove this README.md and update it with the template README provided. Adapt it to your needs: -* __TODO__ + mv scripts/README.new.md README.md diff --git a/amlrt_project/train.py b/amlrt_project/train.py index 8579478..47ebc75 100644 --- a/amlrt_project/train.py +++ b/amlrt_project/train.py @@ -161,14 +161,15 @@ def train_impl(model, datamodule, output, hyper_params, use_progress_bar, check_and_log_hp(['max_epoch'], hyper_params) best_model_path = os.path.join(output, BEST_MODEL_NAME) + best_checkpoint_params = hyper_params['best_checkpoint'] best_checkpoint_callback = ModelCheckpoint( dirpath=best_model_path, filename='model', save_top_k=1, verbose=use_progress_bar, - monitor="val_loss", - mode="min", - every_n_epochs=1, + monitor=best_checkpoint_params['metric'], + mode=best_checkpoint_params['mode'], + every_n_epochs=best_checkpoint_params['every_n_epochs'] ) last_model_path = os.path.join(output, LAST_MODEL_NAME) diff --git a/examples/config.yaml b/examples/config.yaml index c3fcf64..97556dc 100644 --- a/examples/config.yaml +++ b/examples/config.yaml @@ -22,8 +22,20 @@ hidden_dim: 256 num_classes: 10 architecture: simple_mlp +# here wew centralize the metric and the mode to use in both early stopping and +# best checkpoint selection. If instead you want to use different metric/mode, +# remove this section and define them directly in the early_stopping / best_checkpoint blocks. +metric_to_use: 'val_loss' +mode_to_use: 'min' + # early stopping early_stopping: - metric: val_loss - mode: min + metric: ${metric_to_use} + mode: ${mode_to_use} patience: 3 + +# best checkpoint params +best_checkpoint: + metric: ${metric_to_use} + mode: ${mode_to_use} + every_n_epochs: 1 \ No newline at end of file diff --git a/examples/local/config.yaml b/examples/local/config.yaml index 38b0ec8..20b8fd0 100644 --- a/examples/local/config.yaml +++ b/examples/local/config.yaml @@ -1,2 +1,5 @@ +# note how this param is overriding the parent config one (check folder above) +max_epoch: 2 + # architecture hidden_dim: 256 diff --git a/examples/local/eval.sh b/examples/local/eval.sh new file mode 100644 index 0000000..8af79a6 --- /dev/null +++ b/examples/local/eval.sh @@ -0,0 +1 @@ +amlrt_project_eval --data ../data --config ../config.yaml config.yaml --ckpt-path output/best_model/model.ckpt diff --git a/examples/local/run.sh b/examples/local/run.sh index 9ad11bc..076a20b 100644 --- a/examples/local/run.sh +++ b/examples/local/run.sh @@ -1,3 +1 @@ -set -e -amlrt-train --data ../data --output output --config ../config.yaml config.yaml --start-from-scratch -amlrt-eval --data ../data --config ../config.yaml config.yaml --ckpt-path output/best_model/model.ckpt +amlrt_project_train --data ../data --output output --config ../config.yaml config.yaml --start-from-scratch diff --git a/examples/local_orion/run.sh b/examples/local_orion/run.sh index a86d49b..4f35338 100644 --- a/examples/local_orion/run.sh +++ b/examples/local_orion/run.sh @@ -2,8 +2,8 @@ set -e export ORION_DB_ADDRESS='orion_db.pkl' export ORION_DB_TYPE='pickleddb' -merge-configs --config ../config.yaml config.yaml --merged-config-file merged_config.yaml -orion -vvv -v hunt --config orion_config.yaml amlrt-train --data ../data \ +amlrt_project_merge_configs --config ../config.yaml config.yaml --merged-config-file merged_config.yaml +orion -vvv -v hunt --config orion_config.yaml amlrt_project_train --data ../data \ --config merged_config.yaml --disable-progressbar \ --output '{exp.working_dir}/{trial.id}/' \ --log '{exp.working_dir}/{trial.id}/exp.log' diff --git a/examples/slurm/to_submit.sh b/examples/slurm/to_submit.sh index b5c88ef..b27e68e 100644 --- a/examples/slurm/to_submit.sh +++ b/examples/slurm/to_submit.sh @@ -27,4 +27,4 @@ export MLFLOW_TRACKING_URI='mlruns' -amlrt-train --data ../data --output output --config ../config.yaml config.yaml --tmp-folder ${SLURM_TMPDIR} --disable-progressbar +amlrt_project_train --data ../data --output output --config ../config.yaml config.yaml --tmp-folder ${SLURM_TMPDIR} --disable-progressbar diff --git a/examples/slurm_orion/to_submit.sh b/examples/slurm_orion/to_submit.sh index 6337bbd..65d7925 100644 --- a/examples/slurm_orion/to_submit.sh +++ b/examples/slurm_orion/to_submit.sh @@ -25,9 +25,9 @@ export ORION_DB_ADDRESS='orion_db.pkl' export ORION_DB_TYPE='pickleddb' -merge-configs --config ../config.yaml config.yaml --merged-config-file merged_config.yaml +amlrt_project_merge_configs --config ../config.yaml config.yaml --merged-config-file merged_config.yaml orion -v hunt --config orion_config.yaml \ - amlrt-train --data ../data --config merged_config.yaml --disable-progressbar \ + amlrt_project_train --data ../data --config merged_config.yaml --disable-progressbar \ --output '{exp.working_dir}/{trial.id}/' \ --log '{exp.working_dir}/{trial.id}/exp.log' \ --tmp-folder ${SLURM_TMPDIR}/{trial.id} diff --git a/hooks/pre-commit b/hooks/pre-commit index 96715b2..b930ae1 100755 --- a/hooks/pre-commit +++ b/hooks/pre-commit @@ -5,7 +5,7 @@ set -e # linting flake8 --ignore D . # Check everything but docstrings -flake8 --select D --exclude tests/ # Check only the docstrings +flake8 --select D . --exclude tests/,.venv # Check only the docstrings isort --check . # Check imports # Raise error if any staged notebooks contain outputs diff --git a/scripts/README.new.md b/scripts/README.new.md new file mode 100644 index 0000000..a869664 --- /dev/null +++ b/scripts/README.new.md @@ -0,0 +1,196 @@ +# amlrt_project (change this name to the name of your project) + +Replace this line with a short description about your project! + +## Instructions to setup the project + +### Install the dependencies: +First, activate a virtual environment (recommended). +Install the package in `editable` mode so you can modify the source directly: + + pip install -e . + +To add new dependencies, simply add them to the setup.py. + +### Setup pre-commit hooks: +These hooks will: +* validate flake8 before any commit +* check that jupyter notebook outputs have been stripped + + cd .git/hooks/ && ln -s ../../hooks/pre-commit . + +### Setup Continuous Integration + +Continuous integration will run the following: +- Unit tests under `tests`. +- End-to-end test under `exmaples/local`. +- `flake8` to check the code syntax. +- Checks on documentation presence and format (using `sphinx`). + +We support the GitHub Actions for running CI. + +Github actions are already configured in `.github/workflows/tests.yml`. +Github actions are already enabled by default when using Github, so, when +pushing to github, they will be executed automatically for pull requests to +`main` and to `develop`. + +## Running the code + +### Run the tests +Just run (from the root folder): + + pytest + +### Run the code/examples. +Note that the code should already compile at this point. + +Running examples can be found under the `examples` folder. + +In particular, you will find examples for: +* local machine (e.g., your laptop). +* a slurm cluster. + +For both these cases, there is the possibility to run with or without Orion. +(Orion is a hyper-parameter search tool - see https://github.com/Epistimio/orion - +that is already configured in this project) + +In any case, the run script will take multiple config files as arguments (`--configs`). +This is because the config files will be merged together thanks to OmegaConf (the latter +takes precedence). +Note the param `--cli-config-params` can also be used, at CLI time, to modify/add more parameters. + +### Loggers +Currently, Tensorboard, Comet and Aims are supported. +For Comet, you will have to specify the key and the project. +This can be done in several way (see the Comet-ML docs); a quick way is to set the env variables: +``` +COMET_WORKSPACE=... +COMET_PROJECT_NAME=... +COMET_API_KEY=... +``` + +#### Run locally + +For example, to run on your local machine without Orion: + + cd examples/local + sh run.sh + +This will run a simple MLP on a simple toy task: sum 5 float numbers. +You should see an almost perfect loss of 0 after a few epochs. + +Note you have a new `output` folder which contains models and a summary of results: +* best_model: the best model checkpoint during training +* last_model: the last model checkpoint during training +* lightning_logs: contains the tensorboard logs. + +To view tensorboard logs, simply run: + + tensorboard --logdir output + +#### Run on a remote cluster (with Slurm) + +First, bring you project on the cluster (assuming you didn't create your +project directly there). To do so, simply login on the cluster and git +clone your project: + + git clone git@github.com:${GITHUB_USERNAME}/${PROJECT_NAME}.git + +Then activate your virtual env, and install the dependencies: + + cd amlrt_project + pip install -e . + +To run with Slurm, just: + + cd examples/slurm + sh run.sh + +Check the log to see that you got an almost perfect loss (i.e., 0). + +#### Measure GPU time (and others) on the Mila cluster + +You can track down the GPU time (and other resources) of your jobs by +associating a tag to the job (when using `sbatch`). +To associate a tag to a job, replace `my_tag` with a proper tag, +and uncomment the line (i.e., remove one #) from the line: + + ##SBATCH --wckey=my_tag + +This line is inside the file `examples/slurm_mila/to_submit.sh`. + +To get a sumary for a particular tag, just run: + + sacct --allusers --wckeys=my_tag --format=JobID,JobName,Start,Elapsed -X -P --delimiter=',' + +(again, remember to change `my_tag` into the real tag name) + +#### GPU profiling on the Mila cluster + +It can be useful to monitor and profile how you utilise your GPU (usage, memory, etc.). For the +time being, you can only monitor your profiling in real-time from the Mila cluster, i.e. while your +experiments are running. To monitor your GPU, you need to setup port-forwarding on the host your +experiments are running on. This can be done in the following way: + +Once you have launched your job on the mila cluster, open the log for your current experiment: + +`head logs/amlrt_project__.err` + +You should see printed in the first few lines the hostname of your machine, e.g., + +``` +INFO:amlrt_project.utils.logging_utils:Experiment info: +hostname: leto35 +git code hash: a51bfc5447d188bd6d31fac3afbd5757650ef524 +data folder: ../data +data folder (abs): /network/tmp1/bronzimi/20191105_cookiecutter/amlrt_project/examples/data +``` + +In a separate shell on your local computer, run the following command: + +`ssh -L 19999:.server.mila.quebec:19999 @login.server.mila.quebec -p 2222` + +where `` is your user name on the Mila cluster and `` is the name of the machine your job is currenty running on (`leto35` in our example). You can then navigate your local browser to `http://localhost:19999/` to view the ressources being used on the cluster and monitor your job. You should see something like this: + +![image](https://user-images.githubusercontent.com/18450628/88088807-fe2acd80-cb58-11ea-8ab2-bd090e8a826c.png) +{%- endif %} + +#### Run with Orion on the Slurm cluster + +This example will run orion for 2 trials (see the orion config file). +To do so, go into `examples/slurm_orion`. +Here you can find the orion config file (`orion_config.yaml`), as well as the config +file (`config.yaml`) for your project (that contains the hyper-parameters). + +In general, you will want to run Orion in parallel over N slurm jobs. +To do so, simply run `sh run.sh` N times. + +When Orion has completed the trials, you will find the orion db file. + +You will also find the output of your experiments in `orion_working_dir`, which +will contain a folder for every trial. +Inside these folders, you can find the models (the best one and the last one), the config file with +the hyper-parameters for this trial, and the log file. + +You can check orion status with the following commands: +(to be run from `examples/slurm_orion`) + + export ORION_DB_ADDRESS='orion_db.pkl' + export ORION_DB_TYPE='pickleddb' + orion status + orion info --name my_exp + +### Building docs: + +Documentation is built using sphinx. It will automatically document all functions based on docstrings. +To automatically generate docs for your project, navigate to the `docs` folder and build the documentation: + + cd docs + make html + +To view the docs locally, open `docs/_build/html/index.html` in your browser. + + +## YOUR PROJECT README: + +* __TODO__ \ No newline at end of file diff --git a/scripts/quick_install.sh b/scripts/quick_install.sh new file mode 100644 index 0000000..8b54e94 --- /dev/null +++ b/scripts/quick_install.sh @@ -0,0 +1,72 @@ +# This script is used to quickly install the cookiecutter-pyml template +set -e + +# Default value for project_name +project_name="amlrt_project" + + +replace_project_name() { + # Replace all instances of amlrt_project with the project name and rename the root folder + local project_name="$1" + + # Check if the OS is macOS or Linux + if [[ "$OSTYPE" == "darwin"* ]]; then + # macOS + + # Replace all instances of amlrt_project with the project name + find . -type f -exec grep -l 'amlrt_project' {} \; | xargs sed -i '' 's/amlrt_project/'"$project_name"'/g' + + # Rename root folder + mv amlrt_project "$project_name" + elif [[ "$OSTYPE" == "linux-gnu"* ]]; then + # Linux + + # Replace all instances of amlrt_project with the project name + find . -type f -exec grep -l 'amlrt_project' {} \; | xargs sed -i 's/amlrt_project/'"$project_name"'/g' + + # Rename root folder + mv amlrt_project "$project_name" + else + echo "Unsupported OS: $OSTYPE" + echo "Your OS is not yet supported. You will have to manually replace all instances of amlrt_project with your project name." + fi +} + + +# Parse command-line arguments +while [[ "$#" -gt 0 ]]; do + case $1 in + --project-name) project_name="$2"; shift ;; + *) echo "Unknown parameter passed: $1"; exit 1 ;; + esac + shift +done + +# Use the project_name variable +echo "Cloning cookie-cutter to: $project_name..." + +# TODO: Once this has been properly merged to master, update ref to point to master +# git clone https://github.com/mila-iqia/cookiecutter-pyml.git $project_name +git clone --branch development https://github.com/mila-iqia/cookiecutter-pyml.git $project_name + +# Remove the .git folder and reinitialize it +echo "Initializing the git repository..." +cd $project_name +rm -fr .git +git init + +replace_project_name $project_name + +# Replace the README.md file +mv scripts/README.new.md README.md + +echo "" +echo "Done! You can now visit your project by navigating to it:" +echo "" +echo " cd $project_name" + +echo "" +echo "Remember to point it to your github repository:" +echo "git remote add origin git@github.com:\${GITHUB_USERNAME}/\${PROJECT_NAME}.git" +echo "" +echo "For more information, please visit https://github.com/mila-iqia/cookiecutter-pyml" diff --git a/setup.py b/setup.py index 47b80df..8633a97 100644 --- a/setup.py +++ b/setup.py @@ -2,9 +2,9 @@ setup( name='amlrt_project', - version='0.0.1', + version='3.2.0', packages=find_packages(include=['amlrt_project', 'amlrt_project.*']), - python_requires='>=3.9', + python_requires='>=3.11', install_requires=[ 'aim==3.18.1; os_name!="nt"', 'comet-ml==3.39.3', @@ -33,10 +33,9 @@ ], entry_points={ 'console_scripts': [ - # TODO: change amlrt- prefix, placeholder for now. - 'amlrt-train=amlrt_project.train:main', - 'amlrt-eval=amlrt_project.evaluate:main', - 'merge-configs=amlrt_project.utils.config_utils:main' + 'amlrt_project_train=amlrt_project.train:main', + 'amlrt_project_eval=amlrt_project.evaluate:main', + 'amlrt_project_merge_configs=amlrt_project.utils.config_utils:main' ], } ) diff --git a/tests/end2end_pytorch/run.sh b/tests/end2end_pytorch/run_orion.sh similarity index 62% rename from tests/end2end_pytorch/run.sh rename to tests/end2end_pytorch/run_orion.sh index 247c2b1..963cce3 100755 --- a/tests/end2end_pytorch/run.sh +++ b/tests/end2end_pytorch/run_orion.sh @@ -2,23 +2,6 @@ set -e # go to the examples folder and run the example -cd $GITHUB_WORKSPACE/examples/local -sh run.sh -mv output output_OLD -# re-run the example to check reproducibility -sh run.sh -# check results are the same -echo "results are:" -cat output*/results.txt -DIFF_LINES=`cat output*/results.txt | uniq | wc -l` -if [ ${DIFF_LINES} -gt 1 ]; then - echo "ERROR: two identical runs produced different output results - review seed implementation" - exit 1 -else - echo "PASS: two identical runs produced the same output results." -fi - -# run Orion cd $GITHUB_WORKSPACE/examples/local_orion sh run.sh mv orion_working_dir orion_working_dir_OLD diff --git a/tests/end2end_pytorch/run_single.sh b/tests/end2end_pytorch/run_single.sh new file mode 100755 index 0000000..c8ec7c9 --- /dev/null +++ b/tests/end2end_pytorch/run_single.sh @@ -0,0 +1,35 @@ +# exit at the first error +set -e + +# go to the examples folder and run the example +cd $GITHUB_WORKSPACE/examples/local +sh run.sh +mv output output_OLD +# re-run the example to check reproducibility +sh run.sh +# check results are the same +echo "results are:" +cat output*/results.txt +DIFF_LINES=`cat output*/results.txt | uniq | wc -l` +if [ ${DIFF_LINES} -gt 1 ]; then + echo "ERROR: two identical runs produced different output results - review seed implementation" + exit 1 +else + echo "PASS: two identical runs produced the same output results." +fi + +# now run eval and store the results on valid in a variable +EVAL_RESULT=`sh eval.sh | grep "Validation Metrics"` +CLEANED_EVAL_RESULT=`echo $EVAL_RESULT | sed 's/.*: //g' | sed 's/}.*//g'` +TRAIN_RESULT=`cat output/results.txt` +CLEANED_TRAIN_RESULT=`echo ${TRAIN_RESULT} | sed 's/.*: //g'` + +echo "train results: ${CLEANED_TRAIN_RESULT} / eval results: ${CLEANED_EVAL_RESULT}" + +# Compare the two values, formatted to 5 decimal places +if ! [ "$(printf "%.5f" "$CLEANED_EVAL_RESULT")" = "$(printf "%.5f" "$CLEANED_TRAIN_RESULT")" ]; then + echo "results are NOT equal up to 5 decimal places." + exit 1 +else + echo "results are equal." +fi \ No newline at end of file