From 4d983ded11f9038198ac0da225e4f147f30413ad Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 13 Apr 2023 01:03:59 -0600 Subject: [PATCH 1/3] add wip deepcam workflow I need a larger computer with moooooar power! Signed-off-by: vsoch --- .../merlin/singularity-openfoam/README.md | 9 +-- .../mlcommons-deepcam/README.md | 74 +++++++++++++++++++ .../mlcommons-deepcam/install_mini_dataset.sh | 35 +++++++++ .../mlcommons-deepcam/minicluster.yaml | 50 +++++++++++++ .../mlcommons-deepcam/run_training.sh | 49 ++++++++++++ 5 files changed, 209 insertions(+), 8 deletions(-) create mode 100755 examples/machine-learning/mlcommons-deepcam/README.md create mode 100755 examples/machine-learning/mlcommons-deepcam/install_mini_dataset.sh create mode 100755 examples/machine-learning/mlcommons-deepcam/minicluster.yaml create mode 100644 examples/machine-learning/mlcommons-deepcam/run_training.sh diff --git a/examples/launchers/merlin/singularity-openfoam/README.md b/examples/launchers/merlin/singularity-openfoam/README.md index 81545c5a..f7365277 100755 --- a/examples/launchers/merlin/singularity-openfoam/README.md +++ b/examples/launchers/merlin/singularity-openfoam/README.md @@ -17,14 +17,7 @@ $ kind create cluster --config ../../../kind-config.yaml And the Flux Operator namespace created: ```bash -$ kubectl create -n flux-operator -``` - -And then generate the (separate) pods to run redis and rabbitmq in the flux-operator namespace. -The containers already have shared certificates (just for this test case)! - -```bash -$ kubectl create -f ../services.yaml +$ kubectl create namespace flux-operator ``` And create the MiniCluster to use them! diff --git a/examples/machine-learning/mlcommons-deepcam/README.md b/examples/machine-learning/mlcommons-deepcam/README.md new file mode 100755 index 00000000..16c74c76 --- /dev/null +++ b/examples/machine-learning/mlcommons-deepcam/README.md @@ -0,0 +1,74 @@ +# Deepcam + +> Deep Learning Climate Segmentation Benchmark + +This shows a PyTorch implementation for the climate segmentation benchmark, based on the +Exascale Deep Learning for Climate Analytics paper: https://arxiv.org/abs/1810.01993. +The workflow is provided from [mlcommons/deepcam](https://github.com/mlcommons/hpc/tree/main/deepcam). + +## Create MiniCluster + +First, cd to the directory here, and create the kind cluster: + +```bash +$ kind create cluster --config ../../kind-config.yaml +``` + +And the Flux Operator namespace created: + +```bash +$ kubectl create namespace flux-operator +``` + +And install the flux operator (from the repository here): + +```bash +$ kubectl apply -f ../../dist/flux-operator.yaml +``` + +We don't want to create the minicluster quite yet! We want to prepare the data first. + +## Dataset + +You can read [more about the dataset here](https://github.com/mlcommons/hpc/tree/main/deepcam#dataset). +You'll need to download the dataset from [this globus endpoint](https://app.globus.org/file-manager?origin_id=0b226e2c-4de0-11ea-971a-021304b0cca7&origin_path=%2F) and into the current directory. +Note that I did this by setting up [Globus Connect Personal](https://www.globus.org/globus-connect-personal) and +then downloading to a scoped location on my computer, and then moving to the directory here. +First, extract the data (make sure you have ~50GB of space): + +```bash +$ tar -xzvf deepcam-data-n512.tgz +$ chmod +x install_mini_dataset.sh +``` + +This will extract the data to a directory, `deepcam-data-n512` and then we can run the script to prepare it: + +```bash +$ mkdir -p ./data +$ ./install_mini_dataset.sh ./deepcam-data-n512 ./data +``` + +This will basically copy the data over, and create the needed structure for training, etc. +It should look like this, with most of the files under "training": + +```bash +$ ls ./data +stats.h5 train validation +``` + +Note that the root directory here is bound to /tmp/workflow in our cluster, so it should +show up as `/tmp/workflow/data`. + +## Training + +Now that we have our data ready, we can create the minicluster (which will pull the container to run the job) +Note that we will use default parameters, but you can learn more about the defaults and parameters +[in the repository](https://github.com/mlcommons/hpc/tree/main/deepcam). + +Then create the MiniCluster to use them! Let's hope your computer doesn't run out of space, or something like that. + +```bash +$ kubectl apply -f minicluster.yaml +``` + +**WIP** need larger computer... diff --git a/examples/machine-learning/mlcommons-deepcam/install_mini_dataset.sh b/examples/machine-learning/mlcommons-deepcam/install_mini_dataset.sh new file mode 100755 index 00000000..93af849d --- /dev/null +++ b/examples/machine-learning/mlcommons-deepcam/install_mini_dataset.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# This script will take the downloaded small batch of data files, +# make the required number of duplicates, and install in the specified +# directory in train/ and validation/ subfolders. + +if [ $# -lt 2 ]; then + echo "Usage:" + echo " $0 DOWNLOADED_DATA_DIR INSTALLATION_TARGET_DIR [NUM_COPIES]" + exit 1 +fi + +sourceDir=$1 +targetDir=$2 +numCopies=1 +if [ $# -ge 3 ]; then + numCopies=$3 +fi + +# First, we prepare the train directory by duplicating every file numCopies times +mkdir -p $targetDir/train +for f in $(ls $sourceDir | grep "data-.*.h5"); do + echo $f + for (( i=0; i<$numCopies; i++ )); do + outFile=$targetDir/train/${f/.h5/-$i.h5} + echo " $outFile" + cp $sourceDir/$f $outFile + done +done + +# Copy in the stats file +cp $sourceDir/stats.h5 $targetDir/ + +# Now copy the training directory to the validation directory +cp -r $targetDir/train $targetDir/validation diff --git a/examples/machine-learning/mlcommons-deepcam/minicluster.yaml b/examples/machine-learning/mlcommons-deepcam/minicluster.yaml new file mode 100755 index 00000000..2536a2de --- /dev/null +++ b/examples/machine-learning/mlcommons-deepcam/minicluster.yaml @@ -0,0 +1,50 @@ +apiVersion: flux-framework.org/v1alpha1 +kind: MiniCluster +metadata: + name: flux-sample + namespace: flux-operator +spec: + + # IMPORTANT: see the README.md to see how to prepare data first! + # You should have a local ./data folder with training and stats + # Number of pods to create for MiniCluster + size: 4 + tasks: 2 + interactive: true + + # Make this kind of persistent volume and claim available to pods + # This is a path in minikube (e.g., minikube ssh) + volumes: + data: + storageClass: hostpath + path: /tmp/workflow + + # This is a list because a pod can support multiple containers + containers: + # The container URI to pull (currently needs to be public) + - image: ghcr.io/rse-ops/singularity:tag-mamba + cores: 4 + + # This will run with the defaults, targeting our ./data directory + command: singularity exec --pwd /opt/deepCam/run_scripts ./deepcam.sif /bin/bash ./run_training.sh + workingDir: /tmp/workflow + + # This pulls the container (once) by the broker to workingDir /data + commands: + pre: mkdir -p /tmp/workflow/output + brokerPre: | + if [[ ! -e "/tmp/workflow/deepcam.sif" ]]; then + singularity pull /tmp/workflow/deepcam.sif docker://ghcr.io/rse-ops/mlcommons-deepcam:tag-21.12-py3 + fi + + fluxUser: + name: fluxuser + + # Container will be pre-pulled here only by the broker + volumes: + data: + path: /tmp/workflow + + # Running a container in a container + securityContext: + privileged: true \ No newline at end of file diff --git a/examples/machine-learning/mlcommons-deepcam/run_training.sh b/examples/machine-learning/mlcommons-deepcam/run_training.sh new file mode 100644 index 00000000..74264197 --- /dev/null +++ b/examples/machine-learning/mlcommons-deepcam/run_training.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# The MIT License (MIT) +# +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +# parameters +data_dir="/tmp/workflow/data" +output_dir="/tmp/workflow/output" +run_tag="test_run" +local_batch_size=2 + +python ./train.py \ + --wireup_method "dummy" \ + --run_tag ${run_tag} \ + --data_dir_prefix ${data_dir} \ + --output_dir ${output_dir} \ + --model_prefix "segmentation" \ + --optimizer "LAMB" \ + --adam_eps 1e-6 \ + --start_lr 0.0055 \ + --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \ + --lr_warmup_steps 400 \ + --lr_warmup_factor 1. \ + --weight_decay 1e-2 \ + --logging_frequency 10 \ + --save_frequency 0 \ + --max_epochs 200 \ + --max_inter_threads 4 \ + --seed $(date +%s) \ + --batchnorm_group_size 1 \ + --local_batch_size ${local_batch_size} \ No newline at end of file From 8cd64b2545ef256bd72169c58df8519cfa754796 Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 13 Apr 2023 01:05:20 -0600 Subject: [PATCH 2/3] add back accidentally removed bit Signed-off-by: vsoch --- examples/launchers/merlin/singularity-openfoam/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/launchers/merlin/singularity-openfoam/README.md b/examples/launchers/merlin/singularity-openfoam/README.md index f7365277..57ed5df9 100755 --- a/examples/launchers/merlin/singularity-openfoam/README.md +++ b/examples/launchers/merlin/singularity-openfoam/README.md @@ -20,6 +20,13 @@ And the Flux Operator namespace created: $ kubectl create namespace flux-operator ``` +And then generate the (separate) pods to run redis and rabbitmq in the flux-operator namespace. +The containers already have shared certificates (just for this test case)! + +```bash +$ kubectl create -f ../services.yaml +``` + And create the MiniCluster to use them! ```bash From e9b78c71ddf93d8c6a15d70be979cd9a6fdc781d Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 13 Apr 2023 14:13:04 -0600 Subject: [PATCH 3/3] save state of mlcommons workflow Signed-off-by: vsoch --- .../mlcommons-deepcam/README.md | 14 +- .../mlcommons-deepcam/minicluster.yaml | 4 +- .../mlcommons-deepcam/run_training.sh | 9 +- .../mlcommons-deepcam/train.py | 260 ++++++++++++++++++ 4 files changed, 279 insertions(+), 8 deletions(-) mode change 100644 => 100755 examples/machine-learning/mlcommons-deepcam/run_training.sh create mode 100755 examples/machine-learning/mlcommons-deepcam/train.py diff --git a/examples/machine-learning/mlcommons-deepcam/README.md b/examples/machine-learning/mlcommons-deepcam/README.md index 16c74c76..ee28503d 100755 --- a/examples/machine-learning/mlcommons-deepcam/README.md +++ b/examples/machine-learning/mlcommons-deepcam/README.md @@ -8,10 +8,18 @@ The workflow is provided from [mlcommons/deepcam](https://github.com/mlcommons/h ## Create MiniCluster -First, cd to the directory here, and create the kind cluster: +First, cd to the directory here, and create the minikube cluster (kind did not work to create a sandbox for the SIF): ```bash -$ kind create cluster --config ../../kind-config.yaml +$ minikube start +``` + +If you use minikube, you'll want to create a mount: + +```bash +$ minikube mount $PWD/:/tmp/workflow +$ docker pull ghcr.io/rse-ops/singularity:tag-mamba +$ minikube image load ghcr.io/rse-ops/singularity:tag-mamba ``` And the Flux Operator namespace created: @@ -71,4 +79,4 @@ Then create the MiniCluster to use them! Let's hope your computer doesn't run ou $ kubectl apply -f minicluster.yaml ``` -**WIP** need larger computer... +**WIP** this likely will work, but needs to be tested on a machine with GPU, etc. It will not work on a CPU. diff --git a/examples/machine-learning/mlcommons-deepcam/minicluster.yaml b/examples/machine-learning/mlcommons-deepcam/minicluster.yaml index 2536a2de..ac04fa34 100755 --- a/examples/machine-learning/mlcommons-deepcam/minicluster.yaml +++ b/examples/machine-learning/mlcommons-deepcam/minicluster.yaml @@ -8,7 +8,7 @@ spec: # IMPORTANT: see the README.md to see how to prepare data first! # You should have a local ./data folder with training and stats # Number of pods to create for MiniCluster - size: 4 + size: 2 tasks: 2 interactive: true @@ -26,7 +26,7 @@ spec: cores: 4 # This will run with the defaults, targeting our ./data directory - command: singularity exec --pwd /opt/deepCam/run_scripts ./deepcam.sif /bin/bash ./run_training.sh + command: singularity exec --pwd /opt/deepCam ./deepcam.sif /bin/bash /tmp/workflow/run_training.sh workingDir: /tmp/workflow # This pulls the container (once) by the broker to workingDir /data diff --git a/examples/machine-learning/mlcommons-deepcam/run_training.sh b/examples/machine-learning/mlcommons-deepcam/run_training.sh old mode 100644 new mode 100755 index 74264197..f9e57055 --- a/examples/machine-learning/mlcommons-deepcam/run_training.sh +++ b/examples/machine-learning/mlcommons-deepcam/run_training.sh @@ -22,19 +22,19 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # parameters +# IMPORTANT: use absolute paths data_dir="/tmp/workflow/data" output_dir="/tmp/workflow/output" run_tag="test_run" local_batch_size=2 -python ./train.py \ +python /tmp/workflow/train.py \ --wireup_method "dummy" \ --run_tag ${run_tag} \ --data_dir_prefix ${data_dir} \ --output_dir ${output_dir} \ --model_prefix "segmentation" \ --optimizer "LAMB" \ - --adam_eps 1e-6 \ --start_lr 0.0055 \ --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \ --lr_warmup_steps 400 \ @@ -46,4 +46,7 @@ python ./train.py \ --max_inter_threads 4 \ --seed $(date +%s) \ --batchnorm_group_size 1 \ - --local_batch_size ${local_batch_size} \ No newline at end of file + --local_batch_size ${local_batch_size} + +# Removed (not an argument) +# --adam_eps 1e-6 \ diff --git a/examples/machine-learning/mlcommons-deepcam/train.py b/examples/machine-learning/mlcommons-deepcam/train.py new file mode 100755 index 00000000..06f5cd21 --- /dev/null +++ b/examples/machine-learning/mlcommons-deepcam/train.py @@ -0,0 +1,260 @@ +# The MIT License (MIT) +# +# Copyright (c) 2018 Pyjcsx +# Modifications Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +# Basics +import os +import numpy as np +import datetime as dt +import subprocess as sp +import sys + +# Utils is here +sys.path.insert(0, "/opt/deepCam") + +# logging +import utils.mlperf_log_utils as mll + +# Torch +import torch +import torch.optim as optim +from torch.autograd import Variable + +# Custom +from driver import train_epoch, validate +from utils import parser +from utils import losses +from utils import optimizer_helpers as oh +from utils import bnstats as bns +from data import get_dataloaders, get_datashapes +from architecture import deeplab_xception + +# DDP +import torch.distributed as dist +from torch.nn.parallel.distributed import DistributedDataParallel as DDP + +#comm wrapper +from utils import comm + +#main function +def main(pargs): + + #init distributed training + comm_local_group = comm.init(pargs.wireup_method, pargs.batchnorm_group_size) + comm_rank = comm.get_rank() + comm_local_rank = comm.get_local_rank() + comm_size = comm.get_size() + comm_local_size = comm.get_local_size() + + # set up logging + pargs.logging_frequency = max([pargs.logging_frequency, 0]) + log_file = os.path.normpath(os.path.join(pargs.output_dir, "logs", pargs.run_tag + ".log")) + logger = mll.mlperf_logger(log_file, "deepcam", "Umbrella Corp.") + logger.log_start(key = "init_start", sync = True) + logger.log_event(key = "cache_clear") + + #set seed + seed = pargs.seed + logger.log_event(key = "seed", value = seed) + + # Some setup + torch.manual_seed(seed) + if torch.cuda.is_available(): + device = torch.device("cuda", comm_local_rank) + torch.cuda.manual_seed(seed) + torch.cuda.set_device(device) + torch.backends.cudnn.benchmark = True + else: + device = torch.device("cpu") + + #set up directories + root_dir = os.path.join(pargs.data_dir_prefix) + output_dir = pargs.output_dir + plot_dir = os.path.join(output_dir, "plots") + if comm_rank == 0: + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + + # logging of rank information + logger.log_event(key = "number_of_ranks", value = comm_size) + logger.log_event(key = "number_of_nodes", value = (comm_size // comm_local_size)) + logger.log_event(key = "accelerators_per_node", value = comm_local_size) + + # Logging hyperparameters + logger.log_event(key = "global_batch_size", value = (pargs.local_batch_size * comm_size)) + logger.log_event(key = "batchnorm_group_size", value = pargs.batchnorm_group_size) + logger.log_event(key = "gradient_accumulation_frequency", value = pargs.gradient_accumulation_frequency) + logger.log_event(key = "checkpoint", value = pargs.checkpoint) + + # Define architecture + n_input_channels = len(pargs.channels) + n_output_channels = 3 + net = deeplab_xception.DeepLabv3_plus(n_input = n_input_channels, + n_classes = n_output_channels, + os=16, pretrained=False, + rank = comm_rank, + process_group = comm_local_group) + net.to(device) + + #select loss + #some magic numbers + loss_pow = -0.125 + class_weights = [0.986267818390377**loss_pow, 0.0004578708870701058**loss_pow, 0.01327431072255291**loss_pow] + # extract loss + criterion = losses.CELoss(class_weights).to(device) + criterion = torch.jit.script(criterion) + + #select optimizer + optimizer = oh.get_optimizer(pargs, net, logger) + + #restart from checkpoint if desired + if pargs.checkpoint is not None: + checkpoint = torch.load(pargs.checkpoint, map_location = device) + start_step = checkpoint['step'] + start_epoch = checkpoint['epoch'] + optimizer.load_state_dict(checkpoint['optimizer']) + net.load_state_dict(checkpoint['model']) + else: + start_step = 0 + start_epoch = 0 + + #broadcast model and optimizer state + steptens = torch.tensor(np.array([start_step, start_epoch]), requires_grad=False).to(device) + if dist.is_initialized(): + dist.broadcast(steptens, src = 0) + + #unpack the bcasted tensor + start_step = int(steptens.cpu().numpy()[0]) + start_epoch = int(steptens.cpu().numpy()[1]) + + #select scheduler + scheduler = None + if pargs.lr_schedule: + pargs.lr_schedule["lr_warmup_steps"] = pargs.lr_warmup_steps + pargs.lr_schedule["lr_warmup_factor"] = pargs.lr_warmup_factor + scheduler = oh.get_lr_schedule(pargs.start_lr, pargs.lr_schedule, optimizer, logger, last_step = start_step) + + # print parameters + if comm_rank == 0: + print(net) + print("Total number of elements:", sum(p.numel() for p in net.parameters() if p.requires_grad)) + + # get input shapes for the upcoming model preprocessing + # input_shape: + tshape, _ = get_datashapes(pargs, root_dir) + input_shape = tuple([tshape[2], tshape[0], tshape[1]]) + + #distributed model parameters + bucket_cap_mb = 25 + if pargs.batchnorm_group_size > 1: + bucket_cap_mb = 220 + + # get stream, relevant for graph capture + ddp_net = DDP(net, device_ids=[device.index], + output_device=device.index, + find_unused_parameters=True, + broadcast_buffers=False, + bucket_cap_mb=bucket_cap_mb, + gradient_as_bucket_view=False) + + # get stats handler here + bnstats_handler = bns.BatchNormStatsSynchronize(ddp_net, reduction = "mean", inplace = True) + + # create handles + net_validate = ddp_net + net_train = ddp_net + + # Set up the data feeder + train_loader, train_size, validation_loader, validation_size = get_dataloaders(pargs, root_dir, device, seed, comm_size, comm_rank) + + # log size of datasets + logger.log_event(key = "train_samples", value = train_size) + val_size = validation_size + logger.log_event(key = "eval_samples", value = val_size) + + # get start steps + step = start_step + epoch = start_epoch + current_lr = pargs.start_lr if not pargs.lr_schedule else scheduler.get_last_lr()[0] + stop_training = False + net_train.train() + + # start trining + logger.log_end(key = "init_stop", sync = True) + logger.log_start(key = "run_start", sync = True) + + # training loop + while True: + + # start epoch + logger.log_start(key = "epoch_start", metadata = {'epoch_num': epoch+1, 'step_num': step}, sync=True) + + train_loader.sampler.set_epoch(epoch) + + # training + step = train_epoch(pargs, comm_rank, comm_size, + device, step, epoch, + net_train, criterion, + optimizer, scheduler, + train_loader, + logger) + + # average BN stats + bnstats_handler.synchronize() + + # validation + stop_training = validate(pargs, comm_rank, comm_size, + device, step, epoch, + net_validate, criterion, validation_loader, + logger) + + # log the epoch + logger.log_end(key = "epoch_stop", metadata = {'epoch_num': epoch+1, 'step_num': step}, sync = True) + epoch += 1 + + #save model if desired + if (pargs.save_frequency > 0) and (epoch % pargs.save_frequency == 0): + logger.log_start(key = "save_start", metadata = {'epoch_num': epoch+1, 'step_num': step}, sync = True) + if comm_rank == 0: + checkpoint = { + 'step': step, + 'epoch': epoch, + 'model': net_train.state_dict(), + 'optimizer': optimizer.state_dict() + } + torch.save(checkpoint, os.path.join(output_dir, pargs.model_prefix + "_step_" + str(step) + ".cpt") ) + logger.log_end(key = "save_stop", metadata = {'epoch_num': epoch+1, 'step_num': step}, sync = True) + + # are we done? + if (epoch >= pargs.max_epochs) or stop_training: + break + + # run done + logger.log_end(key = "run_stop", sync = True, metadata = {'status' : 'success'}) + + +if __name__ == "__main__": + + #arguments + pargs = parser.parse_arguments() + + #run the stuff + main(pargs)