From 4d983ded11f9038198ac0da225e4f147f30413ad Mon Sep 17 00:00:00 2001
From: vsoch <vsoch@users.noreply.github.com>
Date: Thu, 13 Apr 2023 01:03:59 -0600
Subject: [PATCH 1/3] add wip deepcam workflow

I need a larger computer with moooooar power!

Signed-off-by: vsoch <vsoch@users.noreply.github.com>
---
 .../merlin/singularity-openfoam/README.md     |  9 +--
 .../mlcommons-deepcam/README.md               | 74 +++++++++++++++++++
 .../mlcommons-deepcam/install_mini_dataset.sh | 35 +++++++++
 .../mlcommons-deepcam/minicluster.yaml        | 50 +++++++++++++
 .../mlcommons-deepcam/run_training.sh         | 49 ++++++++++++
 5 files changed, 209 insertions(+), 8 deletions(-)
 create mode 100755 examples/machine-learning/mlcommons-deepcam/README.md
 create mode 100755 examples/machine-learning/mlcommons-deepcam/install_mini_dataset.sh
 create mode 100755 examples/machine-learning/mlcommons-deepcam/minicluster.yaml
 create mode 100644 examples/machine-learning/mlcommons-deepcam/run_training.sh

diff --git a/examples/launchers/merlin/singularity-openfoam/README.md b/examples/launchers/merlin/singularity-openfoam/README.md
index 81545c5a..f7365277 100755
--- a/examples/launchers/merlin/singularity-openfoam/README.md
+++ b/examples/launchers/merlin/singularity-openfoam/README.md
@@ -17,14 +17,7 @@ $ kind create cluster --config ../../../kind-config.yaml
 And the Flux Operator namespace created:
 
 ```bash
-$ kubectl create -n flux-operator
-```
-
-And then generate the (separate) pods to run redis and rabbitmq in the flux-operator namespace.
-The containers already have shared certificates (just for this test case)!
-
-```bash
-$ kubectl create -f ../services.yaml
+$ kubectl create namespace flux-operator
 ```
 
 And create the MiniCluster to use them!
diff --git a/examples/machine-learning/mlcommons-deepcam/README.md b/examples/machine-learning/mlcommons-deepcam/README.md
new file mode 100755
index 00000000..16c74c76
--- /dev/null
+++ b/examples/machine-learning/mlcommons-deepcam/README.md
@@ -0,0 +1,74 @@
+# Deepcam
+
+> Deep Learning Climate Segmentation Benchmark
+
+This shows a  PyTorch implementation for the climate segmentation benchmark, based on the
+Exascale Deep Learning for Climate Analytics paper: https://arxiv.org/abs/1810.01993.
+The workflow is provided from [mlcommons/deepcam](https://github.com/mlcommons/hpc/tree/main/deepcam).
+
+## Create MiniCluster
+
+First, cd to the directory here, and create the kind cluster:
+
+```bash
+$ kind create cluster --config ../../kind-config.yaml
+```
+
+And the Flux Operator namespace created:
+
+```bash
+$ kubectl create namespace flux-operator
+```
+
+And install the flux operator (from the repository here):
+
+```bash
+$ kubectl apply -f ../../dist/flux-operator.yaml
+```
+
+We don't want to create the minicluster quite yet! We want to prepare the data first.
+
+## Dataset
+
+You can read [more about the dataset here](https://github.com/mlcommons/hpc/tree/main/deepcam#dataset).
+You'll need to download the dataset from [this globus endpoint](https://app.globus.org/file-manager?origin_id=0b226e2c-4de0-11ea-971a-021304b0cca7&origin_path=%2F) and into the current directory.
+Note that I did this by setting up [Globus Connect Personal](https://www.globus.org/globus-connect-personal) and
+then downloading to a scoped location on my computer, and then moving to the directory here.
+First, extract the data (make sure you have ~50GB of space):
+
+```bash
+$ tar -xzvf deepcam-data-n512.tgz
+$ chmod +x install_mini_dataset.sh
+```
+
+This will extract the data to a directory, `deepcam-data-n512` and then we can run the script to prepare it:
+
+```bash
+$ mkdir -p ./data
+$ ./install_mini_dataset.sh ./deepcam-data-n512 ./data
+```
+
+This will basically copy the data over, and create the needed structure for training, etc.
+It should look like this, with most of the files under "training":
+
+```bash
+$ ls ./data
+stats.h5  train  validation
+```
+
+Note that the root directory here is bound to /tmp/workflow in our cluster, so it should
+show up as `/tmp/workflow/data`.
+
+## Training
+
+Now that we have our data ready, we can create the minicluster (which will pull the container to run the job)
+Note that we will use default parameters, but you can learn more about the defaults and parameters
+[in the repository](https://github.com/mlcommons/hpc/tree/main/deepcam).
+
+Then create the MiniCluster to use them! Let's hope your computer doesn't run out of space, or something like that.
+
+```bash
+$ kubectl apply -f minicluster.yaml
+```
+
+**WIP** need larger computer...
diff --git a/examples/machine-learning/mlcommons-deepcam/install_mini_dataset.sh b/examples/machine-learning/mlcommons-deepcam/install_mini_dataset.sh
new file mode 100755
index 00000000..93af849d
--- /dev/null
+++ b/examples/machine-learning/mlcommons-deepcam/install_mini_dataset.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# This script will take the downloaded small batch of data files,
+# make the required number of duplicates, and install in the specified
+# directory in train/ and validation/ subfolders.
+
+if [ $# -lt 2 ]; then
+    echo "Usage:"
+    echo "  $0 DOWNLOADED_DATA_DIR INSTALLATION_TARGET_DIR [NUM_COPIES]"
+    exit 1
+fi
+
+sourceDir=$1
+targetDir=$2
+numCopies=1
+if [ $# -ge 3 ]; then
+    numCopies=$3
+fi
+
+# First, we prepare the train directory by duplicating every file numCopies times
+mkdir -p $targetDir/train
+for f in $(ls $sourceDir | grep "data-.*.h5"); do
+    echo $f
+    for (( i=0; i<$numCopies; i++ )); do
+        outFile=$targetDir/train/${f/.h5/-$i.h5}
+        echo "  $outFile"
+        cp $sourceDir/$f $outFile
+    done
+done
+
+# Copy in the stats file
+cp $sourceDir/stats.h5 $targetDir/
+
+# Now copy the training directory to the validation directory
+cp -r $targetDir/train $targetDir/validation
diff --git a/examples/machine-learning/mlcommons-deepcam/minicluster.yaml b/examples/machine-learning/mlcommons-deepcam/minicluster.yaml
new file mode 100755
index 00000000..2536a2de
--- /dev/null
+++ b/examples/machine-learning/mlcommons-deepcam/minicluster.yaml
@@ -0,0 +1,50 @@
+apiVersion: flux-framework.org/v1alpha1
+kind: MiniCluster
+metadata:
+  name: flux-sample
+  namespace: flux-operator
+spec:
+
+  # IMPORTANT: see the README.md to see how to prepare data first!
+  # You should have a local ./data folder with training and stats
+  # Number of pods to create for MiniCluster
+  size: 4
+  tasks: 2
+  interactive: true
+
+  # Make this kind of persistent volume and claim available to pods
+  # This is a path in minikube (e.g., minikube ssh)
+  volumes:
+    data:
+      storageClass: hostpath
+      path: /tmp/workflow
+
+  # This is a list because a pod can support multiple containers
+  containers:
+    # The container URI to pull (currently needs to be public)
+    - image: ghcr.io/rse-ops/singularity:tag-mamba
+      cores: 4
+
+      # This will run with the defaults, targeting our ./data directory
+      command: singularity exec --pwd /opt/deepCam/run_scripts ./deepcam.sif /bin/bash ./run_training.sh 
+      workingDir: /tmp/workflow
+
+      # This pulls the container (once) by the broker to workingDir /data
+      commands:
+        pre: mkdir -p /tmp/workflow/output
+        brokerPre: |
+           if [[ ! -e "/tmp/workflow/deepcam.sif" ]]; then
+               singularity pull /tmp/workflow/deepcam.sif docker://ghcr.io/rse-ops/mlcommons-deepcam:tag-21.12-py3
+           fi
+
+      fluxUser:
+        name: fluxuser
+
+      # Container will be pre-pulled here only by the broker
+      volumes:
+        data:
+          path: /tmp/workflow
+       
+      # Running a container in a container
+      securityContext:
+        privileged: true
\ No newline at end of file
diff --git a/examples/machine-learning/mlcommons-deepcam/run_training.sh b/examples/machine-learning/mlcommons-deepcam/run_training.sh
new file mode 100644
index 00000000..74264197
--- /dev/null
+++ b/examples/machine-learning/mlcommons-deepcam/run_training.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# The MIT License (MIT)
+#
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# parameters
+data_dir="/tmp/workflow/data"
+output_dir="/tmp/workflow/output"
+run_tag="test_run"
+local_batch_size=2
+
+python ./train.py \
+       --wireup_method "dummy" \
+       --run_tag ${run_tag} \
+       --data_dir_prefix ${data_dir} \
+       --output_dir ${output_dir} \
+       --model_prefix "segmentation" \
+       --optimizer "LAMB" \
+       --adam_eps 1e-6 \
+       --start_lr 0.0055 \
+       --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \
+       --lr_warmup_steps 400 \
+       --lr_warmup_factor 1. \
+       --weight_decay 1e-2 \
+       --logging_frequency 10 \
+       --save_frequency 0 \
+       --max_epochs 200 \
+       --max_inter_threads 4 \
+       --seed $(date +%s) \
+       --batchnorm_group_size 1 \
+       --local_batch_size ${local_batch_size}
\ No newline at end of file

From 8cd64b2545ef256bd72169c58df8519cfa754796 Mon Sep 17 00:00:00 2001
From: vsoch <vsoch@users.noreply.github.com>
Date: Thu, 13 Apr 2023 01:05:20 -0600
Subject: [PATCH 2/3] add back accidentally removed bit

Signed-off-by: vsoch <vsoch@users.noreply.github.com>
---
 examples/launchers/merlin/singularity-openfoam/README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/launchers/merlin/singularity-openfoam/README.md b/examples/launchers/merlin/singularity-openfoam/README.md
index f7365277..57ed5df9 100755
--- a/examples/launchers/merlin/singularity-openfoam/README.md
+++ b/examples/launchers/merlin/singularity-openfoam/README.md
@@ -20,6 +20,13 @@ And the Flux Operator namespace created:
 $ kubectl create namespace flux-operator
 ```
 
+And then generate the (separate) pods to run redis and rabbitmq in the flux-operator namespace.
+The containers already have shared certificates (just for this test case)!
+
+```bash
+$ kubectl create -f ../services.yaml
+```
+
 And create the MiniCluster to use them!
 
 ```bash

From e9b78c71ddf93d8c6a15d70be979cd9a6fdc781d Mon Sep 17 00:00:00 2001
From: vsoch <vsoch@users.noreply.github.com>
Date: Thu, 13 Apr 2023 14:13:04 -0600
Subject: [PATCH 3/3] save state of mlcommons workflow

Signed-off-by: vsoch <vsoch@users.noreply.github.com>
---
 .../mlcommons-deepcam/README.md               |  14 +-
 .../mlcommons-deepcam/minicluster.yaml        |   4 +-
 .../mlcommons-deepcam/run_training.sh         |   9 +-
 .../mlcommons-deepcam/train.py                | 260 ++++++++++++++++++
 4 files changed, 279 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 examples/machine-learning/mlcommons-deepcam/run_training.sh
 create mode 100755 examples/machine-learning/mlcommons-deepcam/train.py

diff --git a/examples/machine-learning/mlcommons-deepcam/README.md b/examples/machine-learning/mlcommons-deepcam/README.md
index 16c74c76..ee28503d 100755
--- a/examples/machine-learning/mlcommons-deepcam/README.md
+++ b/examples/machine-learning/mlcommons-deepcam/README.md
@@ -8,10 +8,18 @@ The workflow is provided from [mlcommons/deepcam](https://github.com/mlcommons/h
 
 ## Create MiniCluster
 
-First, cd to the directory here, and create the kind cluster:
+First, cd to the directory here, and create the minikube cluster (kind did not work to create a sandbox for the SIF):
 
 ```bash
-$ kind create cluster --config ../../kind-config.yaml
+$ minikube start
+```
+
+If you use minikube, you'll want to create a mount:
+
+```bash
+$ minikube mount $PWD/:/tmp/workflow
+$ docker pull ghcr.io/rse-ops/singularity:tag-mamba
+$ minikube image load ghcr.io/rse-ops/singularity:tag-mamba
 ```
 
 And the Flux Operator namespace created:
@@ -71,4 +79,4 @@ Then create the MiniCluster to use them! Let's hope your computer doesn't run ou
 $ kubectl apply -f minicluster.yaml
 ```
 
-**WIP** need larger computer...
+**WIP** this likely will work, but needs to be tested on a machine with GPU, etc. It will not work on a CPU.
diff --git a/examples/machine-learning/mlcommons-deepcam/minicluster.yaml b/examples/machine-learning/mlcommons-deepcam/minicluster.yaml
index 2536a2de..ac04fa34 100755
--- a/examples/machine-learning/mlcommons-deepcam/minicluster.yaml
+++ b/examples/machine-learning/mlcommons-deepcam/minicluster.yaml
@@ -8,7 +8,7 @@ spec:
   # IMPORTANT: see the README.md to see how to prepare data first!
   # You should have a local ./data folder with training and stats
   # Number of pods to create for MiniCluster
-  size: 4
+  size: 2
   tasks: 2
   interactive: true
 
@@ -26,7 +26,7 @@ spec:
       cores: 4
 
       # This will run with the defaults, targeting our ./data directory
-      command: singularity exec --pwd /opt/deepCam/run_scripts ./deepcam.sif /bin/bash ./run_training.sh 
+      command: singularity exec --pwd /opt/deepCam ./deepcam.sif /bin/bash /tmp/workflow/run_training.sh
       workingDir: /tmp/workflow
 
       # This pulls the container (once) by the broker to workingDir /data
diff --git a/examples/machine-learning/mlcommons-deepcam/run_training.sh b/examples/machine-learning/mlcommons-deepcam/run_training.sh
old mode 100644
new mode 100755
index 74264197..f9e57055
--- a/examples/machine-learning/mlcommons-deepcam/run_training.sh
+++ b/examples/machine-learning/mlcommons-deepcam/run_training.sh
@@ -22,19 +22,19 @@
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 # parameters
+# IMPORTANT: use absolute paths
 data_dir="/tmp/workflow/data"
 output_dir="/tmp/workflow/output"
 run_tag="test_run"
 local_batch_size=2
 
-python ./train.py \
+python /tmp/workflow/train.py \
        --wireup_method "dummy" \
        --run_tag ${run_tag} \
        --data_dir_prefix ${data_dir} \
        --output_dir ${output_dir} \
        --model_prefix "segmentation" \
        --optimizer "LAMB" \
-       --adam_eps 1e-6 \
        --start_lr 0.0055 \
        --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \
        --lr_warmup_steps 400 \
@@ -46,4 +46,7 @@ python ./train.py \
        --max_inter_threads 4 \
        --seed $(date +%s) \
        --batchnorm_group_size 1 \
-       --local_batch_size ${local_batch_size}
\ No newline at end of file
+       --local_batch_size ${local_batch_size}
+
+# Removed (not an argument)
+#       --adam_eps 1e-6 \
diff --git a/examples/machine-learning/mlcommons-deepcam/train.py b/examples/machine-learning/mlcommons-deepcam/train.py
new file mode 100755
index 00000000..06f5cd21
--- /dev/null
+++ b/examples/machine-learning/mlcommons-deepcam/train.py
@@ -0,0 +1,260 @@
+# The MIT License (MIT)
+#
+# Copyright (c) 2018 Pyjcsx
+# Modifications Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# Basics
+import os
+import numpy as np
+import datetime as dt
+import subprocess as sp
+import sys
+
+# Utils is here
+sys.path.insert(0, "/opt/deepCam")
+
+# logging
+import utils.mlperf_log_utils as mll
+
+# Torch
+import torch
+import torch.optim as optim
+from torch.autograd import Variable
+
+# Custom
+from driver import train_epoch, validate
+from utils import parser
+from utils import losses
+from utils import optimizer_helpers as oh
+from utils import bnstats as bns
+from data import get_dataloaders, get_datashapes
+from architecture import deeplab_xception
+
+# DDP
+import torch.distributed as dist
+from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+
+#comm wrapper
+from utils import comm
+
+#main function
+def main(pargs):
+
+    #init distributed training
+    comm_local_group = comm.init(pargs.wireup_method, pargs.batchnorm_group_size)
+    comm_rank = comm.get_rank()
+    comm_local_rank = comm.get_local_rank()
+    comm_size = comm.get_size()
+    comm_local_size = comm.get_local_size()
+    
+    # set up logging
+    pargs.logging_frequency = max([pargs.logging_frequency, 0])
+    log_file = os.path.normpath(os.path.join(pargs.output_dir, "logs", pargs.run_tag + ".log"))
+    logger = mll.mlperf_logger(log_file, "deepcam", "Umbrella Corp.")
+    logger.log_start(key = "init_start", sync = True)        
+    logger.log_event(key = "cache_clear")
+    
+    #set seed
+    seed = pargs.seed
+    logger.log_event(key = "seed", value = seed)
+    
+    # Some setup
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        device = torch.device("cuda", comm_local_rank)
+        torch.cuda.manual_seed(seed)
+        torch.cuda.set_device(device)
+        torch.backends.cudnn.benchmark = True
+    else:
+        device = torch.device("cpu")
+        
+    #set up directories
+    root_dir = os.path.join(pargs.data_dir_prefix)
+    output_dir = pargs.output_dir
+    plot_dir = os.path.join(output_dir, "plots")
+    if comm_rank == 0:
+        if not os.path.isdir(output_dir):
+            os.makedirs(output_dir)
+
+    # logging of rank information
+    logger.log_event(key = "number_of_ranks", value = comm_size)
+    logger.log_event(key = "number_of_nodes", value = (comm_size // comm_local_size))
+    logger.log_event(key = "accelerators_per_node", value = comm_local_size)
+    
+    # Logging hyperparameters
+    logger.log_event(key = "global_batch_size", value = (pargs.local_batch_size * comm_size))
+    logger.log_event(key = "batchnorm_group_size", value = pargs.batchnorm_group_size)
+    logger.log_event(key = "gradient_accumulation_frequency", value = pargs.gradient_accumulation_frequency)
+    logger.log_event(key = "checkpoint", value = pargs.checkpoint)
+
+    # Define architecture
+    n_input_channels = len(pargs.channels)
+    n_output_channels = 3
+    net = deeplab_xception.DeepLabv3_plus(n_input = n_input_channels, 
+                                          n_classes = n_output_channels, 
+                                          os=16, pretrained=False, 
+                                          rank = comm_rank,
+                                          process_group = comm_local_group)
+    net.to(device)
+    
+    #select loss
+    #some magic numbers
+    loss_pow = -0.125
+    class_weights = [0.986267818390377**loss_pow, 0.0004578708870701058**loss_pow, 0.01327431072255291**loss_pow]
+    # extract loss
+    criterion = losses.CELoss(class_weights).to(device)
+    criterion = torch.jit.script(criterion)
+
+    #select optimizer
+    optimizer = oh.get_optimizer(pargs, net, logger)
+
+    #restart from checkpoint if desired
+    if pargs.checkpoint is not None:
+        checkpoint = torch.load(pargs.checkpoint, map_location = device)
+        start_step = checkpoint['step']
+        start_epoch = checkpoint['epoch']
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        net.load_state_dict(checkpoint['model'])
+    else:
+        start_step = 0
+        start_epoch = 0   
+
+    #broadcast model and optimizer state
+    steptens = torch.tensor(np.array([start_step, start_epoch]), requires_grad=False).to(device)
+    if dist.is_initialized():
+        dist.broadcast(steptens, src = 0)
+        
+    #unpack the bcasted tensor
+    start_step = int(steptens.cpu().numpy()[0])
+    start_epoch = int(steptens.cpu().numpy()[1])  
+                                    
+    #select scheduler
+    scheduler = None
+    if pargs.lr_schedule:
+        pargs.lr_schedule["lr_warmup_steps"] = pargs.lr_warmup_steps
+        pargs.lr_schedule["lr_warmup_factor"] = pargs.lr_warmup_factor
+        scheduler = oh.get_lr_schedule(pargs.start_lr, pargs.lr_schedule, optimizer, logger, last_step = start_step)
+    
+    # print parameters
+    if comm_rank == 0:
+        print(net)
+        print("Total number of elements:", sum(p.numel() for p in net.parameters() if p.requires_grad))
+        
+    # get input shapes for the upcoming model preprocessing
+    # input_shape:
+    tshape, _ = get_datashapes(pargs, root_dir)
+    input_shape = tuple([tshape[2], tshape[0], tshape[1]])
+    
+    #distributed model parameters
+    bucket_cap_mb = 25
+    if pargs.batchnorm_group_size > 1:
+        bucket_cap_mb = 220
+    
+    # get stream, relevant for graph capture
+    ddp_net = DDP(net, device_ids=[device.index],
+                  output_device=device.index,
+                  find_unused_parameters=True,
+                  broadcast_buffers=False,
+                  bucket_cap_mb=bucket_cap_mb,
+                  gradient_as_bucket_view=False)
+
+    # get stats handler here
+    bnstats_handler = bns.BatchNormStatsSynchronize(ddp_net, reduction = "mean", inplace = True)
+    
+    # create handles
+    net_validate = ddp_net
+    net_train = ddp_net
+        
+    # Set up the data feeder
+    train_loader, train_size, validation_loader, validation_size = get_dataloaders(pargs, root_dir, device, seed, comm_size, comm_rank)
+    
+    # log size of datasets
+    logger.log_event(key = "train_samples", value = train_size)
+    val_size = validation_size
+    logger.log_event(key = "eval_samples", value = val_size)
+
+    # get start steps
+    step = start_step
+    epoch = start_epoch
+    current_lr = pargs.start_lr if not pargs.lr_schedule else scheduler.get_last_lr()[0]
+    stop_training = False
+    net_train.train()
+
+    # start trining
+    logger.log_end(key = "init_stop", sync = True)
+    logger.log_start(key = "run_start", sync = True)
+
+    # training loop
+    while True:
+
+        # start epoch
+        logger.log_start(key = "epoch_start", metadata = {'epoch_num': epoch+1, 'step_num': step}, sync=True)
+
+        train_loader.sampler.set_epoch(epoch)
+
+        # training
+        step = train_epoch(pargs, comm_rank, comm_size,
+                           device, step, epoch, 
+                           net_train, criterion, 
+                           optimizer, scheduler,
+                           train_loader,
+                           logger)
+
+        # average BN stats
+        bnstats_handler.synchronize()
+        
+        # validation
+        stop_training = validate(pargs, comm_rank, comm_size,
+                                 device, step, epoch, 
+                                 net_validate, criterion, validation_loader, 
+                                 logger)
+
+        # log the epoch
+        logger.log_end(key = "epoch_stop", metadata = {'epoch_num': epoch+1, 'step_num': step}, sync = True)
+        epoch += 1
+        
+        #save model if desired
+        if (pargs.save_frequency > 0) and (epoch % pargs.save_frequency == 0):
+            logger.log_start(key = "save_start", metadata = {'epoch_num': epoch+1, 'step_num': step}, sync = True)
+            if comm_rank == 0:
+                checkpoint = {
+                    'step': step,
+                    'epoch': epoch,
+                    'model': net_train.state_dict(),
+                    'optimizer': optimizer.state_dict()
+                }
+                torch.save(checkpoint, os.path.join(output_dir, pargs.model_prefix + "_step_" + str(step) + ".cpt") )
+                logger.log_end(key = "save_stop", metadata = {'epoch_num': epoch+1, 'step_num': step}, sync = True)
+                    
+        # are we done?
+        if (epoch >= pargs.max_epochs) or stop_training:
+            break
+
+    # run done
+    logger.log_end(key = "run_stop", sync = True, metadata = {'status' : 'success'})
+
+
+if __name__ == "__main__":
+
+    #arguments
+    pargs = parser.parse_arguments()
+    
+    #run the stuff
+    main(pargs)