From a75ec17c0c2ce289fc952e2ea2c325e9ea5b1197 Mon Sep 17 00:00:00 2001
From: michalm <michalm@nvidia.com>
Date: Tue, 10 Oct 2023 21:30:10 +0200
Subject: [PATCH 01/12] Move epochs to samples

---
 image_segmentation/pytorch/main.py            |  2 +-
 .../pytorch/runtime/training.py               | 25 ++++++++++++-------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py
index c3ab0a27f..a66a84e8b 100644
--- a/image_segmentation/pytorch/main.py
+++ b/image_segmentation/pytorch/main.py
@@ -64,7 +64,7 @@ def main():
 
     if flags.exec_mode == 'train':
         train(flags, model, train_dataloader, val_dataloader, loss_fn, score_fn,
-              device=device, callbacks=callbacks, is_distributed=is_distributed)
+              device=device, callbacks=callbacks, is_distributed=is_distributed, samples_per_epoch=samples_per_epoch)
 
     elif flags.exec_mode == 'evaluate':
         eval_metrics = evaluate(flags, model, val_dataloader, loss_fn, score_fn,
diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py
index 030adde78..88b294d4d 100644
--- a/image_segmentation/pytorch/runtime/training.py
+++ b/image_segmentation/pytorch/runtime/training.py
@@ -30,7 +30,8 @@ def lr_warmup(optimizer, init_lr, lr, current_epoch, warmup_epochs):
         param_group['lr'] = init_lr + (lr - init_lr) * scale
 
 
-def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, callbacks, is_distributed):
+def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, callbacks,
+          is_distributed, samples_per_epoch):
     rank = get_rank()
     world_size = get_world_size()
     torch.backends.cudnn.benchmark = flags.cudnn_benchmark
@@ -52,6 +53,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
 
     is_successful = False
     diverged = False
+    epoch = 1
     next_eval_at = flags.start_eval_at
     model.train()
     for callback in callbacks:
@@ -61,8 +63,9 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
         if epoch <= flags.lr_warmup_epochs and flags.lr_warmup_epochs > 0:
             lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, epoch, flags.lr_warmup_epochs)
         mllog_start(key=CONSTANTS.BLOCK_START, sync=False,
-                    metadata={CONSTANTS.FIRST_EPOCH_NUM: epoch, CONSTANTS.EPOCH_COUNT: 1})
-        mllog_start(key=CONSTANTS.EPOCH_START, metadata={CONSTANTS.EPOCH_NUM: epoch}, sync=False)
+                    metadata={CONSTANTS.FIRST_EPOCH_NUM: epoch * samples_per_epoch,
+                              CONSTANTS.EPOCH_COUNT: samples_per_epoch})
+        mllog_start(key=CONSTANTS.EPOCH_START, metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, sync=False)
 
         if is_distributed:
             train_loader.sampler.set_epoch(epoch)
@@ -98,7 +101,8 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
             cumulative_loss.append(loss_value)
 
         mllog_end(key=CONSTANTS.EPOCH_STOP, sync=False,
-                  metadata={CONSTANTS.EPOCH_NUM: epoch, 'current_lr': optimizer.param_groups[0]['lr']})
+                  metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch,
+                            'current_lr': optimizer.param_groups[0]['lr']})
 
         if flags.lr_decay_epochs:
             scheduler.step()
@@ -106,16 +110,17 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
         if epoch == next_eval_at:
             next_eval_at += flags.evaluate_every
             del output
-            mllog_start(key=CONSTANTS.EVAL_START, value=epoch, metadata={CONSTANTS.EPOCH_NUM: epoch}, sync=False)
+            mllog_start(key=CONSTANTS.EVAL_START, value=epoch * samples_per_epoch,
+                        metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, sync=False)
 
             eval_metrics = evaluate(flags, model, val_loader, loss_fn, score_fn, device, epoch)
             eval_metrics["train_loss"] = sum(cumulative_loss) / len(cumulative_loss)
 
             mllog_event(key=CONSTANTS.EVAL_ACCURACY,
                         value=eval_metrics["mean_dice"],
-                        metadata={CONSTANTS.EPOCH_NUM: epoch},
+                        metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch},
                         sync=False)
-            mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: epoch}, sync=False)
+            mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, sync=False)
 
             for callback in callbacks:
                 callback.on_epoch_end(epoch=epoch, metrics=eval_metrics, model=model, optimizer=optimizer)
@@ -127,12 +132,14 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
                 diverged = True
 
         mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False,
-                  metadata={CONSTANTS.FIRST_EPOCH_NUM: epoch, CONSTANTS.EPOCH_COUNT: 1})
+                  metadata={CONSTANTS.FIRST_EPOCH_NUM: epoch * samples_per_epoch,
+                            CONSTANTS.EPOCH_COUNT: samples_per_epoch})
 
         if is_successful or diverged:
             break
 
     mllog_end(key=CONSTANTS.RUN_STOP, sync=True,
-              metadata={CONSTANTS.STATUS: CONSTANTS.SUCCESS if is_successful else CONSTANTS.ABORTED})
+              metadata={CONSTANTS.STATUS: CONSTANTS.SUCCESS if is_successful else CONSTANTS.ABORTED,
+                        CONSTANTS.EPOCH_COUNT: epoch * samples_per_epoch})
     for callback in callbacks:
         callback.on_fit_end()

From fde6d6f98009d1691383864b7c442cb6efaede7d Mon Sep 17 00:00:00 2001
From: michalm <michalm@nvidia.com>
Date: Sun, 10 Dec 2023 10:46:14 +0100
Subject: [PATCH 02/12] Add dummy training, test epochless training

---
 .../pytorch/data_loading/data_loader.py       |   2 +-
 .../pytorch/data_loading/pytorch_loader.py    |   2 +-
 image_segmentation/pytorch/main.py            |   3 +-
 .../pytorch/runtime/arguments.py              |   4 +-
 .../pytorch/runtime/dummy_training.py         | 106 ++++++++++++++++++
 .../pytorch/runtime/training.py               |  85 ++++++--------
 6 files changed, 149 insertions(+), 53 deletions(-)
 create mode 100644 image_segmentation/pytorch/runtime/dummy_training.py

diff --git a/image_segmentation/pytorch/data_loading/data_loader.py b/image_segmentation/pytorch/data_loading/data_loader.py
index c4c80b51d..c3ef531f1 100644
--- a/image_segmentation/pytorch/data_loading/data_loader.py
+++ b/image_segmentation/pytorch/data_loading/data_loader.py
@@ -90,7 +90,7 @@ def get_data_loaders(flags, num_shards, global_rank):
         raise ValueError(f"Loader {flags.loader} unknown. Valid loaders are: synthetic, pytorch")
 
     # The DistributedSampler seed should be the same for all workers
-    train_sampler = DistributedSampler(train_dataset, seed=flags.shuffling_seed, drop_last=True) if num_shards > 1 else None
+    train_sampler = None, DistributedSampler(train_dataset, seed=flags.shuffling_seed, drop_last=True) if num_shards > 1 else None
     val_sampler = None
 
     train_dataloader = DataLoader(train_dataset,
diff --git a/image_segmentation/pytorch/data_loading/pytorch_loader.py b/image_segmentation/pytorch/data_loading/pytorch_loader.py
index bb71d32f7..aa871153a 100644
--- a/image_segmentation/pytorch/data_loading/pytorch_loader.py
+++ b/image_segmentation/pytorch/data_loading/pytorch_loader.py
@@ -143,7 +143,7 @@ def __init__(self, images, labels, **kwargs):
         self.rand_crop = RandBalancedCrop(patch_size=patch_size, oversampling=oversampling)
 
     def __len__(self):
-        return len(self.images)
+        return 1e9 #len(self.images)
 
     def __getitem__(self, idx):
         data = {"image": np.load(self.images[idx]), "label": np.load(self.labels[idx])}
diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py
index a66a84e8b..2f1652945 100644
--- a/image_segmentation/pytorch/main.py
+++ b/image_segmentation/pytorch/main.py
@@ -8,7 +8,8 @@
 
 from data_loading.data_loader import get_data_loaders
 
-from runtime.training import train
+# from runtime.training import train
+from runtime.dummy_training import train
 from runtime.inference import evaluate
 from runtime.arguments import PARSER
 from runtime.distributed_utils import init_distributed, get_world_size, get_device, is_main_process, get_rank
diff --git a/image_segmentation/pytorch/runtime/arguments.py b/image_segmentation/pytorch/runtime/arguments.py
index bc7530633..9ee9c6a67 100644
--- a/image_segmentation/pytorch/runtime/arguments.py
+++ b/image_segmentation/pytorch/runtime/arguments.py
@@ -27,8 +27,8 @@
 PARSER.add_argument('--optimizer', dest='optimizer', default="sgd", choices=["sgd", "adam", "lamb"], type=str)
 PARSER.add_argument('--learning_rate', dest='learning_rate', type=float, default=1.0)
 PARSER.add_argument('--init_learning_rate', dest='init_learning_rate', type=float, default=1e-4)
-PARSER.add_argument('--lr_warmup_epochs', dest='lr_warmup_epochs', type=int, default=0)
-PARSER.add_argument('--lr_decay_epochs', nargs='+', type=int, default=[])
+PARSER.add_argument('--lr_warmup_samples', dest='lr_warmup_samples', type=int, default=0)
+PARSER.add_argument('--lr_decay_samples', nargs='+', type=int, default=[])
 PARSER.add_argument('--lr_decay_factor', dest='lr_decay_factor', type=float, default=1.0)
 PARSER.add_argument('--lamb_betas', nargs='+', type=int, default=[0.9, 0.999])
 PARSER.add_argument('--momentum', dest='momentum', type=float, default=0.9)
diff --git a/image_segmentation/pytorch/runtime/dummy_training.py b/image_segmentation/pytorch/runtime/dummy_training.py
new file mode 100644
index 000000000..31f93cf84
--- /dev/null
+++ b/image_segmentation/pytorch/runtime/dummy_training.py
@@ -0,0 +1,106 @@
+from tqdm import tqdm
+
+import torch
+from torch.optim import Adam, SGD
+from torch.cuda.amp import autocast, GradScaler
+
+from runtime.distributed_utils import get_rank, reduce_tensor, get_world_size
+from runtime.inference import evaluate
+from runtime.logging import mllog_event, mllog_start, mllog_end, CONSTANTS
+
+
+def get_optimizer(params, flags):
+    if flags.optimizer == "adam":
+        optim = Adam(params, lr=flags.learning_rate, weight_decay=flags.weight_decay)
+    elif flags.optimizer == "sgd":
+        optim = SGD(params, lr=flags.learning_rate, momentum=flags.momentum, nesterov=True,
+                    weight_decay=flags.weight_decay)
+    elif flags.optimizer == "lamb":
+        import apex
+        optim = apex.optimizers.FusedLAMB(params, lr=flags.learning_rate, betas=flags.lamb_betas,
+                                          weight_decay=flags.weight_decay)
+    else:
+        raise ValueError("Optimizer {} unknown.".format(flags.optimizer))
+    return optim
+
+
+def lr_warmup(optimizer, init_lr, lr, current_samples, warmup_samples):
+    scale = current_samples / warmup_samples
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = init_lr + (lr - init_lr) * scale
+
+
+def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, callbacks,
+          is_distributed, samples_per_epoch):
+    rank = get_rank()
+    world_size = get_world_size()
+    torch.backends.cudnn.benchmark = flags.cudnn_benchmark
+    torch.backends.cudnn.deterministic = flags.cudnn_deterministic
+
+    optimizer = get_optimizer(model.parameters(), flags)
+    if flags.lr_decay_epochs:
+        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
+                                                         milestones=flags.lr_decay_epochs,
+                                                         gamma=flags.lr_decay_factor)
+    scaler = GradScaler()
+
+    model.to(device)
+    loss_fn.to(device)
+    if is_distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model,
+                                                          device_ids=[flags.local_rank],
+                                                          output_device=flags.local_rank)
+
+    is_successful = False
+    diverged = False
+    total_samples = 0
+    iteration = 0
+    next_eval_at = flags.start_eval_at
+    model.train()
+    train_loader = iter(train_loader)
+    for callback in callbacks:
+        callback.on_fit_start()
+
+    while not diverged and not is_successful:
+        mllog_start(key=CONSTANTS.BLOCK_START, sync=False,
+                    metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples,
+                              CONSTANTS.EPOCH_COUNT: next_eval_at})
+
+        while total_samples < next_eval_at:
+            if total_samples <= flags.lr_warmup_epochs * samples_per_epoch and flags.lr_warmup_epochs > 0:
+                lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples)
+
+            optimizer.zero_grad()
+            # for iteration, batch in enumerate(tqdm(train_loader, disable=(rank != 0) or not flags.verbose)):
+
+            batch = next(train_loader)
+            total_samples = flags.batch_size * world_size
+
+            image, label = batch
+            # image, label = image.to(device), label.to(device)
+
+            iteration += 1
+            print(total_samples)
+
+
+        # Evaluation
+        mllog_start(key=CONSTANTS.EVAL_START, value=total_samples,
+                    metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False)
+
+
+
+        mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False)
+
+        model.train()
+
+        mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False,
+                  metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples,
+                            CONSTANTS.EPOCH_COUNT: next_eval_at})
+        next_eval_at += flags.evaluate_every
+
+
+    mllog_end(key=CONSTANTS.RUN_STOP, sync=True,
+              metadata={CONSTANTS.STATUS: CONSTANTS.SUCCESS if is_successful else CONSTANTS.ABORTED,
+                        CONSTANTS.EPOCH_COUNT: total_samples})
+    for callback in callbacks:
+        callback.on_fit_end()
diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py
index 88b294d4d..8fac8dbda 100644
--- a/image_segmentation/pytorch/runtime/training.py
+++ b/image_segmentation/pytorch/runtime/training.py
@@ -24,8 +24,8 @@ def get_optimizer(params, flags):
     return optim
 
 
-def lr_warmup(optimizer, init_lr, lr, current_epoch, warmup_epochs):
-    scale = current_epoch / warmup_epochs
+def lr_warmup(optimizer, init_lr, lr, current_samples, warmup_samples):
+    scale = current_samples / warmup_samples
     for param_group in optimizer.param_groups:
         param_group['lr'] = init_lr + (lr - init_lr) * scale
 
@@ -53,26 +53,29 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
 
     is_successful = False
     diverged = False
-    epoch = 1
+    total_samples = 0
+    iteration = 0
     next_eval_at = flags.start_eval_at
     model.train()
+    train_loader = iter(train_loader)
     for callback in callbacks:
         callback.on_fit_start()
-    for epoch in range(1, flags.epochs + 1):
-        cumulative_loss = []
-        if epoch <= flags.lr_warmup_epochs and flags.lr_warmup_epochs > 0:
-            lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, epoch, flags.lr_warmup_epochs)
+
+    while not diverged and not is_successful:
         mllog_start(key=CONSTANTS.BLOCK_START, sync=False,
-                    metadata={CONSTANTS.FIRST_EPOCH_NUM: epoch * samples_per_epoch,
-                              CONSTANTS.EPOCH_COUNT: samples_per_epoch})
-        mllog_start(key=CONSTANTS.EPOCH_START, metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, sync=False)
+                    metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples,
+                              CONSTANTS.EPOCH_COUNT: next_eval_at})
+
+        while total_samples < next_eval_at:
+            if total_samples <= flags.lr_warmup_epochs * samples_per_epoch and flags.lr_warmup_epochs > 0:
+                lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples)
+
+            optimizer.zero_grad()
+            # for iteration, batch in enumerate(tqdm(train_loader, disable=(rank != 0) or not flags.verbose)):
 
-        if is_distributed:
-            train_loader.sampler.set_epoch(epoch)
+            batch = next(train_loader)
+            total_samples = flags.batch_size * world_size
 
-        loss_value = None
-        optimizer.zero_grad()
-        for iteration, batch in enumerate(tqdm(train_loader, disable=(rank != 0) or not flags.verbose)):
             image, label = batch
             image, label = image.to(device), label.to(device)
             for callback in callbacks:
@@ -96,50 +99,36 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
                     optimizer.step()
 
                 optimizer.zero_grad()
+            iteration += 1
 
-            loss_value = reduce_tensor(loss_value, world_size).detach().cpu().numpy()
-            cumulative_loss.append(loss_value)
 
-        mllog_end(key=CONSTANTS.EPOCH_STOP, sync=False,
-                  metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch,
-                            'current_lr': optimizer.param_groups[0]['lr']})
+        # Evaluation
 
-        if flags.lr_decay_epochs:
-            scheduler.step()
+        del output
+        mllog_start(key=CONSTANTS.EVAL_START, value=total_samples,
+                    metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False)
 
-        if epoch == next_eval_at:
-            next_eval_at += flags.evaluate_every
-            del output
-            mllog_start(key=CONSTANTS.EVAL_START, value=epoch * samples_per_epoch,
-                        metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, sync=False)
+        eval_metrics = evaluate(flags, model, val_loader, loss_fn, score_fn, device, total_samples)
 
-            eval_metrics = evaluate(flags, model, val_loader, loss_fn, score_fn, device, epoch)
-            eval_metrics["train_loss"] = sum(cumulative_loss) / len(cumulative_loss)
+        mllog_event(key=CONSTANTS.EVAL_ACCURACY, value=eval_metrics["mean_dice"],
+                    metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False)
+        mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False)
 
-            mllog_event(key=CONSTANTS.EVAL_ACCURACY,
-                        value=eval_metrics["mean_dice"],
-                        metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch},
-                        sync=False)
-            mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, sync=False)
-
-            for callback in callbacks:
-                callback.on_epoch_end(epoch=epoch, metrics=eval_metrics, model=model, optimizer=optimizer)
-            model.train()
-            if eval_metrics["mean_dice"] >= flags.quality_threshold:
-                is_successful = True
-            elif eval_metrics["mean_dice"] < 1e-6:
-                print("MODEL DIVERGED. ABORTING.")
-                diverged = True
+        model.train()
+        if eval_metrics["mean_dice"] >= flags.quality_threshold:
+            is_successful = True
+        elif eval_metrics["mean_dice"] < 1e-6:
+            print("MODEL DIVERGED. ABORTING.")
+            diverged = True
 
         mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False,
-                  metadata={CONSTANTS.FIRST_EPOCH_NUM: epoch * samples_per_epoch,
-                            CONSTANTS.EPOCH_COUNT: samples_per_epoch})
+                  metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples,
+                            CONSTANTS.EPOCH_COUNT: next_eval_at})
+        next_eval_at += flags.evaluate_every
 
-        if is_successful or diverged:
-            break
 
     mllog_end(key=CONSTANTS.RUN_STOP, sync=True,
               metadata={CONSTANTS.STATUS: CONSTANTS.SUCCESS if is_successful else CONSTANTS.ABORTED,
-                        CONSTANTS.EPOCH_COUNT: epoch * samples_per_epoch})
+                        CONSTANTS.EPOCH_COUNT: total_samples})
     for callback in callbacks:
         callback.on_fit_end()

From b718348bccf46f256d6dc773536c92c541216db9 Mon Sep 17 00:00:00 2001
From: michalm <michalm@nvidia.com>
Date: Sun, 10 Dec 2023 11:08:06 +0100
Subject: [PATCH 03/12] Syntax fixes

---
 image_segmentation/pytorch/data_loading/data_loader.py    | 2 +-
 image_segmentation/pytorch/data_loading/pytorch_loader.py | 2 +-
 image_segmentation/pytorch/runtime/dummy_training.py      | 6 +++---
 image_segmentation/pytorch/runtime/logging.py             | 4 ++--
 image_segmentation/pytorch/runtime/training.py            | 6 +++---
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/image_segmentation/pytorch/data_loading/data_loader.py b/image_segmentation/pytorch/data_loading/data_loader.py
index c3ef531f1..727d44b10 100644
--- a/image_segmentation/pytorch/data_loading/data_loader.py
+++ b/image_segmentation/pytorch/data_loading/data_loader.py
@@ -90,7 +90,7 @@ def get_data_loaders(flags, num_shards, global_rank):
         raise ValueError(f"Loader {flags.loader} unknown. Valid loaders are: synthetic, pytorch")
 
     # The DistributedSampler seed should be the same for all workers
-    train_sampler = None, DistributedSampler(train_dataset, seed=flags.shuffling_seed, drop_last=True) if num_shards > 1 else None
+    train_sampler = None#, DistributedSampler(train_dataset, seed=flags.shuffling_seed, drop_last=True) if num_shards > 1 else None
     val_sampler = None
 
     train_dataloader = DataLoader(train_dataset,
diff --git a/image_segmentation/pytorch/data_loading/pytorch_loader.py b/image_segmentation/pytorch/data_loading/pytorch_loader.py
index aa871153a..05c1fd538 100644
--- a/image_segmentation/pytorch/data_loading/pytorch_loader.py
+++ b/image_segmentation/pytorch/data_loading/pytorch_loader.py
@@ -143,7 +143,7 @@ def __init__(self, images, labels, **kwargs):
         self.rand_crop = RandBalancedCrop(patch_size=patch_size, oversampling=oversampling)
 
     def __len__(self):
-        return 1e9 #len(self.images)
+        return int(168*10000) #len(self.images)
 
     def __getitem__(self, idx):
         data = {"image": np.load(self.images[idx]), "label": np.load(self.labels[idx])}
diff --git a/image_segmentation/pytorch/runtime/dummy_training.py b/image_segmentation/pytorch/runtime/dummy_training.py
index 31f93cf84..5fddaf8d2 100644
--- a/image_segmentation/pytorch/runtime/dummy_training.py
+++ b/image_segmentation/pytorch/runtime/dummy_training.py
@@ -38,9 +38,9 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
     torch.backends.cudnn.deterministic = flags.cudnn_deterministic
 
     optimizer = get_optimizer(model.parameters(), flags)
-    if flags.lr_decay_epochs:
+    if flags.lr_decay_samples:
         scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
-                                                         milestones=flags.lr_decay_epochs,
+                                                         milestones=flags.lr_decay_samples,
                                                          gamma=flags.lr_decay_factor)
     scaler = GradScaler()
 
@@ -67,7 +67,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
                               CONSTANTS.EPOCH_COUNT: next_eval_at})
 
         while total_samples < next_eval_at:
-            if total_samples <= flags.lr_warmup_epochs * samples_per_epoch and flags.lr_warmup_epochs > 0:
+            if total_samples <= flags.lr_warmup_samples and flags.lr_warmup_samples > 0:
                 lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples)
 
             optimizer.zero_grad()
diff --git a/image_segmentation/pytorch/runtime/logging.py b/image_segmentation/pytorch/runtime/logging.py
index 2d76c9216..92efe9984 100644
--- a/image_segmentation/pytorch/runtime/logging.py
+++ b/image_segmentation/pytorch/runtime/logging.py
@@ -83,9 +83,9 @@ def mlperf_submission_log():
 def mlperf_run_param_log(flags):
     mllog_event(key=mllog.constants.OPT_NAME, value=flags.optimizer)
     mllog_event(key=mllog.constants.OPT_BASE_LR, value=flags.learning_rate)
-    mllog_event(key=mllog.constants.OPT_LR_WARMUP_EPOCHS, value=flags.lr_warmup_epochs)
+    mllog_event(key=mllog.constants.OPT_LR_WARMUP_EPOCHS, value=flags.lr_warmup_samples)
     # mllog_event(key=mllog.constants.OPT_LR_WARMUP_FACTOR, value=flags.lr_warmup_factor)
-    mllog_event(key=mllog.constants.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=flags.lr_decay_epochs)
+    mllog_event(key=mllog.constants.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=flags.lr_decay_samples)
     mllog_event(key=mllog.constants.OPT_LR_DECAY_FACTOR, value=flags.lr_decay_factor)
     mllog_event(key=mllog.constants.OPT_WEIGHT_DECAY, value=flags.weight_decay)
     mllog_event(key="opt_momentum", value=flags.momentum)
diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py
index 8fac8dbda..f2e881bcb 100644
--- a/image_segmentation/pytorch/runtime/training.py
+++ b/image_segmentation/pytorch/runtime/training.py
@@ -38,9 +38,9 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
     torch.backends.cudnn.deterministic = flags.cudnn_deterministic
 
     optimizer = get_optimizer(model.parameters(), flags)
-    if flags.lr_decay_epochs:
+    if flags.lr_decay_samples:
         scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
-                                                         milestones=flags.lr_decay_epochs,
+                                                         milestones=flags.lr_decay_samples,
                                                          gamma=flags.lr_decay_factor)
     scaler = GradScaler()
 
@@ -67,7 +67,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
                               CONSTANTS.EPOCH_COUNT: next_eval_at})
 
         while total_samples < next_eval_at:
-            if total_samples <= flags.lr_warmup_epochs * samples_per_epoch and flags.lr_warmup_epochs > 0:
+            if total_samples <= flags.lr_warmup_samples and flags.lr_warmup_epochs > 0:
                 lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples)
 
             optimizer.zero_grad()

From bc6d796d47e1660b31a02631adaa51cf58330189 Mon Sep 17 00:00:00 2001
From: michalm <michalm@nvidia.com>
Date: Sun, 10 Dec 2023 11:16:29 +0100
Subject: [PATCH 04/12] Syntax fixes

---
 image_segmentation/pytorch/Dockerfile                |  2 +-
 .../pytorch/data_loading/pytorch_loader.py           | 12 +++++++++---
 image_segmentation/pytorch/runtime/dummy_training.py |  3 +++
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/image_segmentation/pytorch/Dockerfile b/image_segmentation/pytorch/Dockerfile
index fbe42e6a8..5616daa00 100644
--- a/image_segmentation/pytorch/Dockerfile
+++ b/image_segmentation/pytorch/Dockerfile
@@ -2,7 +2,6 @@ ARG FROM_IMAGE_NAME=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime
 #ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.02-py3
 FROM ${FROM_IMAGE_NAME}
 
-ADD . /workspace/unet3d
 WORKDIR /workspace/unet3d
 
 RUN apt-get update && \
@@ -13,4 +12,5 @@ RUN apt-get install -y vim
 RUN pip install --upgrade pip
 RUN pip install --disable-pip-version-check -r requirements.txt
 
+ADD . /workspace/unet3d
 #RUN pip uninstall -y apex; pip uninstall -y apex; git clone --branch seryilmaz/fused_dropout_softmax  https://github.com/seryilmaz/apex.git; cd apex;  pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--xentropy" --global-option="--deprecated_fused_adam" --global-option="--deprecated_fused_lamb" --global-option="--fast_multihead_attn" .
diff --git a/image_segmentation/pytorch/data_loading/pytorch_loader.py b/image_segmentation/pytorch/data_loading/pytorch_loader.py
index 05c1fd538..7a8ca1429 100644
--- a/image_segmentation/pytorch/data_loading/pytorch_loader.py
+++ b/image_segmentation/pytorch/data_loading/pytorch_loader.py
@@ -141,14 +141,20 @@ def __init__(self, images, labels, **kwargs):
         patch_size, oversampling = kwargs["patch_size"], kwargs["oversampling"]
         self.patch_size = patch_size
         self.rand_crop = RandBalancedCrop(patch_size=patch_size, oversampling=oversampling)
+        self.real_len = len(self.images)
+
+        self.x = list(range(24))
+        self.y = list(range(24))
 
     def __len__(self):
         return int(168*10000) #len(self.images)
 
     def __getitem__(self, idx):
-        data = {"image": np.load(self.images[idx]), "label": np.load(self.labels[idx])}
-        data = self.rand_crop(data)
-        data = self.train_transforms(data)
+        # data = {"image": np.load(self.images[idx % self.real_len]), "label": np.load(self.labels[idx % self.real_len])}
+        # data = self.rand_crop(data)
+        # data = self.train_transforms(data)
+
+        data = {"image": self.x[idx % 24], "label": self.y[idx % 24]}
         return data["image"], data["label"]
 
 
diff --git a/image_segmentation/pytorch/runtime/dummy_training.py b/image_segmentation/pytorch/runtime/dummy_training.py
index 5fddaf8d2..9cb781bab 100644
--- a/image_segmentation/pytorch/runtime/dummy_training.py
+++ b/image_segmentation/pytorch/runtime/dummy_training.py
@@ -61,6 +61,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
     for callback in callbacks:
         callback.on_fit_start()
 
+    counts = {}
     while not diverged and not is_successful:
         mllog_start(key=CONSTANTS.BLOCK_START, sync=False,
                     metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples,
@@ -81,6 +82,8 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
 
             iteration += 1
             print(total_samples)
+            for b in batch:
+                print(*b)
 
 
         # Evaluation

From b50801b73797bbfbfed289a24603a994dcb60a31 Mon Sep 17 00:00:00 2001
From: michalm <michalm@nvidia.com>
Date: Sun, 10 Dec 2023 11:18:40 +0100
Subject: [PATCH 05/12] Syntax fixes

---
 image_segmentation/pytorch/Dockerfile | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/image_segmentation/pytorch/Dockerfile b/image_segmentation/pytorch/Dockerfile
index 5616daa00..f501ce8fc 100644
--- a/image_segmentation/pytorch/Dockerfile
+++ b/image_segmentation/pytorch/Dockerfile
@@ -1,16 +1,15 @@
-ARG FROM_IMAGE_NAME=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime
-#ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.02-py3
+#ARG FROM_IMAGE_NAME=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:23.04-py3
 FROM ${FROM_IMAGE_NAME}
 
-WORKDIR /workspace/unet3d
+#RUN apt-get update && \
+#    apt-get upgrade -y && \
+#    apt-get install -y git
+#RUN apt-get install -y vim
 
-RUN apt-get update && \
-    apt-get upgrade -y && \
-    apt-get install -y git
-RUN apt-get install -y vim
+ADD . /workspace/unet3d
+WORKDIR /workspace/unet3d
 
 RUN pip install --upgrade pip
 RUN pip install --disable-pip-version-check -r requirements.txt
-
-ADD . /workspace/unet3d
 #RUN pip uninstall -y apex; pip uninstall -y apex; git clone --branch seryilmaz/fused_dropout_softmax  https://github.com/seryilmaz/apex.git; cd apex;  pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--xentropy" --global-option="--deprecated_fused_adam" --global-option="--deprecated_fused_lamb" --global-option="--fast_multihead_attn" .

From f1f01b381c8d602f542369b3322a08b57b84a016 Mon Sep 17 00:00:00 2001
From: michalm <michalm@nvidia.com>
Date: Sun, 10 Dec 2023 11:28:53 +0100
Subject: [PATCH 06/12] Restore original dataloader

---
 .../pytorch/data_loading/pytorch_loader.py    |  11 +-
 .../pytorch/runtime/dummy_training.py         | 109 ------------------
 2 files changed, 3 insertions(+), 117 deletions(-)
 delete mode 100644 image_segmentation/pytorch/runtime/dummy_training.py

diff --git a/image_segmentation/pytorch/data_loading/pytorch_loader.py b/image_segmentation/pytorch/data_loading/pytorch_loader.py
index 7a8ca1429..2eae826ab 100644
--- a/image_segmentation/pytorch/data_loading/pytorch_loader.py
+++ b/image_segmentation/pytorch/data_loading/pytorch_loader.py
@@ -143,18 +143,13 @@ def __init__(self, images, labels, **kwargs):
         self.rand_crop = RandBalancedCrop(patch_size=patch_size, oversampling=oversampling)
         self.real_len = len(self.images)
 
-        self.x = list(range(24))
-        self.y = list(range(24))
-
     def __len__(self):
         return int(168*10000) #len(self.images)
 
     def __getitem__(self, idx):
-        # data = {"image": np.load(self.images[idx % self.real_len]), "label": np.load(self.labels[idx % self.real_len])}
-        # data = self.rand_crop(data)
-        # data = self.train_transforms(data)
-
-        data = {"image": self.x[idx % 24], "label": self.y[idx % 24]}
+        data = {"image": np.load(self.images[idx % self.real_len]), "label": np.load(self.labels[idx % self.real_len])}
+        data = self.rand_crop(data)
+        data = self.train_transforms(data)
         return data["image"], data["label"]
 
 
diff --git a/image_segmentation/pytorch/runtime/dummy_training.py b/image_segmentation/pytorch/runtime/dummy_training.py
deleted file mode 100644
index 9cb781bab..000000000
--- a/image_segmentation/pytorch/runtime/dummy_training.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from tqdm import tqdm
-
-import torch
-from torch.optim import Adam, SGD
-from torch.cuda.amp import autocast, GradScaler
-
-from runtime.distributed_utils import get_rank, reduce_tensor, get_world_size
-from runtime.inference import evaluate
-from runtime.logging import mllog_event, mllog_start, mllog_end, CONSTANTS
-
-
-def get_optimizer(params, flags):
-    if flags.optimizer == "adam":
-        optim = Adam(params, lr=flags.learning_rate, weight_decay=flags.weight_decay)
-    elif flags.optimizer == "sgd":
-        optim = SGD(params, lr=flags.learning_rate, momentum=flags.momentum, nesterov=True,
-                    weight_decay=flags.weight_decay)
-    elif flags.optimizer == "lamb":
-        import apex
-        optim = apex.optimizers.FusedLAMB(params, lr=flags.learning_rate, betas=flags.lamb_betas,
-                                          weight_decay=flags.weight_decay)
-    else:
-        raise ValueError("Optimizer {} unknown.".format(flags.optimizer))
-    return optim
-
-
-def lr_warmup(optimizer, init_lr, lr, current_samples, warmup_samples):
-    scale = current_samples / warmup_samples
-    for param_group in optimizer.param_groups:
-        param_group['lr'] = init_lr + (lr - init_lr) * scale
-
-
-def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, callbacks,
-          is_distributed, samples_per_epoch):
-    rank = get_rank()
-    world_size = get_world_size()
-    torch.backends.cudnn.benchmark = flags.cudnn_benchmark
-    torch.backends.cudnn.deterministic = flags.cudnn_deterministic
-
-    optimizer = get_optimizer(model.parameters(), flags)
-    if flags.lr_decay_samples:
-        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
-                                                         milestones=flags.lr_decay_samples,
-                                                         gamma=flags.lr_decay_factor)
-    scaler = GradScaler()
-
-    model.to(device)
-    loss_fn.to(device)
-    if is_distributed:
-        model = torch.nn.parallel.DistributedDataParallel(model,
-                                                          device_ids=[flags.local_rank],
-                                                          output_device=flags.local_rank)
-
-    is_successful = False
-    diverged = False
-    total_samples = 0
-    iteration = 0
-    next_eval_at = flags.start_eval_at
-    model.train()
-    train_loader = iter(train_loader)
-    for callback in callbacks:
-        callback.on_fit_start()
-
-    counts = {}
-    while not diverged and not is_successful:
-        mllog_start(key=CONSTANTS.BLOCK_START, sync=False,
-                    metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples,
-                              CONSTANTS.EPOCH_COUNT: next_eval_at})
-
-        while total_samples < next_eval_at:
-            if total_samples <= flags.lr_warmup_samples and flags.lr_warmup_samples > 0:
-                lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples)
-
-            optimizer.zero_grad()
-            # for iteration, batch in enumerate(tqdm(train_loader, disable=(rank != 0) or not flags.verbose)):
-
-            batch = next(train_loader)
-            total_samples = flags.batch_size * world_size
-
-            image, label = batch
-            # image, label = image.to(device), label.to(device)
-
-            iteration += 1
-            print(total_samples)
-            for b in batch:
-                print(*b)
-
-
-        # Evaluation
-        mllog_start(key=CONSTANTS.EVAL_START, value=total_samples,
-                    metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False)
-
-
-
-        mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False)
-
-        model.train()
-
-        mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False,
-                  metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples,
-                            CONSTANTS.EPOCH_COUNT: next_eval_at})
-        next_eval_at += flags.evaluate_every
-
-
-    mllog_end(key=CONSTANTS.RUN_STOP, sync=True,
-              metadata={CONSTANTS.STATUS: CONSTANTS.SUCCESS if is_successful else CONSTANTS.ABORTED,
-                        CONSTANTS.EPOCH_COUNT: total_samples})
-    for callback in callbacks:
-        callback.on_fit_end()

From cc8716c2db5e4673bcd169d2dc008f3dc4f5b997 Mon Sep 17 00:00:00 2001
From: michalm <michalm@nvidia.com>
Date: Sun, 10 Dec 2023 11:32:53 +0100
Subject: [PATCH 07/12] Fix start eval at and evaluate every

---
 image_segmentation/pytorch/main.py              | 3 ---
 image_segmentation/pytorch/oldREADME.md         | 2 --
 image_segmentation/pytorch/run_and_time.sh      | 4 +---
 image_segmentation/pytorch/runtime/arguments.py | 2 --
 image_segmentation/pytorch/runtime/training.py  | 8 ++++++--
 5 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py
index 2f1652945..8a8bd4459 100644
--- a/image_segmentation/pytorch/main.py
+++ b/image_segmentation/pytorch/main.py
@@ -53,9 +53,6 @@ def main():
     train_dataloader, val_dataloader = get_data_loaders(flags, num_shards=world_size, global_rank=local_rank)
     samples_per_epoch = world_size * len(train_dataloader) * flags.batch_size
     mllog_event(key='samples_per_epoch', value=samples_per_epoch, sync=False)
-    flags.evaluate_every = flags.evaluate_every or ceil(20*DATASET_SIZE/samples_per_epoch)
-    flags.start_eval_at = flags.start_eval_at or ceil(1000*DATASET_SIZE/samples_per_epoch)
-
     mllog_event(key=constants.GLOBAL_BATCH_SIZE, value=flags.batch_size * world_size * flags.ga_steps, sync=False)
     mllog_event(key=constants.GRADIENT_ACCUMULATION_STEPS, value=flags.ga_steps)
     loss_fn = DiceCELoss(to_onehot_y=True, use_softmax=True, layout=flags.layout,
diff --git a/image_segmentation/pytorch/oldREADME.md b/image_segmentation/pytorch/oldREADME.md
index a87187135..e11831292 100644
--- a/image_segmentation/pytorch/oldREADME.md
+++ b/image_segmentation/pytorch/oldREADME.md
@@ -165,8 +165,6 @@ The complete list of the available parameters for the main.py script contains:
 * `--batch_size`: Size of each minibatch per GPU (default: `2`).
 * `--ga_steps`: Number of steps for gradient accumulation (default: `1`).
 * `--epochs`: Maximum number of epochs for training (default: `1`).
-* `--evaluate_every`: Epoch interval for evaluation (default: `20`).
-* `--start_eval_at`: First epoch to start running evaluation at (default: `1000`).
 * `--layout`: Data layout (default: `NCDHW`. `NDHWC` is not implemented).
 * `--input_shape`: Input shape for images during training (default: `[128, 128, 128]`).
 * `--val_input_shape`: Input shape for images during evaluation (default: `[128, 128, 128]`).
diff --git a/image_segmentation/pytorch/run_and_time.sh b/image_segmentation/pytorch/run_and_time.sh
index bfdeb5948..57e6a8d21 100644
--- a/image_segmentation/pytorch/run_and_time.sh
+++ b/image_segmentation/pytorch/run_and_time.sh
@@ -33,15 +33,13 @@ mllog_event(key=constants.CACHE_CLEAR, value=True)"
 
   python main.py --data_dir ${DATASET_DIR} \
     --epochs ${MAX_EPOCHS} \
-    --evaluate_every ${EVALUATE_EVERY} \
-    --start_eval_at ${START_EVAL_AT} \
     --quality_threshold ${QUALITY_THRESHOLD} \
     --batch_size ${BATCH_SIZE} \
     --optimizer sgd \
     --ga_steps ${GRADIENT_ACCUMULATION_STEPS} \
     --learning_rate ${LEARNING_RATE} \
     --seed ${SEED} \
-    --lr_warmup_epochs ${LR_WARMUP_EPOCHS}
+    --lr_warmup_samples ${LR_WARMUP_SAMPLES}
 
 	# end timing
 	end=$(date +%s)
diff --git a/image_segmentation/pytorch/runtime/arguments.py b/image_segmentation/pytorch/runtime/arguments.py
index 9ee9c6a67..92eab1732 100644
--- a/image_segmentation/pytorch/runtime/arguments.py
+++ b/image_segmentation/pytorch/runtime/arguments.py
@@ -33,8 +33,6 @@
 PARSER.add_argument('--lamb_betas', nargs='+', type=int, default=[0.9, 0.999])
 PARSER.add_argument('--momentum', dest='momentum', type=float, default=0.9)
 PARSER.add_argument('--weight_decay', dest='weight_decay', type=float, default=0.0)
-PARSER.add_argument('--evaluate_every', '--eval_every', dest='evaluate_every', type=int, default=None)
-PARSER.add_argument('--start_eval_at', dest='start_eval_at', type=int, default=None)
 PARSER.add_argument('--verbose', '-v', dest='verbose', action='store_true', default=False)
 PARSER.add_argument('--normalization', dest='normalization', type=str,
                     choices=['instancenorm', 'batchnorm'], default='instancenorm')
diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py
index f2e881bcb..6bc4a0f78 100644
--- a/image_segmentation/pytorch/runtime/training.py
+++ b/image_segmentation/pytorch/runtime/training.py
@@ -9,6 +9,10 @@
 from runtime.logging import mllog_event, mllog_start, mllog_end, CONSTANTS
 
 
+START_EVAL_AT = 168*1000
+EVALUATE_EVERY = 168*20
+
+
 def get_optimizer(params, flags):
     if flags.optimizer == "adam":
         optim = Adam(params, lr=flags.learning_rate, weight_decay=flags.weight_decay)
@@ -55,7 +59,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
     diverged = False
     total_samples = 0
     iteration = 0
-    next_eval_at = flags.start_eval_at
+    next_eval_at = START_EVAL_AT
     model.train()
     train_loader = iter(train_loader)
     for callback in callbacks:
@@ -124,7 +128,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
         mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False,
                   metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples,
                             CONSTANTS.EPOCH_COUNT: next_eval_at})
-        next_eval_at += flags.evaluate_every
+        next_eval_at += EVALUATE_EVERY
 
 
     mllog_end(key=CONSTANTS.RUN_STOP, sync=True,

From a95b87eebad24debd21a2a36fa273df76709f104 Mon Sep 17 00:00:00 2001
From: michalm <michalm@nvidia.com>
Date: Sun, 10 Dec 2023 11:37:05 +0100
Subject: [PATCH 08/12] Fix start eval at and evaluate every

---
 image_segmentation/pytorch/main.py              | 3 +--
 image_segmentation/pytorch/runtime/arguments.py | 6 +++---
 image_segmentation/pytorch/runtime/training.py  | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py
index 8a8bd4459..d40bab8bc 100644
--- a/image_segmentation/pytorch/main.py
+++ b/image_segmentation/pytorch/main.py
@@ -8,8 +8,7 @@
 
 from data_loading.data_loader import get_data_loaders
 
-# from runtime.training import train
-from runtime.dummy_training import train
+from runtime.training import train
 from runtime.inference import evaluate
 from runtime.arguments import PARSER
 from runtime.distributed_utils import init_distributed, get_world_size, get_device, is_main_process, get_rank
diff --git a/image_segmentation/pytorch/runtime/arguments.py b/image_segmentation/pytorch/runtime/arguments.py
index 92eab1732..a574152d2 100644
--- a/image_segmentation/pytorch/runtime/arguments.py
+++ b/image_segmentation/pytorch/runtime/arguments.py
@@ -14,7 +14,7 @@
 PARSER.add_argument('--quality_threshold', dest='quality_threshold', type=float, default=0.908)
 PARSER.add_argument('--ga_steps', dest='ga_steps', type=int, default=1)
 PARSER.add_argument('--warmup_steps', dest='warmup_steps', type=int, default=4)
-PARSER.add_argument('--batch_size', dest='batch_size', type=int, default=2)
+PARSER.add_argument('--batch_size', dest='batch_size', type=int, default=7)
 PARSER.add_argument('--layout', dest='layout', type=str, choices=['NCDHW'], default='NCDHW')
 PARSER.add_argument('--input_shape', nargs='+', type=int, default=[128, 128, 128])
 PARSER.add_argument('--val_input_shape', nargs='+', type=int, default=[128, 128, 128])
@@ -25,9 +25,9 @@
 PARSER.add_argument('--benchmark', dest='benchmark', action='store_true', default=False)
 PARSER.add_argument('--amp', dest='amp', action='store_true', default=False)
 PARSER.add_argument('--optimizer', dest='optimizer', default="sgd", choices=["sgd", "adam", "lamb"], type=str)
-PARSER.add_argument('--learning_rate', dest='learning_rate', type=float, default=1.0)
+PARSER.add_argument('--learning_rate', dest='learning_rate', type=float, default=2.0)
 PARSER.add_argument('--init_learning_rate', dest='init_learning_rate', type=float, default=1e-4)
-PARSER.add_argument('--lr_warmup_samples', dest='lr_warmup_samples', type=int, default=0)
+PARSER.add_argument('--lr_warmup_samples', dest='lr_warmup_samples', type=int, default=168000)
 PARSER.add_argument('--lr_decay_samples', nargs='+', type=int, default=[])
 PARSER.add_argument('--lr_decay_factor', dest='lr_decay_factor', type=float, default=1.0)
 PARSER.add_argument('--lamb_betas', nargs='+', type=int, default=[0.9, 0.999])
diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py
index 6bc4a0f78..c52404f5e 100644
--- a/image_segmentation/pytorch/runtime/training.py
+++ b/image_segmentation/pytorch/runtime/training.py
@@ -71,7 +71,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
                               CONSTANTS.EPOCH_COUNT: next_eval_at})
 
         while total_samples < next_eval_at:
-            if total_samples <= flags.lr_warmup_samples and flags.lr_warmup_epochs > 0:
+            if total_samples <= flags.lr_warmup_samples and flags.lr_warmup_samples > 0:
                 lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples)
 
             optimizer.zero_grad()

From 0c4f4db1cebb299a8e72437eb7e555a7937cfaba Mon Sep 17 00:00:00 2001
From: michalm <michalm@nvidia.com>
Date: Sun, 10 Dec 2023 15:07:25 +0100
Subject: [PATCH 09/12] Add support for lr decay

---
 image_segmentation/pytorch/main.py            |  4 +-
 .../pytorch/runtime/distributed_utils.py      |  2 +-
 .../pytorch/runtime/training.py               | 57 +++++++++++--------
 3 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py
index d40bab8bc..de87e4b1e 100644
--- a/image_segmentation/pytorch/main.py
+++ b/image_segmentation/pytorch/main.py
@@ -50,8 +50,8 @@ def main():
     mllog_end(key=constants.INIT_STOP, sync=True)
     mllog_start(key=constants.RUN_START, sync=True)
     train_dataloader, val_dataloader = get_data_loaders(flags, num_shards=world_size, global_rank=local_rank)
-    samples_per_epoch = world_size * len(train_dataloader) * flags.batch_size
-    mllog_event(key='samples_per_epoch', value=samples_per_epoch, sync=False)
+    # samples_per_epoch = world_size * len(train_dataloader) * flags.batch_size
+    # mllog_event(key='samples_per_epoch', value=samples_per_epoch, sync=False)
     mllog_event(key=constants.GLOBAL_BATCH_SIZE, value=flags.batch_size * world_size * flags.ga_steps, sync=False)
     mllog_event(key=constants.GRADIENT_ACCUMULATION_STEPS, value=flags.ga_steps)
     loss_fn = DiceCELoss(to_onehot_y=True, use_softmax=True, layout=flags.layout,
diff --git a/image_segmentation/pytorch/runtime/distributed_utils.py b/image_segmentation/pytorch/runtime/distributed_utils.py
index 0a5a5cab4..b467969ee 100644
--- a/image_segmentation/pytorch/runtime/distributed_utils.py
+++ b/image_segmentation/pytorch/runtime/distributed_utils.py
@@ -97,7 +97,7 @@ def get_world_size():
 def reduce_tensor(tensor, num_gpus):
     if num_gpus > 1:
         rt = tensor.clone()
-        dist.all_reduce(rt, op=dist.reduce_op.SUM)
+        dist.all_reduce(rt, op=dist.ReduceOp.SUM)
         if rt.is_floating_point():
             rt = rt / num_gpus
         else:
diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py
index c52404f5e..dc1d0e726 100644
--- a/image_segmentation/pytorch/runtime/training.py
+++ b/image_segmentation/pytorch/runtime/training.py
@@ -1,4 +1,5 @@
 from tqdm import tqdm
+from time import time
 
 import torch
 from torch.optim import Adam, SGD
@@ -34,18 +35,21 @@ def lr_warmup(optimizer, init_lr, lr, current_samples, warmup_samples):
         param_group['lr'] = init_lr + (lr - init_lr) * scale
 
 
-def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, callbacks,
-          is_distributed, samples_per_epoch):
-    rank = get_rank()
+def lr_decay(optimizer, lr_decay_samples, lr_decay_factor, total_samples):
+    if total_samples > lr_decay_samples[0]:
+        lr_decay_samples = lr_decay_samples[1:]
+        for param_group in optimizer.param_groups:
+            param_group['lr'] *= lr_decay_factor
+    return lr_decay_samples
+
+
+def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, callbacks, is_distributed):
+
     world_size = get_world_size()
     torch.backends.cudnn.benchmark = flags.cudnn_benchmark
     torch.backends.cudnn.deterministic = flags.cudnn_deterministic
 
     optimizer = get_optimizer(model.parameters(), flags)
-    if flags.lr_decay_samples:
-        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
-                                                         milestones=flags.lr_decay_samples,
-                                                         gamma=flags.lr_decay_factor)
     scaler = GradScaler()
 
     model.to(device)
@@ -59,7 +63,8 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
     diverged = False
     total_samples = 0
     iteration = 0
-    next_eval_at = START_EVAL_AT
+    lr_decay_samples = flags.lr_decay_samples
+    next_eval_at = EVALUATE_EVERY
     model.train()
     train_loader = iter(train_loader)
     for callback in callbacks:
@@ -70,15 +75,17 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
                     metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples,
                               CONSTANTS.EPOCH_COUNT: next_eval_at})
 
+        t0 = time()
         while total_samples < next_eval_at:
             if total_samples <= flags.lr_warmup_samples and flags.lr_warmup_samples > 0:
                 lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples)
+            if len(flags.lr_decay_samples) > 0:
+                lr_decay_samples = lr_decay(optimizer, lr_decay_samples, flags.lr_decay_factor, total_samples)
 
             optimizer.zero_grad()
-            # for iteration, batch in enumerate(tqdm(train_loader, disable=(rank != 0) or not flags.verbose)):
 
             batch = next(train_loader)
-            total_samples = flags.batch_size * world_size
+            total_samples += flags.batch_size * world_size
 
             image, label = batch
             image, label = image.to(device), label.to(device)
@@ -105,34 +112,34 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
                 optimizer.zero_grad()
             iteration += 1
 
-
+        print(f"Throughput: {round(EVALUATE_EVERY / (time() - t0), 2)} samples/s. Time {time() - t0}")
         # Evaluation
-
         del output
-        mllog_start(key=CONSTANTS.EVAL_START, value=total_samples,
-                    metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False)
+        if total_samples >= START_EVAL_AT:
+            mllog_start(key=CONSTANTS.EVAL_START, value=total_samples,
+                        metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False)
 
-        eval_metrics = evaluate(flags, model, val_loader, loss_fn, score_fn, device, total_samples)
+            eval_metrics = evaluate(flags, model, val_loader, loss_fn, score_fn, device, total_samples)
 
-        mllog_event(key=CONSTANTS.EVAL_ACCURACY, value=eval_metrics["mean_dice"],
-                    metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False)
-        mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False)
+            mllog_event(key=CONSTANTS.EVAL_ACCURACY, value=eval_metrics["mean_dice"],
+                        metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False)
+            mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False)
 
-        model.train()
-        if eval_metrics["mean_dice"] >= flags.quality_threshold:
-            is_successful = True
-        elif eval_metrics["mean_dice"] < 1e-6:
-            print("MODEL DIVERGED. ABORTING.")
-            diverged = True
+            model.train()
+            if eval_metrics["mean_dice"] >= flags.quality_threshold:
+                is_successful = True
+            elif eval_metrics["mean_dice"] < 1e-6:
+                print("MODEL DIVERGED. ABORTING.")
+                diverged = True
 
         mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False,
                   metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples,
                             CONSTANTS.EPOCH_COUNT: next_eval_at})
         next_eval_at += EVALUATE_EVERY
 
-
     mllog_end(key=CONSTANTS.RUN_STOP, sync=True,
               metadata={CONSTANTS.STATUS: CONSTANTS.SUCCESS if is_successful else CONSTANTS.ABORTED,
                         CONSTANTS.EPOCH_COUNT: total_samples})
+
     for callback in callbacks:
         callback.on_fit_end()

From 59699418bb77581e24a319d32481b5240e5102da Mon Sep 17 00:00:00 2001
From: michalm <michalm@nvidia.com>
Date: Mon, 11 Dec 2023 08:45:18 +0100
Subject: [PATCH 10/12] syntax fixes

---
 image_segmentation/pytorch/main.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py
index de87e4b1e..7cc2c8445 100644
--- a/image_segmentation/pytorch/main.py
+++ b/image_segmentation/pytorch/main.py
@@ -50,8 +50,6 @@ def main():
     mllog_end(key=constants.INIT_STOP, sync=True)
     mllog_start(key=constants.RUN_START, sync=True)
     train_dataloader, val_dataloader = get_data_loaders(flags, num_shards=world_size, global_rank=local_rank)
-    # samples_per_epoch = world_size * len(train_dataloader) * flags.batch_size
-    # mllog_event(key='samples_per_epoch', value=samples_per_epoch, sync=False)
     mllog_event(key=constants.GLOBAL_BATCH_SIZE, value=flags.batch_size * world_size * flags.ga_steps, sync=False)
     mllog_event(key=constants.GRADIENT_ACCUMULATION_STEPS, value=flags.ga_steps)
     loss_fn = DiceCELoss(to_onehot_y=True, use_softmax=True, layout=flags.layout,
@@ -61,7 +59,7 @@ def main():
 
     if flags.exec_mode == 'train':
         train(flags, model, train_dataloader, val_dataloader, loss_fn, score_fn,
-              device=device, callbacks=callbacks, is_distributed=is_distributed, samples_per_epoch=samples_per_epoch)
+              device=device, callbacks=callbacks, is_distributed=is_distributed)
 
     elif flags.exec_mode == 'evaluate':
         eval_metrics = evaluate(flags, model, val_dataloader, loss_fn, score_fn,

From b5cc52ea63c31033aaa820244c65209118a8f96e Mon Sep 17 00:00:00 2001
From: michalm <michalm@nvidia.com>
Date: Tue, 12 Dec 2023 09:48:23 +0100
Subject: [PATCH 11/12] syntax fixes

---
 image_segmentation/pytorch/main.py             | 1 -
 image_segmentation/pytorch/runtime/training.py | 7 +++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py
index 7cc2c8445..7581c3f7a 100644
--- a/image_segmentation/pytorch/main.py
+++ b/image_segmentation/pytorch/main.py
@@ -20,7 +20,6 @@
 
 
 def main():
-    mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'unet3d.log'))
     mllog.config(filename=os.path.join("/results", 'unet3d.log'))
     mllogger = mllog.get_mllogger()
     mllogger.logger.propagate = False
diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py
index dc1d0e726..19cbe7432 100644
--- a/image_segmentation/pytorch/runtime/training.py
+++ b/image_segmentation/pytorch/runtime/training.py
@@ -36,7 +36,7 @@ def lr_warmup(optimizer, init_lr, lr, current_samples, warmup_samples):
 
 
 def lr_decay(optimizer, lr_decay_samples, lr_decay_factor, total_samples):
-    if total_samples > lr_decay_samples[0]:
+    if len(lr_decay_samples) > 0 and total_samples > lr_decay_samples[0]:
         lr_decay_samples = lr_decay_samples[1:]
         for param_group in optimizer.param_groups:
             param_group['lr'] *= lr_decay_factor
@@ -73,7 +73,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
     while not diverged and not is_successful:
         mllog_start(key=CONSTANTS.BLOCK_START, sync=False,
                     metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples,
-                              CONSTANTS.EPOCH_COUNT: next_eval_at})
+                              CONSTANTS.EPOCH_COUNT: EVALUATE_EVERY})
 
         t0 = time()
         while total_samples < next_eval_at:
@@ -112,7 +112,6 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
                 optimizer.zero_grad()
             iteration += 1
 
-        print(f"Throughput: {round(EVALUATE_EVERY / (time() - t0), 2)} samples/s. Time {time() - t0}")
         # Evaluation
         del output
         if total_samples >= START_EVAL_AT:
@@ -134,7 +133,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal
 
         mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False,
                   metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples,
-                            CONSTANTS.EPOCH_COUNT: next_eval_at})
+                            CONSTANTS.EPOCH_COUNT: EVALUATE_EVERY})
         next_eval_at += EVALUATE_EVERY
 
     mllog_end(key=CONSTANTS.RUN_STOP, sync=True,

From 56ed46d33fda6d9b7c5d9e15fef1bd232083827d Mon Sep 17 00:00:00 2001
From: michalm <michalm@nvidia.com>
Date: Wed, 17 Jan 2024 17:55:35 +0100
Subject: [PATCH 12/12] Reverse dockerfile changes

---
 image_segmentation/pytorch/Dockerfile | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/image_segmentation/pytorch/Dockerfile b/image_segmentation/pytorch/Dockerfile
index f501ce8fc..cea560919 100644
--- a/image_segmentation/pytorch/Dockerfile
+++ b/image_segmentation/pytorch/Dockerfile
@@ -1,11 +1,11 @@
-#ARG FROM_IMAGE_NAME=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:23.04-py3
+ARG FROM_IMAGE_NAME=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime
+#ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.02-py3
 FROM ${FROM_IMAGE_NAME}
 
-#RUN apt-get update && \
-#    apt-get upgrade -y && \
-#    apt-get install -y git
-#RUN apt-get install -y vim
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y git
+RUN apt-get install -y vim
 
 ADD . /workspace/unet3d
 WORKDIR /workspace/unet3d