From a75ec17c0c2ce289fc952e2ea2c325e9ea5b1197 Mon Sep 17 00:00:00 2001 From: michalm Date: Tue, 10 Oct 2023 21:30:10 +0200 Subject: [PATCH 01/12] Move epochs to samples --- image_segmentation/pytorch/main.py | 2 +- .../pytorch/runtime/training.py | 25 ++++++++++++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py index c3ab0a27f..a66a84e8b 100644 --- a/image_segmentation/pytorch/main.py +++ b/image_segmentation/pytorch/main.py @@ -64,7 +64,7 @@ def main(): if flags.exec_mode == 'train': train(flags, model, train_dataloader, val_dataloader, loss_fn, score_fn, - device=device, callbacks=callbacks, is_distributed=is_distributed) + device=device, callbacks=callbacks, is_distributed=is_distributed, samples_per_epoch=samples_per_epoch) elif flags.exec_mode == 'evaluate': eval_metrics = evaluate(flags, model, val_dataloader, loss_fn, score_fn, diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py index 030adde78..88b294d4d 100644 --- a/image_segmentation/pytorch/runtime/training.py +++ b/image_segmentation/pytorch/runtime/training.py @@ -30,7 +30,8 @@ def lr_warmup(optimizer, init_lr, lr, current_epoch, warmup_epochs): param_group['lr'] = init_lr + (lr - init_lr) * scale -def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, callbacks, is_distributed): +def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, callbacks, + is_distributed, samples_per_epoch): rank = get_rank() world_size = get_world_size() torch.backends.cudnn.benchmark = flags.cudnn_benchmark @@ -52,6 +53,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal is_successful = False diverged = False + epoch = 1 next_eval_at = flags.start_eval_at model.train() for callback in callbacks: @@ -61,8 +63,9 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal if epoch <= flags.lr_warmup_epochs and flags.lr_warmup_epochs > 0: lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, epoch, flags.lr_warmup_epochs) mllog_start(key=CONSTANTS.BLOCK_START, sync=False, - metadata={CONSTANTS.FIRST_EPOCH_NUM: epoch, CONSTANTS.EPOCH_COUNT: 1}) - mllog_start(key=CONSTANTS.EPOCH_START, metadata={CONSTANTS.EPOCH_NUM: epoch}, sync=False) + metadata={CONSTANTS.FIRST_EPOCH_NUM: epoch * samples_per_epoch, + CONSTANTS.EPOCH_COUNT: samples_per_epoch}) + mllog_start(key=CONSTANTS.EPOCH_START, metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, sync=False) if is_distributed: train_loader.sampler.set_epoch(epoch) @@ -98,7 +101,8 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal cumulative_loss.append(loss_value) mllog_end(key=CONSTANTS.EPOCH_STOP, sync=False, - metadata={CONSTANTS.EPOCH_NUM: epoch, 'current_lr': optimizer.param_groups[0]['lr']}) + metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch, + 'current_lr': optimizer.param_groups[0]['lr']}) if flags.lr_decay_epochs: scheduler.step() @@ -106,16 +110,17 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal if epoch == next_eval_at: next_eval_at += flags.evaluate_every del output - mllog_start(key=CONSTANTS.EVAL_START, value=epoch, metadata={CONSTANTS.EPOCH_NUM: epoch}, sync=False) + mllog_start(key=CONSTANTS.EVAL_START, value=epoch * samples_per_epoch, + metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, sync=False) eval_metrics = evaluate(flags, model, val_loader, loss_fn, score_fn, device, epoch) eval_metrics["train_loss"] = sum(cumulative_loss) / len(cumulative_loss) mllog_event(key=CONSTANTS.EVAL_ACCURACY, value=eval_metrics["mean_dice"], - metadata={CONSTANTS.EPOCH_NUM: epoch}, + metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, sync=False) - mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: epoch}, sync=False) + mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, sync=False) for callback in callbacks: callback.on_epoch_end(epoch=epoch, metrics=eval_metrics, model=model, optimizer=optimizer) @@ -127,12 +132,14 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal diverged = True mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False, - metadata={CONSTANTS.FIRST_EPOCH_NUM: epoch, CONSTANTS.EPOCH_COUNT: 1}) + metadata={CONSTANTS.FIRST_EPOCH_NUM: epoch * samples_per_epoch, + CONSTANTS.EPOCH_COUNT: samples_per_epoch}) if is_successful or diverged: break mllog_end(key=CONSTANTS.RUN_STOP, sync=True, - metadata={CONSTANTS.STATUS: CONSTANTS.SUCCESS if is_successful else CONSTANTS.ABORTED}) + metadata={CONSTANTS.STATUS: CONSTANTS.SUCCESS if is_successful else CONSTANTS.ABORTED, + CONSTANTS.EPOCH_COUNT: epoch * samples_per_epoch}) for callback in callbacks: callback.on_fit_end() From fde6d6f98009d1691383864b7c442cb6efaede7d Mon Sep 17 00:00:00 2001 From: michalm Date: Sun, 10 Dec 2023 10:46:14 +0100 Subject: [PATCH 02/12] Add dummy training, test epochless training --- .../pytorch/data_loading/data_loader.py | 2 +- .../pytorch/data_loading/pytorch_loader.py | 2 +- image_segmentation/pytorch/main.py | 3 +- .../pytorch/runtime/arguments.py | 4 +- .../pytorch/runtime/dummy_training.py | 106 ++++++++++++++++++ .../pytorch/runtime/training.py | 85 ++++++-------- 6 files changed, 149 insertions(+), 53 deletions(-) create mode 100644 image_segmentation/pytorch/runtime/dummy_training.py diff --git a/image_segmentation/pytorch/data_loading/data_loader.py b/image_segmentation/pytorch/data_loading/data_loader.py index c4c80b51d..c3ef531f1 100644 --- a/image_segmentation/pytorch/data_loading/data_loader.py +++ b/image_segmentation/pytorch/data_loading/data_loader.py @@ -90,7 +90,7 @@ def get_data_loaders(flags, num_shards, global_rank): raise ValueError(f"Loader {flags.loader} unknown. Valid loaders are: synthetic, pytorch") # The DistributedSampler seed should be the same for all workers - train_sampler = DistributedSampler(train_dataset, seed=flags.shuffling_seed, drop_last=True) if num_shards > 1 else None + train_sampler = None, DistributedSampler(train_dataset, seed=flags.shuffling_seed, drop_last=True) if num_shards > 1 else None val_sampler = None train_dataloader = DataLoader(train_dataset, diff --git a/image_segmentation/pytorch/data_loading/pytorch_loader.py b/image_segmentation/pytorch/data_loading/pytorch_loader.py index bb71d32f7..aa871153a 100644 --- a/image_segmentation/pytorch/data_loading/pytorch_loader.py +++ b/image_segmentation/pytorch/data_loading/pytorch_loader.py @@ -143,7 +143,7 @@ def __init__(self, images, labels, **kwargs): self.rand_crop = RandBalancedCrop(patch_size=patch_size, oversampling=oversampling) def __len__(self): - return len(self.images) + return 1e9 #len(self.images) def __getitem__(self, idx): data = {"image": np.load(self.images[idx]), "label": np.load(self.labels[idx])} diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py index a66a84e8b..2f1652945 100644 --- a/image_segmentation/pytorch/main.py +++ b/image_segmentation/pytorch/main.py @@ -8,7 +8,8 @@ from data_loading.data_loader import get_data_loaders -from runtime.training import train +# from runtime.training import train +from runtime.dummy_training import train from runtime.inference import evaluate from runtime.arguments import PARSER from runtime.distributed_utils import init_distributed, get_world_size, get_device, is_main_process, get_rank diff --git a/image_segmentation/pytorch/runtime/arguments.py b/image_segmentation/pytorch/runtime/arguments.py index bc7530633..9ee9c6a67 100644 --- a/image_segmentation/pytorch/runtime/arguments.py +++ b/image_segmentation/pytorch/runtime/arguments.py @@ -27,8 +27,8 @@ PARSER.add_argument('--optimizer', dest='optimizer', default="sgd", choices=["sgd", "adam", "lamb"], type=str) PARSER.add_argument('--learning_rate', dest='learning_rate', type=float, default=1.0) PARSER.add_argument('--init_learning_rate', dest='init_learning_rate', type=float, default=1e-4) -PARSER.add_argument('--lr_warmup_epochs', dest='lr_warmup_epochs', type=int, default=0) -PARSER.add_argument('--lr_decay_epochs', nargs='+', type=int, default=[]) +PARSER.add_argument('--lr_warmup_samples', dest='lr_warmup_samples', type=int, default=0) +PARSER.add_argument('--lr_decay_samples', nargs='+', type=int, default=[]) PARSER.add_argument('--lr_decay_factor', dest='lr_decay_factor', type=float, default=1.0) PARSER.add_argument('--lamb_betas', nargs='+', type=int, default=[0.9, 0.999]) PARSER.add_argument('--momentum', dest='momentum', type=float, default=0.9) diff --git a/image_segmentation/pytorch/runtime/dummy_training.py b/image_segmentation/pytorch/runtime/dummy_training.py new file mode 100644 index 000000000..31f93cf84 --- /dev/null +++ b/image_segmentation/pytorch/runtime/dummy_training.py @@ -0,0 +1,106 @@ +from tqdm import tqdm + +import torch +from torch.optim import Adam, SGD +from torch.cuda.amp import autocast, GradScaler + +from runtime.distributed_utils import get_rank, reduce_tensor, get_world_size +from runtime.inference import evaluate +from runtime.logging import mllog_event, mllog_start, mllog_end, CONSTANTS + + +def get_optimizer(params, flags): + if flags.optimizer == "adam": + optim = Adam(params, lr=flags.learning_rate, weight_decay=flags.weight_decay) + elif flags.optimizer == "sgd": + optim = SGD(params, lr=flags.learning_rate, momentum=flags.momentum, nesterov=True, + weight_decay=flags.weight_decay) + elif flags.optimizer == "lamb": + import apex + optim = apex.optimizers.FusedLAMB(params, lr=flags.learning_rate, betas=flags.lamb_betas, + weight_decay=flags.weight_decay) + else: + raise ValueError("Optimizer {} unknown.".format(flags.optimizer)) + return optim + + +def lr_warmup(optimizer, init_lr, lr, current_samples, warmup_samples): + scale = current_samples / warmup_samples + for param_group in optimizer.param_groups: + param_group['lr'] = init_lr + (lr - init_lr) * scale + + +def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, callbacks, + is_distributed, samples_per_epoch): + rank = get_rank() + world_size = get_world_size() + torch.backends.cudnn.benchmark = flags.cudnn_benchmark + torch.backends.cudnn.deterministic = flags.cudnn_deterministic + + optimizer = get_optimizer(model.parameters(), flags) + if flags.lr_decay_epochs: + scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, + milestones=flags.lr_decay_epochs, + gamma=flags.lr_decay_factor) + scaler = GradScaler() + + model.to(device) + loss_fn.to(device) + if is_distributed: + model = torch.nn.parallel.DistributedDataParallel(model, + device_ids=[flags.local_rank], + output_device=flags.local_rank) + + is_successful = False + diverged = False + total_samples = 0 + iteration = 0 + next_eval_at = flags.start_eval_at + model.train() + train_loader = iter(train_loader) + for callback in callbacks: + callback.on_fit_start() + + while not diverged and not is_successful: + mllog_start(key=CONSTANTS.BLOCK_START, sync=False, + metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples, + CONSTANTS.EPOCH_COUNT: next_eval_at}) + + while total_samples < next_eval_at: + if total_samples <= flags.lr_warmup_epochs * samples_per_epoch and flags.lr_warmup_epochs > 0: + lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples) + + optimizer.zero_grad() + # for iteration, batch in enumerate(tqdm(train_loader, disable=(rank != 0) or not flags.verbose)): + + batch = next(train_loader) + total_samples = flags.batch_size * world_size + + image, label = batch + # image, label = image.to(device), label.to(device) + + iteration += 1 + print(total_samples) + + + # Evaluation + mllog_start(key=CONSTANTS.EVAL_START, value=total_samples, + metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False) + + + + mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False) + + model.train() + + mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False, + metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples, + CONSTANTS.EPOCH_COUNT: next_eval_at}) + next_eval_at += flags.evaluate_every + + + mllog_end(key=CONSTANTS.RUN_STOP, sync=True, + metadata={CONSTANTS.STATUS: CONSTANTS.SUCCESS if is_successful else CONSTANTS.ABORTED, + CONSTANTS.EPOCH_COUNT: total_samples}) + for callback in callbacks: + callback.on_fit_end() diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py index 88b294d4d..8fac8dbda 100644 --- a/image_segmentation/pytorch/runtime/training.py +++ b/image_segmentation/pytorch/runtime/training.py @@ -24,8 +24,8 @@ def get_optimizer(params, flags): return optim -def lr_warmup(optimizer, init_lr, lr, current_epoch, warmup_epochs): - scale = current_epoch / warmup_epochs +def lr_warmup(optimizer, init_lr, lr, current_samples, warmup_samples): + scale = current_samples / warmup_samples for param_group in optimizer.param_groups: param_group['lr'] = init_lr + (lr - init_lr) * scale @@ -53,26 +53,29 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal is_successful = False diverged = False - epoch = 1 + total_samples = 0 + iteration = 0 next_eval_at = flags.start_eval_at model.train() + train_loader = iter(train_loader) for callback in callbacks: callback.on_fit_start() - for epoch in range(1, flags.epochs + 1): - cumulative_loss = [] - if epoch <= flags.lr_warmup_epochs and flags.lr_warmup_epochs > 0: - lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, epoch, flags.lr_warmup_epochs) + + while not diverged and not is_successful: mllog_start(key=CONSTANTS.BLOCK_START, sync=False, - metadata={CONSTANTS.FIRST_EPOCH_NUM: epoch * samples_per_epoch, - CONSTANTS.EPOCH_COUNT: samples_per_epoch}) - mllog_start(key=CONSTANTS.EPOCH_START, metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, sync=False) + metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples, + CONSTANTS.EPOCH_COUNT: next_eval_at}) + + while total_samples < next_eval_at: + if total_samples <= flags.lr_warmup_epochs * samples_per_epoch and flags.lr_warmup_epochs > 0: + lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples) + + optimizer.zero_grad() + # for iteration, batch in enumerate(tqdm(train_loader, disable=(rank != 0) or not flags.verbose)): - if is_distributed: - train_loader.sampler.set_epoch(epoch) + batch = next(train_loader) + total_samples = flags.batch_size * world_size - loss_value = None - optimizer.zero_grad() - for iteration, batch in enumerate(tqdm(train_loader, disable=(rank != 0) or not flags.verbose)): image, label = batch image, label = image.to(device), label.to(device) for callback in callbacks: @@ -96,50 +99,36 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal optimizer.step() optimizer.zero_grad() + iteration += 1 - loss_value = reduce_tensor(loss_value, world_size).detach().cpu().numpy() - cumulative_loss.append(loss_value) - mllog_end(key=CONSTANTS.EPOCH_STOP, sync=False, - metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch, - 'current_lr': optimizer.param_groups[0]['lr']}) + # Evaluation - if flags.lr_decay_epochs: - scheduler.step() + del output + mllog_start(key=CONSTANTS.EVAL_START, value=total_samples, + metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False) - if epoch == next_eval_at: - next_eval_at += flags.evaluate_every - del output - mllog_start(key=CONSTANTS.EVAL_START, value=epoch * samples_per_epoch, - metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, sync=False) + eval_metrics = evaluate(flags, model, val_loader, loss_fn, score_fn, device, total_samples) - eval_metrics = evaluate(flags, model, val_loader, loss_fn, score_fn, device, epoch) - eval_metrics["train_loss"] = sum(cumulative_loss) / len(cumulative_loss) + mllog_event(key=CONSTANTS.EVAL_ACCURACY, value=eval_metrics["mean_dice"], + metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False) + mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False) - mllog_event(key=CONSTANTS.EVAL_ACCURACY, - value=eval_metrics["mean_dice"], - metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, - sync=False) - mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: epoch * samples_per_epoch}, sync=False) - - for callback in callbacks: - callback.on_epoch_end(epoch=epoch, metrics=eval_metrics, model=model, optimizer=optimizer) - model.train() - if eval_metrics["mean_dice"] >= flags.quality_threshold: - is_successful = True - elif eval_metrics["mean_dice"] < 1e-6: - print("MODEL DIVERGED. ABORTING.") - diverged = True + model.train() + if eval_metrics["mean_dice"] >= flags.quality_threshold: + is_successful = True + elif eval_metrics["mean_dice"] < 1e-6: + print("MODEL DIVERGED. ABORTING.") + diverged = True mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False, - metadata={CONSTANTS.FIRST_EPOCH_NUM: epoch * samples_per_epoch, - CONSTANTS.EPOCH_COUNT: samples_per_epoch}) + metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples, + CONSTANTS.EPOCH_COUNT: next_eval_at}) + next_eval_at += flags.evaluate_every - if is_successful or diverged: - break mllog_end(key=CONSTANTS.RUN_STOP, sync=True, metadata={CONSTANTS.STATUS: CONSTANTS.SUCCESS if is_successful else CONSTANTS.ABORTED, - CONSTANTS.EPOCH_COUNT: epoch * samples_per_epoch}) + CONSTANTS.EPOCH_COUNT: total_samples}) for callback in callbacks: callback.on_fit_end() From b718348bccf46f256d6dc773536c92c541216db9 Mon Sep 17 00:00:00 2001 From: michalm Date: Sun, 10 Dec 2023 11:08:06 +0100 Subject: [PATCH 03/12] Syntax fixes --- image_segmentation/pytorch/data_loading/data_loader.py | 2 +- image_segmentation/pytorch/data_loading/pytorch_loader.py | 2 +- image_segmentation/pytorch/runtime/dummy_training.py | 6 +++--- image_segmentation/pytorch/runtime/logging.py | 4 ++-- image_segmentation/pytorch/runtime/training.py | 6 +++--- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/image_segmentation/pytorch/data_loading/data_loader.py b/image_segmentation/pytorch/data_loading/data_loader.py index c3ef531f1..727d44b10 100644 --- a/image_segmentation/pytorch/data_loading/data_loader.py +++ b/image_segmentation/pytorch/data_loading/data_loader.py @@ -90,7 +90,7 @@ def get_data_loaders(flags, num_shards, global_rank): raise ValueError(f"Loader {flags.loader} unknown. Valid loaders are: synthetic, pytorch") # The DistributedSampler seed should be the same for all workers - train_sampler = None, DistributedSampler(train_dataset, seed=flags.shuffling_seed, drop_last=True) if num_shards > 1 else None + train_sampler = None#, DistributedSampler(train_dataset, seed=flags.shuffling_seed, drop_last=True) if num_shards > 1 else None val_sampler = None train_dataloader = DataLoader(train_dataset, diff --git a/image_segmentation/pytorch/data_loading/pytorch_loader.py b/image_segmentation/pytorch/data_loading/pytorch_loader.py index aa871153a..05c1fd538 100644 --- a/image_segmentation/pytorch/data_loading/pytorch_loader.py +++ b/image_segmentation/pytorch/data_loading/pytorch_loader.py @@ -143,7 +143,7 @@ def __init__(self, images, labels, **kwargs): self.rand_crop = RandBalancedCrop(patch_size=patch_size, oversampling=oversampling) def __len__(self): - return 1e9 #len(self.images) + return int(168*10000) #len(self.images) def __getitem__(self, idx): data = {"image": np.load(self.images[idx]), "label": np.load(self.labels[idx])} diff --git a/image_segmentation/pytorch/runtime/dummy_training.py b/image_segmentation/pytorch/runtime/dummy_training.py index 31f93cf84..5fddaf8d2 100644 --- a/image_segmentation/pytorch/runtime/dummy_training.py +++ b/image_segmentation/pytorch/runtime/dummy_training.py @@ -38,9 +38,9 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal torch.backends.cudnn.deterministic = flags.cudnn_deterministic optimizer = get_optimizer(model.parameters(), flags) - if flags.lr_decay_epochs: + if flags.lr_decay_samples: scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, - milestones=flags.lr_decay_epochs, + milestones=flags.lr_decay_samples, gamma=flags.lr_decay_factor) scaler = GradScaler() @@ -67,7 +67,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal CONSTANTS.EPOCH_COUNT: next_eval_at}) while total_samples < next_eval_at: - if total_samples <= flags.lr_warmup_epochs * samples_per_epoch and flags.lr_warmup_epochs > 0: + if total_samples <= flags.lr_warmup_samples and flags.lr_warmup_samples > 0: lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples) optimizer.zero_grad() diff --git a/image_segmentation/pytorch/runtime/logging.py b/image_segmentation/pytorch/runtime/logging.py index 2d76c9216..92efe9984 100644 --- a/image_segmentation/pytorch/runtime/logging.py +++ b/image_segmentation/pytorch/runtime/logging.py @@ -83,9 +83,9 @@ def mlperf_submission_log(): def mlperf_run_param_log(flags): mllog_event(key=mllog.constants.OPT_NAME, value=flags.optimizer) mllog_event(key=mllog.constants.OPT_BASE_LR, value=flags.learning_rate) - mllog_event(key=mllog.constants.OPT_LR_WARMUP_EPOCHS, value=flags.lr_warmup_epochs) + mllog_event(key=mllog.constants.OPT_LR_WARMUP_EPOCHS, value=flags.lr_warmup_samples) # mllog_event(key=mllog.constants.OPT_LR_WARMUP_FACTOR, value=flags.lr_warmup_factor) - mllog_event(key=mllog.constants.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=flags.lr_decay_epochs) + mllog_event(key=mllog.constants.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=flags.lr_decay_samples) mllog_event(key=mllog.constants.OPT_LR_DECAY_FACTOR, value=flags.lr_decay_factor) mllog_event(key=mllog.constants.OPT_WEIGHT_DECAY, value=flags.weight_decay) mllog_event(key="opt_momentum", value=flags.momentum) diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py index 8fac8dbda..f2e881bcb 100644 --- a/image_segmentation/pytorch/runtime/training.py +++ b/image_segmentation/pytorch/runtime/training.py @@ -38,9 +38,9 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal torch.backends.cudnn.deterministic = flags.cudnn_deterministic optimizer = get_optimizer(model.parameters(), flags) - if flags.lr_decay_epochs: + if flags.lr_decay_samples: scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, - milestones=flags.lr_decay_epochs, + milestones=flags.lr_decay_samples, gamma=flags.lr_decay_factor) scaler = GradScaler() @@ -67,7 +67,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal CONSTANTS.EPOCH_COUNT: next_eval_at}) while total_samples < next_eval_at: - if total_samples <= flags.lr_warmup_epochs * samples_per_epoch and flags.lr_warmup_epochs > 0: + if total_samples <= flags.lr_warmup_samples and flags.lr_warmup_epochs > 0: lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples) optimizer.zero_grad() From bc6d796d47e1660b31a02631adaa51cf58330189 Mon Sep 17 00:00:00 2001 From: michalm Date: Sun, 10 Dec 2023 11:16:29 +0100 Subject: [PATCH 04/12] Syntax fixes --- image_segmentation/pytorch/Dockerfile | 2 +- .../pytorch/data_loading/pytorch_loader.py | 12 +++++++++--- image_segmentation/pytorch/runtime/dummy_training.py | 3 +++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/image_segmentation/pytorch/Dockerfile b/image_segmentation/pytorch/Dockerfile index fbe42e6a8..5616daa00 100644 --- a/image_segmentation/pytorch/Dockerfile +++ b/image_segmentation/pytorch/Dockerfile @@ -2,7 +2,6 @@ ARG FROM_IMAGE_NAME=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime #ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.02-py3 FROM ${FROM_IMAGE_NAME} -ADD . /workspace/unet3d WORKDIR /workspace/unet3d RUN apt-get update && \ @@ -13,4 +12,5 @@ RUN apt-get install -y vim RUN pip install --upgrade pip RUN pip install --disable-pip-version-check -r requirements.txt +ADD . /workspace/unet3d #RUN pip uninstall -y apex; pip uninstall -y apex; git clone --branch seryilmaz/fused_dropout_softmax https://github.com/seryilmaz/apex.git; cd apex; pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--xentropy" --global-option="--deprecated_fused_adam" --global-option="--deprecated_fused_lamb" --global-option="--fast_multihead_attn" . diff --git a/image_segmentation/pytorch/data_loading/pytorch_loader.py b/image_segmentation/pytorch/data_loading/pytorch_loader.py index 05c1fd538..7a8ca1429 100644 --- a/image_segmentation/pytorch/data_loading/pytorch_loader.py +++ b/image_segmentation/pytorch/data_loading/pytorch_loader.py @@ -141,14 +141,20 @@ def __init__(self, images, labels, **kwargs): patch_size, oversampling = kwargs["patch_size"], kwargs["oversampling"] self.patch_size = patch_size self.rand_crop = RandBalancedCrop(patch_size=patch_size, oversampling=oversampling) + self.real_len = len(self.images) + + self.x = list(range(24)) + self.y = list(range(24)) def __len__(self): return int(168*10000) #len(self.images) def __getitem__(self, idx): - data = {"image": np.load(self.images[idx]), "label": np.load(self.labels[idx])} - data = self.rand_crop(data) - data = self.train_transforms(data) + # data = {"image": np.load(self.images[idx % self.real_len]), "label": np.load(self.labels[idx % self.real_len])} + # data = self.rand_crop(data) + # data = self.train_transforms(data) + + data = {"image": self.x[idx % 24], "label": self.y[idx % 24]} return data["image"], data["label"] diff --git a/image_segmentation/pytorch/runtime/dummy_training.py b/image_segmentation/pytorch/runtime/dummy_training.py index 5fddaf8d2..9cb781bab 100644 --- a/image_segmentation/pytorch/runtime/dummy_training.py +++ b/image_segmentation/pytorch/runtime/dummy_training.py @@ -61,6 +61,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal for callback in callbacks: callback.on_fit_start() + counts = {} while not diverged and not is_successful: mllog_start(key=CONSTANTS.BLOCK_START, sync=False, metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples, @@ -81,6 +82,8 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal iteration += 1 print(total_samples) + for b in batch: + print(*b) # Evaluation From b50801b73797bbfbfed289a24603a994dcb60a31 Mon Sep 17 00:00:00 2001 From: michalm Date: Sun, 10 Dec 2023 11:18:40 +0100 Subject: [PATCH 05/12] Syntax fixes --- image_segmentation/pytorch/Dockerfile | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/image_segmentation/pytorch/Dockerfile b/image_segmentation/pytorch/Dockerfile index 5616daa00..f501ce8fc 100644 --- a/image_segmentation/pytorch/Dockerfile +++ b/image_segmentation/pytorch/Dockerfile @@ -1,16 +1,15 @@ -ARG FROM_IMAGE_NAME=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime -#ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.02-py3 +#ARG FROM_IMAGE_NAME=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime +ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:23.04-py3 FROM ${FROM_IMAGE_NAME} -WORKDIR /workspace/unet3d +#RUN apt-get update && \ +# apt-get upgrade -y && \ +# apt-get install -y git +#RUN apt-get install -y vim -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install -y git -RUN apt-get install -y vim +ADD . /workspace/unet3d +WORKDIR /workspace/unet3d RUN pip install --upgrade pip RUN pip install --disable-pip-version-check -r requirements.txt - -ADD . /workspace/unet3d #RUN pip uninstall -y apex; pip uninstall -y apex; git clone --branch seryilmaz/fused_dropout_softmax https://github.com/seryilmaz/apex.git; cd apex; pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--xentropy" --global-option="--deprecated_fused_adam" --global-option="--deprecated_fused_lamb" --global-option="--fast_multihead_attn" . From f1f01b381c8d602f542369b3322a08b57b84a016 Mon Sep 17 00:00:00 2001 From: michalm Date: Sun, 10 Dec 2023 11:28:53 +0100 Subject: [PATCH 06/12] Restore original dataloader --- .../pytorch/data_loading/pytorch_loader.py | 11 +- .../pytorch/runtime/dummy_training.py | 109 ------------------ 2 files changed, 3 insertions(+), 117 deletions(-) delete mode 100644 image_segmentation/pytorch/runtime/dummy_training.py diff --git a/image_segmentation/pytorch/data_loading/pytorch_loader.py b/image_segmentation/pytorch/data_loading/pytorch_loader.py index 7a8ca1429..2eae826ab 100644 --- a/image_segmentation/pytorch/data_loading/pytorch_loader.py +++ b/image_segmentation/pytorch/data_loading/pytorch_loader.py @@ -143,18 +143,13 @@ def __init__(self, images, labels, **kwargs): self.rand_crop = RandBalancedCrop(patch_size=patch_size, oversampling=oversampling) self.real_len = len(self.images) - self.x = list(range(24)) - self.y = list(range(24)) - def __len__(self): return int(168*10000) #len(self.images) def __getitem__(self, idx): - # data = {"image": np.load(self.images[idx % self.real_len]), "label": np.load(self.labels[idx % self.real_len])} - # data = self.rand_crop(data) - # data = self.train_transforms(data) - - data = {"image": self.x[idx % 24], "label": self.y[idx % 24]} + data = {"image": np.load(self.images[idx % self.real_len]), "label": np.load(self.labels[idx % self.real_len])} + data = self.rand_crop(data) + data = self.train_transforms(data) return data["image"], data["label"] diff --git a/image_segmentation/pytorch/runtime/dummy_training.py b/image_segmentation/pytorch/runtime/dummy_training.py deleted file mode 100644 index 9cb781bab..000000000 --- a/image_segmentation/pytorch/runtime/dummy_training.py +++ /dev/null @@ -1,109 +0,0 @@ -from tqdm import tqdm - -import torch -from torch.optim import Adam, SGD -from torch.cuda.amp import autocast, GradScaler - -from runtime.distributed_utils import get_rank, reduce_tensor, get_world_size -from runtime.inference import evaluate -from runtime.logging import mllog_event, mllog_start, mllog_end, CONSTANTS - - -def get_optimizer(params, flags): - if flags.optimizer == "adam": - optim = Adam(params, lr=flags.learning_rate, weight_decay=flags.weight_decay) - elif flags.optimizer == "sgd": - optim = SGD(params, lr=flags.learning_rate, momentum=flags.momentum, nesterov=True, - weight_decay=flags.weight_decay) - elif flags.optimizer == "lamb": - import apex - optim = apex.optimizers.FusedLAMB(params, lr=flags.learning_rate, betas=flags.lamb_betas, - weight_decay=flags.weight_decay) - else: - raise ValueError("Optimizer {} unknown.".format(flags.optimizer)) - return optim - - -def lr_warmup(optimizer, init_lr, lr, current_samples, warmup_samples): - scale = current_samples / warmup_samples - for param_group in optimizer.param_groups: - param_group['lr'] = init_lr + (lr - init_lr) * scale - - -def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, callbacks, - is_distributed, samples_per_epoch): - rank = get_rank() - world_size = get_world_size() - torch.backends.cudnn.benchmark = flags.cudnn_benchmark - torch.backends.cudnn.deterministic = flags.cudnn_deterministic - - optimizer = get_optimizer(model.parameters(), flags) - if flags.lr_decay_samples: - scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, - milestones=flags.lr_decay_samples, - gamma=flags.lr_decay_factor) - scaler = GradScaler() - - model.to(device) - loss_fn.to(device) - if is_distributed: - model = torch.nn.parallel.DistributedDataParallel(model, - device_ids=[flags.local_rank], - output_device=flags.local_rank) - - is_successful = False - diverged = False - total_samples = 0 - iteration = 0 - next_eval_at = flags.start_eval_at - model.train() - train_loader = iter(train_loader) - for callback in callbacks: - callback.on_fit_start() - - counts = {} - while not diverged and not is_successful: - mllog_start(key=CONSTANTS.BLOCK_START, sync=False, - metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples, - CONSTANTS.EPOCH_COUNT: next_eval_at}) - - while total_samples < next_eval_at: - if total_samples <= flags.lr_warmup_samples and flags.lr_warmup_samples > 0: - lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples) - - optimizer.zero_grad() - # for iteration, batch in enumerate(tqdm(train_loader, disable=(rank != 0) or not flags.verbose)): - - batch = next(train_loader) - total_samples = flags.batch_size * world_size - - image, label = batch - # image, label = image.to(device), label.to(device) - - iteration += 1 - print(total_samples) - for b in batch: - print(*b) - - - # Evaluation - mllog_start(key=CONSTANTS.EVAL_START, value=total_samples, - metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False) - - - - mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False) - - model.train() - - mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False, - metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples, - CONSTANTS.EPOCH_COUNT: next_eval_at}) - next_eval_at += flags.evaluate_every - - - mllog_end(key=CONSTANTS.RUN_STOP, sync=True, - metadata={CONSTANTS.STATUS: CONSTANTS.SUCCESS if is_successful else CONSTANTS.ABORTED, - CONSTANTS.EPOCH_COUNT: total_samples}) - for callback in callbacks: - callback.on_fit_end() From cc8716c2db5e4673bcd169d2dc008f3dc4f5b997 Mon Sep 17 00:00:00 2001 From: michalm Date: Sun, 10 Dec 2023 11:32:53 +0100 Subject: [PATCH 07/12] Fix start eval at and evaluate every --- image_segmentation/pytorch/main.py | 3 --- image_segmentation/pytorch/oldREADME.md | 2 -- image_segmentation/pytorch/run_and_time.sh | 4 +--- image_segmentation/pytorch/runtime/arguments.py | 2 -- image_segmentation/pytorch/runtime/training.py | 8 ++++++-- 5 files changed, 7 insertions(+), 12 deletions(-) diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py index 2f1652945..8a8bd4459 100644 --- a/image_segmentation/pytorch/main.py +++ b/image_segmentation/pytorch/main.py @@ -53,9 +53,6 @@ def main(): train_dataloader, val_dataloader = get_data_loaders(flags, num_shards=world_size, global_rank=local_rank) samples_per_epoch = world_size * len(train_dataloader) * flags.batch_size mllog_event(key='samples_per_epoch', value=samples_per_epoch, sync=False) - flags.evaluate_every = flags.evaluate_every or ceil(20*DATASET_SIZE/samples_per_epoch) - flags.start_eval_at = flags.start_eval_at or ceil(1000*DATASET_SIZE/samples_per_epoch) - mllog_event(key=constants.GLOBAL_BATCH_SIZE, value=flags.batch_size * world_size * flags.ga_steps, sync=False) mllog_event(key=constants.GRADIENT_ACCUMULATION_STEPS, value=flags.ga_steps) loss_fn = DiceCELoss(to_onehot_y=True, use_softmax=True, layout=flags.layout, diff --git a/image_segmentation/pytorch/oldREADME.md b/image_segmentation/pytorch/oldREADME.md index a87187135..e11831292 100644 --- a/image_segmentation/pytorch/oldREADME.md +++ b/image_segmentation/pytorch/oldREADME.md @@ -165,8 +165,6 @@ The complete list of the available parameters for the main.py script contains: * `--batch_size`: Size of each minibatch per GPU (default: `2`). * `--ga_steps`: Number of steps for gradient accumulation (default: `1`). * `--epochs`: Maximum number of epochs for training (default: `1`). -* `--evaluate_every`: Epoch interval for evaluation (default: `20`). -* `--start_eval_at`: First epoch to start running evaluation at (default: `1000`). * `--layout`: Data layout (default: `NCDHW`. `NDHWC` is not implemented). * `--input_shape`: Input shape for images during training (default: `[128, 128, 128]`). * `--val_input_shape`: Input shape for images during evaluation (default: `[128, 128, 128]`). diff --git a/image_segmentation/pytorch/run_and_time.sh b/image_segmentation/pytorch/run_and_time.sh index bfdeb5948..57e6a8d21 100644 --- a/image_segmentation/pytorch/run_and_time.sh +++ b/image_segmentation/pytorch/run_and_time.sh @@ -33,15 +33,13 @@ mllog_event(key=constants.CACHE_CLEAR, value=True)" python main.py --data_dir ${DATASET_DIR} \ --epochs ${MAX_EPOCHS} \ - --evaluate_every ${EVALUATE_EVERY} \ - --start_eval_at ${START_EVAL_AT} \ --quality_threshold ${QUALITY_THRESHOLD} \ --batch_size ${BATCH_SIZE} \ --optimizer sgd \ --ga_steps ${GRADIENT_ACCUMULATION_STEPS} \ --learning_rate ${LEARNING_RATE} \ --seed ${SEED} \ - --lr_warmup_epochs ${LR_WARMUP_EPOCHS} + --lr_warmup_samples ${LR_WARMUP_SAMPLES} # end timing end=$(date +%s) diff --git a/image_segmentation/pytorch/runtime/arguments.py b/image_segmentation/pytorch/runtime/arguments.py index 9ee9c6a67..92eab1732 100644 --- a/image_segmentation/pytorch/runtime/arguments.py +++ b/image_segmentation/pytorch/runtime/arguments.py @@ -33,8 +33,6 @@ PARSER.add_argument('--lamb_betas', nargs='+', type=int, default=[0.9, 0.999]) PARSER.add_argument('--momentum', dest='momentum', type=float, default=0.9) PARSER.add_argument('--weight_decay', dest='weight_decay', type=float, default=0.0) -PARSER.add_argument('--evaluate_every', '--eval_every', dest='evaluate_every', type=int, default=None) -PARSER.add_argument('--start_eval_at', dest='start_eval_at', type=int, default=None) PARSER.add_argument('--verbose', '-v', dest='verbose', action='store_true', default=False) PARSER.add_argument('--normalization', dest='normalization', type=str, choices=['instancenorm', 'batchnorm'], default='instancenorm') diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py index f2e881bcb..6bc4a0f78 100644 --- a/image_segmentation/pytorch/runtime/training.py +++ b/image_segmentation/pytorch/runtime/training.py @@ -9,6 +9,10 @@ from runtime.logging import mllog_event, mllog_start, mllog_end, CONSTANTS +START_EVAL_AT = 168*1000 +EVALUATE_EVERY = 168*20 + + def get_optimizer(params, flags): if flags.optimizer == "adam": optim = Adam(params, lr=flags.learning_rate, weight_decay=flags.weight_decay) @@ -55,7 +59,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal diverged = False total_samples = 0 iteration = 0 - next_eval_at = flags.start_eval_at + next_eval_at = START_EVAL_AT model.train() train_loader = iter(train_loader) for callback in callbacks: @@ -124,7 +128,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False, metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples, CONSTANTS.EPOCH_COUNT: next_eval_at}) - next_eval_at += flags.evaluate_every + next_eval_at += EVALUATE_EVERY mllog_end(key=CONSTANTS.RUN_STOP, sync=True, From a95b87eebad24debd21a2a36fa273df76709f104 Mon Sep 17 00:00:00 2001 From: michalm Date: Sun, 10 Dec 2023 11:37:05 +0100 Subject: [PATCH 08/12] Fix start eval at and evaluate every --- image_segmentation/pytorch/main.py | 3 +-- image_segmentation/pytorch/runtime/arguments.py | 6 +++--- image_segmentation/pytorch/runtime/training.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py index 8a8bd4459..d40bab8bc 100644 --- a/image_segmentation/pytorch/main.py +++ b/image_segmentation/pytorch/main.py @@ -8,8 +8,7 @@ from data_loading.data_loader import get_data_loaders -# from runtime.training import train -from runtime.dummy_training import train +from runtime.training import train from runtime.inference import evaluate from runtime.arguments import PARSER from runtime.distributed_utils import init_distributed, get_world_size, get_device, is_main_process, get_rank diff --git a/image_segmentation/pytorch/runtime/arguments.py b/image_segmentation/pytorch/runtime/arguments.py index 92eab1732..a574152d2 100644 --- a/image_segmentation/pytorch/runtime/arguments.py +++ b/image_segmentation/pytorch/runtime/arguments.py @@ -14,7 +14,7 @@ PARSER.add_argument('--quality_threshold', dest='quality_threshold', type=float, default=0.908) PARSER.add_argument('--ga_steps', dest='ga_steps', type=int, default=1) PARSER.add_argument('--warmup_steps', dest='warmup_steps', type=int, default=4) -PARSER.add_argument('--batch_size', dest='batch_size', type=int, default=2) +PARSER.add_argument('--batch_size', dest='batch_size', type=int, default=7) PARSER.add_argument('--layout', dest='layout', type=str, choices=['NCDHW'], default='NCDHW') PARSER.add_argument('--input_shape', nargs='+', type=int, default=[128, 128, 128]) PARSER.add_argument('--val_input_shape', nargs='+', type=int, default=[128, 128, 128]) @@ -25,9 +25,9 @@ PARSER.add_argument('--benchmark', dest='benchmark', action='store_true', default=False) PARSER.add_argument('--amp', dest='amp', action='store_true', default=False) PARSER.add_argument('--optimizer', dest='optimizer', default="sgd", choices=["sgd", "adam", "lamb"], type=str) -PARSER.add_argument('--learning_rate', dest='learning_rate', type=float, default=1.0) +PARSER.add_argument('--learning_rate', dest='learning_rate', type=float, default=2.0) PARSER.add_argument('--init_learning_rate', dest='init_learning_rate', type=float, default=1e-4) -PARSER.add_argument('--lr_warmup_samples', dest='lr_warmup_samples', type=int, default=0) +PARSER.add_argument('--lr_warmup_samples', dest='lr_warmup_samples', type=int, default=168000) PARSER.add_argument('--lr_decay_samples', nargs='+', type=int, default=[]) PARSER.add_argument('--lr_decay_factor', dest='lr_decay_factor', type=float, default=1.0) PARSER.add_argument('--lamb_betas', nargs='+', type=int, default=[0.9, 0.999]) diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py index 6bc4a0f78..c52404f5e 100644 --- a/image_segmentation/pytorch/runtime/training.py +++ b/image_segmentation/pytorch/runtime/training.py @@ -71,7 +71,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal CONSTANTS.EPOCH_COUNT: next_eval_at}) while total_samples < next_eval_at: - if total_samples <= flags.lr_warmup_samples and flags.lr_warmup_epochs > 0: + if total_samples <= flags.lr_warmup_samples and flags.lr_warmup_samples > 0: lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples) optimizer.zero_grad() From 0c4f4db1cebb299a8e72437eb7e555a7937cfaba Mon Sep 17 00:00:00 2001 From: michalm Date: Sun, 10 Dec 2023 15:07:25 +0100 Subject: [PATCH 09/12] Add support for lr decay --- image_segmentation/pytorch/main.py | 4 +- .../pytorch/runtime/distributed_utils.py | 2 +- .../pytorch/runtime/training.py | 57 +++++++++++-------- 3 files changed, 35 insertions(+), 28 deletions(-) diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py index d40bab8bc..de87e4b1e 100644 --- a/image_segmentation/pytorch/main.py +++ b/image_segmentation/pytorch/main.py @@ -50,8 +50,8 @@ def main(): mllog_end(key=constants.INIT_STOP, sync=True) mllog_start(key=constants.RUN_START, sync=True) train_dataloader, val_dataloader = get_data_loaders(flags, num_shards=world_size, global_rank=local_rank) - samples_per_epoch = world_size * len(train_dataloader) * flags.batch_size - mllog_event(key='samples_per_epoch', value=samples_per_epoch, sync=False) + # samples_per_epoch = world_size * len(train_dataloader) * flags.batch_size + # mllog_event(key='samples_per_epoch', value=samples_per_epoch, sync=False) mllog_event(key=constants.GLOBAL_BATCH_SIZE, value=flags.batch_size * world_size * flags.ga_steps, sync=False) mllog_event(key=constants.GRADIENT_ACCUMULATION_STEPS, value=flags.ga_steps) loss_fn = DiceCELoss(to_onehot_y=True, use_softmax=True, layout=flags.layout, diff --git a/image_segmentation/pytorch/runtime/distributed_utils.py b/image_segmentation/pytorch/runtime/distributed_utils.py index 0a5a5cab4..b467969ee 100644 --- a/image_segmentation/pytorch/runtime/distributed_utils.py +++ b/image_segmentation/pytorch/runtime/distributed_utils.py @@ -97,7 +97,7 @@ def get_world_size(): def reduce_tensor(tensor, num_gpus): if num_gpus > 1: rt = tensor.clone() - dist.all_reduce(rt, op=dist.reduce_op.SUM) + dist.all_reduce(rt, op=dist.ReduceOp.SUM) if rt.is_floating_point(): rt = rt / num_gpus else: diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py index c52404f5e..dc1d0e726 100644 --- a/image_segmentation/pytorch/runtime/training.py +++ b/image_segmentation/pytorch/runtime/training.py @@ -1,4 +1,5 @@ from tqdm import tqdm +from time import time import torch from torch.optim import Adam, SGD @@ -34,18 +35,21 @@ def lr_warmup(optimizer, init_lr, lr, current_samples, warmup_samples): param_group['lr'] = init_lr + (lr - init_lr) * scale -def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, callbacks, - is_distributed, samples_per_epoch): - rank = get_rank() +def lr_decay(optimizer, lr_decay_samples, lr_decay_factor, total_samples): + if total_samples > lr_decay_samples[0]: + lr_decay_samples = lr_decay_samples[1:] + for param_group in optimizer.param_groups: + param_group['lr'] *= lr_decay_factor + return lr_decay_samples + + +def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, callbacks, is_distributed): + world_size = get_world_size() torch.backends.cudnn.benchmark = flags.cudnn_benchmark torch.backends.cudnn.deterministic = flags.cudnn_deterministic optimizer = get_optimizer(model.parameters(), flags) - if flags.lr_decay_samples: - scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, - milestones=flags.lr_decay_samples, - gamma=flags.lr_decay_factor) scaler = GradScaler() model.to(device) @@ -59,7 +63,8 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal diverged = False total_samples = 0 iteration = 0 - next_eval_at = START_EVAL_AT + lr_decay_samples = flags.lr_decay_samples + next_eval_at = EVALUATE_EVERY model.train() train_loader = iter(train_loader) for callback in callbacks: @@ -70,15 +75,17 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples, CONSTANTS.EPOCH_COUNT: next_eval_at}) + t0 = time() while total_samples < next_eval_at: if total_samples <= flags.lr_warmup_samples and flags.lr_warmup_samples > 0: lr_warmup(optimizer, flags.init_learning_rate, flags.learning_rate, total_samples, flags.lr_warmup_samples) + if len(flags.lr_decay_samples) > 0: + lr_decay_samples = lr_decay(optimizer, lr_decay_samples, flags.lr_decay_factor, total_samples) optimizer.zero_grad() - # for iteration, batch in enumerate(tqdm(train_loader, disable=(rank != 0) or not flags.verbose)): batch = next(train_loader) - total_samples = flags.batch_size * world_size + total_samples += flags.batch_size * world_size image, label = batch image, label = image.to(device), label.to(device) @@ -105,34 +112,34 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal optimizer.zero_grad() iteration += 1 - + print(f"Throughput: {round(EVALUATE_EVERY / (time() - t0), 2)} samples/s. Time {time() - t0}") # Evaluation - del output - mllog_start(key=CONSTANTS.EVAL_START, value=total_samples, - metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False) + if total_samples >= START_EVAL_AT: + mllog_start(key=CONSTANTS.EVAL_START, value=total_samples, + metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False) - eval_metrics = evaluate(flags, model, val_loader, loss_fn, score_fn, device, total_samples) + eval_metrics = evaluate(flags, model, val_loader, loss_fn, score_fn, device, total_samples) - mllog_event(key=CONSTANTS.EVAL_ACCURACY, value=eval_metrics["mean_dice"], - metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False) - mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False) + mllog_event(key=CONSTANTS.EVAL_ACCURACY, value=eval_metrics["mean_dice"], + metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False) + mllog_end(key=CONSTANTS.EVAL_STOP, metadata={CONSTANTS.EPOCH_NUM: total_samples}, sync=False) - model.train() - if eval_metrics["mean_dice"] >= flags.quality_threshold: - is_successful = True - elif eval_metrics["mean_dice"] < 1e-6: - print("MODEL DIVERGED. ABORTING.") - diverged = True + model.train() + if eval_metrics["mean_dice"] >= flags.quality_threshold: + is_successful = True + elif eval_metrics["mean_dice"] < 1e-6: + print("MODEL DIVERGED. ABORTING.") + diverged = True mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False, metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples, CONSTANTS.EPOCH_COUNT: next_eval_at}) next_eval_at += EVALUATE_EVERY - mllog_end(key=CONSTANTS.RUN_STOP, sync=True, metadata={CONSTANTS.STATUS: CONSTANTS.SUCCESS if is_successful else CONSTANTS.ABORTED, CONSTANTS.EPOCH_COUNT: total_samples}) + for callback in callbacks: callback.on_fit_end() From 59699418bb77581e24a319d32481b5240e5102da Mon Sep 17 00:00:00 2001 From: michalm Date: Mon, 11 Dec 2023 08:45:18 +0100 Subject: [PATCH 10/12] syntax fixes --- image_segmentation/pytorch/main.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py index de87e4b1e..7cc2c8445 100644 --- a/image_segmentation/pytorch/main.py +++ b/image_segmentation/pytorch/main.py @@ -50,8 +50,6 @@ def main(): mllog_end(key=constants.INIT_STOP, sync=True) mllog_start(key=constants.RUN_START, sync=True) train_dataloader, val_dataloader = get_data_loaders(flags, num_shards=world_size, global_rank=local_rank) - # samples_per_epoch = world_size * len(train_dataloader) * flags.batch_size - # mllog_event(key='samples_per_epoch', value=samples_per_epoch, sync=False) mllog_event(key=constants.GLOBAL_BATCH_SIZE, value=flags.batch_size * world_size * flags.ga_steps, sync=False) mllog_event(key=constants.GRADIENT_ACCUMULATION_STEPS, value=flags.ga_steps) loss_fn = DiceCELoss(to_onehot_y=True, use_softmax=True, layout=flags.layout, @@ -61,7 +59,7 @@ def main(): if flags.exec_mode == 'train': train(flags, model, train_dataloader, val_dataloader, loss_fn, score_fn, - device=device, callbacks=callbacks, is_distributed=is_distributed, samples_per_epoch=samples_per_epoch) + device=device, callbacks=callbacks, is_distributed=is_distributed) elif flags.exec_mode == 'evaluate': eval_metrics = evaluate(flags, model, val_dataloader, loss_fn, score_fn, From b5cc52ea63c31033aaa820244c65209118a8f96e Mon Sep 17 00:00:00 2001 From: michalm Date: Tue, 12 Dec 2023 09:48:23 +0100 Subject: [PATCH 11/12] syntax fixes --- image_segmentation/pytorch/main.py | 1 - image_segmentation/pytorch/runtime/training.py | 7 +++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/image_segmentation/pytorch/main.py b/image_segmentation/pytorch/main.py index 7cc2c8445..7581c3f7a 100644 --- a/image_segmentation/pytorch/main.py +++ b/image_segmentation/pytorch/main.py @@ -20,7 +20,6 @@ def main(): - mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'unet3d.log')) mllog.config(filename=os.path.join("/results", 'unet3d.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False diff --git a/image_segmentation/pytorch/runtime/training.py b/image_segmentation/pytorch/runtime/training.py index dc1d0e726..19cbe7432 100644 --- a/image_segmentation/pytorch/runtime/training.py +++ b/image_segmentation/pytorch/runtime/training.py @@ -36,7 +36,7 @@ def lr_warmup(optimizer, init_lr, lr, current_samples, warmup_samples): def lr_decay(optimizer, lr_decay_samples, lr_decay_factor, total_samples): - if total_samples > lr_decay_samples[0]: + if len(lr_decay_samples) > 0 and total_samples > lr_decay_samples[0]: lr_decay_samples = lr_decay_samples[1:] for param_group in optimizer.param_groups: param_group['lr'] *= lr_decay_factor @@ -73,7 +73,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal while not diverged and not is_successful: mllog_start(key=CONSTANTS.BLOCK_START, sync=False, metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples, - CONSTANTS.EPOCH_COUNT: next_eval_at}) + CONSTANTS.EPOCH_COUNT: EVALUATE_EVERY}) t0 = time() while total_samples < next_eval_at: @@ -112,7 +112,6 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal optimizer.zero_grad() iteration += 1 - print(f"Throughput: {round(EVALUATE_EVERY / (time() - t0), 2)} samples/s. Time {time() - t0}") # Evaluation del output if total_samples >= START_EVAL_AT: @@ -134,7 +133,7 @@ def train(flags, model, train_loader, val_loader, loss_fn, score_fn, device, cal mllog_end(key=CONSTANTS.BLOCK_STOP, sync=False, metadata={CONSTANTS.FIRST_EPOCH_NUM: total_samples, - CONSTANTS.EPOCH_COUNT: next_eval_at}) + CONSTANTS.EPOCH_COUNT: EVALUATE_EVERY}) next_eval_at += EVALUATE_EVERY mllog_end(key=CONSTANTS.RUN_STOP, sync=True, From 56ed46d33fda6d9b7c5d9e15fef1bd232083827d Mon Sep 17 00:00:00 2001 From: michalm Date: Wed, 17 Jan 2024 17:55:35 +0100 Subject: [PATCH 12/12] Reverse dockerfile changes --- image_segmentation/pytorch/Dockerfile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/image_segmentation/pytorch/Dockerfile b/image_segmentation/pytorch/Dockerfile index f501ce8fc..cea560919 100644 --- a/image_segmentation/pytorch/Dockerfile +++ b/image_segmentation/pytorch/Dockerfile @@ -1,11 +1,11 @@ -#ARG FROM_IMAGE_NAME=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime -ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:23.04-py3 +ARG FROM_IMAGE_NAME=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime +#ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.02-py3 FROM ${FROM_IMAGE_NAME} -#RUN apt-get update && \ -# apt-get upgrade -y && \ -# apt-get install -y git -#RUN apt-get install -y vim +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y git +RUN apt-get install -y vim ADD . /workspace/unet3d WORKDIR /workspace/unet3d