From 5c4c0c64f2746098b2f64efb24066061f1f22950 Mon Sep 17 00:00:00 2001 From: AleHC Date: Wed, 20 Nov 2024 11:02:40 +0100 Subject: [PATCH] Fixed wrong lr initialization when loading checkpoints --- src/nanotron/trainer.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py index 21251a32..9a01b7a1 100644 --- a/src/nanotron/trainer.py +++ b/src/nanotron/trainer.py @@ -209,6 +209,13 @@ def __init__( parallel_context=self.parallel_context, root_folder=self.init_checkpoint_path, ) + # Update optimizer learning rate because otherwise it is set to zero in the first iteration. + param_groups = self.optimizer.get_base_optimizer().param_groups + last_lrs = self.lr_scheduler.get_last_lr() + assert len(param_groups) == len(last_lrs) + for group, last_lr in zip(param_groups, last_lrs): + assert "lr" in group + group["lr"] = last_lr # Define iteration start state if self.init_checkpoint_path is not None: