Avoid duplicated checkpoint save (NVIDIA#7555) (NVIDIA#7566)

Signed-off-by: Mikołaj Błaż <[email protected]> Co-authored-by: mikolajblaz <[email protected]> Signed-off-by: Elena Rastorgueva <[email protected]>
erastorgueva-nv · Oct 11, 2023 · affdb5d · affdb5d
1 parent 5541f60
commit affdb5d
Showing 1 changed file with 22 additions and 13 deletions.
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -291,6 +291,10 @@ def save_checkpoint(
             checkpoint_dir = ckpt_to_dir(filepath)
 
             fs = get_filesystem(checkpoint_dir)
+            if fs.isdir(checkpoint_dir) and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_dir):
+                logging.info(f'Distributed checkpoint at path {checkpoint_dir} already exists, skipping saving')
+                return
+
             if is_global_rank_zero():
                 fs.makedirs(checkpoint_dir, exist_ok=True)
 
@@ -477,19 +481,24 @@ def save_to(self, model, save_path: str):
                 # model weights is a directory
                 dist_ckpt_dir = ckpt_to_dir(os.path.join(dir_name, self.model_weights_ckpt))
                 fs = get_filesystem(dist_ckpt_dir)
-                if is_global_rank_zero():
-                    fs.makedirs(dist_ckpt_dir, exist_ok=True)
-                sharded_state_dict = model.sharded_state_dict()
-                # dist checkpoint needs torch.distributed to save the checkpoint
-                if parallel_state.is_unitialized():
-
-                    def dummy():
-                        return
-
-                    if model.trainer.strategy.launcher is not None:
-                        model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
-                    model.trainer.strategy.setup_environment()
-                dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=dist_ckpt_dir)
+
+                if fs.isdir(dist_ckpt_dir) and dist_checkpointing.check_is_distributed_checkpoint(dist_ckpt_dir):
+                    logging.info(f'Distributed checkpoint at path {dist_ckpt_dir} already exists, skipping saving')
+                else:
+                    if is_global_rank_zero():
+                        fs.makedirs(dist_ckpt_dir, exist_ok=True)
+
+                    sharded_state_dict = model.sharded_state_dict()
+                    # dist checkpoint needs torch.distributed to save the checkpoint
+                    if parallel_state.is_unitialized():
+
+                        def dummy():
+                            return
+
+                        if model.trainer.strategy.launcher is not None:
+                            model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
+                        model.trainer.strategy.setup_environment()
+                    dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=dist_ckpt_dir)
 
             else: