Merge branch 'main' into sortformer/pr_01

NVIDIA · Nov 26, 2024 · 470579d · 470579d
2 parents 01085ab + 7198fa4
commit 470579d
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 14 deletions.
diff --git a/nemo/lightning/io/pl.py b/nemo/lightning/io/pl.py
@@ -155,9 +155,6 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
         checkpoint_dir = ckpt_to_weights_subdir(path, is_saving=True)
 
         fs = get_filesystem(checkpoint_dir)
-        if fs.isdir(checkpoint_dir) and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_dir):
-            logging.info(f'Distributed checkpoint at path {checkpoint_dir} already exists, skipping saving')
-            return
         fs.makedirs(checkpoint_dir, exist_ok=True)
 
         validate_sharding_integrity = not (self.validated_consistency and self.assume_constant_structure)

diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import shutil
 from collections import OrderedDict
 from pathlib import Path

diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py
@@ -37,17 +37,25 @@
 
 
 def _try_restore_tokenizer(model, ckpt_path):
+    from nemo.collections.common.tokenizers import TokenizerSpec
     from nemo.lightning.io import load_context
 
     try:
         tokenizer = load_context(ckpt_path, "model.tokenizer")
+    except ValueError as e:
+        logging.warning(
+            f"Encountered error while trying to restore tokenizer. Tokenizer is not restored. " f"Original error: {e}"
+        )
+        return model
+
+    if isinstance(tokenizer, TokenizerSpec):
         model.tokenizer = tokenizer
         model.__io__.tokenizer = tokenizer.__io__
-    except:
-        # Ignore if the ckpt doesn't have a tokenizer.
-        pass
-    finally:
-        return model
+    else:
+        # Ignore if the ckpt doesn't have a tokenizer. type(tokenizer)==TrainerContext in this case.
+        logging.warning("Checkpoint does not have model.tokenizer field. Tokenizer is not restored.")
+
+    return model
 
 
 @dataclass(kw_only=True)
@@ -56,8 +64,10 @@ class AutoResume:
     checkpoints in NeMo.
 
     Attributes:
-        restore_config (Optional[RestoreConfig]): Optional config for selectively restoring specific parts like model weights, optimizer states, etc.
-            If the config contains a path from HF or another non-NeMo checkpoint format, the checkpoint will be automatically converted to a NeMo compatible format.
+        restore_config (Optional[RestoreConfig]): Optional config for selectively restoring specific parts like model
+            weights, optimizer states, etc.
+            If the config contains a path from HF or another non-NeMo checkpoint format, the checkpoint will be
+            automatically converted to a NeMo compatible format.
             resume_from_folder or the run's log_dir takes precedence over restore_config.
         resume_from_directory (str): Path to the checkpointing directory to restore from.
         resume_from_path (str): Path to a specific checkpoint to restore from.
@@ -209,17 +219,22 @@ def _find_trainer_ckpt_path(self) -> Optional[Path]:
 
         if not checkpoint_dir.exists() or (not len(end_checkpoints) > 0 and not len(last_checkpoints) > 0):
             if self.resume_ignore_no_checkpoint:
-                warn = f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :{checkpoint_dir}. "
+                warn = (
+                    f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir "
+                    f":{checkpoint_dir}. "
+                )
                 if checkpoint is None:
                     warn += "Training from scratch."
                 logging.warning(warn)
             else:
                 if self.restore_config:
-                    # resume_if_exists is True but run is not resumable. Do not fail and try to do selective restore later instead.
+                    # resume_if_exists is True but run is not resumable. Do not fail and try to do selective restore
+                    # later instead.
                     return None
                 else:
                     raise NotFoundError(
-                        f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Cannot resume."
+                        f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir "
+                        f":{checkpoint_dir}. Cannot resume."
                     )
         elif len(end_checkpoints) > 0:
             if not self.resume_past_end:
@@ -240,7 +255,8 @@ def _find_trainer_ckpt_path(self) -> Optional[Path]:
                 # Select the checkpoint with the latest modified time
                 checkpoint = sorted(last_checkpoints, key=lambda pth: pth.lstat().st_mtime, reverse=True)[0]
                 logging.warning(
-                    f"Multiple checkpoints {last_checkpoints} matches *last.ckpt. Selecting one with the latest modified time."
+                    f"Multiple checkpoints {last_checkpoints} matches *last.ckpt. Selecting one with the latest "
+                    f"modified time."
                 )
         else:
             checkpoint = last_checkpoints[0]

diff --git a/tests/collections/llm/test_mnist_model_nemo2_fsdp.py b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py
@@ -525,6 +525,7 @@ def run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu():
                 every_n_train_steps=5,
                 # Enables the .nemo file-like checkpointing where all IOMixins are under SerDe
                 always_save_context=True,
+                filename="{model_name}--{val_loss:.2f}-{step}-{consumed_samples}",
             )
             root_dir = tmpdir
             save_dir = root_dir / name
@@ -572,6 +573,7 @@ def run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu():
                     global_batch_size=2,
                     output_log=False,  # Disable logs to support predict_step
                 ),
+                ckpt_load_optimizer=False,
             )
             predict_trainer = nl.Trainer(
                 accelerator="gpu",