Skip to content

Commit

Permalink
Merge branch 'main' into sortformer/pr_01
Browse files Browse the repository at this point in the history
  • Loading branch information
tango4j authored Nov 26, 2024
2 parents 01085ab + 7198fa4 commit 470579d
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 14 deletions.
3 changes: 0 additions & 3 deletions nemo/lightning/io/pl.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,6 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
checkpoint_dir = ckpt_to_weights_subdir(path, is_saving=True)

fs = get_filesystem(checkpoint_dir)
if fs.isdir(checkpoint_dir) and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_dir):
logging.info(f'Distributed checkpoint at path {checkpoint_dir} already exists, skipping saving')
return
fs.makedirs(checkpoint_dir, exist_ok=True)

validate_sharding_integrity = not (self.validated_consistency and self.assume_constant_structure)
Expand Down
1 change: 1 addition & 0 deletions nemo/lightning/pytorch/strategies/fsdp_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import shutil
from collections import OrderedDict
from pathlib import Path
Expand Down
38 changes: 27 additions & 11 deletions nemo/lightning/resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,25 @@


def _try_restore_tokenizer(model, ckpt_path):
from nemo.collections.common.tokenizers import TokenizerSpec
from nemo.lightning.io import load_context

try:
tokenizer = load_context(ckpt_path, "model.tokenizer")
except ValueError as e:
logging.warning(
f"Encountered error while trying to restore tokenizer. Tokenizer is not restored. " f"Original error: {e}"
)
return model

if isinstance(tokenizer, TokenizerSpec):
model.tokenizer = tokenizer
model.__io__.tokenizer = tokenizer.__io__
except:
# Ignore if the ckpt doesn't have a tokenizer.
pass
finally:
return model
else:
# Ignore if the ckpt doesn't have a tokenizer. type(tokenizer)==TrainerContext in this case.
logging.warning("Checkpoint does not have model.tokenizer field. Tokenizer is not restored.")

return model


@dataclass(kw_only=True)
Expand All @@ -56,8 +64,10 @@ class AutoResume:
checkpoints in NeMo.
Attributes:
restore_config (Optional[RestoreConfig]): Optional config for selectively restoring specific parts like model weights, optimizer states, etc.
If the config contains a path from HF or another non-NeMo checkpoint format, the checkpoint will be automatically converted to a NeMo compatible format.
restore_config (Optional[RestoreConfig]): Optional config for selectively restoring specific parts like model
weights, optimizer states, etc.
If the config contains a path from HF or another non-NeMo checkpoint format, the checkpoint will be
automatically converted to a NeMo compatible format.
resume_from_folder or the run's log_dir takes precedence over restore_config.
resume_from_directory (str): Path to the checkpointing directory to restore from.
resume_from_path (str): Path to a specific checkpoint to restore from.
Expand Down Expand Up @@ -209,17 +219,22 @@ def _find_trainer_ckpt_path(self) -> Optional[Path]:

if not checkpoint_dir.exists() or (not len(end_checkpoints) > 0 and not len(last_checkpoints) > 0):
if self.resume_ignore_no_checkpoint:
warn = f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :{checkpoint_dir}. "
warn = (
f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir "
f":{checkpoint_dir}. "
)
if checkpoint is None:
warn += "Training from scratch."
logging.warning(warn)
else:
if self.restore_config:
# resume_if_exists is True but run is not resumable. Do not fail and try to do selective restore later instead.
# resume_if_exists is True but run is not resumable. Do not fail and try to do selective restore
# later instead.
return None
else:
raise NotFoundError(
f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Cannot resume."
f"There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir "
f":{checkpoint_dir}. Cannot resume."
)
elif len(end_checkpoints) > 0:
if not self.resume_past_end:
Expand All @@ -240,7 +255,8 @@ def _find_trainer_ckpt_path(self) -> Optional[Path]:
# Select the checkpoint with the latest modified time
checkpoint = sorted(last_checkpoints, key=lambda pth: pth.lstat().st_mtime, reverse=True)[0]
logging.warning(
f"Multiple checkpoints {last_checkpoints} matches *last.ckpt. Selecting one with the latest modified time."
f"Multiple checkpoints {last_checkpoints} matches *last.ckpt. Selecting one with the latest "
f"modified time."
)
else:
checkpoint = last_checkpoints[0]
Expand Down
2 changes: 2 additions & 0 deletions tests/collections/llm/test_mnist_model_nemo2_fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,7 @@ def run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu():
every_n_train_steps=5,
# Enables the .nemo file-like checkpointing where all IOMixins are under SerDe
always_save_context=True,
filename="{model_name}--{val_loss:.2f}-{step}-{consumed_samples}",
)
root_dir = tmpdir
save_dir = root_dir / name
Expand Down Expand Up @@ -572,6 +573,7 @@ def run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu():
global_batch_size=2,
output_log=False, # Disable logs to support predict_step
),
ckpt_load_optimizer=False,
)
predict_trainer = nl.Trainer(
accelerator="gpu",
Expand Down

0 comments on commit 470579d

Please sign in to comment.