Skip to content

Commit

Permalink
Remove logic to skip checkpoint save if checkpoint exists (#11362)
Browse files Browse the repository at this point in the history
* remove skipping of checkpoint when checkpoint exists

Signed-off-by: ashors1 <[email protected]>

* update filename for mnist fsdp test

Signed-off-by: ashors1 <[email protected]>

* add missing import

Signed-off-by: ashors1 <[email protected]>

* fsdp test fix

Signed-off-by: ashors1 <[email protected]>

* Apply isort and black reformatting

Signed-off-by: ashors1 <[email protected]>

---------

Signed-off-by: ashors1 <[email protected]>
Signed-off-by: ashors1 <[email protected]>
Co-authored-by: ashors1 <[email protected]>
  • Loading branch information
ashors1 and ashors1 authored Nov 26, 2024
1 parent 613d0f2 commit 706eb09
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 3 deletions.
3 changes: 0 additions & 3 deletions nemo/lightning/io/pl.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,6 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
checkpoint_dir = ckpt_to_weights_subdir(path, is_saving=True)

fs = get_filesystem(checkpoint_dir)
if fs.isdir(checkpoint_dir) and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_dir):
logging.info(f'Distributed checkpoint at path {checkpoint_dir} already exists, skipping saving')
return
fs.makedirs(checkpoint_dir, exist_ok=True)

validate_sharding_integrity = not (self.validated_consistency and self.assume_constant_structure)
Expand Down
1 change: 1 addition & 0 deletions nemo/lightning/pytorch/strategies/fsdp_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import shutil
from collections import OrderedDict
from pathlib import Path
Expand Down
2 changes: 2 additions & 0 deletions tests/collections/llm/test_mnist_model_nemo2_fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,7 @@ def run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu():
every_n_train_steps=5,
# Enables the .nemo file-like checkpointing where all IOMixins are under SerDe
always_save_context=True,
filename="{model_name}--{val_loss:.2f}-{step}-{consumed_samples}",
)
root_dir = tmpdir
save_dir = root_dir / name
Expand Down Expand Up @@ -572,6 +573,7 @@ def run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu():
global_batch_size=2,
output_log=False, # Disable logs to support predict_step
),
ckpt_load_optimizer=False,
)
predict_trainer = nl.Trainer(
accelerator="gpu",
Expand Down

0 comments on commit 706eb09

Please sign in to comment.