Skip to content

Commit

Permalink
rm rm (NVIDIA#11116)
Browse files Browse the repository at this point in the history
Signed-off-by: Alexandros Koumparoulis <[email protected]>
Signed-off-by: Hainan Xu <[email protected]>
  • Loading branch information
akoumpa authored and Hainan Xu committed Nov 5, 2024
1 parent d05f686 commit d6995bf
Showing 1 changed file with 7 additions and 15 deletions.
22 changes: 7 additions & 15 deletions nemo/lightning/io/pl.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,21 +142,13 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
validate_sharding_integrity = not (self.validated_consistency and self.assume_constant_structure)
self.validated_consistency = True

try:
return dist_checkpointing.save(
sharded_state_dict=checkpoint,
checkpoint_dir=checkpoint_dir,
sharded_strategy=self.save_sharded_strategy,
validate_access_integrity=validate_sharding_integrity,
async_sharded_save=self.async_save,
)
except:
logging.error(f"Failed to save checkpoint to {checkpoint_dir}")
# Do cleanup.
import shutil

shutil.rmtree(checkpoint_dir)
raise
return dist_checkpointing.save(
sharded_state_dict=checkpoint,
checkpoint_dir=checkpoint_dir,
sharded_strategy=self.save_sharded_strategy,
validate_access_integrity=validate_sharding_integrity,
async_sharded_save=self.async_save,
)

@override
def load_checkpoint(
Expand Down

0 comments on commit d6995bf

Please sign in to comment.