From 6b977d0beb48d4eb0943046912519b539517b9e3 Mon Sep 17 00:00:00 2001 From: Pete Date: Thu, 7 Sep 2023 09:46:32 -0700 Subject: [PATCH] handle race conditions when saving to NFS on cirrascale (#255) --- olmo/train.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/olmo/train.py b/olmo/train.py index 6988a81ad..020ecc1d3 100644 --- a/olmo/train.py +++ b/olmo/train.py @@ -261,12 +261,26 @@ def save_sharded_checkpoint(self) -> Path: if get_fs_local_rank() == 0: # Replace temp directory with target checkpoint directory. - checkpoint_dir_tmp.replace(checkpoint_dir) + try: + checkpoint_dir_tmp.replace(checkpoint_dir) + except FileNotFoundError: + # Caught when another (file-system) local rank 0 has already replaced the tmp directory. + # This can happen when nodes are saving to a common NFS drive but otherwise have distinct + # file-systems. + if not checkpoint_dir.exists(): + raise # Link to 'latest'. latest_path = Path(self.cfg.save_folder) / "latest" latest_path.unlink(missing_ok=True) - latest_path.symlink_to(checkpoint_dir.name, target_is_directory=True) + try: + latest_path.symlink_to(checkpoint_dir.name, target_is_directory=True) + except FileExistsError: + # Same as above, caught when another (file-system) local rank 0 has already made the 'latest' symlink. + # This can happen when nodes are saving to a common NFS drive but otherwise have distinct + # file-systems. + if latest_path.resolve().name != checkpoint_dir.name: + raise # In the cases where we're using a shared NFS drive between ranks to save checkpoints, # replacing the temp directory with the final directory from rank 0 might not be immediately