diff --git a/Dockerfile.ci b/Dockerfile.ci index 15cd016073ca..07b0c6a0ab57 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -34,7 +34,7 @@ WORKDIR /workspace # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.13.0 -ARG MCORE_TAG=c7a1f82d761577e6ca0338d3521eac82f2aa0904 +ARG MCORE_TAG=338af51452a53982d202e8386db6233adad1ce86 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ --mount=type=bind,source=requirements,target=requirements \ diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 809ca30ca5ed..ea469fade004 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -174,7 +174,7 @@ model: fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint. # Distributed checkpoint setup - dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. + dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU dist_ckpt_parallel_save: True # if true, each worker will write its own part of the dist checkpoint dist_ckpt_parallel_save_within_dp: False # if true, save will be parallelized only within a DP group (whole world otherwise), which might slightly reduce the save overhead diff --git a/nemo/lightning/io/pl.py b/nemo/lightning/io/pl.py index d0749fbeead7..6625ca0a1622 100644 --- a/nemo/lightning/io/pl.py +++ b/nemo/lightning/io/pl.py @@ -208,6 +208,13 @@ def _determine_dist_ckpt_save_strategy(self): are passed in config or in case of a fully parallel save in which case a parallelization wrapper is applied. """ + if self.save_ckpt_format == 'zarr': + logging.warning( + f'`zarr` distributed checkpoint backend is deprecated.' + f' Distributed optimizer checkpoint saving might be extremely slow.' + f' Please switch to PyTorch Distributed format (model.dist_ckpt_format=torch_dist).' + ) + if self.async_save and self.save_ckpt_format != 'torch_dist': raise ValueError('Async dist-ckpt save supported only for torch_dist format') diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py index 9348779051bb..1f4e949ec340 100644 --- a/nemo/utils/callbacks/dist_ckpt_io.py +++ b/nemo/utils/callbacks/dist_ckpt_io.py @@ -242,7 +242,7 @@ def from_config(cls, model_cfg: dict, async_save: bool = False): it should be provided separately. Defaults to False. """ return cls( - save_ckpt_format=model_cfg.get('dist_ckpt_format', 'zarr'), + save_ckpt_format=model_cfg.get('dist_ckpt_format', 'torch_dist'), load_directly_on_device=model_cfg.get('dist_ckpt_load_on_device', True), load_strictness=model_cfg.get('dist_ckpt_load_strictness', None), async_save=async_save, @@ -360,6 +360,13 @@ def _determine_dist_ckpt_save_strategy(self): are passed in config or in case of a fully parallel save in which case a parallelization wrapper is applied. """ + if self.save_ckpt_format == 'zarr': + logging.warning( + f'`zarr` distributed checkpoint backend is deprecated.' + f' Distributed optimizer checkpoint saving might be extremely slow.' + f' Please switch to PyTorch Distributed format (model.dist_ckpt_format=torch_dist).' + ) + if self.async_save and self.save_ckpt_format != 'torch_dist': raise ValueError('Async dist-ckpt save supported only for torch_dist format')