From ca6035aa77b96425a5cf40fdc24cd95b233e4947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 23 Oct 2024 21:02:53 +0200 Subject: [PATCH 1/8] llm.generate fixes (#10983) (#11007) * fix context path, disable optimizer init, add tp * format * address comments, require user to provide trainer * minor fix * minor fixes --------- Signed-off-by: HuiyingLi Co-authored-by: Huiying --- nemo/collections/llm/api.py | 2 +- nemo/collections/llm/inference/base.py | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 71e006472db9..a9b3d4361f5b 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -436,7 +436,7 @@ def export_ckpt( def generate( path: Union[Path, str], prompts: list[str], - trainer: Optional[nl.Trainer] = None, + trainer: nl.Trainer, params_dtype: torch.dtype = torch.bfloat16, max_batch_size: int = 4, random_seed: Optional[int] = None, diff --git a/nemo/collections/llm/inference/base.py b/nemo/collections/llm/inference/base.py index 95da536fde06..0171f1c2dd5c 100644 --- a/nemo/collections/llm/inference/base.py +++ b/nemo/collections/llm/inference/base.py @@ -16,6 +16,7 @@ import nemo.lightning as nl from nemo.lightning import io +from nemo.lightning.ckpt_utils import ckpt_to_context_subdir from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy from nemo.lightning.pytorch.strategies.utils import RestoreConfig @@ -44,6 +45,7 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl. load_optim_state=False, ) trainer.strategy.restore_config = restore_config + trainer.strategy._setup_optimizers = False trainer.ckpt_path = None trainer.strategy.connect(model) if trainer.strategy.launcher is not None: @@ -61,16 +63,22 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl. def setup_model_and_tokenizer( path: Path, - trainer: Optional[nl.Trainer] = None, + trainer: nl.Trainer, params_dtype: torch.dtype = torch.bfloat16, inference_batch_times_seqlen_threshold: int = 1000, ) -> tuple[MCoreGPTModel, MCoreTokenizerWrappper]: - model: io.TrainerContext = io.load_context(path=path, subpath="model") - trainer = trainer or io.load_context(path=path, subpath="trainer") + model: io.TrainerContext = io.load_context(path=ckpt_to_context_subdir(path), subpath="model") _setup_trainer_and_restore_model(path=path, trainer=trainer, model=model) # This is to get the MCore model required in GPTInferenceWrapper. - mcore_model = model.module.module.module + mcore_model = model + while mcore_model: + if type(mcore_model) is MCoreGPTModel: + break + mcore_model = getattr(mcore_model, "module", None) + if mcore_model is None or type(mcore_model) is not MCoreGPTModel: + raise ValueError("Exact McoreGPTModel instance not found in the model structure.") + inference_wrapped_model = GPTInferenceWrapper( mcore_model, InferenceWrapperConfig( From a8bd34945611586899c699a3c19ed33e885db39e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 24 Oct 2024 15:48:08 +0200 Subject: [PATCH 2/8] Add a build option to load_context (#10713) (#11023) * Add a build option to load_context * Adding test * Trying to fix failing CPU test * cherry-pick fix --------- Signed-off-by: Marc Romeijn Signed-off-by: Alexandros Koumparoulis Co-authored-by: Marc Romeyn Co-authored-by: Alexandros Koumparoulis --- nemo/lightning/io/api.py | 18 ++++++++++++++---- nemo/lightning/io/mixin.py | 5 ++++- tests/lightning/io/test_api.py | 13 +++++++------ 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/nemo/lightning/io/api.py b/nemo/lightning/io/api.py index 643b671d1d85..7a702edb7f21 100644 --- a/nemo/lightning/io/api.py +++ b/nemo/lightning/io/api.py @@ -1,5 +1,6 @@ from pathlib import Path -from typing import Callable, Optional, Type +from typing import Callable, Optional, Type, overload +import fiddle as fdl import pytorch_lightning as pl @@ -7,14 +8,23 @@ from nemo.lightning.io.pl import TrainerContext -def load_context(path: Path, subpath: Optional[str] = None) -> TrainerContext: +@overload +def load_context(path: Path, subpath: Optional[str] = None, build: bool = True) -> TrainerContext: ... + + +@overload +def load_context(path: Path, subpath: Optional[str] = None, build: bool = False) -> fdl.Config[TrainerContext]: ... + + +def load_context(path: Path, subpath: Optional[str] = None, build: bool = True): """ Loads a TrainerContext from a json-file or directory. Args: path (Path): The path to the json-file or directory containing 'io.json'. subpath (Optional[str]): Subpath to selectively load only specific objects inside the TrainerContext. Defaults to None. - + build (bool): Whether to build the TrainerContext. Defaults to True. + Otherwise, the TrainerContext is returned as a Config[TrainerContext] object. Returns ------- TrainerContext: The loaded TrainerContext instance. @@ -27,7 +37,7 @@ def load_context(path: Path, subpath: Optional[str] = None) -> TrainerContext: checkpoint: TrainerContext = load_ckpt("/path/to/checkpoint", subpath="model.config") """ - return load(path, output_type=TrainerContext, subpath=subpath) + return load(path, output_type=TrainerContext, subpath=subpath, build=build) def model_importer(target: Type[ConnectorMixin], ext: str) -> Callable[[Type[ConnT]], Type[ConnT]]: diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py index ee53fe010145..2d1162bb2156 100644 --- a/nemo/lightning/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -634,7 +634,7 @@ def _artifact_transform_load(cfg: fdl.Config, path: Path): pass -def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] = None) -> CkptType: +def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] = None, build: bool = True) -> CkptType: """ Loads a configuration from a pickle file and constructs an object of the specified type. @@ -698,4 +698,7 @@ def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] = config = serialization.Deserialization(json_config).result _artifact_transform_load(config, path) + if not build: + return config + return fdl.build(config) diff --git a/tests/lightning/io/test_api.py b/tests/lightning/io/test_api.py index 386bd5b5fdab..a4d458cef17b 100644 --- a/tests/lightning/io/test_api.py +++ b/tests/lightning/io/test_api.py @@ -16,6 +16,7 @@ from functools import partial from pathlib import Path +import fiddle as fdl import pytest import yaml from pytorch_lightning.loggers import TensorBoardLogger @@ -69,9 +70,9 @@ def test_reload_ckpt(self, tmpdir, partial_function_with_pos_and_key_args): loaded_func = loaded.extra["dummy"] assert loaded_func(b=2) == partial_function_with_pos_and_key_args(b=2) - model_yaml = Path(tmpdir) / "model.yaml" - assert model_yaml.exists() - - observed = yaml.safe_load(model_yaml.read_text()) - expected = yaml.safe_load((Path(ARTIFACTS_DIR) / "model.yaml").read_text()) - assert observed.keys() == expected.keys() + config = io.load_context(tmpdir, build=False) + assert isinstance(config, fdl.Config) + assert config.model.config.seq_length == ckpt.model.config.seq_length + assert config.model.tokenizer.vocab_file.startswith(str(tmpdir)) + assert config.model.tokenizer.merges_file.startswith(str(tmpdir)) + assert config.extra["dummy"] == fdl.Partial(dummy_extra, 10, c=15) From 4714421d0c3c76e769bda1680f0999b37c8d8637 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 24 Oct 2024 15:51:04 +0200 Subject: [PATCH 3/8] Change default for always_save_context to True (#11014) (#11020) Signed-off-by: Abhishree Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Co-authored-by: Pablo Garay --- nemo/lightning/pytorch/callbacks/model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py index 97df7bfbcfa5..ee46630791ab 100644 --- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py +++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py @@ -73,7 +73,7 @@ def __init__( train_time_interval: Optional[timedelta] = None, save_on_train_epoch_end: Optional[bool] = False, # Save after training, not after validation save_optim_on_train_end: Optional[bool] = False, - always_save_context: bool = False, + always_save_context: bool = True, save_context_on_train_end: bool = True, **kwargs, ): From e34e04bd44c81d99c27c5f3ced324baaea3548ec Mon Sep 17 00:00:00 2001 From: malay-nagda <164242706+malay-nagda@users.noreply.github.com> Date: Thu, 24 Oct 2024 23:01:32 +0530 Subject: [PATCH 4/8] Performance mode (#10926) and gpt3 175b cli (#10985) (#11021) * Performance mode (#10926) * llama3 performance mode Signed-off-by: Malay Nagda * llama3 performance mode tests Signed-off-by: Malay Nagda * mixtral performance mode Signed-off-by: Malay Nagda * remove unused Signed-off-by: Malay Nagda * nemotron perf mode Signed-off-by: Malay Nagda * 405b, 174b perf mode Signed-off-by: Malay Nagda * perf mode comment Signed-off-by: Malay Nagda * Apply isort and black reformatting Signed-off-by: malay-nagda --------- Signed-off-by: Malay Nagda Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com> Signed-off-by: malay-nagda Co-authored-by: malay-nagda * gpt3 175b cli (#10985) * gpt3 175b cli Signed-off-by: Malay Nagda * Apply isort and black reformatting Signed-off-by: malay-nagda * Apply isort and black reformatting Signed-off-by: malay-nagda --------- Signed-off-by: Malay Nagda Signed-off-by: malay-nagda Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com> Co-authored-by: malay-nagda --------- Signed-off-by: Malay Nagda Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com> Signed-off-by: malay-nagda Co-authored-by: malay-nagda --- nemo/collections/llm/recipes/__init__.py | 2 + nemo/collections/llm/recipes/gpt3_175b.py | 52 ++++++++----------- nemo/collections/llm/recipes/llama31_405b.py | 52 ++++++++----------- nemo/collections/llm/recipes/llama3_70b.py | 51 +++++++++--------- nemo/collections/llm/recipes/llama3_8b.py | 43 ++++++--------- nemo/collections/llm/recipes/mixtral_8x22b.py | 49 ++++++++--------- nemo/collections/llm/recipes/mixtral_8x7b.py | 49 ++++++++--------- nemo/collections/llm/recipes/nemotron3_8b.py | 36 +++++-------- nemo/collections/llm/recipes/nemotron4_15b.py | 37 +++++-------- nemo/collections/llm/recipes/nemotron4_22b.py | 45 ++++++---------- .../collections/llm/recipes/nemotron4_340b.py | 45 ++++++---------- .../llm/recipes/test_llama3_70b.py | 6 +-- .../collections/llm/recipes/test_llama3_8b.py | 6 +-- tests/lightning/test_nemo_run.py | 1 + 14 files changed, 196 insertions(+), 278 deletions(-) diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py index 555d29f2ceb1..bb5c41bc600c 100644 --- a/nemo/collections/llm/recipes/__init__.py +++ b/nemo/collections/llm/recipes/__init__.py @@ -14,6 +14,7 @@ from nemo.collections.llm.recipes import ( + gpt3_175b, llama3_8b, llama3_8b_16k, llama3_8b_64k, @@ -61,6 +62,7 @@ "nemotron4_22b_16k", "nemotron4_22b_64k", "nemotron4_340b", + "gpt3_175b", "adam", "default_log", "default_resume", diff --git a/nemo/collections/llm/recipes/gpt3_175b.py b/nemo/collections/llm/recipes/gpt3_175b.py index 7e016154aa3e..1abe8a218e82 100644 --- a/nemo/collections/llm/recipes/gpt3_175b.py +++ b/nemo/collections/llm/recipes/gpt3_175b.py @@ -142,7 +142,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for GPT3 175B model. @@ -155,6 +160,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -172,7 +178,7 @@ def pretrain_recipe( Note: This recipe is optimized for the large 175B model and requires significant computational resources. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -186,49 +192,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 1, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for GPT3 175B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - CLI usage: - $ nemo llm pretrain --factory "gpt3_175b.pretrain_recipe_performance(num_nodes=64, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="gpt3_175b_perf", num_nodes=64) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.append( run.Config( diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py index 45efedc3cbd6..055e9a06fcba 100644 --- a/nemo/collections/llm/recipes/llama31_405b.py +++ b/nemo/collections/llm/recipes/llama31_405b.py @@ -144,7 +144,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Llama3.1 405B model. @@ -157,6 +162,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -174,7 +180,7 @@ def pretrain_recipe( Note: This recipe is optimized for the large 405B model and requires significant computational resources. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -188,49 +194,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 1, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Llama3.1 405B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - CLI usage: - $ nemo llm pretrain --factory "llama31_405b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="llama31_405b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.append( run.Config( diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py index ffd4a833885e..b283c68b222b 100644 --- a/nemo/collections/llm/recipes/llama3_70b.py +++ b/nemo/collections/llm/recipes/llama3_70b.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl @@ -142,7 +142,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Llama3 70B model. @@ -155,6 +160,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -172,7 +178,8 @@ def pretrain_recipe( Note: This recipe is optimized for the large 70B model and requires significant computational resources. """ - return run.Partial( + + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -186,45 +193,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Llama3 70B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - CLI usage: - $ nemo llm pretrain --factory "llama3_70b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="llama3_70b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.append( run.Config( diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py index dd162ed29914..269eb7865dcf 100644 --- a/nemo/collections/llm/recipes/llama3_8b.py +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -143,7 +143,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Llama3 8B model. @@ -156,6 +161,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -174,7 +180,7 @@ def pretrain_recipe( For more details on pre-training LLMs with NeMo, see the pre-training guide in the `examples/llm/pretrain/` directory. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -188,44 +194,29 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 1, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Llama3 8B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - $ nemo llm pretrain --factory llama3_8b_optimized - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="llama3_8b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - recipe.trainer.callbacks.append( run.Config( MegatronCommOverlapCallback, diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py index 028ad25ad794..fd065a540cbf 100644 --- a/nemo/collections/llm/recipes/mixtral_8x22b.py +++ b/nemo/collections/llm/recipes/mixtral_8x22b.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl @@ -146,7 +146,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 16, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Mixtral 8x22B model. @@ -159,6 +164,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -173,7 +179,7 @@ def pretrain_recipe( >>> recipe = pretrain_recipe(name="mixtral_pretrain", num_nodes=2) >>> print(recipe) """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -185,44 +191,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -def pretrain_recipe_performance( - name: str = "default", dir: Optional[str] = None, num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain -) -> Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Mixtral 8x22B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: Partial: Partial configuration for performance-optimized pre-training. - Examples: - CLI usage: - $ nemo llm pretrain --factory "mixtral_8x22b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="mixtral_8x22b_perf", num_nodes=8) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.extend( [ diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py index ead3d03edeac..1933f7768382 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl @@ -142,7 +142,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 2, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 8, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Mixtral 8x7B model. @@ -155,6 +160,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -169,7 +175,7 @@ def pretrain_recipe( >>> recipe = pretrain_recipe(name="mixtral_8x7b_pretrain", num_nodes=2) >>> print(recipe) """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -181,44 +187,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -def pretrain_recipe_performance( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain -) -> Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Mixtral 8x7B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: Partial: Partial configuration for performance-optimized pre-training. - Examples: - CLI usage: - $ nemo llm pretrain --factory "mixtral_8x7b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="mixtral_8x7b_perf", num_nodes=8) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.extend( [ diff --git a/nemo/collections/llm/recipes/nemotron3_8b.py b/nemo/collections/llm/recipes/nemotron3_8b.py index c1563e3beb15..156fc543725b 100644 --- a/nemo/collections/llm/recipes/nemotron3_8b.py +++ b/nemo/collections/llm/recipes/nemotron3_8b.py @@ -83,6 +83,7 @@ def pretrain_recipe( constant_steps=0, min_lr=3.0e-5, max_lr=3e-4, + performance_mode: bool = False, # Training function fn=pretrain, ) -> run.Partial: @@ -118,6 +119,7 @@ def pretrain_recipe( constant_steps (int): Number of constant steps. min_lr (float): Minimum learning rate. max_lr (float): Maximum learning rate. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -135,7 +137,7 @@ def pretrain_recipe( Note: This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=nemotron_trainer( @@ -174,43 +176,29 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 1, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Nemotron3 8B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - $ nemo llm pretrain --factory nemotron3_8b_optimized - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="nemotron3_8b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) recipe.trainer.callbacks.append( run.Config( diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py index 9f184a92d94b..16ae7b2b1e79 100644 --- a/nemo/collections/llm/recipes/nemotron4_15b.py +++ b/nemo/collections/llm/recipes/nemotron4_15b.py @@ -80,6 +80,7 @@ def pretrain_recipe( constant_steps=0, min_lr=4.5e-5, max_lr=4.5e-5, + performance_mode: bool = False, # Training function fn=pretrain, ) -> run.Partial: @@ -115,6 +116,7 @@ def pretrain_recipe( constant_steps (int): Number of constant steps. min_lr (float): Minimum learning rate. max_lr (float): Maximum learning rate. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -132,7 +134,7 @@ def pretrain_recipe( Note: This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=nemotron_trainer( @@ -171,44 +173,29 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 8, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Nemotron4 15B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - $ nemo llm pretrain --factory nemotron4_15b_optimized - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="nemotron4_15b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - recipe.trainer.callbacks.append( run.Config( MegatronCommOverlapCallback, diff --git a/nemo/collections/llm/recipes/nemotron4_22b.py b/nemo/collections/llm/recipes/nemotron4_22b.py index 4fb697c006fc..a20afedfea56 100644 --- a/nemo/collections/llm/recipes/nemotron4_22b.py +++ b/nemo/collections/llm/recipes/nemotron4_22b.py @@ -80,6 +80,7 @@ def pretrain_recipe( constant_steps=0, min_lr=1e-5, max_lr=1e-4, + performance_mode: bool = False, # Training function fn=pretrain, ) -> run.Partial: @@ -115,6 +116,7 @@ def pretrain_recipe( constant_steps (int): Number of constant steps. min_lr (float): Minimum learning rate. max_lr (float): Maximum learning rate. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -132,7 +134,7 @@ def pretrain_recipe( Note: This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=nemotron_trainer( @@ -171,48 +173,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 8, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Nemotron4 22B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - $ nemo llm pretrain --factory nemotron4_22b_optimized - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="nemotron4_22b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.append( run.Config( diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py index 62d25641f48c..31380f71c939 100644 --- a/nemo/collections/llm/recipes/nemotron4_340b.py +++ b/nemo/collections/llm/recipes/nemotron4_340b.py @@ -83,6 +83,7 @@ def pretrain_recipe( constant_steps=0, min_lr=1.0e-5, max_lr=1.0e-4, + performance_mode: bool = False, # Training function fn=pretrain, ) -> run.Partial: @@ -118,6 +119,7 @@ def pretrain_recipe( constant_steps (int): Number of constant steps. min_lr (float): Minimum learning rate. max_lr (float): Maximum learning rate. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -135,7 +137,7 @@ def pretrain_recipe( Note: This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=nemotron_trainer( @@ -174,48 +176,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 16, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Nemotron4 340B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - $ nemo llm pretrain --factory nemotron4_340b_optimized - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="nemotron4_340b_perf", num_nodes=16) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.append( run.Config( diff --git a/tests/collections/llm/recipes/test_llama3_70b.py b/tests/collections/llm/recipes/test_llama3_70b.py index cc77ec921de7..d47b674b7b70 100644 --- a/tests/collections/llm/recipes/test_llama3_70b.py +++ b/tests/collections/llm/recipes/test_llama3_70b.py @@ -79,10 +79,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node - def test_pretrain_recipe_performance(self, recipe_module): - recipe = recipe_module.pretrain_recipe_performance( - name="test_perf", dir="/tmp", num_nodes=4, num_gpus_per_node=8 - ) + def test_pretrain_performance_optimizations(self, recipe_module): + recipe = recipe_module.pretrain_recipe(performance_mode=True) assert any( isinstance(cb, run.Config) and cb.__fn_or_cls__ == MegatronCommOverlapCallback for cb in recipe.trainer.callbacks diff --git a/tests/collections/llm/recipes/test_llama3_8b.py b/tests/collections/llm/recipes/test_llama3_8b.py index df4f05eec2ae..88fab6d6325a 100644 --- a/tests/collections/llm/recipes/test_llama3_8b.py +++ b/tests/collections/llm/recipes/test_llama3_8b.py @@ -90,10 +90,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node - def test_pretrain_recipe_performance(self, recipe_module): - recipe = recipe_module.pretrain_recipe_performance( - name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8 - ) + def test_pretrain_performance_optimizations(self, recipe_module): + recipe = recipe_module.pretrain_recipe(performance_mode=True) assert any(cb.__fn_or_cls__.__name__ == "MegatronCommOverlapCallback" for cb in recipe.trainer.callbacks) def test_trainer_parallelism_options(self, recipe_module): diff --git a/tests/lightning/test_nemo_run.py b/tests/lightning/test_nemo_run.py index c708c71ea8a1..aa73df50613a 100644 --- a/tests/lightning/test_nemo_run.py +++ b/tests/lightning/test_nemo_run.py @@ -36,6 +36,7 @@ # ("nemotron4_22b_64k", "pretrain_recipe", "nemotron4_22b_64k_pretrain"), # ("nemotron4_340b", "pretrain_recipe", "nemotron4_340b_pretrain"), # ("nemotron4_340b", "finetune_recipe", "nemotron4_340b_finetune"), + ("gpt3_175b", "pretrain_recipe", "gpt3_175b_pretrain"), ], ) def test_recipes_with_nemo_run(module, recipe, name, tmpdir, monkeypatch): From 395c502422198771828f87f73b142b551e69403e Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 25 Oct 2024 09:16:15 -0700 Subject: [PATCH 5/8] Fix _strategy_lib tests (#11033) (#11039) * fix world size and don't mock * cleanup global state * check app state instead * fix syntax nemo logger test --------- Signed-off-by: Maanu Grover Co-authored-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com> --- tests/conftest.py | 8 ++++++ tests/lightning/test_nemo_logger.py | 3 +- tests/lightning/test_strategy_lib.py | 42 +++++++++++++++++----------- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 6298ed051c68..118e978e63c7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,6 +25,8 @@ import pytest +from nemo.utils.metaclasses import Singleton + # Those variables probably should go to main NeMo configuration file (config.yaml). __TEST_DATA_FILENAME = "test_data.tar.gz" __TEST_DATA_URL = "https://github.com/NVIDIA/NeMo/releases/download/v1.0.0rc1/" @@ -115,6 +117,11 @@ def cleanup_local_folder(): rmtree('./nemo_experiments', ignore_errors=True) +@pytest.fixture(autouse=True) +def reset_singletons(): + Singleton._Singleton__instances = {} + + @pytest.fixture(scope="session") def test_data_dir(): """ @@ -173,6 +180,7 @@ def k2_cuda_is_enabled(k2_is_appropriate) -> Tuple[bool, str]: return k2_is_appropriate import torch # noqa: E402 + from nemo.core.utils.k2_guard import k2 # noqa: E402 if torch.cuda.is_available() and k2.with_cuda: diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py index 3f8f7a1e0bb8..a5a5ec32c886 100644 --- a/tests/lightning/test_nemo_logger.py +++ b/tests/lightning/test_nemo_logger.py @@ -115,7 +115,8 @@ def test_resume(self, trainer, tmp_path): resume_ignore_no_checkpoint=True, ).setup(trainer) - path = Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints").mkdir(parents=True) + path = Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints") + path.mkdir(parents=True) # Error because checkpoints do not exist in folder with pytest.raises(NotFoundError): nl.AutoResume( diff --git a/tests/lightning/test_strategy_lib.py b/tests/lightning/test_strategy_lib.py index 36143cedb8c4..61727d5612e7 100644 --- a/tests/lightning/test_strategy_lib.py +++ b/tests/lightning/test_strategy_lib.py @@ -57,8 +57,10 @@ def configure_model(self): assert model.config.pipeline_dtype == torch.float32 -@patch('nemo.collections.nlp.modules.common.megatron.megatron_init.initialize_model_parallel_for_nemo') -def test_init_parallel_ranks(mock_initialize_model_parallel) -> None: +def test_init_parallel_ranks() -> None: + from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator + from megatron.core.parallel_state import destroy_model_parallel + from nemo.utils import AppState app_state = AppState() @@ -80,27 +82,33 @@ def test_init_parallel_ranks(mock_initialize_model_parallel) -> None: mock_parallel_config.pipeline_model_parallel_split_rank = None _strategy_lib.init_parallel_ranks( - world_size=2, + world_size=24, global_rank=1, local_rank=0, parallel_config=mock_parallel_config, seed=1234, fp8=False, ) - mock_initialize_model_parallel.assert_called_once_with( - world_size=2, - global_rank=1, - local_rank=0, - tensor_model_parallel_size=2, - pipeline_model_parallel_size=3, - virtual_pipeline_model_parallel_size=4, - context_parallel_size=2, - expert_model_parallel_size=2, - seed=1234, - pipeline_model_parallel_split_rank=None, - use_fp8=False, - init_mpi_proc_group=False, - ) + expected_app_state = { + "world_size": 24, + "global_rank": 1, + "local_rank": 0, + "tensor_model_parallel_size": 2, + "pipeline_model_parallel_size": 3, + "virtual_pipeline_model_parallel_size": 4, + "context_parallel_size": 2, + "expert_model_parallel_size": 2, + "pipeline_model_parallel_split_rank": None, + "use_fp8": False, + "init_mpi_proc_group": False, + } + for k, v in expected_app_state.items(): + assert hasattr(app_state, k), f"Expected to find {k} in AppState" + app_attr = getattr(app_state, k) + assert app_attr == v, f"{k} in AppState is incorrect, Expected: {v} Actual: {app_attr}" + + destroy_model_parallel() + destroy_num_microbatches_calculator() @patch('torch.distributed.is_initialized', return_value=True) From 4fac15ab3463218966a774204e720fa921f4a3f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 25 Oct 2024 18:22:04 +0200 Subject: [PATCH 6/8] Update `BaseMegatronSampler` for compatibility with PTL's `_BatchProgress` (#11016) (#11034) * Revert "[NeMo-UX] Use custom `BatchProgress` class which does not restore states (#10383)" This reverts commit b5798ded9f27168db9d7d77cbe4f9da80bf49268. * make megatron sampler return the total number of batches in the dataset --------- Signed-off-by: ashors1 Co-authored-by: Anna Shors <71393111+ashors1@users.noreply.github.com> --- nemo/lightning/data.py | 7 +++---- nemo/lightning/pytorch/strategies/fsdp_strategy.py | 8 -------- nemo/lightning/pytorch/strategies/megatron_strategy.py | 7 ------- nemo/lightning/pytorch/strategies/utils.py | 10 ---------- 4 files changed, 3 insertions(+), 29 deletions(-) diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py index 0f30dfe22851..7051e87841ca 100644 --- a/nemo/lightning/data.py +++ b/nemo/lightning/data.py @@ -287,17 +287,16 @@ def __init__( ) def __len__(self): - num_available_samples: int = self.total_samples - self.consumed_samples if self.global_batch_size is not None: if self.drop_last: - num_global_batches = num_available_samples // self.global_batch_size + num_global_batches = self.total_samples // self.global_batch_size else: - num_global_batches = (num_available_samples + self.global_batch_size - 1) // self.global_batch_size + num_global_batches = (self.total_samples + self.global_batch_size - 1) // self.global_batch_size # return len of dataloader in terms of micro batches to avoid discrepancy between len of dataloader and # num of batches fetched (as training step fetches in terms of micro batches) return num_global_batches * (self.global_batch_size // self.micro_batch_times_data_parallel_size) else: - return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1 + return (self.total_samples - 1) // self.micro_batch_times_data_parallel_size + 1 @abc.abstractmethod def __iter__(self): ... diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py index 5f24d988396b..d34d1716e6b4 100644 --- a/nemo/lightning/pytorch/strategies/fsdp_strategy.py +++ b/nemo/lightning/pytorch/strategies/fsdp_strategy.py @@ -35,7 +35,6 @@ from nemo.lightning import io from nemo.lightning.pytorch.strategies.utils import ( - _MegatronBatchProgress, ckpt_to_dir, create_checkpoint_io, fix_progress_bar, @@ -74,7 +73,6 @@ def __init__( ckpt_load_optimizer: bool = True, ckpt_save_optimizer: bool = True, data_sampler=None, - overwrite_batch_progress: bool = True, **kwargs, ): super().__init__(auto_wrap_policy=auto_wrap_policy, state_dict_type=state_dict_type, **kwargs) @@ -82,7 +80,6 @@ def __init__( self.data_sampler = data_sampler self.ckpt_load_optimizer = ckpt_load_optimizer self.ckpt_save_optimizer = ckpt_save_optimizer - self.overwrite_batch_progress = overwrite_batch_progress @override def setup_environment(self) -> None: @@ -95,11 +92,6 @@ def setup(self, trainer: pl.Trainer) -> None: self.trainer = trainer setup_data_sampler(self.trainer) fix_progress_bar(trainer) - - trainer_fn = trainer.state.fn - if trainer_fn == TrainerFn.FITTING and self.overwrite_batch_progress: - trainer.fit_loop.epoch_loop.batch_progress = _MegatronBatchProgress() - super().setup(trainer) def _get_loss_reduction(self, step_type: str): diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index e68b67c86f2d..839f1249cbb1 100644 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -62,7 +62,6 @@ from nemo.lightning.pytorch.callbacks import ModelTransform from nemo.lightning.pytorch.strategies.utils import ( RestoreConfig, - _MegatronBatchProgress, ckpt_to_dir, create_checkpoint_io, fix_progress_bar, @@ -155,8 +154,6 @@ class MegatronStrategy(DDPStrategy, io.IOMixin): that prints the metrics to stdout. Suitable for non-interactive settings. progress_interval (int): How frequently to print progress to stdout. Only used when replace_progress_bar is True. - overwrite_batch_progress (bool): Whether to overwrite _BatchProgress class used in PTL by default with - _MegatronBatchProgress. This should be True whenever you're using a Megatron-based dataset. **kwargs: Additional keyword arguments. Note: @@ -199,7 +196,6 @@ def __init__( replace_progress_bar: bool = True, progress_interval: int = 1, restore_config: Optional[RestoreConfig] = None, - overwrite_batch_progress: bool = True, **kwargs, ) -> None: super().__init__( @@ -240,7 +236,6 @@ def __init__( self.replace_progress_bar = replace_progress_bar self.progress_interval = progress_interval - self.overwrite_batch_progress = overwrite_batch_progress self.restore_config = restore_config @@ -338,8 +333,6 @@ def setup(self, trainer: pl.Trainer) -> None: self.configure_ddp() trainer.fit_loop.epoch_loop.automatic_optimization = _MegatronAutomaticOptimization(trainer) - if self.overwrite_batch_progress: - trainer.fit_loop.epoch_loop.batch_progress = _MegatronBatchProgress() import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py index 415392f2bef0..a7f0e7339def 100644 --- a/nemo/lightning/pytorch/strategies/utils.py +++ b/nemo/lightning/pytorch/strategies/utils.py @@ -25,12 +25,10 @@ from megatron.core.dist_checkpointing.strategies.torch import sharded_tensor_to_torch_sharded_tensor from megatron.core.transformer.utils import _get_extra_state_offsets from pytorch_lightning.callbacks import TQDMProgressBar -from pytorch_lightning.loops.progress import _BatchProgress from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from torch.distributed._sharded_tensor import ShardedTensor as TorchShardedTensor from torch.distributed._tensor import DTensor, Replicate, Shard from torch.distributed.device_mesh import DeviceMesh -from typing_extensions import override from nemo.lightning import _strategy_lib from nemo.lightning.io.pl import MegatronCheckpointIO @@ -48,14 +46,6 @@ class RestoreConfig: load_artifacts: bool = True -class _MegatronBatchProgress(_BatchProgress): - @override - def load_state_dict(self, state_dict: dict) -> None: - ## in megatron, we want to start the batch progress over when - ## restoring from a checkpoint - return - - def setup_parallel_ranks(strategy: pl.strategies.Strategy): from megatron.core.model_parallel_config import ModelParallelConfig From 5dd81f09181298a2dcaa4eba9a0949b5d871e269 Mon Sep 17 00:00:00 2001 From: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com> Date: Fri, 25 Oct 2024 13:36:03 -0400 Subject: [PATCH 7/8] Change dist ckpt defaults (#10913) (#11031) * Enable ckpt features by default (async ckpt), ckpt every 15mins and reduce preemption time to 1min * fix ssm tests * Make note that ckpt_async_save is disabled for SSMs * Enable async ckpt for SSMs with fix * Disable async ckpt in the peft test as it is a known bug, add note. * Fix failing unit tests * Ashors/peft async ckpt (#11010) * [WIP] prototype for supporting async checkpointing with peft * Enable async ckpt for the peft test * Fix peft setup test --------- Signed-off-by: Shriya Palsamudram Signed-off-by: ashors1 Co-authored-by: ataghibakhsh Co-authored-by: Pablo Garay --- nemo/collections/llm/recipes/log/default.py | 3 +- nemo/lightning/io/connector.py | 2 ++ nemo/lightning/pytorch/callbacks/peft.py | 28 ++++++++++++++++--- .../pytorch/strategies/megatron_strategy.py | 8 +++--- nemo/lightning/pytorch/strategies/utils.py | 4 ++- nemo/lightning/run/plugins.py | 4 +-- .../collections/llm/test_mnist_model_nemo2.py | 1 + .../lightning/pytorch/callbacks/test_peft.py | 4 ++- tests/lightning/test_dist_ckpt.py | 1 + 9 files changed, 42 insertions(+), 13 deletions(-) diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py index 93bd9f9470fa..d83580a1a543 100644 --- a/nemo/collections/llm/recipes/log/default.py +++ b/nemo/collections/llm/recipes/log/default.py @@ -13,6 +13,7 @@ # limitations under the License. +from datetime import timedelta from typing import Optional from nemo_run import Config, cli @@ -50,7 +51,7 @@ def default_log( nl.ModelCheckpoint, save_last=True, save_top_k=10, - every_n_train_steps=200, + train_time_interval=Config(timedelta, minutes=15), filename="{model_name}--{val_loss:.2f}-{step}-{consumed_samples}", ) diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py index 3ccbef536b99..41ce2d8f1117 100644 --- a/nemo/lightning/io/connector.py +++ b/nemo/lightning/io/connector.py @@ -183,6 +183,8 @@ def nemo_save(self, output_path: Path, trainer: pl.Trainer, dump_io: bool = True output_path = Path(output_path) output_path.mkdir(parents=True, exist_ok=True) trainer.save_checkpoint(ckpt_to_weights_subdir(output_path)) + if getattr(trainer.strategy, "async_save", False): + trainer.strategy.checkpoint_io.maybe_finalize_save_checkpoint(blocking=True) from nemo.lightning.io.pl import TrainerContext from nemo.utils.get_rank import is_global_rank_zero diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py index 1e3cde0bbcde..15d0dd8ac2ab 100644 --- a/nemo/lightning/pytorch/callbacks/peft.py +++ b/nemo/lightning/pytorch/callbacks/peft.py @@ -14,6 +14,7 @@ import json from abc import ABC, abstractmethod +from functools import partial from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple @@ -27,6 +28,7 @@ from nemo.lightning.io.pl import ckpt_to_dir from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform from nemo.utils import logging +from nemo.utils.callbacks.dist_ckpt_io import AsyncCompatibleCheckpointIO if TYPE_CHECKING: from megatron.core.dist_checkpointing.mapping import ShardedStateDict @@ -97,11 +99,28 @@ def __call__(self, model: nn.Module) -> nn.Module: return model def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str) -> None: + from nemo.lightning.pytorch.strategies.utils import create_checkpoint_io + super().setup(trainer, pl_module, stage=stage) trainer.strategy.trainer = trainer - self.wrapped_io = WrappedAdapterIO(trainer.strategy.checkpoint_io, self) - trainer.strategy._checkpoint_io = self.wrapped_io + wrapped_io = partial(WrappedAdapterIO, peft=self) + ckpt_io_kwargs = { + "save_ckpt_format": trainer.strategy.save_ckpt_format, + "async_save": trainer.strategy.async_save, + "torch_dist_multiproc": trainer.strategy.torch_dist_multiproc, + "assume_constant_structure": trainer.strategy.assume_constant_structure, + "parallel_save": trainer.strategy.parallel_save, + "parallel_save_within_dp": trainer.strategy.parallel_save_within_dp, + "parallel_load": trainer.strategy.parallel_load, + "load_directly_on_device": trainer.strategy.load_directly_on_device, + } + trainer.strategy._checkpoint_io = create_checkpoint_io(wrapping_ckpt_io=wrapped_io, **ckpt_io_kwargs) + self.wrapped_io = ( + trainer.strategy._checkpoint_io._checkpoint_io + if trainer.strategy.async_save + else trainer.strategy._checkpoint_io + ) trainer.strategy._init_model_parallel = False trainer.strategy._setup_optimizers = False @@ -257,7 +276,7 @@ def load_state_dict(self, state_dict, strict=True): self.adapter.load_state_dict(adapter_state_dict, strict) -class WrappedAdapterIO(_WrappingCheckpointIO): +class WrappedAdapterIO(_WrappingCheckpointIO, AsyncCompatibleCheckpointIO): peft: Optional[PEFT] = None model_ckpt_path: Optional[Path] = None adapter_ckpt_path: Optional[Path] = None @@ -273,7 +292,7 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio checkpoint['sharded_state_dict'] = dict( filter(lambda item: self.peft.adapter_key_filter(item[0]), checkpoint['sharded_state_dict'].items()) ) - self.checkpoint_io.save_checkpoint(checkpoint, path, storage_options=storage_options) + request = self.checkpoint_io.save_checkpoint(checkpoint, path, storage_options=storage_options) from nemo.utils.get_rank import is_global_rank_zero @@ -282,6 +301,7 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio adapter_meta_path = ckpt_to_dir(path) / _ADAPTER_META_FILENAME with open(adapter_meta_path, "w") as f: json.dump(metadata, f) + return request @override def load_checkpoint( diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index 839f1249cbb1..342c437f0e32 100644 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -129,7 +129,7 @@ class MegatronStrategy(DDPStrategy, io.IOMixin): save_ckpt_format (str): Distributed checkpoint format to use for checkpoint saving. Should be one of 'torch_dist' or 'zarr'. Defaults to 'torch_dist'. ckpt_async_save (bool): Whether to save checkpoints asynchronously to reduce checkpointing overhead. - Defaults to False. + Defaults to True. ckpt_torch_dist_multiproc (int): Number of extra processes per rank used during ckpt save with PyTorch distributed format. Defaults to None. ckpt_assume_constant_structure (bool): Allows caching some computation across checkpoint saves. @@ -139,7 +139,7 @@ class MegatronStrategy(DDPStrategy, io.IOMixin): ckpt_parallel_save_within_dp (bool): If true, save will be parallelized only within a DP group (whole world otherwise), which might slightly reduce the save overhead. Defaults to False. ckpt_parallel_load (bool): If true, each worker will load part of the dist checkpoint - and exchange with NCCL. Might use some extra GPU memory. Defaults to False. + and exchange with NCCL. Might use some extra GPU memory. Defaults to True. ckpt_parallel_save_optim (bool): Parallel save/load of a DistributedOptimizer. 'True' allows performant save and reshardable checkpoints. Set to 'False' only in order to minimize the number of checkpoint files. @@ -183,12 +183,12 @@ def __init__( lazy_init: bool = False, pipeline_dtype: Optional[torch.dtype] = None, save_ckpt_format: str = "torch_dist", - ckpt_async_save: bool = False, + ckpt_async_save: bool = True, ckpt_torch_dist_multiproc: int = None, ## TODO(ashors): put elsewhere? ckpt_assume_constant_structure: bool = False, ckpt_parallel_save: bool = True, ckpt_parallel_save_within_dp: bool = False, - ckpt_parallel_load: bool = False, + ckpt_parallel_load: bool = True, ckpt_parallel_save_optim: bool = True, ckpt_load_directly_on_device: bool = True, setup_optimizers: bool = True, diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py index a7f0e7339def..43a5a9243aa5 100644 --- a/nemo/lightning/pytorch/strategies/utils.py +++ b/nemo/lightning/pytorch/strategies/utils.py @@ -117,8 +117,10 @@ def ckpt_to_dir(filepath: Union[str, Path]) -> Path: return filepath -def create_checkpoint_io(**kwargs): +def create_checkpoint_io(wrapping_ckpt_io=None, **kwargs): checkpoint_io = MegatronCheckpointIO(**kwargs) + if wrapping_ckpt_io: + checkpoint_io = wrapping_ckpt_io(checkpoint_io) if kwargs.get("async_save", False): checkpoint_io = AsyncFinalizableCheckpointIO(checkpoint_io) diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py index dfcc7c1650ce..c9a38c5979ca 100644 --- a/nemo/lightning/run/plugins.py +++ b/nemo/lightning/run/plugins.py @@ -52,14 +52,14 @@ class PreemptionPlugin(run.Plugin): preempt_time (int): The time, in seconds, before the task's time limit at which the executor will send a SIGTERM preemption signal. This allows tasks to be gracefully stopped before reaching their time limit, reducing waste and - promoting fair resource usage. The default value is 300 seconds (5 minutes). + promoting fair resource usage. The default value is 60 seconds (1 minute). This is only supported for ``run.SlurmExecutor``. callbacks (list[run.Config[Callback]]): A list of callback configurations that the plugin will merge with the task's existing callbacks. By default, the list includes NeMo's preemption callback. """ - preempt_time: int = 300 + preempt_time: int = 60 callbacks: list[run.Config[Callback]] = field(default_factory=lambda: [run.Config(PreemptionCallback)]) def setup(self, task: run.Partial | run.Script, executor: run.Executor): diff --git a/tests/collections/llm/test_mnist_model_nemo2.py b/tests/collections/llm/test_mnist_model_nemo2.py index 3f0b804e8bd6..a5c2aa96fc03 100644 --- a/tests/collections/llm/test_mnist_model_nemo2.py +++ b/tests/collections/llm/test_mnist_model_nemo2.py @@ -501,6 +501,7 @@ def run_train_mnist_litautoencoder_with_megatron_strategy_single_gpu(): monitor="val_loss", save_top_k=1, every_n_train_steps=5, + filename="{model_name}--{val_loss:.2f}-{step}-{consumed_samples}", # Enables the .nemo file-like checkpointing where all IOMixins are under SerDe always_save_context=True, ) diff --git a/tests/lightning/pytorch/callbacks/test_peft.py b/tests/lightning/pytorch/callbacks/test_peft.py index 53f9016a3bac..95caca4d2784 100644 --- a/tests/lightning/pytorch/callbacks/test_peft.py +++ b/tests/lightning/pytorch/callbacks/test_peft.py @@ -18,6 +18,7 @@ from pytorch_lightning.trainer.states import TrainerFn from nemo.collections.llm import fn from nemo.lightning.pytorch.callbacks.peft import PEFT, WrappedAdapterIO +from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO class TestPEFT: @@ -48,7 +49,8 @@ def test_peft_setup(self): pl_module.model_transform = peft peft.setup(trainer, pl_module, "fit") - assert isinstance(trainer.strategy._checkpoint_io, WrappedAdapterIO) + assert isinstance(trainer.strategy._checkpoint_io, AsyncFinalizableCheckpointIO) + assert isinstance(trainer.strategy._checkpoint_io._checkpoint_io, WrappedAdapterIO) assert peft.model_transform is not None assert peft._needs_to_call is True diff --git a/tests/lightning/test_dist_ckpt.py b/tests/lightning/test_dist_ckpt.py index e6ea381fdf0b..5deb8085aa30 100644 --- a/tests/lightning/test_dist_ckpt.py +++ b/tests/lightning/test_dist_ckpt.py @@ -35,6 +35,7 @@ def set_env(): def _get_strategy(): strategy = nl.MegatronStrategy( enable_nemo_ckpt_io=False, + ckpt_async_save=False, ) return strategy From fe4d09b63e21be0e935dd6ed5fb9a25b231c3690 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 25 Oct 2024 11:49:05 -0700 Subject: [PATCH 8/8] Fix pip install (#11026) (#11028) * Move AutoTokenizer inline * Move einops to common requirements * Move AutoTokenizer import to top-level again in fine_tuning * Move megatron init inside nemo.lightning * Make megatron_lazy_init_context work when transformer-engine is not installed * Only import get_nmt_tokenizer when needed * Apply isort and black reformatting --------- Signed-off-by: Marc Romeyn Signed-off-by: marcromeyn Co-authored-by: Marc Romeyn Co-authored-by: marcromeyn --- nemo/collections/llm/gpt/data/mock.py | 8 +- nemo/lightning/_strategy_lib.py | 28 +- nemo/lightning/megatron_init.py | 413 ++++++++++++++++++++++++++ requirements/requirements_common.txt | 1 + requirements/requirements_nlp.txt | 1 - 5 files changed, 441 insertions(+), 10 deletions(-) create mode 100644 nemo/lightning/megatron_init.py diff --git a/nemo/collections/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py index 1c5e01c89bbd..5678597eda0b 100644 --- a/nemo/collections/llm/gpt/data/mock.py +++ b/nemo/collections/llm/gpt/data/mock.py @@ -56,9 +56,13 @@ def __init__( self.persistent_workers = persistent_workers self.create_attention_mask = create_attention_mask or not HAVE_TE - from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer + if tokenizer is None: + from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer + + self.tokenizer = get_nmt_tokenizer("megatron", "GPT2BPETokenizer") + else: + self.tokenizer = tokenizer - self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "GPT2BPETokenizer") self.data_sampler = MegatronDataSampler( seq_len=self.seq_length, micro_batch_size=micro_batch_size, diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py index d11031feded6..c3adf2a133e5 100644 --- a/nemo/lightning/_strategy_lib.py +++ b/nemo/lightning/_strategy_lib.py @@ -21,6 +21,8 @@ import torch from torch import nn +from nemo.lightning.megatron_init import initialize_model_parallel_for_nemo + NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE = "NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE" @@ -56,7 +58,6 @@ def init_parallel_ranks( seed (int, optional): The seed for random number generation. Defaults to 1234. fp8 (bool, optional): Whether to use fp8 precision for model parameters. Defaults to False. """ - from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo from nemo.utils import AppState app_state = AppState() @@ -161,13 +162,20 @@ def set_model_parallel_attributes(model, parallelism): @contextmanager def megatron_lazy_init_context(config) -> Generator[None, None, None]: - def monkey_patched(c): - return {"device": "meta"} + try: + from megatron.core.extensions import transformer_engine as _te + + original = _te._get_extra_te_kwargs # noqa: SLF001 - from megatron.core.extensions import transformer_engine as _te + def _get_extra_te_kwargs_meta(c): + """Forces device to meta""" + kwargs = original(c) + kwargs['device'] = 'meta' + return kwargs - original = _te._get_extra_te_kwargs # noqa: SLF001 - _te._get_extra_te_kwargs = monkey_patched # noqa: SLF001 + _te._get_extra_te_kwargs = _get_extra_te_kwargs_meta # noqa: SLF001 + except ImportError: + pass _orig_perform_initialization = config.perform_initialization _orig_use_cpu_initialization = config.use_cpu_initialization @@ -177,7 +185,13 @@ def monkey_patched(c): yield - _te._get_extra_te_kwargs = original # noqa: SLF001 + try: + from megatron.core.extensions import transformer_engine as _te + + _te._get_extra_te_kwargs = original # noqa: SLF001 + except ImportError: + pass + config.perform_initialization = _orig_perform_initialization config.use_cpu_initialization = _orig_use_cpu_initialization diff --git a/nemo/lightning/megatron_init.py b/nemo/lightning/megatron_init.py new file mode 100644 index 000000000000..c060d140cb8c --- /dev/null +++ b/nemo/lightning/megatron_init.py @@ -0,0 +1,413 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random + +import numpy as np +import torch + +from nemo.utils import AppState, logging + +try: + from apex.transformer.log_util import set_logging_level + + HAVE_APEX = True + +except (ImportError, ModuleNotFoundError): + + HAVE_APEX = False + +try: + from megatron.core import tensor_parallel + from megatron.core.parallel_state import ( + RankGenerator, + get_pipeline_model_parallel_rank, + set_expert_model_parallel_rank, + set_expert_model_parallel_world_size, + set_pipeline_model_parallel_rank, + set_pipeline_model_parallel_split_rank, + set_pipeline_model_parallel_world_size, + set_tensor_model_parallel_rank, + set_tensor_model_parallel_world_size, + set_virtual_pipeline_model_parallel_rank, + ) + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False + +try: + from megatron.core.num_microbatches_calculator import ( + ConstantNumMicroBatchesCalculator, + get_current_global_batch_size, + get_micro_batch_size, + get_num_microbatches, + init_num_microbatches_calculator, + ) + + MCORE_MB_CALCULATOR = True + +except (ImportError, ModuleNotFoundError): + logging.warning("Megatron num_microbatches_calculator not found, using Apex version.") + from apex.transformer.microbatches import ConstantNumMicroBatches as ConstantNumMicroBatchesCalculator + from apex.transformer.pipeline_parallel.utils import ( + get_current_global_batch_size, + get_micro_batch_size, + get_num_microbatches, + ) + from apex.transformer.pipeline_parallel.utils import ( + setup_microbatch_calculator as init_num_microbatches_calculator, + ) + + MCORE_MB_CALCULATOR = False + + +try: + from megatron.core.parallel_state import set_virtual_pipeline_model_parallel_world_size + + HAVE_INTERLEAVED = True + +except: + + HAVE_INTERLEAVED = False + + +def initialize_model_parallel_for_nemo( + world_size, + global_rank, + local_rank, + tensor_model_parallel_size=1, + expert_model_parallel_size=1, + pipeline_model_parallel_size=1, + virtual_pipeline_model_parallel_size=None, + pipeline_model_parallel_split_rank=None, + context_parallel_size=1, + micro_batch_size=None, + global_batch_size=None, + rampup_batch_size=None, + use_fp8=False, + init_mpi_proc_group=False, + seed=1234, + apex_transformer_log_level=30, + use_tp_pp_dp_mapping=False, + use_te_rng_tracker=False, +): + + if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED: + raise ValueError("set_virtual_pipeline_model_parallel_world_size is needed in megatron-core for interleaved.") + + # updating NeMo globals + app_state = AppState() + app_state.global_rank = global_rank + app_state.world_size = world_size + app_state.local_rank = local_rank + app_state.use_tp_pp_dp_mapping = use_tp_pp_dp_mapping + app_state.expert_model_parallel_size = expert_model_parallel_size + app_state.tensor_model_parallel_size = tensor_model_parallel_size + app_state.pipeline_model_parallel_size = pipeline_model_parallel_size + app_state.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size + app_state.context_parallel_size = context_parallel_size + app_state.use_fp8 = use_fp8 + app_state.init_mpi_proc_group = init_mpi_proc_group + ( + app_state.tensor_model_parallel_rank, + app_state.pipeline_model_parallel_rank, + app_state.expert_model_parallel_rank, + app_state.model_parallel_size, + app_state.data_parallel_size, + app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, + ) = fake_initialize_model_parallel( + world_size=world_size, + rank=global_rank, + tensor_model_parallel_size_=tensor_model_parallel_size, + pipeline_model_parallel_size_=pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size_=virtual_pipeline_model_parallel_size, + pipeline_model_parallel_split_rank_=pipeline_model_parallel_split_rank, + context_parallel_size_=context_parallel_size, + expert_model_parallel_size_=expert_model_parallel_size, + use_tp_pp_dp_mapping=use_tp_pp_dp_mapping, + ) + + # update apex.transformer globals + set_tensor_model_parallel_world_size(app_state.tensor_model_parallel_size) + set_tensor_model_parallel_rank(app_state.tensor_model_parallel_rank) + + set_expert_model_parallel_world_size(app_state.expert_model_parallel_size) + set_expert_model_parallel_rank(app_state.expert_model_parallel_rank) + + set_pipeline_model_parallel_rank(app_state.pipeline_model_parallel_rank) + if HAVE_INTERLEAVED: + set_virtual_pipeline_model_parallel_world_size(app_state.virtual_pipeline_model_parallel_size) + set_virtual_pipeline_model_parallel_rank(app_state.virtual_pipeline_model_parallel_rank) + set_pipeline_model_parallel_world_size(app_state.pipeline_model_parallel_size) + set_pipeline_model_parallel_split_rank(app_state.pipeline_model_parallel_split_rank) + + tensor_parallel.random.initialize_rng_tracker(use_te_rng_tracker=use_te_rng_tracker) + if seed is not None: + # @chcui not setting seed is for model conversion. always set seed for training/inference. + _set_random_seed(seed) + + if global_batch_size and micro_batch_size is not None: + # TODO: add rampup_batch_size here when we have it implemented + if MCORE_MB_CALCULATOR: + from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR + + if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None: + init_num_microbatches_calculator( + rank=global_rank, + global_batch_size=global_batch_size, + micro_batch_size=micro_batch_size, + data_parallel_size=app_state.data_parallel_size, + rampup_batch_size=rampup_batch_size, + ) + else: + if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatchesCalculator): + assert get_current_global_batch_size() == global_batch_size + assert get_micro_batch_size() == micro_batch_size + assert get_num_microbatches() == global_batch_size // ( + micro_batch_size * app_state.data_parallel_size + ) + else: + raise Exception("Microbatch calculator already initialized.") + else: + from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR + + if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None: + init_num_microbatches_calculator( + rank=global_rank, + global_batch_size=global_batch_size, + micro_batch_size=micro_batch_size, + data_parallel_size=app_state.data_parallel_size, + rampup_batch_size=rampup_batch_size, + ) + else: + if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatchesCalculator): + assert get_current_global_batch_size() == global_batch_size + assert get_micro_batch_size() == micro_batch_size + assert get_num_microbatches() == global_batch_size // ( + micro_batch_size * app_state.data_parallel_size + ) + else: + raise Exception("Microbatch calculator already initialized.") + + app_state._is_megatron_initialized = True + + if HAVE_APEX: + set_logging_level(apex_transformer_log_level) + + +def _set_random_seed(seed_): + """Set random seed for reproducability.""" + if seed_ is not None and seed_ > 0: + # Ensure that different pipeline MP stages get different seeds. + seed = seed_ + (100 * get_pipeline_model_parallel_rank()) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.device_count() > 0: + tensor_parallel.model_parallel_cuda_manual_seed(seed) + else: + raise ValueError('Seed ({}) should be a positive integer.'.format(seed_)) + + +def set_jit_fusion_options(): + """Set PyTorch JIT layer fusion options.""" + # set flags if we are using the 21.10 container + if torch.__version__ == "1.10.0a0+0aef44c": + # nvfuser + torch._C._jit_set_profiling_executor(True) + torch._C._jit_set_profiling_mode(True) + torch._C._jit_override_can_fuse_on_cpu(False) + torch._C._jit_override_can_fuse_on_gpu(False) + torch._C._jit_set_texpr_fuser_enabled(False) + torch._C._jit_set_nvfuser_enabled(True) + torch._C._debug_set_autodiff_subgraph_inlining(False) + + +def fake_initialize_model_parallel( + world_size, + rank, + tensor_model_parallel_size_, + pipeline_model_parallel_size_, + pipeline_model_parallel_split_rank_=None, + virtual_pipeline_model_parallel_size_=None, + expert_model_parallel_size_=1, + context_parallel_size_=1, + use_tp_pp_dp_mapping=False, +): + """ + Fake initialize model data parallel groups so that we can instantiate model parallel models before DDP is initialized. + This is needed because PTL execution flow is init model, init trainer -> call trainer.fit(model). DDP is initialized during .fit. + This function is taken from megatron.core.parallel_state and modified so that the distributed groups are not created. + We only need the tensor parallel and pipeline parallel ranks to instantiate the model. + + Arguments: + tensor_model_parallel_size: number of GPUs used to parallelize model tensor. + pipeline_model_parallel_size: number of GPUs used to parallelize model pipeline. + context_parallel_size: number of GPUs used to parallelize tokens of each input. + + Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we + use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize + the model pipeline. The present function will + create 8 tensor model-parallel groups, 4 pipeline model-parallel groups + and 8 data-parallel groups as: + 8 data_parallel groups: + [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15] + 8 tensor model-parallel groups: + [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15] + 4 pipeline model-parallel groups: + [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15] + Note that for efficiency, the caller should make sure adjacent ranks + are on the same DGX box. For example if we are using 2 DGX-1 boxes + with a total of 16 GPUs, rank 0 to 7 belong to the first box and + ranks 8 to 15 belong to the second box. + """ + + # Get world size and rank. Ensure some consistencies. + tensor_model_parallel_size = min(tensor_model_parallel_size_, world_size) + pipeline_model_parallel_size = min(pipeline_model_parallel_size_, world_size) + model_parallel_size = tensor_model_parallel_size * pipeline_model_parallel_size + context_parallel_size = min(context_parallel_size_, world_size) + + assert ( + world_size % (tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size) == 0 + ), f'world_size: {world_size} must be divisible by tensor_model_parallel_size: {tensor_model_parallel_size} times pipeline_model_parallel_size {pipeline_model_parallel_size} times context_parallel_size {context_parallel_size}' + data_parallel_size = world_size // ( + tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size + ) + + num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size + num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size + + virtual_pipeline_model_parallel_rank = None + if virtual_pipeline_model_parallel_size_ is not None: + virtual_pipeline_model_parallel_rank = 0 + + rank_generator = RankGenerator( + tp=tensor_model_parallel_size, + ep=expert_model_parallel_size_, + dp=data_parallel_size, + pp=pipeline_model_parallel_size, + cp=context_parallel_size, + order='tp-pp-dp' if use_tp_pp_dp_mapping else 'tp-cp-ep-dp-pp', + ) + + # Build the data-parallel groups. + all_data_parallel_group_ranks_with_cp = [] + for ranks in rank_generator.get_ranks('dp'): + if rank in ranks: + data_parallel_group = list(ranks) + logging.info(f'Rank {rank} has data parallel group : {data_parallel_group}') + + for ranks_with_cp in rank_generator.get_ranks('dp-cp'): + all_data_parallel_group_ranks_with_cp.append(ranks_with_cp) + if rank in ranks_with_cp: + data_parallel_group_with_cp = ranks_with_cp + logging.info( + f'Rank {rank} has combined group of data parallel and context parallel : {data_parallel_group_with_cp}' + ) + + data_parallel_rank = data_parallel_group.index(rank) + logging.info( + f'All data parallel group ranks with context parallel combined: {all_data_parallel_group_ranks_with_cp}' + ) + logging.info(f'Ranks {rank} has data parallel rank: {data_parallel_rank}') + + # Build the context-parallel groups. + all_context_parallel_group_ranks = [] + for ranks in rank_generator.get_ranks('cp'): + all_context_parallel_group_ranks.append(ranks) + if rank in ranks: + context_parallel_group = ranks + logging.info(f'Rank {rank} has context parallel group: {context_parallel_group}') + + context_parallel_rank = context_parallel_group.index(rank) + logging.info(f'All context parallel group ranks: {all_context_parallel_group_ranks}') + logging.info(f'Ranks {rank} has context parallel rank: {context_parallel_rank}') + + # Build the model-parallel groups. + all_model_parallel_group_ranks = [] + for ranks in rank_generator.get_ranks('tp-pp'): + all_model_parallel_group_ranks.append(ranks) + if rank in ranks: + logging.info(f'Rank {rank} has model parallel group: {list(ranks)}') + logging.info(f'All model parallel group ranks: {all_model_parallel_group_ranks}') + + # Build the tensor model-parallel groups. + all_tensor_model_parallel_group_ranks = [] + tensor_model_parallel_group = None + for ranks in rank_generator.get_ranks('tp'): + all_tensor_model_parallel_group_ranks.append(ranks) + if rank in ranks: + tensor_model_parallel_group = ranks + logging.info(f'Rank {rank} has tensor model parallel group: {tensor_model_parallel_group}') + + tensor_model_parallel_rank = tensor_model_parallel_group.index(rank) + + logging.info(f'All tensor model parallel group ranks: {all_tensor_model_parallel_group_ranks}') + logging.info(f'Rank {rank} has tensor model parallel rank: {tensor_model_parallel_rank}') + + # EP rank + expert_model_parallel_rank = 0 + if expert_model_parallel_size_ is not None and expert_model_parallel_size_ > 1: + for ranks in rank_generator.get_ranks('ep', independent_ep=True): + if rank in ranks: + expert_model_parallel_rank = list(ranks).index(rank) + + # Build the pipeline model-parallel groups and embedding groups + # (first and last rank in each pipeline model-parallel group). + all_pipeline_model_parallel_group_ranks = [] + all_embedding_group_ranks = [] + pipeline_model_parallel_group = None + embedding_group = None + embedding_rank = None + for ranks in rank_generator.get_ranks('pp'): + all_pipeline_model_parallel_group_ranks.append(ranks) + if rank in ranks: + pipeline_model_parallel_group = ranks + logging.info(f'Rank {rank} has pipeline model parallel group: {pipeline_model_parallel_group}') + + # Setup embedding group (to exchange gradients between + # first and last stages). + if len(ranks) > 1: + embedding_ranks = [ranks[0], ranks[-1]] + all_embedding_group_ranks.append(embedding_ranks) + else: + embedding_ranks = ranks + all_embedding_group_ranks.append(list(embedding_ranks)) + if rank in embedding_ranks: + embedding_group = list(embedding_ranks) + logging.info(f'Rank {rank} has embedding group: {embedding_group}') + + pipeline_model_parallel_rank = pipeline_model_parallel_group.index(rank) + if embedding_group is not None: + embedding_rank = embedding_group.index(rank) + + logging.info(f'All pipeline model parallel group ranks: {all_pipeline_model_parallel_group_ranks}') + logging.info(f'Rank {rank} has pipeline model parallel rank {pipeline_model_parallel_rank}') + logging.info(f'All embedding group ranks: {all_pipeline_model_parallel_group_ranks}') + logging.info(f'Rank {rank} has embedding rank: {embedding_rank}') + + return ( + tensor_model_parallel_rank, + pipeline_model_parallel_rank, + expert_model_parallel_rank, + model_parallel_size, + data_parallel_size, + pipeline_model_parallel_split_rank_, + virtual_pipeline_model_parallel_rank, + ) diff --git a/requirements/requirements_common.txt b/requirements/requirements_common.txt index 616381ed5933..d8ad52452c7c 100644 --- a/requirements/requirements_common.txt +++ b/requirements/requirements_common.txt @@ -1,4 +1,5 @@ datasets +einops inflect pandas sacremoses>=0.0.43 diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index f2b074a5975b..b7cac6a3b827 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -1,6 +1,5 @@ accelerated-scan boto3 -einops faiss-cpu fasttext flask_restful