From 58da88610f73a712684429b8207a5d9039924869 Mon Sep 17 00:00:00 2001 From: Marc Romeyn Date: Tue, 22 Oct 2024 17:58:21 +0200 Subject: [PATCH 01/12] Reflect CLI change nemorun -> nemo (#10443) Signed-off-by: Marc Romeijn Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> --- examples/llm/pretrain/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/llm/pretrain/README.md b/examples/llm/pretrain/README.md index c9bb7331f972..61f64d7792bb 100644 --- a/examples/llm/pretrain/README.md +++ b/examples/llm/pretrain/README.md @@ -3,7 +3,7 @@ ### Listing the available recipes for pretraining ```bash -nemorun llm pretrain --help +nemo llm pretrain --help ``` ![recipe-listing](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/list-recipes.png) @@ -12,7 +12,7 @@ nemorun llm pretrain --help ### Run pre-training with a default recipe ```bash -nemorun llm pretrain --factory llama3_8b +nemo llm pretrain --factory llama3_8b ``` ![llama3_70b](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b.png) @@ -20,7 +20,7 @@ nemorun llm pretrain --factory llama3_8b We can also call the factory function with custom parameters: ```bash -nemorun llm pretrain --factory "llama3_70b(num_nodes=128)" +nemo llm pretrain --factory "llama3_70b(num_nodes=128)" ``` ![llama3_70b-128-nodes](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b_128nodes.png) @@ -29,13 +29,13 @@ nemorun llm pretrain --factory "llama3_70b(num_nodes=128)" The CLI allows you to overwrite any parameter. For example, to run the recipe with 2000 steps: ```bash -nemorun llm pretrain --factory llama3_70b trainer.max_steps=2000 +nemo llm pretrain --factory llama3_70b trainer.max_steps=2000 ``` The syntax of the CLI is the same as the Python code. Which is great but in some cases you might want to inspect & edit a recipe interactively. An easy way to do this using the cli is the use the `--repl` flag. ```bash -nemorun llm pretrain --factory llama3_70b --repl +nemo llm pretrain --factory llama3_70b --repl ``` ![repl](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/repl.gif) From 746203add92094e385a97bfe54f819b1dd45146e Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Date: Tue, 22 Oct 2024 19:02:32 +0300 Subject: [PATCH 02/12] minor fix (#10990) Co-authored-by: Ali Taghibakhshi --- scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py | 2 +- .../checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py index f395e34765d0..42d3e77ce4c8 100644 --- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py +++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py @@ -15,7 +15,7 @@ r""" Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint. Example to run this conversion script: - python convert_llama_hf_to_nemo.py \ + python convert_llama_hf_to_nemo_load.py \ --input_name_or_path \ --input_state_dict \ --output_path \ diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py index 940a9df5f9a8..f7096996e5b1 100644 --- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py +++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py @@ -15,7 +15,7 @@ r""" Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint. Example to run this conversion script: - python convert_llama_hf_to_nemo.py \ + python convert_llama_hf_to_nemo_save_dict.py \ --input_name_or_path \ --output_path --precision bf16 From 70d8cc191b322d25fdb9428396c21a66d19f3ffb Mon Sep 17 00:00:00 2001 From: anteju <108555623+anteju@users.noreply.github.com> Date: Tue, 22 Oct 2024 09:33:55 -0700 Subject: [PATCH 03/12] Fixed sampler override and audio_key in prepare_audio_data (#10980) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ante Jukić --- examples/audio/process_audio.py | 4 ++-- nemo/collections/asr/parts/utils/transcribe_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/audio/process_audio.py b/examples/audio/process_audio.py index e28fb4e69627..ec88bda34954 100644 --- a/examples/audio/process_audio.py +++ b/examples/audio/process_audio.py @@ -159,8 +159,8 @@ def main(cfg: ProcessConfig) -> ProcessConfig: audio_to_audio_model.set_trainer(trainer) audio_to_audio_model = audio_to_audio_model.eval() - # override sampler - if cfg.sampler is not None: + # override sampler if necessary + if cfg.sampler: logging.info('Overriding sampler with %s', cfg.sampler) if hasattr(audio_to_audio_model, 'sampler'): diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index c1e712c44aeb..0d4f4c895bcf 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -314,7 +314,7 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]: with NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=cfg.presort_manifest): audio_file = get_full_path(audio_file=item[audio_key], manifest_file=cfg.dataset_manifest) - item[audio_key] = audio_file + item['audio_filepath'] = audio_file filepaths.append(audio_file) f.write(json.dumps(item) + "\n") sorted_manifest_path = f.name From c20e8922c434ccc22b7a8bf62acdb3276bd7a9f7 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Tue, 22 Oct 2024 14:08:43 -0400 Subject: [PATCH 04/12] Add more recipes (#10957) * add recipes Signed-off-by: Chen Cui * adjust finetuning recipe Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx --------- Signed-off-by: Chen Cui Signed-off-by: cuichenx Co-authored-by: cuichenx --- nemo/collections/llm/gpt/model/baichuan.py | 2 +- nemo/collections/llm/gpt/model/chatglm.py | 4 +- nemo/collections/llm/recipes/__init__.py | 8 + nemo/collections/llm/recipes/baichuan2_7b.py | 285 ++++++++++++++++++ nemo/collections/llm/recipes/chatglm3_6b.py | 283 +++++++++++++++++ .../llm/recipes/finetune_default.py | 8 +- nemo/collections/llm/recipes/gemma_2b.py | 285 ++++++++++++++++++ nemo/collections/llm/recipes/gemma_7b.py | 285 ++++++++++++++++++ nemo/collections/llm/recipes/optim/adam.py | 8 +- 9 files changed, 1158 insertions(+), 10 deletions(-) create mode 100644 nemo/collections/llm/recipes/baichuan2_7b.py create mode 100644 nemo/collections/llm/recipes/chatglm3_6b.py create mode 100644 nemo/collections/llm/recipes/gemma_2b.py create mode 100644 nemo/collections/llm/recipes/gemma_7b.py diff --git a/nemo/collections/llm/gpt/model/baichuan.py b/nemo/collections/llm/gpt/model/baichuan.py index 56231978061f..c283b802a118 100644 --- a/nemo/collections/llm/gpt/model/baichuan.py +++ b/nemo/collections/llm/gpt/model/baichuan.py @@ -215,7 +215,7 @@ def _import_qkv(ctx: io.TransformCTX, qkv_weights): q = qkv_weights[0].squeeze().view(*new_q_tensor_shape) k = qkv_weights[1].squeeze().view(*new_kv_tensor_shape) v = qkv_weights[2].squeeze().view(*new_kv_tensor_shape) - qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:]) + qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:]).type_as(qkv_weights) for i in range(num_query_groups): qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :])) qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :])) diff --git a/nemo/collections/llm/gpt/model/chatglm.py b/nemo/collections/llm/gpt/model/chatglm.py index 5bd1319102e2..e7450a8db28d 100644 --- a/nemo/collections/llm/gpt/model/chatglm.py +++ b/nemo/collections/llm/gpt/model/chatglm.py @@ -221,7 +221,7 @@ def _import_qkv_weight(ctx: io.TransformCTX, hf_qkv_weights): k = k.view(*new_kv_tensor_shape) v = v.view(*new_kv_tensor_shape) - qkv_weights = torch.empty((0, head_size, old_tensor_shape[1])) + qkv_weights = torch.empty((0, head_size, old_tensor_shape[1])).type_as(hf_qkv_weights) for i in range(num_query_groups): qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :])) qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :])) @@ -251,7 +251,7 @@ def _import_qkv_bias(ctx: io.TransformCTX, hf_qkv_bias): q = q.view(*new_q_tensor_shape) k = k.view(*new_kv_tensor_shape) v = v.view(*new_kv_tensor_shape) - qkv_bias = torch.empty((0, head_size)) + qkv_bias = torch.empty((0, head_size)).type_as(hf_qkv_bias) for i in range(num_query_groups): qkv_bias = torch.cat((qkv_bias, q[i * heads_per_group : (i + 1) * heads_per_group, :])) qkv_bias = torch.cat((qkv_bias, k[i : i + 1, :])) diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py index 47cc4e71448d..ff81c3b383fc 100644 --- a/nemo/collections/llm/recipes/__init__.py +++ b/nemo/collections/llm/recipes/__init__.py @@ -14,6 +14,10 @@ from nemo.collections.llm.recipes import ( + baichuan2_7b, + chatglm3_6b, + gemma_2b, + gemma_7b, llama3_8b, llama3_8b_16k, llama3_8b_64k, @@ -49,6 +53,10 @@ from nemo.collections.llm.recipes.optim import adam __all__ = [ + "baichuan2_7b", + "chatglm3_6b", + "gemma_2b", + "gemma_7b", "llama3_8b", "llama3_8b_16k", "llama3_8b_64k", diff --git a/nemo/collections/llm/recipes/baichuan2_7b.py b/nemo/collections/llm/recipes/baichuan2_7b.py new file mode 100644 index 000000000000..3ebb643af779 --- /dev/null +++ b/nemo/collections/llm/recipes/baichuan2_7b.py @@ -0,0 +1,285 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Callable, Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm import Baichuan2Config7B, Baichuan2Model +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback +from nemo.utils.exp_manager import TimingCallback + +NAME = "baichuan2_7b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Baichuan2 7B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Baichuan2 7B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=baichuan2_7b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(Baichuan2Model, config=run.Config(Baichuan2Config7B)) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Baichuan2 7B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=baichuan2_7b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Baichuan2 7B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory baichuan2_7b + $ nemo llm pretrain --factory "baichuan2_7b(num_nodes=2, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="baichuan2_7b_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=pretrain, name=NAME + "_optimized") +def pretrain_recipe_performance( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Baichuan2 7B model. + + This recipe enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Examples: + $ nemo llm pretrain --factory baichuan2_7b_optimized + + Python API usage: + >>> recipe = pretrain_recipe_performance(name="baichuan2_7b_perf", num_nodes=4) + >>> print(recipe) + + Note: + Use this recipe with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=False, + ) + ) + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + peft_scheme: Optional[str] = 'lora', +) -> run.Partial: + """ + Create a fine-tuning recipe for Baichuan2 7B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory baichuan2_7b + + Python API usage: + >>> recipe = finetune_recipe(name="baichuan2_7b_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + """ + recipe = default_finetune_recipe( + model(), "baichuan-inc/Baichuan2-7B-Base", dir, name, num_nodes, num_gpus_per_node + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 2 + recipe.optim.config.lr = 5e-6 + elif peft_scheme.lower() == 'lora': + recipe.peft = run.Config(LoRA) + recipe.optim.config.lr = 1e-4 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/chatglm3_6b.py b/nemo/collections/llm/recipes/chatglm3_6b.py new file mode 100644 index 000000000000..f5d580a9c6ea --- /dev/null +++ b/nemo/collections/llm/recipes/chatglm3_6b.py @@ -0,0 +1,283 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Callable, Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm import ChatGLM3Config6B, ChatGLMModel +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback +from nemo.utils.exp_manager import TimingCallback + +NAME = "chatglm3_6b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a ChatGLM3 6B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the ChatGLM3 6B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=chatglm3_6b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(ChatGLMModel, config=run.Config(ChatGLM3Config6B)) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for ChatGLM3 6B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=chatglm3_6b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for ChatGLM3 6B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory chatglm3_6b + $ nemo llm pretrain --factory "chatglm3_6b(num_nodes=2, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="chatglm3_6b_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=pretrain, name=NAME + "_optimized") +def pretrain_recipe_performance( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for ChatGLM3 6B model. + + This recipe enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Examples: + $ nemo llm pretrain --factory chatglm3_6b_optimized + + Python API usage: + >>> recipe = pretrain_recipe_performance(name="chatglm3_6b_perf", num_nodes=4) + >>> print(recipe) + + Note: + Use this recipe with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=False, + ) + ) + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + peft_scheme: Optional[str] = 'lora', +) -> run.Partial: + """ + Create a fine-tuning recipe for ChatGLM3 6B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory chatglm3_6b + + Python API usage: + >>> recipe = finetune_recipe(name="chatglm3_6b_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + """ + recipe = default_finetune_recipe(model(), "THUDM/chatglm3-6b", dir, name, num_nodes, num_gpus_per_node) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 2 + recipe.optim.config.lr = 5e-6 + elif peft_scheme.lower() == 'lora': + recipe.peft = run.Config(LoRA) + recipe.optim.config.lr = 1e-4 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/finetune_default.py b/nemo/collections/llm/recipes/finetune_default.py index 89c982613126..255763abbf50 100644 --- a/nemo/collections/llm/recipes/finetune_default.py +++ b/nemo/collections/llm/recipes/finetune_default.py @@ -60,7 +60,7 @@ def default_finetune_recipe( ), data=run.Config(llm.SquadDataModule, seq_length=2048, global_batch_size=128, micro_batch_size=1), log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), - optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50, adam_beta2=0.98), resume=nemo_resume(resume_path), ) @@ -77,9 +77,9 @@ def default_finetune_trainer( num_nodes=1, num_gpus_per_node=8, max_steps=1000, - limit_test_batches=None, - limit_val_batches=None, - val_check_interval=5, + limit_test_batches=1, + limit_val_batches=1, + val_check_interval=30, ): strategy = run.Config( nl.MegatronStrategy, diff --git a/nemo/collections/llm/recipes/gemma_2b.py b/nemo/collections/llm/recipes/gemma_2b.py new file mode 100644 index 000000000000..cbcd340c1e92 --- /dev/null +++ b/nemo/collections/llm/recipes/gemma_2b.py @@ -0,0 +1,285 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Callable, Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm import GemmaConfig2B, GemmaModel +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback +from nemo.utils.exp_manager import TimingCallback + +NAME = "gemma_2b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Gemma 2B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Gemma 2B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=gemma_2b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(GemmaModel, config=run.Config(GemmaConfig2B)) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Gemma 2B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=gemma_2b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Gemma 2B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory gemma_2b + $ nemo llm pretrain --factory "gemma_2b(num_nodes=2, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="gemma_2b_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=pretrain, name=NAME + "_optimized") +def pretrain_recipe_performance( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Gemma 2B model. + + This recipe enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Examples: + $ nemo llm pretrain --factory gemma_2b_optimized + + Python API usage: + >>> recipe = pretrain_recipe_performance(name="gemma_2b_perf", num_nodes=4) + >>> print(recipe) + + Note: + Use this recipe with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=False, + ) + ) + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + peft_scheme: Optional[str] = 'lora', +) -> run.Partial: + """ + Create a fine-tuning recipe for Gemma 2B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory gemma_2b + + Python API usage: + >>> recipe = finetune_recipe(name="gemma_2b_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + """ + # Disable cuDNN attention since TE 1.8 does not support head dim > 128 + os.environ['NVTE_FUSED_ATTN'] = "0" + + recipe = default_finetune_recipe(model(), "google/gemma-2b", dir, name, num_nodes, num_gpus_per_node) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 2 + recipe.optim.config.lr = 5e-6 + elif peft_scheme.lower() == 'lora': + recipe.peft = run.Config(LoRA) + recipe.optim.config.lr = 1e-4 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/gemma_7b.py b/nemo/collections/llm/recipes/gemma_7b.py new file mode 100644 index 000000000000..3b0e206d9ce7 --- /dev/null +++ b/nemo/collections/llm/recipes/gemma_7b.py @@ -0,0 +1,285 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Callable, Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm import GemmaConfig7B, GemmaModel +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback +from nemo.utils.exp_manager import TimingCallback + +NAME = "gemma_7b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Gemma 7B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Gemma 7B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=gemma_7b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(GemmaModel, config=run.Config(GemmaConfig7B)) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Gemma 7B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=gemma_7b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Gemma 7B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory gemma_7b + $ nemo llm pretrain --factory "gemma_7b(num_nodes=2, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="gemma_7b_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=pretrain, name=NAME + "_optimized") +def pretrain_recipe_performance( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Gemma 7B model. + + This recipe enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Examples: + $ nemo llm pretrain --factory gemma_7b_optimized + + Python API usage: + >>> recipe = pretrain_recipe_performance(name="gemma_7b_perf", num_nodes=4) + >>> print(recipe) + + Note: + Use this recipe with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) + + recipe.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=False, + ) + ) + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + peft_scheme: Optional[str] = 'lora', +) -> run.Partial: + """ + Create a fine-tuning recipe for Gemma 7B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory gemma_7b + + Python API usage: + >>> recipe = finetune_recipe(name="gemma_7b_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + """ + # Disable cuDNN attention since TE 1.8 does not support head dim > 128 + os.environ['NVTE_FUSED_ATTN'] = "0" + + recipe = default_finetune_recipe(model(), "google/gemma-7b", dir, name, num_nodes, num_gpus_per_node) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 2 + recipe.optim.config.lr = 5e-6 + elif peft_scheme.lower() == 'lora': + recipe.peft = run.Config(LoRA) + recipe.optim.config.lr = 1e-4 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py index 5be87ac71e9d..c6510577711d 100644 --- a/nemo/collections/llm/recipes/optim/adam.py +++ b/nemo/collections/llm/recipes/optim/adam.py @@ -25,6 +25,8 @@ def distributed_fused_adam_with_cosine_annealing( precision: str = "bf16-mixed", # or "16-mixed" warmup_steps: int = 2000, constant_steps: int = 0, + adam_beta1: float = 0.9, + adam_beta2: float = 0.95, max_lr: float = 1e-4, min_lr: Optional[float] = None, clip_grad: float = 1.0, @@ -37,14 +39,14 @@ def distributed_fused_adam_with_cosine_annealing( weight_decay=0.1, bf16=precision == "bf16-mixed", fp16=precision == "16-mixed", - adam_beta1=0.9, - adam_beta2=0.95, + adam_beta1=adam_beta1, + adam_beta2=adam_beta2, adam_eps=1e-5, use_distributed_optimizer=True, clip_grad=clip_grad, ) - min_lr = min_lr or (0.1 * max_lr) + min_lr = min_lr if min_lr is not None else (0.1 * max_lr) sched = run.Config( CosineAnnealingScheduler, warmup_steps=warmup_steps, From 1ba8bb1c623c2b7e938549002f5b77ab02fbe4bc Mon Sep 17 00:00:00 2001 From: meatybobby Date: Tue, 22 Oct 2024 11:48:51 -0700 Subject: [PATCH 05/12] Fix parallel_embedding (#10975) --- nemo/export/trt_llm/converter/model_converter.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py index 366206c948eb..e5e9f8154d24 100755 --- a/nemo/export/trt_llm/converter/model_converter.py +++ b/nemo/export/trt_llm/converter/model_converter.py @@ -260,9 +260,7 @@ def model_to_trtllm_ckpt( if mapping.is_first_pp_rank(): embedding_weight = ( - np.ascontiguousarray( - split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank) - ) + split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank) if use_parallel_embedding else weights_dict["transformer.vocab_embedding.weight"] ) @@ -272,9 +270,7 @@ def model_to_trtllm_ckpt( pos_embedding_weight = weights_dict.get("transformer.position_embedding.weight") if pos_embedding_weight is not None: if use_parallel_embedding: - pos_embedding_weight = np.ascontiguousarray( - split(pos_embedding_weight, mapping.tp_size, mapping.tp_rank) - ) + pos_embedding_weight = split(pos_embedding_weight, mapping.tp_size, mapping.tp_rank) weights_dict_local["transformer.position_embedding.weight"] = pos_embedding_weight if mapping.is_last_pp_rank(): From 69e3c3f31ce2c6ba9b7f29c5101426d49b63bd63 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Tue, 22 Oct 2024 18:15:00 -0400 Subject: [PATCH 06/12] Upgrade transformers (#10854) Signed-off-by: Chen Cui --- requirements/requirements_lightning.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt index 171abce41f37..e8020f244821 100644 --- a/requirements/requirements_lightning.txt +++ b/requirements/requirements_lightning.txt @@ -4,6 +4,6 @@ hydra-core>1.3,<=1.3.2 omegaconf<=2.3 pytorch-lightning>2.2.1 torchmetrics>=0.11.0 -transformers>=4.44.0 +transformers>=4.45.0 wandb webdataset>=0.2.86 From 8f26236a6af9e963b214130a481a9cb67b941943 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 22 Oct 2024 22:11:32 -0700 Subject: [PATCH 07/12] Add support and recipes for HF models via AutoModelForCausalLM (#10962) * initial hf_lit_module Signed-off-by: Alexandros Koumparoulis * make sft gpt dataset sanity check optional Signed-off-by: Alexandros Koumparoulis * HF sft example Signed-off-by: Alexandros Koumparoulis * Rename HfLitModule to HfAutoModel Signed-off-by: Alexandros Koumparoulis * update default model id Signed-off-by: Alexandros Koumparoulis * move rank&world_size as params Signed-off-by: Alexandros Koumparoulis * fix mbs in example Signed-off-by: Alexandros Koumparoulis * fix for fsdp and logger Signed-off-by: Alexandros Koumparoulis * make loss_fn configurable Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * remove optim from HfAutoModel Signed-off-by: Alexandros Koumparoulis * add pytorch native optim Signed-off-by: Alexandros Koumparoulis * add hfAutoModel pretrain nemorun recipe Signed-off-by: Alexandros Koumparoulis * remove debug Signed-off-by: Alexandros Koumparoulis * remove stale imports Signed-off-by: Alexandros Koumparoulis * remove stale import Signed-off-by: Alexandros Koumparoulis * rm stale imports Signed-off-by: Alexandros Koumparoulis * rm stale imports Signed-off-by: Alexandros Koumparoulis * tokenizer fix Signed-off-by: Alexandros Koumparoulis * update example Signed-off-by: Alexandros Koumparoulis * rename pytorch_adam to pytorch_adam_with_cosine_annealing Signed-off-by: Alexandros Koumparoulis * small refactor Signed-off-by: Alexandros Koumparoulis * fix no_weight_decay_cond Signed-off-by: Alexandros Koumparoulis * fix Signed-off-by: Alexandros Koumparoulis * fix Signed-off-by: Alexandros Koumparoulis * switch to flat_lr optim for example Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * remove imports & update docstrings Signed-off-by: Alexandros Koumparoulis * add a tokenizer setter to allow it to work with nemo/collections/llm/api.py::_use_tokenizer Signed-off-by: Alexandros Koumparoulis * remove unused import Signed-off-by: Alexandros Koumparoulis * allow loss_mask to be none Signed-off-by: Alexandros Koumparoulis * Add HF-dataset lightning module Signed-off-by: Alexandros Koumparoulis * check if pad_token_id is None Signed-off-by: Alexandros Koumparoulis * rename hf_lit_module.py to hf_auto_model.py Signed-off-by: Alexandros Koumparoulis * class rename Signed-off-by: Alexandros Koumparoulis * rename Signed-off-by: Alexandros Koumparoulis * update example Signed-off-by: Alexandros Koumparoulis * HfAutoModelForCausalLM Signed-off-by: Alexandros Koumparoulis * rm stale import Signed-off-by: Alexandros Koumparoulis * add option to start with random weights Signed-off-by: Alexandros Koumparoulis * add check in megatron-strategy Signed-off-by: Alexandros Koumparoulis * rename param Signed-off-by: Alexandros Koumparoulis * drop mcore sampler from squadmodule Signed-off-by: Alexandros Koumparoulis * make megatron_sampler optional in HfDatasetDataModule Signed-off-by: Alexandros Koumparoulis * copyright Signed-off-by: Alexandros Koumparoulis * use is_hf_model to mark hf classes Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- examples/llm/sft/hf.py | 91 ++++++++++ nemo/collections/llm/__init__.py | 3 + nemo/collections/llm/gpt/data/__init__.py | 10 +- nemo/collections/llm/gpt/data/fine_tuning.py | 5 + nemo/collections/llm/gpt/data/hf_dataset.py | 103 +++++++++++ nemo/collections/llm/gpt/data/squad.py | 2 + nemo/collections/llm/gpt/model/__init__.py | 2 + .../gpt/model/hf_auto_model_for_causal_lm.py | 108 +++++++++++ nemo/collections/llm/recipes/__init__.py | 2 + .../recipes/hf_auto_model_for_causal_lm.py | 168 ++++++++++++++++++ nemo/collections/llm/recipes/optim/adam.py | 59 +++++- .../megatron/dataset_utils.py | 39 ++-- .../megatron/gpt_sft_dataset.py | 4 + nemo/lightning/data.py | 17 +- nemo/lightning/pytorch/optim/__init__.py | 2 + nemo/lightning/pytorch/optim/pytorch.py | 132 ++++++++++++++ .../lightning/pytorch/plugins/data_sampler.py | 7 + .../pytorch/strategies/megatron_strategy.py | 2 + 18 files changed, 733 insertions(+), 23 deletions(-) create mode 100644 examples/llm/sft/hf.py create mode 100644 nemo/collections/llm/gpt/data/hf_dataset.py create mode 100644 nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py create mode 100644 nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py create mode 100644 nemo/lightning/pytorch/optim/pytorch.py diff --git a/examples/llm/sft/hf.py b/examples/llm/sft/hf.py new file mode 100644 index 000000000000..b7e12d8fb2de --- /dev/null +++ b/examples/llm/sft/hf.py @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fiddle as fdl +import pytorch_lightning as pl +from pytorch_lightning.loggers import WandbLogger +from torch.utils.data import DataLoader + +from nemo import lightning as nl +from nemo.collections import llm + + +class SquadDataModuleWithPthDataloader(llm.SquadDataModule): + def _create_dataloader(self, dataset, **kwargs) -> DataLoader: + return DataLoader( + dataset, + num_workers=self.num_workers, + pin_memory=self.pin_memory, + persistent_workers=self.persistent_workers, + collate_fn=dataset.collate_fn, + batch_size=self.micro_batch_size, + **kwargs, + ) + + +def squad(tokenizer) -> pl.LightningDataModule: + return SquadDataModuleWithPthDataloader( + tokenizer=tokenizer, + seq_length=2048, + micro_batch_size=2, + global_batch_size=128, # assert gbs == mbs * accumulate_grad_batches + num_workers=0, + sanity_check_dist_workers=False, + ) + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('--model', default='meta-llama/Llama-3.2-1B') + parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp']) + parser.add_argument('--devices', default=1) + parser.add_argument('--accelerator', default='gpu', choices=['gpu']) + parser.add_argument('--max-steps', type=int, default=100) + parser.add_argument('--wandb-project', type=str, default=None) + args = parser.parse_args() + + wandb = None + if args.wandb_project is not None: + model = '_'.join(args.model.split('/')[-2:]) + wandb = WandbLogger( + project=args.wandb_project, + name=f'{model}_dev{args.devices}_strat_{args.strategy}', + ) + grad_clip = 0.5 + if args.strategy == 'fsdp': + # See: https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81 + grad_clip = None + use_dist_samp = False + + llm.api.finetune( + model=llm.HfAutoModelForCausalLM(args.model), + data=squad(llm.HfAutoModelForCausalLM.configure_tokenizer(args.model)), + trainer=nl.Trainer( + devices=args.devices, + max_steps=args.max_steps, + accelerator=args.accelerator, + strategy=args.strategy, + log_every_n_steps=1, + limit_val_batches=0.0, + num_sanity_val_steps=0, + accumulate_grad_batches=10, + gradient_clip_val=grad_clip, + use_distributed_sampler=use_dist_samp, + logger=wandb, + ), + optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(max_lr=1e-5, clip_grad=0.5)), + log=None, + ) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 4205c401eea8..6dde88079567 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -21,6 +21,7 @@ from nemo.collections.llm.gpt.data import ( DollyDataModule, FineTuningDataModule, + HfDatasetDataModule, MockDataModule, PreTrainingDataModule, SquadDataModule, @@ -57,6 +58,7 @@ GPTConfig126M, GPTConfig175B, GPTModel, + HfAutoModelForCausalLM, Llama2Config7B, Llama2Config13B, Llama2Config70B, @@ -182,6 +184,7 @@ "squad", "dolly", "peft", + "HfAutoModelForCausalLM", ] diff --git a/nemo/collections/llm/gpt/data/__init__.py b/nemo/collections/llm/gpt/data/__init__.py index 45ca0788874f..f4e97d91e5cd 100644 --- a/nemo/collections/llm/gpt/data/__init__.py +++ b/nemo/collections/llm/gpt/data/__init__.py @@ -14,8 +14,16 @@ from nemo.collections.llm.gpt.data.dolly import DollyDataModule from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule +from nemo.collections.llm.gpt.data.hf_dataset import HfDatasetDataModule from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule -__all__ = ["FineTuningDataModule", "SquadDataModule", "DollyDataModule", "MockDataModule", "PreTrainingDataModule"] +__all__ = [ + "FineTuningDataModule", + "SquadDataModule", + "DollyDataModule", + "MockDataModule", + "PreTrainingDataModule", + "HfDatasetDataModule", +] diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py index 01cf617a094d..2545bbc93f1d 100644 --- a/nemo/collections/llm/gpt/data/fine_tuning.py +++ b/nemo/collections/llm/gpt/data/fine_tuning.py @@ -70,6 +70,7 @@ def __init__( persistent_workers: bool = False, pad_to_max_length: bool = False, packed_sequence_specs: Optional["PackedSequenceSpecs"] = None, + sanity_check_dist_workers: bool = True, ): super().__init__() self.seq_length = seq_length @@ -89,6 +90,7 @@ def __init__( self.packed_sequence_specs = packed_sequence_specs self.packed_sequence_size = -1 if not packed_sequence_specs else packed_sequence_specs.packed_sequence_size self.validate_batch_size_for_packed_sequence() + self._sanity_check_dist_workers = sanity_check_dist_workers def validate_batch_size_for_packed_sequence(self): if self.packed_sequence_size > 0 and self.micro_batch_size > 1: @@ -134,6 +136,7 @@ def train_dataloader(self) -> DataLoader: self.train_path if self.packed_sequence_size <= 0 else self.train_path_packed, max_num_samples=self.max_train_samples, pad_to_max_length=self.pad_to_max_length, + sanity_check_dist_workers=self._sanity_check_dist_workers, ) ) @@ -143,6 +146,7 @@ def val_dataloader(self) -> DataLoader: self.validation_path, is_test=True, pad_to_max_length=self.pad_to_max_length, + sanity_check_dist_workers=self._sanity_check_dist_workers, ), ) @@ -153,6 +157,7 @@ def test_dataloader(self) -> DataLoader: tokens_to_generate=32, is_test=True, pad_to_max_length=self.pad_to_max_length, + sanity_check_dist_workers=self._sanity_check_dist_workers, ) ) diff --git a/nemo/collections/llm/gpt/data/hf_dataset.py b/nemo/collections/llm/gpt/data/hf_dataset.py new file mode 100644 index 000000000000..7e70a970913e --- /dev/null +++ b/nemo/collections/llm/gpt/data/hf_dataset.py @@ -0,0 +1,103 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytorch_lightning as pl +import torch +from torch.utils.data import DataLoader + + +class HfDatasetDataModule(pl.LightningDataModule): + def __init__( + self, + dataset, + num_workers=2, + pin_memory=True, + persistent_workers=True, + micro_batch_size=2, + global_batch_size=2, + pad_token_id=0, + use_mcore_sampler=False, + mcore_dataloader_type='cyclic', + ) -> None: + super().__init__() + assert pad_token_id is not None + + self.dataset = dataset + self.num_workers = num_workers + self.pin_memory = pin_memory + self.persistent_workers = persistent_workers + self.micro_batch_size = micro_batch_size + self.global_batch_size = global_batch_size + self.pad_token_id = pad_token_id + + self.use_mcore_sampler = use_mcore_sampler + self.mcore_dataloader_type = mcore_dataloader_type + + @staticmethod + def collate_fn(batch, pad_token_id=0): + def batchify(tensor): + if tensor.ndim == 1: + return tensor.unsqueeze_(0) + return tensor + + def extract_key_from_dicts(batch, key): + return list(map(lambda x: x[key], batch)) + + def pad_within_micro(batch, pad_token_id): + max_len = max(map(len, batch)) + return [item + [pad_token_id] * (max_len - len(item)) for item in batch] + + return { + key: batchify( + torch.LongTensor( + pad_within_micro( + extract_key_from_dicts(batch, key), + pad_token_id, + ) + ) + ) + for key in ['tokens', 'labels'] + } + + def train_dataloader(self, collate_fn=None): + from nemo.lightning.data import add_megatron_sampler + + if collate_fn is None: + collate_fn = lambda x: HfDatasetDataModule.collate_fn(x, pad_token_id=self.pad_token_id) + + dataloader = DataLoader( + self.dataset, + num_workers=self.num_workers, + pin_memory=self.pin_memory, + persistent_workers=self.persistent_workers, + collate_fn=collate_fn, + batch_size=self.micro_batch_size, + ) + if not self.use_mcore_sampler: + return dataloader + + rank = 0 + world_size = 1 + if torch.distributed.is_initialized(): + rank = torch.distributed.get_rank() + world_size = torch.distributed.get_world_size() + + return add_megatron_sampler( + dataloader, + self.micro_batch_size, + self.global_batch_size, + dataloader_type=self.mcore_dataloader_type, + rank=rank, + world_size=world_size, + ) diff --git a/nemo/collections/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py index f872db94077d..cabbd444c0cf 100644 --- a/nemo/collections/llm/gpt/data/squad.py +++ b/nemo/collections/llm/gpt/data/squad.py @@ -56,6 +56,7 @@ def __init__( persistent_workers: bool = False, pad_to_max_length: bool = False, packed_sequence_specs: Optional["PackedSequenceSpecs"] = None, + sanity_check_dist_workers: bool = True, ): self.force_redownload = force_redownload self.delete_raw = delete_raw @@ -74,6 +75,7 @@ def __init__( persistent_workers=persistent_workers, pad_to_max_length=pad_to_max_length, packed_sequence_specs=packed_sequence_specs, + sanity_check_dist_workers=sanity_check_dist_workers, ) def prepare_data(self) -> None: diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index ebecc06140fe..26b8d67cb53d 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -37,6 +37,7 @@ GemmaConfig7B, GemmaModel, ) +from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HfAutoModelForCausalLM from nemo.collections.llm.gpt.model.llama import ( CodeLlamaConfig7B, CodeLlamaConfig13B, @@ -166,4 +167,5 @@ "gpt_forward_step", "transformer_engine_layer_spec", "local_layer_spec", + "HfAutoModelForCausalLM", ] diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py new file mode 100644 index 000000000000..794c39738dbe --- /dev/null +++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py @@ -0,0 +1,108 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytorch_lightning as pl +import torch +import torch.nn.functional as F +from transformers import AutoModelForCausalLM + +from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer +from nemo.lightning import io + + +def _extract_non_bias_params(model): + return list(map(lambda x: x[1], filter(lambda x: not 'bias' in x[0], model.named_parameters()))) + + +def masked_cross_entropy(logits, targets, mask=None): + if mask is not None: + loss = F.cross_entropy(logits, targets, reduction='none') + return torch.mean(loss[mask == 1]) + else: + return F.cross_entropy(logits, targets) + + +class HfAutoModelForCausalLM(pl.LightningModule, io.IOMixin): + def __init__(self, model_name='gpt2', load_pretrained_weights=True, tokenizer=None, loss_fn=masked_cross_entropy): + super().__init__() + self.save_hyperparameters() + self.model_name = model_name + self._tokenizer = None + self.model = None + self.loss_fn = loss_fn + self.load_pretrained_weights = load_pretrained_weights + self.is_hf_model = True + + @property + def tokenizer(self): + if self._tokenizer is None: + self._tokenizer = HfAutoModelForCausalLM.configure_tokenizer(self.model_name) + return self._tokenizer + + @tokenizer.setter + def tokenizer(self, value): + assert self._tokenizer is None + self._tokenizer = value + + @staticmethod + def configure_tokenizer(model_name): + return AutoTokenizer(model_name) + + def configure_model(self): + # create all your layers here + if self.load_pretrained_weights: + self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype='auto') + else: + from transformers import AutoConfig + + config = AutoConfig.from_pretained(self.model_name) + self.model = AutoModelForCausalLM.from_config(config) + self.model.train() + + def forward(self, input_ids, attention_mask=None, labels=None, loss_mask=None): + outputs = self.model( + input_ids=input_ids.to(self.model.device), + attention_mask=attention_mask, + ) + labels = labels.to(self.model.device) + if loss_mask is not None: + loss_mask = loss_mask.to(self.model.device).view(-1) + n_cls = outputs.logits.shape[-1] + outputs.loss = self.loss_fn(outputs.logits.view(-1, n_cls), labels.view(-1), loss_mask) + return outputs + + def training_step(self, batch): + tokens = batch['tokens'] + labels = batch['labels'] + loss_mask = batch.get('loss_mask', None) + output = self.forward( + input_ids=tokens, + labels=labels, + loss_mask=loss_mask, + ) + + loss = output.loss + self.log('train_log', loss, on_step=True, on_epoch=True, prog_bar=True) + return loss + + def validation_step(self, batch, batch_idx): + tokens = batch['tokens'] + labels = batch['labels'] + output = self.forward( + input_ids=tokens, + labels=labels, + ) + + loss = output.loss + self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True) diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py index ff81c3b383fc..b1fc15aee07c 100644 --- a/nemo/collections/llm/recipes/__init__.py +++ b/nemo/collections/llm/recipes/__init__.py @@ -18,6 +18,7 @@ chatglm3_6b, gemma_2b, gemma_7b, + hf_auto_model_for_causal_lm, llama3_8b, llama3_8b_16k, llama3_8b_64k, @@ -73,6 +74,7 @@ "mamba2_hybrid_8b", "mistral_7b", "mistral_nemo_12b", + "hf_auto_model_for_causal_lm", "mixtral_8x7b", "mixtral_8x7b_16k", "mixtral_8x7b_64k", diff --git a/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py new file mode 100644 index 000000000000..6c81bf922152 --- /dev/null +++ b/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py @@ -0,0 +1,168 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HfAutoModelForCausalLM +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import pytorch_adam_with_cosine_annealing +from nemo.utils.exp_manager import TimingCallback + +NAME = "hf_auto_model_for_causal_lm" + + +@run.cli.factory(name=NAME) +def model(model_name) -> run.Config[pl.LightningModule]: + """ + Factory function to create HfAutoModelForCausalLM model configurations. + + Args: + model_name (str): Model id on HF. + + Returns: + run.Config[pl.LightningModule]: Configuration for the HfAutoModelForCausalLM. + + Examples: + CLI usage: + $ nemo llm pretrain --factory 'HfAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")' + + Python API usage: + >>> model_config = model(model_name="mistralai/Mistral-Nemo-Instruct-2407") + >>> print(model_config) + """ + return run.Config(HfAutoModelForCausalLM, model_name=model_name) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 100, + callbacks: Optional[list[run.Config[Callback]]] = None, + strategy: Optional[str] = 'ddp', + gradient_clip_val: float = 1.0, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for HfAutoModelForCausalLM. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + strategy: Optional[str] = 'ddp': Parallelism strategy. + gradient_clip_val: float = 1.0: gradient-clip value. + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=HfAutoModelForCausalLM ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + """ + strategy = str(strategy).lower() + assert strategy in ['', 'ddp', 'fsdp'], strategy + if strategy == 'fsdp': + # See: https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81 + gradient_clip_val = None + + trainer = run.Config( + nl.Trainer, + devices=num_gpus_per_node, + max_steps=max_steps, + accelerator='gpu', + strategy=strategy, + log_every_n_steps=1, + limit_val_batches=0.0, + num_sanity_val_steps=0, + accumulate_grad_batches=10, + callbacks=callbacks, + gradient_clip_val=gradient_clip_val, + use_distributed_sampler=False, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, + model_name: str = '', +) -> run.Partial: + """ + Create a pre-training recipe for Mistral 7B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory 'HfAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")' + + Python API usage: + >>> recipe = pretrain_recipe(name="auto_pretrain", num_nodes=2, model_name="mistralai/Mistral-Nemo-Instruct-2407") + >>> print(recipe) + """ + return run.Partial( + fn, + model=model(model_name), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=pytorch_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py index c6510577711d..4148d19c6635 100644 --- a/nemo/collections/llm/recipes/optim/adam.py +++ b/nemo/collections/llm/recipes/optim/adam.py @@ -17,7 +17,12 @@ import nemo_run as run from megatron.core.optimizer import OptimizerConfig -from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule, OptimizerModule +from nemo.lightning.pytorch.optim import ( + CosineAnnealingScheduler, + MegatronOptimizerModule, + OptimizerModule, + PytorchOptimizerModule, +) @run.cli.factory @@ -59,3 +64,55 @@ def distributed_fused_adam_with_cosine_annealing( config=opt_cfg, lr_scheduler=sched, ) + + +@run.cli.factory +def pytorch_adam_with_cosine_annealing( + precision: str = "bf16-mixed", # or "16-mixed" + warmup_steps: int = 2000, + constant_steps: int = 0, + max_lr: float = 1e-5, + min_lr: Optional[float] = None, + clip_grad: float = 1.0, +) -> run.Config[OptimizerModule]: + from torch.optim import Adam + + return run.Config( + PytorchOptimizerModule, + optim_cls=Adam, + config=dict( + lr=max_lr, + weight_decay=0.1, + betas=(0.9, 0.95), + eps=1e-8, + ), + lr_scheduler=run.Config( + CosineAnnealingScheduler, + warmup_steps=warmup_steps, + constant_steps=constant_steps, + min_lr=min_lr or (0.1 * max_lr), + ), + ) + + +@run.cli.factory +def pytorch_adam_with_flat_lr( + precision: str = "bf16-mixed", # or "16-mixed" + warmup_steps: int = 2000, + constant_steps: int = 0, + max_lr: float = 1e-5, + min_lr: Optional[float] = None, + clip_grad: float = 1.0, +) -> run.Config[OptimizerModule]: + from torch.optim import Adam + + return run.Config( + PytorchOptimizerModule, + optim_cls=Adam, + config=dict( + lr=max_lr, + weight_decay=0.1, + betas=(0.9, 0.95), + eps=1e-8, + ), + ) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py index 17ffc01fb7f4..4ce9701e76b4 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py @@ -209,7 +209,7 @@ def create_masked_lm_predictions( # on-the-fly whole word masking is possible. token_boundary = [0] * len(tokens) skip_mask_idx = None # Store the index of token that cannot be masked. - for (i, token) in enumerate(tokens): + for i, token in enumerate(tokens): if token == skip_masking_id: skip_mask_idx = i if token == cls_id or token == sep_id: @@ -285,7 +285,10 @@ def create_masked_lm_predictions( available_ngrams = list(cand_index_set.keys()) # n - 1 because pvals is 0-indexed and available ngrams are 1-indexed. pvals_current = np.array([pvals[n - 1] for n in available_ngrams]) - n = np_rng.choice(available_ngrams, p=pvals_current / pvals_current.sum(keepdims=True),) + n = np_rng.choice( + available_ngrams, + p=pvals_current / pvals_current.sum(keepdims=True), + ) else: # Sampling "n" from the geometric distribution and clipping it to # the max_ngrams. Using p=0.2 default from the SpanBERT paper @@ -488,7 +491,10 @@ def create_extreme_masked_lm_predictions( if span_length_distribution == LengthDistribution.uniform: available_ngrams = list(cand_index_set.keys()) pvals_current = np.array([pvals[n] for n in available_ngrams]) - n = np_rng.choice(available_ngrams, p=pvals_current / pvals_current.sum(keepdims=True),) + n = np_rng.choice( + available_ngrams, + p=pvals_current / pvals_current.sum(keepdims=True), + ) elif span_length_distribution == LengthDistribution.geometric: # Sampling "n" from the geometric distribution and clipping it to # the max_ngrams. Using p=0.2 default from the SpanBERT paper @@ -914,7 +920,13 @@ def build_train_valid_test_datasets( seed, ) test_ds = MockT5Dataset( - cfg, tokenizer, "test", int(train_valid_test_num_samples[2]), max_seq_length, max_seq_length_dec, seed, + cfg, + tokenizer, + "test", + int(train_valid_test_num_samples[2]), + max_seq_length, + max_seq_length_dec, + seed, ) return train_ds, valid_ds, test_ds else: @@ -1257,6 +1269,7 @@ def get_samples_mapping( binary_head, index_mapping_dir: str = None, samples_mapping: Any = None, + sanity_check_dist_workers: bool = True, ): """Get a list that maps a sample index to a starting sentence index, end sentence index, and length""" @@ -1328,14 +1341,16 @@ def get_samples_mapping( logging.info( ' > elasped time to build and save samples mapping ' '(seconds): {:4f}'.format(time.time() - start_time) ) - torch.distributed.barrier() - counts = torch.cuda.LongTensor([1]) - torch.distributed.all_reduce(counts, group=parallel_state.get_data_parallel_group(with_context_parallel=True)) - torch.distributed.all_reduce(counts, group=parallel_state.get_pipeline_model_parallel_group()) - assert counts[0].item() == ( - torch.distributed.get_world_size() - // torch.distributed.get_world_size(group=parallel_state.get_tensor_model_parallel_group()) - ) + + if sanity_check_dist_workers: + torch.distributed.barrier() + counts = torch.cuda.LongTensor([1]) + torch.distributed.all_reduce(counts, group=parallel_state.get_data_parallel_group(with_context_parallel=True)) + torch.distributed.all_reduce(counts, group=parallel_state.get_pipeline_model_parallel_group()) + assert counts[0].item() == ( + torch.distributed.get_world_size() + // torch.distributed.get_world_size(group=parallel_state.get_tensor_model_parallel_group()) + ) # Load indexed dataset if not given externally. if samples_mapping is None: logging.info(' > loading indexed mapping from {}'.format(indexmap_filename)) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py index c42249cec2f2..898ddb7d716b 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py @@ -64,6 +64,7 @@ def __init__( output_original_text: bool = False, ceil_to_power_2: bool = False, get_attention_mask_from_fusion: bool = False, + sanity_check_dist_workers: bool = True, ): """ file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} @@ -89,6 +90,7 @@ def __init__( special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} is_test: Whether this dataset is the test split. output_original_text (bool): if true, will keep the original text in the output alongside the tokenized ids. + sanity_check_dist_workers (bool): if true, will run sanity check across workers when making mapping. """ self.tokenizer = tokenizer self.file_path = file_path @@ -117,6 +119,7 @@ def __init__( self.output_original_text = output_original_text self.ceil_to_power_2 = ceil_to_power_2 self.get_attention_mask_from_fusion = get_attention_mask_from_fusion + self.sanity_check_dist_workers = sanity_check_dist_workers if special_tokens is None: self.special_tokens = { @@ -196,6 +199,7 @@ def _build_samples_mapping(self): binary_head=False, index_mapping_dir=self.index_mapping_dir, samples_mapping=osm, + sanity_check_dist_workers=self.sanity_check_dist_workers, ) else: self.samples_mapping = None diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py index 0f30dfe22851..ea7d91b37214 100644 --- a/nemo/lightning/data.py +++ b/nemo/lightning/data.py @@ -139,6 +139,8 @@ def add_megatron_sampler( dataloader_type: Literal["single", "cyclic", "batch"] = "single", drop_last: bool = True, pad_samples_to_global_batch_size: bool = False, + rank: int = 0, + world_size: int = 1, # data_sharding: bool = False ) -> DataLoader: """ @@ -172,9 +174,6 @@ def add_megatron_sampler( Returns: DataLoader: A new DataLoader instance with the configured Megatron sampler. """ - - from megatron.core import parallel_state - if dataloader_type == 'single': batch_sampler = MegatronPretrainingSampler( total_samples=len(dataloader.dataset), @@ -182,8 +181,8 @@ def add_megatron_sampler( micro_batch_size=micro_batch_size, global_batch_size=global_batch_size, rampup_batch_size=rampup_batch_size, - data_parallel_rank=parallel_state.get_data_parallel_rank(), - data_parallel_size=parallel_state.get_data_parallel_world_size(), + data_parallel_rank=rank, + data_parallel_size=world_size, drop_last=drop_last, pad_samples_to_global_batch_size=pad_samples_to_global_batch_size, ) @@ -192,8 +191,8 @@ def add_megatron_sampler( total_samples=len(dataloader.dataset), consumed_samples=consumed_samples, micro_batch_size=micro_batch_size, - data_parallel_rank=parallel_state.get_data_parallel_rank(), - data_parallel_size=parallel_state.get_data_parallel_world_size(), + data_parallel_rank=rank, + data_parallel_size=world_size, drop_last=drop_last, # data_sharding=data_sharding ) @@ -207,8 +206,8 @@ def add_megatron_sampler( consumed_samples=consumed_samples, micro_batch_size=micro_batch_size, global_batch_size=global_batch_size, - data_parallel_rank=parallel_state.get_data_parallel_rank(), - data_parallel_size=parallel_state.get_data_parallel_world_size(), + data_parallel_rank=rank, + data_parallel_size=world_size, drop_last=drop_last, pad_samples_to_global_batch_size=not drop_last, ) diff --git a/nemo/lightning/pytorch/optim/__init__.py b/nemo/lightning/pytorch/optim/__init__.py index 1572e95e136a..db40e5c48c1b 100644 --- a/nemo/lightning/pytorch/optim/__init__.py +++ b/nemo/lightning/pytorch/optim/__init__.py @@ -28,6 +28,7 @@ WarmupPolicyScheduler, ) from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule +from nemo.lightning.pytorch.optim.pytorch import PytorchOptimizerModule __all__ = [ "OptimizerModule", @@ -45,4 +46,5 @@ "PolynomialDecayAnnealingScheduler", "PolynomialHoldDecayAnnealingScheduler", "CosineAnnealingScheduler", + "PytorchOptimizerModule", ] diff --git a/nemo/lightning/pytorch/optim/pytorch.py b/nemo/lightning/pytorch/optim/pytorch.py new file mode 100644 index 000000000000..6600fc0cf0a4 --- /dev/null +++ b/nemo/lightning/pytorch/optim/pytorch.py @@ -0,0 +1,132 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, List, Optional + +import pytorch_lightning as pl +from torch.optim import Optimizer + +from nemo.lightning.megatron_parallel import MegatronParallel +from nemo.lightning.pytorch.optim.base import LRSchedulerModule, OptimizerModule + + +def _param_does_not_have_wd(param_name, param): + return 'bias' in param_name + + +class PytorchOptimizerModule(OptimizerModule): + """A OptimizerModule for pytorch optimizers. + + Attributes: + config (OptimizerConfig): Configuration for the optimizer. + no_weight_decay_cond (Optional[Callable]): Condition for no weight decay. + scale_lr_cond (Optional[Callable]): Condition for scaling learning rate. + lr_mult (float): Learning rate multiplier. + + Example:: + + config = OptimizerConfig(...) + lr_scheduler = MyLRSchedulerModule(...) + optimizer_module = PytorchOptimizerModule(config, lr_scheduler) + + Methods: + setup(model): Sets up the optimizer. + optimizers(model): Defines the optimizers. + """ + + def __init__( + self, + optim_cls, + config: dict = {'lr': 3e-4}, + lr_scheduler: Optional[LRSchedulerModule] = None, + no_weight_decay_cond: Optional[Callable] = _param_does_not_have_wd, + scale_lr_cond: Optional[Callable] = None, + lr_mult: float = 1.0, + ): + """Initializes the PytorchOptimizerModule. + + Args: + config (OptimizerConfig): Configuration for the optimizer. + lr_scheduler (Optional[LRSchedulerModule]): The learning rate scheduler module. + no_weight_decay_cond (Optional[Callable]): Condition for no weight decay. + scale_lr_cond (Optional[Callable]): Condition for scaling learning rate. + lr_mult (float): Learning rate multiplier. + """ + + super().__init__(lr_scheduler=lr_scheduler) + self.optim_cls = optim_cls + self.config = config + self.no_weight_decay_cond = no_weight_decay_cond + self.scale_lr_cond = scale_lr_cond + self.lr_mult = lr_mult + self.optim_cls = optim_cls + + def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"): + # Noop + pass + + def optimizers(self, model) -> List[Optimizer]: + """Defines the optimizers. + + Args: + model (nn.Module): The model for which the optimizers are being defined. + + Returns: + List[Optimizer]: The list of optimizers. + + Raises: + ValueError: If the model is an instance of MegatronParallel. + """ + + if isinstance(model, MegatronParallel): + raise ValueError("Model cannot be an instance of MegatronParallel") + + params_with_wd, params_without_wd = [], [] + if self.no_weight_decay_cond is not None: + for name, param in model.named_parameters(): + if self.no_weight_decay_cond(name, param): + params_without_wd.append(param) + else: + params_with_wd.append(param) + else: + params_with_wd = model.parameters() + + optimizers = [] + if len(params_with_wd) > 0: + optimizers.append( + self.optim_cls( + params_with_wd, + **self.config, + ) + ) + + if len(params_without_wd) > 0: + wd = self.config.get('weight_decay', None) + kwargs['weight_decay'] = 0 + optimizers.append( + self.optim_cls( + params_without_wd, + **kwargs, + ) + ) + # restore value + if wd is not None: + kwargs['weight_decay'] = wd + + assert len(optimizers) > 0, "Expected at least one optimizer with params" + return optimizers + + def finalize_model_grads(self, *args, **kwargs): + # Noop + pass diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py index 55bafce5f71e..52ba9e3220ac 100644 --- a/nemo/lightning/pytorch/plugins/data_sampler.py +++ b/nemo/lightning/pytorch/plugins/data_sampler.py @@ -65,9 +65,14 @@ def setup(self, global_rank: int) -> None: setup_microbatch_calculator(global_rank, self.micro_batch_size, self.global_batch_size, self.rampup_batch_size) def transform_dataloader(self, dataloader: DataLoader, consumed_samples: int = 0) -> DataLoader: + from megatron.core import parallel_state + from nemo.lightning.data import add_megatron_sampler mode = getattr(dataloader, 'mode', 'train') + + data_parallel_rank = parallel_state.get_data_parallel_rank() + data_parallel_size = parallel_state.get_data_parallel_world_size() return add_megatron_sampler( dataloader, micro_batch_size=self.micro_batch_size, @@ -76,6 +81,8 @@ def transform_dataloader(self, dataloader: DataLoader, consumed_samples: int = 0 consumed_samples=self.init_consumed_samples if mode == 'train' else 0, dataloader_type=self.dataloader_type, drop_last=self.drop_last, + rank=data_parallel_rank, + world_size=data_parallel_size, ) def compute_consumed_samples(self, steps_since_resume=0) -> int: diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index c5195511c522..b045804044ec 100644 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -267,6 +267,8 @@ def __init__( def connect(self, model: pl.LightningModule) -> None: super().connect(model) + assert not hasattr(model, 'is_hf_model'), "Cannot use HfAutoModelForCausalLM with MegatronParallel" + _maybe_mcore_config = _strategy_lib.set_model_parallel_attributes(model, self.parallelism) if _maybe_mcore_config: self._mcore_config = _maybe_mcore_config From 51e9b7cb543773c26f02233a696551acc7aae727 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 23 Oct 2024 11:20:32 +0200 Subject: [PATCH 08/12] ci: Update tests (#10987) * ci: Re-enable `L0_Unit_Tests_GPU_Lightning` Signed-off-by: Oliver Koenig * ci: Disable `L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2` Signed-off-by: Oliver Koenig --------- Signed-off-by: Oliver Koenig --- .github/workflows/cicd-main.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 345482e9a1a8..55a952c21eb6 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -217,15 +217,14 @@ jobs: SCRIPT: | NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --with_downloads - OPTIONAL_L0_Unit_Tests_GPU_Lightning: + L0_Unit_Tests_GPU_Lightning: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true' + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --with_downloads - IS_OPTIONAL: true L0_Unit_Tests_GPU_Others: needs: [cicd-test-container-setup] @@ -2468,10 +2467,10 @@ jobs: rm -rf examples/nlp/language_modeling/gpt_pretrain_results rm -rf examples/nlp/language_modeling/gpt_index_mappings - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2: + Optional_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true' + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'Optional_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-2-h100 SCRIPT: | @@ -2578,6 +2577,7 @@ jobs: AFTER_SCRIPT: | rm -rf examples/nlp/language_modeling/gpt_pretrain_results rm -rf examples/nlp/language_modeling/gpt_index_mappings + IS_OPTIONAL: true OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124: needs: [cicd-test-container-setup] @@ -4323,7 +4323,7 @@ jobs: - L0_Unit_Tests_GPU_TTS #- OPTIONAL_L0_Unit_Tests_GPU_Core - L0_Unit_Tests_GPU_Hydra - #- OPTIONAL_L0_Unit_Tests_GPU_Lightning + - L0_Unit_Tests_GPU_Lightning - L0_Unit_Tests_GPU_Others - L0_Unit_Tests_CPU_ASR @@ -4390,7 +4390,7 @@ jobs: - L2_Megatron_GPT_with_Drop_Optimizer_States_TP2 - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2 + # - Optional_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2 #- OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124 - L2_Megatron_GPT_Finetuning_PP2 - L2_Megatron_GPT_Finetuning_StarCoder_PP1 From 05f75862169b1dc7f2641c54ebe5ab3f6f8451cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 23 Oct 2024 11:20:54 +0200 Subject: [PATCH 09/12] =?UTF-8?q?[=F0=9F=A4=A0]:=20Howdy=20folks,=20let's?= =?UTF-8?q?=20bump=20`Dockerfile.ci`=20to=20425cdd4=20!=20(#11001)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com> --- Dockerfile.ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ci b/Dockerfile.ci index 09ffe9674e5d..6ef99a35ae82 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.17.0 -ARG MCORE_TAG=563d5d1726012e8077895b732d5bc81b6e975e8d +ARG MCORE_TAG=425cdd48d5ef5d360d8033288ff7cb0d378f535f ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ From eeb861b89f12ab233a6b9ec5a10d83f29febf94e Mon Sep 17 00:00:00 2001 From: malay-nagda <164242706+malay-nagda@users.noreply.github.com> Date: Wed, 23 Oct 2024 20:45:02 +0530 Subject: [PATCH 10/12] gpt3 175b cli (#10985) * gpt3 175b cli Signed-off-by: Malay Nagda * Apply isort and black reformatting Signed-off-by: malay-nagda * Apply isort and black reformatting Signed-off-by: malay-nagda --------- Signed-off-by: Malay Nagda Signed-off-by: malay-nagda Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com> Co-authored-by: malay-nagda --- nemo/collections/llm/recipes/__init__.py | 2 ++ tests/lightning/test_nemo_run.py | 1 + 2 files changed, 3 insertions(+) diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py index b1fc15aee07c..21994b75f60d 100644 --- a/nemo/collections/llm/recipes/__init__.py +++ b/nemo/collections/llm/recipes/__init__.py @@ -18,6 +18,7 @@ chatglm3_6b, gemma_2b, gemma_7b, + gpt3_175b, hf_auto_model_for_causal_lm, llama3_8b, llama3_8b_16k, @@ -89,6 +90,7 @@ "nemotron4_22b_16k", "nemotron4_22b_64k", "nemotron4_340b", + "gpt3_175b", "adam", "default_log", "default_resume", diff --git a/tests/lightning/test_nemo_run.py b/tests/lightning/test_nemo_run.py index 947930c84847..934eaa853bf0 100644 --- a/tests/lightning/test_nemo_run.py +++ b/tests/lightning/test_nemo_run.py @@ -36,6 +36,7 @@ ("nemotron4_22b_64k", "pretrain_recipe", "nemotron4_22b_64k_pretrain"), ("nemotron4_340b", "pretrain_recipe", "nemotron4_340b_pretrain"), ("nemotron4_340b", "finetune_recipe", "nemotron4_340b_finetune"), + ("gpt3_175b", "pretrain_recipe", "gpt3_175b_pretrain"), ], ) def test_recipes_with_nemo_run(module, recipe, name, tmpdir, monkeypatch): From 9251d1c5b1c0b58c32dcae76b076e734467596f0 Mon Sep 17 00:00:00 2001 From: Valerie Sarge Date: Wed, 23 Oct 2024 08:19:58 -0700 Subject: [PATCH 11/12] Fix for crash with LoRA + tp_overlap_comm=false + sequence_parallel=true (#10920) * Add fusion defaults for llama2 Signed-off-by: Valerie Sarge * Alter ParallelLinearAdapter condition to account for tp_comm_overlap=false Signed-off-by: Valerie Sarge * Apply isort and black reformatting Signed-off-by: vysarge * Clean up unneeded defaults Signed-off-by: Valerie Sarge * gpt3 175b cli Signed-off-by: Malay Nagda * Apply isort and black reformatting Signed-off-by: malay-nagda * Apply isort and black reformatting Signed-off-by: vysarge --------- Signed-off-by: Valerie Sarge Signed-off-by: vysarge Signed-off-by: Malay Nagda Signed-off-by: malay-nagda Signed-off-by: Eric Harper Co-authored-by: vysarge Co-authored-by: Malay Nagda Co-authored-by: malay-nagda Co-authored-by: Eric Harper --- nemo/collections/llm/gpt/model/llama.py | 7 +++++++ .../modules/common/megatron/adapters/parallel_adapters.py | 6 +++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index 0ec13a3d91e8..b48f99e061c9 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -50,6 +50,13 @@ class LlamaConfig(GPTConfig): attention_dropout: float = 0.0 hidden_dropout: float = 0.0 share_embeddings_and_output_weights: bool = False + # Fusions + bias_activation_fusion: bool = True + masked_softmax_fusion: bool = True + persist_layer_norm: bool = True + bias_dropout_fusion: bool = True + apply_rope_fusion: bool = True + cross_entropy_loss_fusion: bool = False @dataclass diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index a547d593d6d7..042dbb95979e 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -256,7 +256,11 @@ def __init__( te_version = packaging.version.Version(version("transformer-engine")) if te_version >= packaging.version.Version("1.5.0dev") and ( - not self.input_is_parallel and getattr(model_parallel_config, "tp_comm_overlap_disable_qkv", False) + not self.input_is_parallel + and ( + not getattr(model_parallel_config, "tp_comm_overlap", False) + or getattr(model_parallel_config, "tp_comm_overlap_disable_qkv", False) + ) ): # TE 1.5 introduces the option `return_layernorm_output_gathered`, so the all gather # in the forward method is not needed, so set self._sequence_parallel to False From ed37d19d51229ad01b6c4e43eeb0f1bd1f3216d3 Mon Sep 17 00:00:00 2001 From: Huiying Date: Wed, 23 Oct 2024 10:02:43 -0700 Subject: [PATCH 12/12] llm.generate fixes (#10983) * fix context path, disable optimizer init, add tp Signed-off-by: HuiyingLi * format Signed-off-by: HuiyingLi * address comments, require user to provide trainer Signed-off-by: HuiyingLi * minor fix Signed-off-by: HuiyingLi * minor fixes Signed-off-by: HuiyingLi --------- Signed-off-by: HuiyingLi --- nemo/collections/llm/api.py | 2 +- nemo/collections/llm/inference/base.py | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 71e006472db9..a9b3d4361f5b 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -436,7 +436,7 @@ def export_ckpt( def generate( path: Union[Path, str], prompts: list[str], - trainer: Optional[nl.Trainer] = None, + trainer: nl.Trainer, params_dtype: torch.dtype = torch.bfloat16, max_batch_size: int = 4, random_seed: Optional[int] = None, diff --git a/nemo/collections/llm/inference/base.py b/nemo/collections/llm/inference/base.py index 95da536fde06..0171f1c2dd5c 100644 --- a/nemo/collections/llm/inference/base.py +++ b/nemo/collections/llm/inference/base.py @@ -16,6 +16,7 @@ import nemo.lightning as nl from nemo.lightning import io +from nemo.lightning.ckpt_utils import ckpt_to_context_subdir from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy from nemo.lightning.pytorch.strategies.utils import RestoreConfig @@ -44,6 +45,7 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl. load_optim_state=False, ) trainer.strategy.restore_config = restore_config + trainer.strategy._setup_optimizers = False trainer.ckpt_path = None trainer.strategy.connect(model) if trainer.strategy.launcher is not None: @@ -61,16 +63,22 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl. def setup_model_and_tokenizer( path: Path, - trainer: Optional[nl.Trainer] = None, + trainer: nl.Trainer, params_dtype: torch.dtype = torch.bfloat16, inference_batch_times_seqlen_threshold: int = 1000, ) -> tuple[MCoreGPTModel, MCoreTokenizerWrappper]: - model: io.TrainerContext = io.load_context(path=path, subpath="model") - trainer = trainer or io.load_context(path=path, subpath="trainer") + model: io.TrainerContext = io.load_context(path=ckpt_to_context_subdir(path), subpath="model") _setup_trainer_and_restore_model(path=path, trainer=trainer, model=model) # This is to get the MCore model required in GPTInferenceWrapper. - mcore_model = model.module.module.module + mcore_model = model + while mcore_model: + if type(mcore_model) is MCoreGPTModel: + break + mcore_model = getattr(mcore_model, "module", None) + if mcore_model is None or type(mcore_model) is not MCoreGPTModel: + raise ValueError("Exact McoreGPTModel instance not found in the model structure.") + inference_wrapped_model = GPTInferenceWrapper( mcore_model, InferenceWrapperConfig(