From 4be4e8f59ceb70cde23029108e53de168b15a748 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Mon, 21 Oct 2024 09:40:17 -0700 Subject: [PATCH 1/8] respect warnings' filters (#10953) * respect warnings' filters Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- nemo/utils/nemo_logging.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/nemo/utils/nemo_logging.py b/nemo/utils/nemo_logging.py index 95e17e5c5f6c..bcc7ad199603 100644 --- a/nemo/utils/nemo_logging.py +++ b/nemo/utils/nemo_logging.py @@ -76,7 +76,7 @@ def __init__(self, capture_warnings=True): self.rank = 0 if is_global_rank_zero() else "UNK" def _define_logger(self, capture_warnings=True): - """ Creates the logger if not already created. Called in init""" + """Creates the logger if not already created. Called in init""" # Use double-checked locking to avoid taking lock unnecessarily. if self._logger is not None: @@ -126,7 +126,7 @@ def record_factory(*args, **kwargs): self._logger.propagate = False def remove_stream_handlers(self): - """ Removes StreamHandler that log to stdout and stderr from the logger.""" + """Removes StreamHandler that log to stdout and stderr from the logger.""" if self._logger is None: raise RuntimeError("Impossible to set handlers if the Logger is not predefined") @@ -236,7 +236,7 @@ def set_verbosity(self, verbosity_level): @contextmanager def patch_stderr_handler(self, stream): - """ Sends messages that should log to stderr to stream instead. Useful for unittests """ + """Sends messages that should log to stderr to stream instead. Useful for unittests""" if self._logger is not None: try: old_stream = self._handlers["stream_stderr"].stream @@ -268,7 +268,7 @@ def patch_stderr_handler(self, stream): @contextmanager def patch_stdout_handler(self, stream): - """ Sends messages that should log to stdout to stream instead. Useful for unittests """ + """Sends messages that should log to stdout to stream instead. Useful for unittests""" if self._logger is not None: try: old_stream = self._handlers["stream_stdout"].stream @@ -339,6 +339,16 @@ def captureWarnings(self, capture): warnings.showwarning = self.old_warnings_showwarning self.old_warnings_showwarning = None + def _warning_is_ignored(self, category): + from warnings import filters + + # Search the filters + for action, msg, cat, mod, ln in filters: + # least-common demoninator if multiple filters for the same class. + if cat == category and action == 'ignore': + return True + return False + def _showwarning(self, message, category, filename, lineno, file=None, line=None): """ Implementation of showwarnings which redirects to logging. @@ -346,6 +356,8 @@ def _showwarning(self, message, category, filename, lineno, file=None, line=None with level logging.WARNING. """ s = warnings.formatwarning(message, category, filename, lineno, line) + if self._warning_is_ignored(category): + return self.warning("%s", s) def _logged_once(self, msg, mode): From 6607e760926b7dd3afc1bcf0050a5df408ab1950 Mon Sep 17 00:00:00 2001 From: Huy Vu <86480512+huvunvidia@users.noreply.github.com> Date: Mon, 21 Oct 2024 14:15:46 -0400 Subject: [PATCH 2/8] Update T5 tokenizer (adding additional tokens to tokenizer config) (#10972) * initial commit * restore t5_pretraining * Apply isort and black reformatting Signed-off-by: huvunvidia --------- Signed-off-by: huvunvidia Co-authored-by: Huy Vu2 Co-authored-by: huvunvidia --- .../common/tokenizers/huggingface/auto_tokenizer.py | 11 ++++++++++- nemo/collections/llm/t5/data/fine_tuning.py | 2 -- nemo/collections/llm/t5/data/pre_training.py | 4 ---- .../collections/nlp/modules/common/tokenizer_utils.py | 9 +++++++-- tests/collections/llm/megatron_t5_finetuning.py | 3 +++ tests/collections/llm/megatron_t5_pretraining.py | 3 +++ 6 files changed, 23 insertions(+), 9 deletions(-) diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py index 76dca1268c3b..439322b8e810 100644 --- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py +++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py @@ -13,7 +13,7 @@ # limitations under the License. from collections import OrderedDict -from typing import Optional +from typing import List, Optional from transformers import AutoTokenizer as AUTOTOKENIZER @@ -43,6 +43,7 @@ def __init__( sep_token: Optional[str] = None, cls_token: Optional[str] = None, unk_token: Optional[str] = None, + additional_special_tokens: Optional[List] = [], use_fast: Optional[bool] = False, trust_remote_code: Optional[bool] = False, ): @@ -60,6 +61,7 @@ def __init__( sep_token: token used for separating sequences cls_token: class token. Usually equal to bos_token unk_token: token to use for unknown tokens + additional_special_tokens: list of other tokens beside standard special tokens (bos, eos, pad, etc.). For example, sentinel tokens for T5 (, , etc.) use_fast: whether to use fast HuggingFace tokenizer """ try: @@ -124,10 +126,17 @@ def __init__( elif self.tokenizer.cls_token is None and self.tokenizer.bos_token: special_tokens_dict["cls_token"] = self.tokenizer.bos_token + # add additional special tokens (not standard special tokens such as bos, eod, sep) + if additional_special_tokens is not None: + special_tokens_dict["additional_special_tokens"] = additional_special_tokens + new_tokens_in_vocab = [] for token in [mask_token, bos_token, eos_token, pad_token, sep_token, cls_token, unk_token]: if token is not None and token not in self.tokenizer.get_vocab(): new_tokens_in_vocab.append(token) + for token in additional_special_tokens: + if token is not None and token not in self.tokenizer.get_vocab(): + new_tokens_in_vocab.append(token) if len(new_tokens_in_vocab) > 0: """ diff --git a/nemo/collections/llm/t5/data/fine_tuning.py b/nemo/collections/llm/t5/data/fine_tuning.py index b1315f7a708a..9326dabe7b84 100644 --- a/nemo/collections/llm/t5/data/fine_tuning.py +++ b/nemo/collections/llm/t5/data/fine_tuning.py @@ -61,8 +61,6 @@ def __init__( from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase") - additional_tokens = {'additional_special_tokens': [f'' for i in range(100)]} - self.tokenizer.add_special_tokens(additional_tokens) self.memmap_workers = memmap_workers self.num_workers = num_workers diff --git a/nemo/collections/llm/t5/data/pre_training.py b/nemo/collections/llm/t5/data/pre_training.py index 2c73e0b78b11..e6f619972284 100644 --- a/nemo/collections/llm/t5/data/pre_training.py +++ b/nemo/collections/llm/t5/data/pre_training.py @@ -130,10 +130,6 @@ def __init__( # add additional tokens for T5 tokenizer from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer - self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase") - additional_tokens = {'additional_special_tokens': [f'' for i in range(100)]} - self.tokenizer.add_special_tokens(additional_tokens) - self.data_sampler = MegatronDataSampler( seq_len=self.seq_length, micro_batch_size=micro_batch_size, diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py index 4e6f9e15b839..dfc55a6c9065 100644 --- a/nemo/collections/nlp/modules/common/tokenizer_utils.py +++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py @@ -69,7 +69,8 @@ def get_tokenizer( To see the list of all HuggingFace pretrained models, use: nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list() tokenizer_model: tokenizer model file of sentencepiece - special_tokens: dict of special tokens + special_tokens: dict of special tokens. + For additional special tokens besides standard special tokens (bos, eos, pad, etc.), such as sentinel tokens for T5 (, , etc.), use key 'additional_special_tokens' vocab_file: path to vocab file use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer bpe_dropout: (experimental) BPE dropout tries to corrupt the standard segmentation @@ -224,7 +225,11 @@ def get_nmt_tokenizer( f'Getting Megatron tokenizer for pretrained model name: {model_name}, custom vocab file: {vocab_file}, and merges file: {merges_file}' ) return get_tokenizer( - tokenizer_name=model_name, vocab_file=vocab_file, merges_file=merges_file, chat_template=chat_template + tokenizer_name=model_name, + vocab_file=vocab_file, + merges_file=merges_file, + special_tokens=special_tokens_dict, + chat_template=chat_template, ) elif library == 'tabular': from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer diff --git a/tests/collections/llm/megatron_t5_finetuning.py b/tests/collections/llm/megatron_t5_finetuning.py index a204e6797926..f54e858cfb43 100644 --- a/tests/collections/llm/megatron_t5_finetuning.py +++ b/tests/collections/llm/megatron_t5_finetuning.py @@ -35,9 +35,12 @@ def get_args(): args = get_args() + special_tokens = {} + special_tokens['additional_special_tokens'] = [f'' for i in range(100)] tokenizer = get_nmt_tokenizer( "megatron", "BertWordPieceCase", + special_tokens=special_tokens, ) data = SquadDataModule( diff --git a/tests/collections/llm/megatron_t5_pretraining.py b/tests/collections/llm/megatron_t5_pretraining.py index 5d8f55a7f26f..a5460be3d154 100644 --- a/tests/collections/llm/megatron_t5_pretraining.py +++ b/tests/collections/llm/megatron_t5_pretraining.py @@ -50,10 +50,13 @@ def get_args(): args = get_args() + special_tokens = {} + special_tokens['additional_special_tokens'] = [f'' for i in range(100)] tokenizer = get_nmt_tokenizer( "megatron", "BertWordPieceCase", vocab_file=args.vocab_path, + special_tokens=special_tokens, ) data = PreTrainingDataModule( paths=args.data_path, From b1cbd06f3c4be1f17cd447e6f054add8b46af923 Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Date: Mon, 21 Oct 2024 21:37:16 +0300 Subject: [PATCH 3/8] Alit/mamba recipe (#10935) * add some mamba recipe * add 130m * add the rest of the recipes * add tokenizer * add tokenizer * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * minor fix * add fixes to ssm for nemorun recipes * add hybrid tokenizer * updating some recipes * Apply isort and black reformatting Signed-off-by: JRD971000 * remove comments * update gbs * fix ckpt resume * fix ckpt resume * fix ckpt resume * update recipes final * Apply isort and black reformatting Signed-off-by: JRD971000 * remove redundant imports * ckpt convertor dtype fix * Apply isort and black reformatting Signed-off-by: JRD971000 --------- Signed-off-by: JRD971000 Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Co-authored-by: JRD971000 --- nemo/collections/llm/gpt/model/ssm.py | 14 +- nemo/collections/llm/recipes/__init__.py | 14 + nemo/collections/llm/recipes/mamba2_130m.py | 321 +++++++++++++++++ nemo/collections/llm/recipes/mamba2_1_3b.py | 321 +++++++++++++++++ nemo/collections/llm/recipes/mamba2_2_7b.py | 321 +++++++++++++++++ nemo/collections/llm/recipes/mamba2_370m.py | 321 +++++++++++++++++ nemo/collections/llm/recipes/mamba2_780m.py | 321 +++++++++++++++++ nemo/collections/llm/recipes/mamba2_8b.py | 321 +++++++++++++++++ .../llm/recipes/mamba2_hybrid_8b.py | 323 ++++++++++++++++++ nemo/lightning/io/connector.py | 6 +- .../llm/gpt/model/megatron_ssm_finetuning.py | 1 + 11 files changed, 2281 insertions(+), 3 deletions(-) create mode 100644 nemo/collections/llm/recipes/mamba2_130m.py create mode 100644 nemo/collections/llm/recipes/mamba2_1_3b.py create mode 100644 nemo/collections/llm/recipes/mamba2_2_7b.py create mode 100644 nemo/collections/llm/recipes/mamba2_370m.py create mode 100644 nemo/collections/llm/recipes/mamba2_780m.py create mode 100644 nemo/collections/llm/recipes/mamba2_8b.py create mode 100644 nemo/collections/llm/recipes/mamba2_hybrid_8b.py diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py index 954fa8bfe9f7..c7228951fa78 100644 --- a/nemo/collections/llm/gpt/model/ssm.py +++ b/nemo/collections/llm/gpt/model/ssm.py @@ -53,6 +53,9 @@ class SSMConfig(TransformerConfig, io.IOMixin): fp16_lm_cross_entropy: bool = False parallel_output: bool = True share_embeddings_and_output_weights: bool = False + params_dtype: torch.dtype = torch.bfloat16 + fp16: bool = False + bf16: bool = True num_layers: int = 2 mamba_ssm_ngroups: int = 8 num_attention_heads: int = 1 @@ -81,6 +84,7 @@ class SSMConfig(TransformerConfig, io.IOMixin): forward_step_fn: Callable = ssm_forward_step data_step_fn: Callable = gpt_data_step + tokenizer_model_path: str = None def configure_model(self, tokenizer) -> "MCoreMambaModel": @@ -127,9 +131,17 @@ def __init__(self, state_dict): def state_dict(self): return self._state_dict + def to(self, dtype): + for k, v in self._state_dict.items(): + if v.dtype != dtype: + logging.warning(f"Converting {k} from {v.dtype} (source model) to {dtype} (target model)") + self._state_dict[k] = v.to(dtype) + source = ModelState(source) target = self.init() - trainer = self.nemo_setup(target) + trainer = self.nemo_setup(target, ckpt_async_save=False) + source.to(self.config.params_dtype) + target.to(self.config.params_dtype) self.convert_state(source, target) self.nemo_save(output_path, trainer) diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py index 7a21633b79ec..47cc4e71448d 100644 --- a/nemo/collections/llm/recipes/__init__.py +++ b/nemo/collections/llm/recipes/__init__.py @@ -21,6 +21,13 @@ llama3_70b_16k, llama3_70b_64k, llama31_405b, + mamba2_1_3b, + mamba2_2_7b, + mamba2_8b, + mamba2_130m, + mamba2_370m, + mamba2_780m, + mamba2_hybrid_8b, mistral_7b, mistral_nemo_12b, mixtral_8x7b, @@ -49,6 +56,13 @@ "llama3_70b_16k", "llama3_70b_64k", "llama31_405b", + "mamba2_130m", + "mamba2_370m", + "mamba2_780m", + "mamba2_1_3b", + "mamba2_2_7b", + "mamba2_8b", + "mamba2_hybrid_8b", "mistral_7b", "mistral_nemo_12b", "mixtral_8x7b", diff --git a/nemo/collections/llm/recipes/mamba2_130m.py b/nemo/collections/llm/recipes/mamba2_130m.py new file mode 100644 index 000000000000..08640604a112 --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_130m.py @@ -0,0 +1,321 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_130m" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='huggingface', + model_name="EleutherAI/gpt-neox-20b", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 130M model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 130M model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_130m ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, config=run.Config(llm.BaseMambaConfig130M), tokenizer=tokenizer(tokenizer_model=tokenizer_model) + ) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 130M model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_130m ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 130M model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_130M + $ nemo llm pretrain --factory "mamba2_130M(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_130M_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + resume_path: str = None, + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 130M model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_130m + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_130m_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.BaseMambaConfig130M(), tokenizer=tokenizer()).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.BaseMambaConfig130M()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 1 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mamba2_1_3b.py b/nemo/collections/llm/recipes/mamba2_1_3b.py new file mode 100644 index 000000000000..58eaf049b059 --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_1_3b.py @@ -0,0 +1,321 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_1_3b" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='huggingface', + model_name="EleutherAI/gpt-neox-20b", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 1.3B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 1.3B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_1_3B ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, config=run.Config(llm.BaseMambaConfig1_3B), tokenizer=tokenizer(tokenizer_model=tokenizer_model) + ) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 1.3B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_1_3b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 1.3B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_1_3b + $ nemo llm pretrain --factory "mamba2_1_3b(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_1_3b_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + resume_path: str = None, + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 1.3B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_1_3b + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_1_3b_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.BaseMambaConfig1_3B(), tokenizer=tokenizer()).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.BaseMambaConfig1_3B()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 1 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mamba2_2_7b.py b/nemo/collections/llm/recipes/mamba2_2_7b.py new file mode 100644 index 000000000000..5cb37c6a02a5 --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_2_7b.py @@ -0,0 +1,321 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_2_7b" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='huggingface', + model_name="EleutherAI/gpt-neox-20b", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 2.7B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 2.7B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_2_7B ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, config=run.Config(llm.BaseMambaConfig2_7B), tokenizer=tokenizer(tokenizer_model=tokenizer_model) + ) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 2.7B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_2_7b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 2.7B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_2_7b + $ nemo llm pretrain --factory "mamba2_2_7b(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_2_7b_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + resume_path: str = None, + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 2.7B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_2_7b + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_2_7b_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.BaseMambaConfig2_7B(), tokenizer=tokenizer()).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.BaseMambaConfig2_7B()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 1 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mamba2_370m.py b/nemo/collections/llm/recipes/mamba2_370m.py new file mode 100644 index 000000000000..bb8bddc4045a --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_370m.py @@ -0,0 +1,321 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_370m" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='huggingface', + model_name="EleutherAI/gpt-neox-20b", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 370M model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 370M model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_370m ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, config=run.Config(llm.BaseMambaConfig370M), tokenizer=tokenizer(tokenizer_model=tokenizer_model) + ) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 370M model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_370m ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 370M model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_370M + $ nemo llm pretrain --factory "mamba2_370M(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_370M_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + resume_path: str = None, + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 370M model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_370m + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_370m_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.BaseMambaConfig370M(), tokenizer=tokenizer()).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.BaseMambaConfig370M()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 1 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mamba2_780m.py b/nemo/collections/llm/recipes/mamba2_780m.py new file mode 100644 index 000000000000..2f6ab6717ae1 --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_780m.py @@ -0,0 +1,321 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_780m" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='huggingface', + model_name="EleutherAI/gpt-neox-20b", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 780M model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 780M model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_780m ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, config=run.Config(llm.BaseMambaConfig780M), tokenizer=tokenizer(tokenizer_model=tokenizer_model) + ) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 780M model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_780m ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 780M model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_780M + $ nemo llm pretrain --factory "mamba2_780M(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_780M_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + resume_path: str = None, + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 780M model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_780m + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_780m_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.BaseMambaConfig780M(), tokenizer=tokenizer()).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.BaseMambaConfig780M()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 1 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mamba2_8b.py b/nemo/collections/llm/recipes/mamba2_8b.py new file mode 100644 index 000000000000..58883deba732 --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_8b.py @@ -0,0 +1,321 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_8b" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='megatron', + model_name="GPTSentencePieceTokenizer", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 8B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 Hybrid 8B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_8b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, config=run.Config(llm.NVIDIAMambaConfig8B), tokenizer=tokenizer(tokenizer_model=tokenizer_model) + ) + + +def trainer( + tensor_parallelism: int = 8, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 Hybrid 8B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_8b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 Hybrid 8B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_8b + $ nemo llm pretrain --factory "mamba2_8b(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_8b_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + resume_path, + tokenizer_model, + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 8B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_8b + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_8b_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.NVIDIAMambaConfig8B(), tokenizer=tokenizer(tokenizer_model=tokenizer_model)).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.NVIDIAMambaConfig8B()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=8, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 8 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/collections/llm/recipes/mamba2_hybrid_8b.py b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py new file mode 100644 index 000000000000..eff37da46fca --- /dev/null +++ b/nemo/collections/llm/recipes/mamba2_hybrid_8b.py @@ -0,0 +1,323 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl +import torch +from megatron.core.distributed import DistributedDataParallelConfig +from pytorch_lightning.callbacks.callback import Callback + +from nemo import lightning as nl +from nemo.collections import llm +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer +from nemo.utils.exp_manager import TimingCallback + +NAME = "mamba2_hybrid_8b" + + +@run.cli.factory(name=NAME) +def tokenizer(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + + return run.Config( + get_nmt_tokenizer, + library='megatronNVIDIAMambaConfig8B', + model_name="GPTSentencePieceTokenizer", + tokenizer_model=tokenizer_model, + use_fast=True, + ) + + +@run.cli.factory(name=NAME) +def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mamba2 Hybrid 8B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mamba2 Hybrid 8B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mamba2_hybrid_8b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config( + llm.GPTModel, + config=run.Config(llm.NVIDIAMambaHybridConfig8B), + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ) + + +def trainer( + tensor_parallelism: int = 8, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mamba2 Hybrid 8B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mamba2_hybrid_8b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=False, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + tokenizer_model: str = None, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn=pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Mamba2 Hybrid 8B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mamba2_hybrid_8b + $ nemo llm pretrain --factory "mamba2_hybrid_8b(num_nodes=1, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mamba2_hybrid_8b_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config( + MockDataModule, + seq_length=4096, + global_batch_size=8, + micro_batch_size=1, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + resume_path, + tokenizer_model, + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + gbs: int = 8, + mbs: int = 1, + peft_scheme: Optional[str] = 'none', +) -> run.Partial: + """ + Create a fine-tuning recipe for Mamba2 Hybrid 8B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + resume_path (str): Path to the NeMo checkpoint (refer to notes below + on how to convert a pytorch checkpoint to NeMo) + tokenizer_model (str): Path to tokenizer model (defaults to None) + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mamba2_hybrid_8b + + Python API usage: + >>> recipe = finetune_recipe(name="mamba2_hybrid_8b_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + For converting an SSM pytorch checkpoint, use the following line of python code: + + llm.GPTModel(llm.NVIDIAMambaHybridConfig8B(), tokenizer=tokenizer(tokenizer_model=tokenizer_model)).import_ckpt( + path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file", + model_config=llm.NVIDIAMambaHybridConfig8B()) + This line will cache the nemo checkpoint to following directory: + /root/.cache/nemo/models/your_pytorch_state_dict_file + + """ + nemo_resume = run.Config( + nl.AutoResume, + restore_config=run.Config(nl.RestoreConfig, path=resume_path), + ) + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=8, + pipeline_model_parallel_size=1, + gradient_as_bucket_view=True, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ckpt_async_save=False, + ) + checkpoint_callback = run.Config( + nl.ModelCheckpoint, + every_n_train_steps=10, + dirpath=dir, + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + devices=num_gpus_per_node, + limit_test_batches=10, + limit_val_batches=10, + log_every_n_steps=20, + max_steps=100, + num_nodes=num_nodes, + plugins=run.Config( + nl.MegatronMixedPrecision, + precision="bf16-mixed", + params_dtype=torch.bfloat16, + ), + callbacks=[checkpoint_callback], + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=20, + ) + recipe = run.Partial( + llm.finetune, + model=model(tokenizer_model=tokenizer_model), + trainer=trainer, + data=run.Config( + llm.SquadDataModule, + seq_length=2048, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer(tokenizer_model=tokenizer_model), + ), + log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=0, warmup_steps=50), + resume=nemo_resume, + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 8 + recipe.optim.config.lr = 5e-6 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + return recipe diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py index 38fbda42c67d..e7ba67b277f8 100644 --- a/nemo/lightning/io/connector.py +++ b/nemo/lightning/io/connector.py @@ -134,7 +134,9 @@ class ModelConnector(Connector, Generic[SourceT, TargetT]): Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and trainer. """ - def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = None) -> pl.Trainer: + def nemo_setup( + self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = None, *args, **kwargs + ) -> pl.Trainer: """ Sets up the model and trainer using a specified strategy, preparing it for training or inference. @@ -150,7 +152,7 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = _trainer = trainer or Trainer( devices=1, accelerator="cpu", - strategy=MegatronStrategy(ckpt_save_optimizer=False, always_save_context=True), + strategy=MegatronStrategy(ckpt_save_optimizer=False, always_save_context=True, *args, **kwargs), ) # Note: set trainer to fitting state to avoid the following code path. Feel free to refactor if we no longer # need to avoid this: diff --git a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py index 67174974f9a3..e0b9862f23e1 100644 --- a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py +++ b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py @@ -59,6 +59,7 @@ def get_args(): strategy=nl.MegatronStrategy( ckpt_load_optimizer=False, ckpt_save_optimizer=False, + ckpt_async_save=False, tensor_model_parallel_size=1, ), plugins=nl.MegatronMixedPrecision( From b39e679ba9991269d712bd473ebbcf74520e9c20 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Mon, 21 Oct 2024 14:42:24 -0700 Subject: [PATCH 4/8] Long context performance doc hot fix (#10946) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * long context perf Signed-off-by: Youngeun Kwon * update the long context perf Signed-off-by: Youngeun Kwon * Akoumparouli/mcore microbatch calculator fix (#10780) * move tests/lightning/{,_}io Signed-off-by: Alexandros Koumparoulis * add microbatch calculator context manager Signed-off-by: Alexandros Koumparoulis * use microbatch calculator context manager Signed-off-by: Alexandros Koumparoulis * add on_load_checkpoint test to ValidateModelRestoration; use ctx manager to reconfigure microbatch calculator; update save/restore path; add cleanup step at the end Signed-off-by: Alexandros Koumparoulis * remove unused var Signed-off-by: Alexandros Koumparoulis * fix Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa Signed-off-by: Youngeun Kwon * remove 8x3b recipes (#10764) * remove 8x3b recipes Signed-off-by: Alexandros Koumparoulis * remove 8x3b from test_nemo_run Signed-off-by: Alexandros Koumparoulis * rm from __init__ Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: Youngeun Kwon * change the figure file name Signed-off-by: Youngeun Kwon * Accommodating the reviewer's comment Signed-off-by: Youngeun Kwon * update the y-axis title Signed-off-by: Youngeun Kwon * [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 3f90b98 ! (#10789) Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com> Signed-off-by: Youngeun Kwon * Add ModelOpt transformer model pruning example for Llama models, default to llama3.1-8b-base (#10294) * Add ModelOpt transformer model pruning example for Llama3 model Signed-off-by: Shengliang Xu * Apply isort and black reformatting Signed-off-by: shengliangxu Signed-off-by: Shengliang Xu * examples code is at wrong dir, move them Signed-off-by: Shengliang Xu * changes as suggested in comment remove some logging and unused config code, update example model to llama3.1 Signed-off-by: Shengliang Xu * Add pruning of hidden_size into example Signed-off-by: Shengliang Xu * Apply isort and black reformatting Signed-off-by: shengliangxu Signed-off-by: Shengliang Xu * Update examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> * Add pruning test to cicd-main.yml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> * Update cicd-main.yml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> * Update cicd-main.yml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> * Update cicd-main.yml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> * Update cicd-main.yml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> * Update cicd-main.yml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --------- Signed-off-by: Shengliang Xu Signed-off-by: shengliangxu Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Co-authored-by: shengliangxu Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Signed-off-by: Youngeun Kwon * Update mamba.rst after dist ckpt addition (#10800) Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Signed-off-by: Youngeun Kwon * fix chunked infer (#10581) Signed-off-by: stevehuang52 Signed-off-by: Youngeun Kwon * fix state transform (#10728) Signed-off-by: Chen Cui Signed-off-by: Youngeun Kwon * use ckpt_to_weights_subdir in restore (#10786) * use ckpt_to_weights_subdir in restore Signed-off-by: Alexandros Koumparoulis * make ckpt_to_{weight,context}_subdir idempotent Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa Signed-off-by: Youngeun Kwon * Mixtral set seq_length=4k (#10704) * enable SP & set seq_lenght=4k Signed-off-by: Alexandros Koumparoulis * update test expected values Signed-off-by: Alexandros Koumparoulis * 8x22b 4k Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: Youngeun Kwon * Fix for crashes with tensorboard_logger=false and VP + LoRA (#10792) * Fix for crashes with tensorboard_logger=false and virtual pipeline parallel + LoRA Signed-off-by: Valerie Sarge * Apply isort and black reformatting Signed-off-by: vysarge --------- Signed-off-by: Valerie Sarge Signed-off-by: vysarge Co-authored-by: vysarge Signed-off-by: Youngeun Kwon * Disable checkpoint conversion inside AutoResume (#10645) * Disable checkpoint conversion inside AutoResume Signed-off-by: Hemil Desai * Apply isort and black reformatting Signed-off-by: hemildesai * Update resume docstrings Signed-off-by: Hemil Desai * fix Signed-off-by: Hemil Desai * add default finetuning recipe and refactor llama3 8b recipe Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx * address comment Signed-off-by: Chen Cui * refactor other recipes Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx * remove 8x3b finetuning recipe for now because HF version not available Signed-off-by: Chen Cui * add copyright header Signed-off-by: Chen Cui * adjust unit tests based on recipe fixes Signed-off-by: Chen Cui * fix failed unit test Signed-off-by: Chen Cui --------- Signed-off-by: Hemil Desai Signed-off-by: hemildesai Signed-off-by: Chen Cui Signed-off-by: cuichenx Co-authored-by: hemildesai Co-authored-by: Chen Cui Co-authored-by: cuichenx Signed-off-by: Youngeun Kwon * replace png file to github assets Signed-off-by: Youngeun Kwon * change image url to github release Signed-off-by: Youngeun Kwon * hot fix on table style Signed-off-by: Youngeun Kwon --------- Signed-off-by: Youngeun Kwon Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: Shengliang Xu Signed-off-by: shengliangxu Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Signed-off-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Signed-off-by: stevehuang52 Signed-off-by: Chen Cui Signed-off-by: Valerie Sarge Signed-off-by: vysarge Signed-off-by: Hemil Desai Signed-off-by: hemildesai Signed-off-by: cuichenx Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Co-authored-by: akoumpa Co-authored-by: oliver könig Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com> Co-authored-by: Shengliang Xu <106840466+shengliangxu@users.noreply.github.com> Co-authored-by: shengliangxu Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: Chen Cui Co-authored-by: Valerie Sarge Co-authored-by: vysarge Co-authored-by: Hemil Desai Co-authored-by: hemildesai Co-authored-by: cuichenx --- .../performance/performance_long_sequence.md | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/docs/source/performance/performance_long_sequence.md b/docs/source/performance/performance_long_sequence.md index 9dc9c6c52be3..d9f26dcf0d61 100644 --- a/docs/source/performance/performance_long_sequence.md +++ b/docs/source/performance/performance_long_sequence.md @@ -7,27 +7,6 @@ - Container: [NeMo24.03.01.framework](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags) - System: DGX-H100 - - From c7a539a6cb4e7cb59b51eab67dad627862c2c9f9 Mon Sep 17 00:00:00 2001 From: malay-nagda <164242706+malay-nagda@users.noreply.github.com> Date: Tue, 22 Oct 2024 03:20:21 +0530 Subject: [PATCH 5/8] Performance mode (#10926) * llama3 performance mode Signed-off-by: Malay Nagda * llama3 performance mode tests Signed-off-by: Malay Nagda * mixtral performance mode Signed-off-by: Malay Nagda * remove unused Signed-off-by: Malay Nagda * nemotron perf mode Signed-off-by: Malay Nagda * 405b, 174b perf mode Signed-off-by: Malay Nagda * perf mode comment Signed-off-by: Malay Nagda * Apply isort and black reformatting Signed-off-by: malay-nagda --------- Signed-off-by: Malay Nagda Signed-off-by: malay-nagda <164242706+malay-nagda@users.noreply.github.com> Signed-off-by: malay-nagda Co-authored-by: malay-nagda --- nemo/collections/llm/recipes/gpt3_175b.py | 52 ++++++++----------- nemo/collections/llm/recipes/llama31_405b.py | 52 ++++++++----------- nemo/collections/llm/recipes/llama3_70b.py | 51 +++++++++--------- nemo/collections/llm/recipes/llama3_8b.py | 43 ++++++--------- nemo/collections/llm/recipes/mixtral_8x22b.py | 50 ++++++++---------- nemo/collections/llm/recipes/mixtral_8x7b.py | 50 ++++++++---------- nemo/collections/llm/recipes/nemotron3_8b.py | 36 +++++-------- nemo/collections/llm/recipes/nemotron4_15b.py | 37 +++++-------- nemo/collections/llm/recipes/nemotron4_22b.py | 45 ++++++---------- .../collections/llm/recipes/nemotron4_340b.py | 45 ++++++---------- .../llm/recipes/test_llama3_70b.py | 6 +-- .../collections/llm/recipes/test_llama3_8b.py | 6 +-- 12 files changed, 193 insertions(+), 280 deletions(-) diff --git a/nemo/collections/llm/recipes/gpt3_175b.py b/nemo/collections/llm/recipes/gpt3_175b.py index 7e016154aa3e..1abe8a218e82 100644 --- a/nemo/collections/llm/recipes/gpt3_175b.py +++ b/nemo/collections/llm/recipes/gpt3_175b.py @@ -142,7 +142,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for GPT3 175B model. @@ -155,6 +160,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -172,7 +178,7 @@ def pretrain_recipe( Note: This recipe is optimized for the large 175B model and requires significant computational resources. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -186,49 +192,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 1, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for GPT3 175B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - CLI usage: - $ nemo llm pretrain --factory "gpt3_175b.pretrain_recipe_performance(num_nodes=64, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="gpt3_175b_perf", num_nodes=64) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.append( run.Config( diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py index 45efedc3cbd6..055e9a06fcba 100644 --- a/nemo/collections/llm/recipes/llama31_405b.py +++ b/nemo/collections/llm/recipes/llama31_405b.py @@ -144,7 +144,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Llama3.1 405B model. @@ -157,6 +162,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -174,7 +180,7 @@ def pretrain_recipe( Note: This recipe is optimized for the large 405B model and requires significant computational resources. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -188,49 +194,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 1, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Llama3.1 405B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - CLI usage: - $ nemo llm pretrain --factory "llama31_405b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="llama31_405b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.append( run.Config( diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py index ffd4a833885e..b283c68b222b 100644 --- a/nemo/collections/llm/recipes/llama3_70b.py +++ b/nemo/collections/llm/recipes/llama3_70b.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl @@ -142,7 +142,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Llama3 70B model. @@ -155,6 +160,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -172,7 +178,8 @@ def pretrain_recipe( Note: This recipe is optimized for the large 70B model and requires significant computational resources. """ - return run.Partial( + + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -186,45 +193,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Llama3 70B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - CLI usage: - $ nemo llm pretrain --factory "llama3_70b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="llama3_70b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.append( run.Config( diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py index dd162ed29914..269eb7865dcf 100644 --- a/nemo/collections/llm/recipes/llama3_8b.py +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -143,7 +143,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Llama3 8B model. @@ -156,6 +161,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -174,7 +180,7 @@ def pretrain_recipe( For more details on pre-training LLMs with NeMo, see the pre-training guide in the `examples/llm/pretrain/` directory. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -188,44 +194,29 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 1, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Llama3 8B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - $ nemo llm pretrain --factory llama3_8b_optimized - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="llama3_8b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - recipe.trainer.callbacks.append( run.Config( MegatronCommOverlapCallback, diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py index f023eae01440..1bfef9be5582 100644 --- a/nemo/collections/llm/recipes/mixtral_8x22b.py +++ b/nemo/collections/llm/recipes/mixtral_8x22b.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl @@ -145,7 +145,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 16, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 16, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Mixtral 8x22B model. @@ -158,6 +163,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -172,7 +178,7 @@ def pretrain_recipe( >>> recipe = pretrain_recipe(name="mixtral_pretrain", num_nodes=16) >>> print(recipe) """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -184,45 +190,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Mixtral 8x22B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - CLI usage: - $ nemo llm pretrain --factory "mixtral_8x22b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="mixtral_8x22b_perf", num_nodes=8) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.extend( [ diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py index e80be03e3217..8e39e73aab76 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import Optional +from typing import Callable, Optional import nemo_run as run import pytorch_lightning as pl @@ -142,7 +142,12 @@ def trainer( @run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 8, + num_gpus_per_node: int = 8, + performance_mode: bool = False, + fn: Callable = pretrain, ) -> run.Partial: """ Create a pre-training recipe for Mixtral 8x7B model. @@ -155,6 +160,7 @@ def pretrain_recipe( name (str): Name of the pre-training run. num_nodes (int): Number of compute nodes to use. num_gpus_per_node (int): Number of GPUs per node. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -169,7 +175,7 @@ def pretrain_recipe( >>> recipe = pretrain_recipe(name="mixtral_8x7b_pretrain", num_nodes=8) >>> print(recipe) """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=trainer( @@ -181,45 +187,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Mixtral 8x7B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - CLI usage: - $ nemo llm pretrain --factory "mixtral_8x3b.pretrain_recipe_performance(num_nodes=8, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="mixtral_8x7b_perf", num_nodes=8) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.extend( [ diff --git a/nemo/collections/llm/recipes/nemotron3_8b.py b/nemo/collections/llm/recipes/nemotron3_8b.py index 928f0d177947..7dcebe17f872 100644 --- a/nemo/collections/llm/recipes/nemotron3_8b.py +++ b/nemo/collections/llm/recipes/nemotron3_8b.py @@ -83,6 +83,7 @@ def pretrain_recipe( constant_steps=0, min_lr=3.0e-5, max_lr=3e-4, + performance_mode: bool = False, # Training function fn=pretrain, ) -> run.Partial: @@ -118,6 +119,7 @@ def pretrain_recipe( constant_steps (int): Number of constant steps. min_lr (float): Minimum learning rate. max_lr (float): Maximum learning rate. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -135,7 +137,7 @@ def pretrain_recipe( Note: This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=nemotron_trainer( @@ -174,43 +176,29 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 1, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Nemotron3 8B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - $ nemo llm pretrain --factory nemotron3_8b_optimized - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="nemotron3_8b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) recipe.trainer.callbacks.append( run.Config( diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py index 9f184a92d94b..16ae7b2b1e79 100644 --- a/nemo/collections/llm/recipes/nemotron4_15b.py +++ b/nemo/collections/llm/recipes/nemotron4_15b.py @@ -80,6 +80,7 @@ def pretrain_recipe( constant_steps=0, min_lr=4.5e-5, max_lr=4.5e-5, + performance_mode: bool = False, # Training function fn=pretrain, ) -> run.Partial: @@ -115,6 +116,7 @@ def pretrain_recipe( constant_steps (int): Number of constant steps. min_lr (float): Minimum learning rate. max_lr (float): Maximum learning rate. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -132,7 +134,7 @@ def pretrain_recipe( Note: This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=nemotron_trainer( @@ -171,44 +173,29 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 8, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Nemotron4 15B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - $ nemo llm pretrain --factory nemotron4_15b_optimized - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="nemotron4_15b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - recipe.trainer.callbacks.append( run.Config( MegatronCommOverlapCallback, diff --git a/nemo/collections/llm/recipes/nemotron4_22b.py b/nemo/collections/llm/recipes/nemotron4_22b.py index 4fb697c006fc..a20afedfea56 100644 --- a/nemo/collections/llm/recipes/nemotron4_22b.py +++ b/nemo/collections/llm/recipes/nemotron4_22b.py @@ -80,6 +80,7 @@ def pretrain_recipe( constant_steps=0, min_lr=1e-5, max_lr=1e-4, + performance_mode: bool = False, # Training function fn=pretrain, ) -> run.Partial: @@ -115,6 +116,7 @@ def pretrain_recipe( constant_steps (int): Number of constant steps. min_lr (float): Minimum learning rate. max_lr (float): Maximum learning rate. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -132,7 +134,7 @@ def pretrain_recipe( Note: This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=nemotron_trainer( @@ -171,48 +173,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 8, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Nemotron4 22B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - $ nemo llm pretrain --factory nemotron4_22b_optimized - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="nemotron4_22b_perf", num_nodes=4) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.append( run.Config( diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py index cc9c7995c9e4..8268b2a87791 100644 --- a/nemo/collections/llm/recipes/nemotron4_340b.py +++ b/nemo/collections/llm/recipes/nemotron4_340b.py @@ -83,6 +83,7 @@ def pretrain_recipe( constant_steps=0, min_lr=1.0e-5, max_lr=1.0e-4, + performance_mode: bool = False, # Training function fn=pretrain, ) -> run.Partial: @@ -118,6 +119,7 @@ def pretrain_recipe( constant_steps (int): Number of constant steps. min_lr (float): Minimum learning rate. max_lr (float): Maximum learning rate. + performance_mode (bool): If true, enables optimizations for maximum performance. fn (Callable): The pre-training function to use. Returns: @@ -135,7 +137,7 @@ def pretrain_recipe( Note: This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset. """ - return run.Partial( + recipe = run.Partial( fn, model=model(), trainer=nemotron_trainer( @@ -174,48 +176,35 @@ def pretrain_recipe( resume=default_resume(), ) + if performance_mode: + recipe = pretrain_performance_optimizations(recipe) -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 16, - num_gpus_per_node: int = 8, - fn: Callable = pretrain, -) -> run.Partial: + return recipe + + +def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: """ Create a performance-optimized pre-training recipe for Nemotron4 340B model. - This recipe enables performance optimizations that may not be suitable for all use cases. + This method enables performance optimizations that may not be suitable for all use cases. It builds upon the standard pre-training recipe and adds additional performance enhancements. Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. + recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added Returns: run.Partial: Partial configuration for performance-optimized pre-training. - Examples: - $ nemo llm pretrain --factory nemotron4_340b_optimized - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="nemotron4_340b_perf", num_nodes=16) - >>> print(recipe) - Note: - Use this recipe with caution and only when you need maximum performance. + Use this method with caution and only when you need maximum performance. It may not be suitable for all hardware configurations or use cases. """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback - # They are added here for user's knowledge - # overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step. - # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed + # 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically + # by MegatronCommOverlapCallback. They are added here for user's knowledge. + # overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step. + # align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else + # each PP stage launches independently as needed. recipe.trainer.callbacks.append( run.Config( diff --git a/tests/collections/llm/recipes/test_llama3_70b.py b/tests/collections/llm/recipes/test_llama3_70b.py index cc77ec921de7..d47b674b7b70 100644 --- a/tests/collections/llm/recipes/test_llama3_70b.py +++ b/tests/collections/llm/recipes/test_llama3_70b.py @@ -79,10 +79,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node - def test_pretrain_recipe_performance(self, recipe_module): - recipe = recipe_module.pretrain_recipe_performance( - name="test_perf", dir="/tmp", num_nodes=4, num_gpus_per_node=8 - ) + def test_pretrain_performance_optimizations(self, recipe_module): + recipe = recipe_module.pretrain_recipe(performance_mode=True) assert any( isinstance(cb, run.Config) and cb.__fn_or_cls__ == MegatronCommOverlapCallback for cb in recipe.trainer.callbacks diff --git a/tests/collections/llm/recipes/test_llama3_8b.py b/tests/collections/llm/recipes/test_llama3_8b.py index df4f05eec2ae..88fab6d6325a 100644 --- a/tests/collections/llm/recipes/test_llama3_8b.py +++ b/tests/collections/llm/recipes/test_llama3_8b.py @@ -90,10 +90,8 @@ def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_ assert recipe.trainer.num_nodes == num_nodes assert recipe.trainer.devices == num_gpus_per_node - def test_pretrain_recipe_performance(self, recipe_module): - recipe = recipe_module.pretrain_recipe_performance( - name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8 - ) + def test_pretrain_performance_optimizations(self, recipe_module): + recipe = recipe_module.pretrain_recipe(performance_mode=True) assert any(cb.__fn_or_cls__.__name__ == "MegatronCommOverlapCallback" for cb in recipe.trainer.callbacks) def test_trainer_parallelism_options(self, recipe_module): From 47f2446a01128e783e5cc9ac8b2058a081c7474f Mon Sep 17 00:00:00 2001 From: Mingyuan Ma <111467530+Victor49152@users.noreply.github.com> Date: Mon, 21 Oct 2024 17:24:02 -0700 Subject: [PATCH 6/8] Add flux inference pipeline (#10752) * Vae added and matched flux checkpoint Signed-off-by: mingyuanm * Flux model added. Signed-off-by: mingyuanm * Copying FlowMatchEulerScheduler over Signed-off-by: mingyuanm * WIP: Start to test the pipeline forward pass Signed-off-by: mingyuanm * Vae added and matched flux checkpoint Signed-off-by: mingyuanm * Inference pipeline runs with offloading function Signed-off-by: mingyuanm * Start to test image generation Signed-off-by: mingyuanm * Decoding with VAE part has been verified. Still need to check the denoising loop. Signed-off-by: mingyuanm * The inference pipeline is verified. Signed-off-by: mingyuanm * Add arg parsers and refactoring Signed-off-by: mingyuanm * Tested on multi batch sizes and prompts. Signed-off-by: mingyuanm * Add headers Signed-off-by: mingyuanm * Apply isort and black reformatting Signed-off-by: Victor49152 * Renaming Signed-off-by: mingyuanm * Move shceduler to sampler folder Signed-off-by: mingyuanm * Merging folders. Signed-off-by: mingyuanm * Apply isort and black reformatting Signed-off-by: Victor49152 * Tested after path changing. Signed-off-by: mingyuanm * Apply isort and black reformatting Signed-off-by: Victor49152 * Move MMDIT block to NeMo Signed-off-by: mingyuanm * Apply isort and black reformatting Signed-off-by: Victor49152 * Add joint attention and single attention to NeMo Signed-off-by: mingyuanm * Apply isort and black reformatting Signed-off-by: Victor49152 * Joint attention updated Signed-off-by: mingyuanm * Apply isort and black reformatting Signed-off-by: Victor49152 * Remove redundant importing Signed-off-by: mingyuanm * Refactor to inherit megatron module Signed-off-by: mingyuanm * Apply isort and black reformatting Signed-off-by: Victor49152 --------- Signed-off-by: mingyuanm Signed-off-by: Victor49152 Co-authored-by: Victor49152 --- .../diffusion/encoders/__init__.py | 13 + .../diffusion/encoders/conditioner.py | 199 ++++++++ nemo/collections/diffusion/flux_infer.py | 113 +++++ .../diffusion/models/dit/dit_attention.py | 428 ++++++++++++++++++ .../diffusion/models/dit/dit_layer_spec.py | 357 ++++++++++++++- .../diffusion/models/flux/__init__.py | 13 + .../diffusion/models/flux/layers.py | 173 +++++++ .../diffusion/models/flux/model.py | 156 +++++++ .../diffusion/models/flux/pipeline.py | 342 ++++++++++++++ .../sampler/flow_matching/__init__.py | 13 + .../flow_match_euler_discrete.py | 284 ++++++++++++ nemo/collections/diffusion/utils/__init__.py | 13 + .../diffusion/utils/flux_ckpt_converter.py | 206 +++++++++ .../diffusion/utils/flux_pipeline_utils.py | 76 ++++ .../diffusion/utils/mcore_parallel_utils.py | 80 ++++ nemo/collections/diffusion/vae/autoencoder.py | 334 ++++++++++++++ nemo/collections/diffusion/vae/blocks.py | 180 ++++++++ 17 files changed, 2971 insertions(+), 9 deletions(-) create mode 100644 nemo/collections/diffusion/encoders/__init__.py create mode 100644 nemo/collections/diffusion/encoders/conditioner.py create mode 100644 nemo/collections/diffusion/flux_infer.py create mode 100644 nemo/collections/diffusion/models/dit/dit_attention.py create mode 100644 nemo/collections/diffusion/models/flux/__init__.py create mode 100644 nemo/collections/diffusion/models/flux/layers.py create mode 100644 nemo/collections/diffusion/models/flux/model.py create mode 100644 nemo/collections/diffusion/models/flux/pipeline.py create mode 100644 nemo/collections/diffusion/sampler/flow_matching/__init__.py create mode 100644 nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py create mode 100644 nemo/collections/diffusion/utils/__init__.py create mode 100644 nemo/collections/diffusion/utils/flux_ckpt_converter.py create mode 100644 nemo/collections/diffusion/utils/flux_pipeline_utils.py create mode 100644 nemo/collections/diffusion/utils/mcore_parallel_utils.py create mode 100644 nemo/collections/diffusion/vae/autoencoder.py create mode 100644 nemo/collections/diffusion/vae/blocks.py diff --git a/nemo/collections/diffusion/encoders/__init__.py b/nemo/collections/diffusion/encoders/__init__.py new file mode 100644 index 000000000000..9e3250071955 --- /dev/null +++ b/nemo/collections/diffusion/encoders/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/diffusion/encoders/conditioner.py b/nemo/collections/diffusion/encoders/conditioner.py new file mode 100644 index 000000000000..2bfb008c5d84 --- /dev/null +++ b/nemo/collections/diffusion/encoders/conditioner.py @@ -0,0 +1,199 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +import torch +import torch.nn as nn +from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer + + +class AbstractEmbModel(nn.Module): + def __init__(self, enable_lora_finetune=False, target_block=[], target_module=[]): + super().__init__() + self._is_trainable = None + self._ucg_rate = None + self._input_key = None + + self.TARGET_BLOCK = target_block + self.TARGET_MODULE = target_module + if enable_lora_finetune: + self.lora_layers = [] + + @property + def is_trainable(self) -> bool: + return self._is_trainable + + @property + def ucg_rate(self) -> Union[float, torch.Tensor]: + return self._ucg_rate + + @property + def input_key(self) -> str: + return self._input_key + + @is_trainable.setter + def is_trainable(self, value: bool): + self._is_trainable = value + + @ucg_rate.setter + def ucg_rate(self, value: Union[float, torch.Tensor]): + self._ucg_rate = value + + @input_key.setter + def input_key(self, value: str): + self._input_key = value + + @is_trainable.deleter + def is_trainable(self): + del self._is_trainable + + @ucg_rate.deleter + def ucg_rate(self): + del self._ucg_rate + + @input_key.deleter + def input_key(self): + del self._input_key + + def encode(self, *args, **kwargs): + raise NotImplementedError + + def _enable_lora(self, lora_model): + for module_name, module in lora_model.named_modules(): + if module.__class__.__name__ in self.TARGET_BLOCK: + tmp = {} + for sub_name, sub_module in module.named_modules(): + if sub_module.__class__.__name__ in self.TARGET_MODULE: + if hasattr(sub_module, "input_size") and hasattr( + sub_module, "output_size" + ): # for megatron ParallelLinear + lora = LoraWrapper(sub_module, sub_module.input_size, sub_module.output_size) + else: # for nn.Linear + lora = LoraWrapper(sub_module, sub_module.in_features, sub_module.out_features) + self.lora_layers.append(lora) + if sub_name not in tmp.keys(): + tmp.update({sub_name: lora}) + else: + print(f"Duplicate subnames are found in module {module_name}") + for sub_name, lora_layer in tmp.items(): + lora_name = f'{sub_name}_lora' + module.add_module(lora_name, lora_layer) + + +class FrozenCLIPEmbedder(AbstractEmbModel): + """Uses the CLIP transformer encoder for text (from Hugging Face)""" + + LAYERS = ["last", "pooled", "hidden"] + + def __init__( + self, + version="openai/clip-vit-large-patch14", + device="cuda", + max_length=77, + enable_lora_finetune=False, + layer="last", + layer_idx=None, + always_return_pooled=False, + dtype=torch.float, + ): + super().__init__(enable_lora_finetune, target_block=["CLIPAttention", "CLIPMLP"], target_module=["Linear"]) + self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") + self.transformer = CLIPTextModel.from_pretrained(version, torch_dtype=dtype).to(device) + self.device = device + self.max_length = max_length + self.freeze() + if enable_lora_finetune: + self._enable_lora(self.transformer) + print(f"CLIP transformer encoder add {len(self.lora_layers)} lora layers.") + + self.layer = layer + self.layer_idx = layer_idx + self.return_pooled = always_return_pooled + if layer == "hidden": + assert layer_idx is not None + assert 0 <= abs(layer_idx) <= 12 + + def freeze(self): + self.transformer = self.transformer.eval() + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text, max_sequence_length=None): + batch_encoding = self.tokenizer( + text, + truncation=True, + max_length=max_sequence_length if max_sequence_length else self.max_length, + return_length=True, + return_overflowing_tokens=False, + padding="max_length", + return_tensors="pt", + ) + tokens = batch_encoding["input_ids"].to(self.transformer.device, non_blocking=True) + outputs = self.transformer(input_ids=tokens, output_hidden_states=(self.layer == "hidden")) + + if self.layer == "last": + z = outputs.last_hidden_state + elif self.layer == "pooled": + z = outputs.pooler_output[:, None, :] + else: + z = outputs.hidden_states[self.layer_idx] + + # Pad the seq length to multiple of 8 + seq_len = (z.shape[1] + 8 - 1) // 8 * 8 + z = torch.nn.functional.pad(z, (0, 0, 0, seq_len - z.shape[1]), value=0.0) + if self.return_pooled: + return z, outputs.pooler_output + return z + + def encode(self, text): + return self(text) + + +class FrozenT5Embedder(AbstractEmbModel): + def __init__( + self, + version="google/t5-v1_1-xxl", + max_length=512, + device="cuda", + dtype=torch.float, + ): + super().__init__() + self.tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-xxl", max_length=max_length) + self.transformer = T5EncoderModel.from_pretrained(version, torch_dtype=dtype).to(device) + self.max_length = max_length + self.freeze() + self.device = device + self.dtype = dtype + + def freeze(self): + self.transformer = self.transformer.eval() + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text, max_sequence_length=None): + batch_encoding = self.tokenizer( + text, + truncation=True, + max_length=max_sequence_length if max_sequence_length else self.max_length, + return_length=False, + return_overflowing_tokens=False, + padding="max_length", + return_tensors="pt", + ) + + tokens = batch_encoding["input_ids"].to(self.transformer.device, non_blocking=True) + outputs = self.transformer(input_ids=tokens, output_hidden_states=None) + + return outputs.last_hidden_state diff --git a/nemo/collections/diffusion/flux_infer.py b/nemo/collections/diffusion/flux_infer.py new file mode 100644 index 000000000000..f914dbf50258 --- /dev/null +++ b/nemo/collections/diffusion/flux_infer.py @@ -0,0 +1,113 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import torch + +from nemo.collections.diffusion.models.flux.pipeline import FluxInferencePipeline +from nemo.collections.diffusion.utils.flux_pipeline_utils import configs +from nemo.collections.diffusion.utils.mcore_parallel_utils import Utils + + +def parse_args(): + parser = argparse.ArgumentParser( + description="The flux inference pipeline is utilizing megatron core transformer.\nPlease prepare the necessary checkpoints for flux model on local disk in order to use this script" + ) + + parser.add_argument("--flux_ckpt", type=str, default="", help="Path to Flux transformer checkpoint(s)") + parser.add_argument("--vae_ckpt", type=str, default="/ckpts/ae.safetensors", help="Path to \'ae.safetensors\'") + parser.add_argument( + "--clip_version", + type=str, + default='/ckpts/text_encoder', + help="Clip version, provide either ckpt dir or clip version like openai/clip-vit-large-patch14", + ) + parser.add_argument( + "--t5_version", + type=str, + default='/ckpts/text_encoder_2', + help="Clip version, provide either ckpt dir or clip version like google/t5-v1_1-xxl", + ) + parser.add_argument( + "--do_convert_from_hf", + action='store_true', + default=False, + help="Must be true if provided checkpoint is not already converted to NeMo version", + ) + parser.add_argument( + "--save_converted_model", + action="store_true", + default=False, + help="Whether to save the converted NeMo transformer checkpoint for Flux", + ) + parser.add_argument( + "--version", + type=str, + default='dev', + choices=['dev', 'schnell'], + help="Must align with the checkpoint provided.", + ) + parser.add_argument("--height", type=int, default=1024, help="Image height.") + parser.add_argument("--width", type=int, default=1024, help="Image width.") + parser.add_argument("--inference_steps", type=int, default=10, help="Number of inference steps to run.") + parser.add_argument( + "--num_images_per_prompt", type=int, default=1, help="Number of images to generate for each prompt." + ) + parser.add_argument("--guidance", type=float, default=0.0, help="Guidance scale.") + parser.add_argument( + "--offload", action='store_true', default=False, help="Offload modules to cpu after being called." + ) + parser.add_argument( + "--prompts", + type=str, + default="A cat holding a sign that says hello world", + help="Inference prompts, use \',\' to separate if multiple prompts are provided.", + ) + parser.add_argument("--bf16", action='store_true', default=False, help="Use bf16 in inference.") + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + print('Initializing model parallel config') + Utils.initialize_distributed(1, 1, 1) + + print('Initializing flux inference pipeline') + params = configs[args.version] + params.vae_params.ckpt = args.vae_ckpt + params.clip_params['version'] = args.clip_version + params.t5_params['version'] = args.t5_version + pipe = FluxInferencePipeline(params) + + print('Loading transformer weights') + pipe.load_from_pretrained( + args.flux_ckpt, + do_convert_from_hf=args.do_convert_from_hf, + save_converted_model=args.save_converted_model, + ) + dtype = torch.bfloat16 if args.bf16 else torch.float32 + text = args.prompts.split(',') + pipe( + text, + max_sequence_length=256, + height=args.height, + width=args.width, + num_inference_steps=args.inference_steps, + num_images_per_prompt=args.num_images_per_prompt, + offload=args.offload, + guidance_scale=args.guidance, + dtype=dtype, + ) diff --git a/nemo/collections/diffusion/models/dit/dit_attention.py b/nemo/collections/diffusion/models/dit/dit_attention.py new file mode 100644 index 000000000000..9e60b11dd1c6 --- /dev/null +++ b/nemo/collections/diffusion/models/dit/dit_attention.py @@ -0,0 +1,428 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +from dataclasses import dataclass +from typing import Union + +import torch +from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb +from megatron.core.transformer.attention import Attention, SelfAttention +from megatron.core.transformer.custom_layers.transformer_engine import SplitAlongDim +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig + + +@dataclass +class JointSelfAttentionSubmodules: + linear_qkv: Union[ModuleSpec, type] = None + added_linear_qkv: Union[ModuleSpec, type] = None + core_attention: Union[ModuleSpec, type] = None + linear_proj: Union[ModuleSpec, type] = None + q_layernorm: Union[ModuleSpec, type] = None + k_layernorm: Union[ModuleSpec, type] = None + added_q_layernorm: Union[ModuleSpec, type] = None + added_k_layernorm: Union[ModuleSpec, type] = None + + +class JointSelfAttention(Attention): + """Joint Self-attention layer class + + Used for MMDIT-like transformer block. + """ + + def __init__( + self, + config: TransformerConfig, + submodules: JointSelfAttentionSubmodules, + layer_number: int, + attn_mask_type=AttnMaskType.padding, + context_pre_only: bool = False, + ): + super().__init__( + config=config, + submodules=submodules, + layer_number=layer_number, + attn_mask_type=attn_mask_type, + attention_type="self", + ) + + self.linear_qkv = build_module( + submodules.linear_qkv, + self.config.hidden_size, + self.query_projection_size + 2 * self.kv_projection_size, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=self.config.add_bias_linear or self.config.add_qkv_bias, + skip_bias_add=False, + is_expert=False, + tp_comm_buffer_name='qkv', + ) + + if submodules.added_linear_qkv is not None: + self.added_linear_qkv = build_module( + submodules.added_linear_qkv, + self.config.hidden_size, + self.query_projection_size + 2 * self.kv_projection_size, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=self.config.add_qkv_bias, + skip_bias_add=False, + is_expert=False, + tp_comm_buffer_name='qkv', + ) + + if not context_pre_only: + self.added_linear_proj = build_module( + submodules.linear_proj, + self.query_projection_size, + self.config.hidden_size, + config=self.config, + init_method=self.config.output_layer_init_method, + bias=self.config.add_bias_linear, + input_is_parallel=True, + skip_bias_add=True, + is_expert=False, + tp_comm_buffer_name='proj', + ) + + if submodules.q_layernorm is not None: + self.q_layernorm = build_module( + submodules.q_layernorm, + hidden_size=self.hidden_size_per_attention_head, + config=self.config, + eps=self.config.layernorm_epsilon, + ) + else: + self.q_layernorm = None + + if submodules.k_layernorm is not None: + self.k_layernorm = build_module( + submodules.k_layernorm, + hidden_size=self.hidden_size_per_attention_head, + config=self.config, + eps=self.config.layernorm_epsilon, + ) + else: + self.k_layernorm = None + + if submodules.added_q_layernorm is not None: + self.added_q_layernorm = build_module( + submodules.added_q_layernorm, + hidden_size=self.hidden_size_per_attention_head, + config=self.config, + eps=self.config.layernorm_epsilon, + ) + else: + self.added_q_layernorm = None + + if submodules.added_k_layernorm is not None: + self.added_k_layernorm = build_module( + submodules.added_k_layernorm, + hidden_size=self.hidden_size_per_attention_head, + config=self.config, + eps=self.config.layernorm_epsilon, + ) + else: + self.added_k_layernorm = None + + def _split_qkv(self, mixed_qkv): + # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn] + new_tensor_shape = mixed_qkv.size()[:-1] + ( + self.num_query_groups_per_partition, + ( + (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2) + * self.hidden_size_per_attention_head + ), + ) + mixed_qkv = mixed_qkv.view(*new_tensor_shape) + + split_arg_list = [ + ( + self.num_attention_heads_per_partition + // self.num_query_groups_per_partition + * self.hidden_size_per_attention_head + ), + self.hidden_size_per_attention_head, + self.hidden_size_per_attention_head, + ] + + if SplitAlongDim is not None: + + # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn] + (query, key, value) = SplitAlongDim( + mixed_qkv, + 3, + split_arg_list, + ) + else: + + # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn] + (query, key, value) = torch.split( + mixed_qkv, + split_arg_list, + dim=3, + ) + + # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] + query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head) + return query, key, value + + def get_query_key_value_tensors(self, hidden_states, key_value_states=None): + """ + Derives `query`, `key` and `value` tensors from `hidden_states`. + """ + # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)] + mixed_qkv, _ = self.linear_qkv(hidden_states) + + query, key, value = self._split_qkv(mixed_qkv) + + if self.config.test_mode: + self.run_realtime_tests() + + if self.q_layernorm is not None: + query = self.q_layernorm(query) + + if self.k_layernorm is not None: + key = self.k_layernorm(key) + + return query, key, value + + def get_added_query_key_value_tensors(self, added_hidden_states, key_value_states=None): + """ + Derives `query`, `key` and `value` tensors from `hidden_states`. + """ + # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)] + mixed_qkv, _ = self.added_linear_qkv(added_hidden_states) + + query, key, value = self._split_qkv(mixed_qkv) + + if self.config.test_mode: + self.run_realtime_tests() + + if self.added_q_layernorm is not None: + query = self.added_q_layernorm(query) + + if self.added_k_layernorm is not None: + key = self.added_k_layernorm(key) + + return query, key, value + + def forward( + self, + hidden_states, + attention_mask, + key_value_states=None, + inference_params=None, + rotary_pos_emb=None, + packed_seq_params=None, + additional_hidden_states=None, + ): + # hidden_states: [sq, b, h] + + # For self attention we just duplicate the rotary_pos_emb if it isn't already + if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple): + rotary_pos_emb = (rotary_pos_emb,) * 2 + + # ===================== + # Query, Key, and Value + # ===================== + # Get the query, key and value tensors based on the type of attention - + # self or cross attn. + + query, key, value = self.get_query_key_value_tensors(hidden_states) + added_query, added_key, added_value = self.get_added_query_key_value_tensors(additional_hidden_states) + + query = torch.cat([added_query, query], dim=0) + key = torch.cat([added_key, key], dim=0) + value = torch.cat([added_value, value], dim=0) + + # =================================================== + # Adjust key, value, and rotary_pos_emb for inference + # =================================================== + key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference( + inference_params, key, value, rotary_pos_emb + ) + + if packed_seq_params is not None: + query = query.squeeze(1) + key = key.squeeze(1) + value = value.squeeze(1) + + # ================================================ + # relative positional embedding (rotary embedding) + # ================================================ + if rotary_pos_emb is not None: + q_pos_emb, k_pos_emb = rotary_pos_emb + + if packed_seq_params is not None: + cu_seqlens_q = packed_seq_params.cu_seqlens_q + cu_seqlens_kv = packed_seq_params.cu_seqlens_kv + else: + cu_seqlens_q = cu_seqlens_kv = None + query = apply_rotary_pos_emb( + query, + q_pos_emb, + config=self.config, + cu_seqlens=cu_seqlens_q, + ) + key = apply_rotary_pos_emb( + key, + k_pos_emb, + config=self.config, + cu_seqlens=cu_seqlens_kv, + ) + + # TODO, can apply positional embedding to value_layer so it has + # absolute positional embedding. + # otherwise, only relative positional embedding takes effect + # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb) + + # ================================== + # core attention computation + # ================================== + if self.checkpoint_core_attention and self.training: + core_attn_out = self._checkpointed_attention_forward( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + packed_seq_params=packed_seq_params, + ) + else: + core_attn_out = self.core_attention( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + packed_seq_params=packed_seq_params, + ) + + if packed_seq_params is not None: + # reshape to same output shape as unpacked case + # (t, np, hn) -> (t, b=1, h=np*hn) + # t is the pack size = sum (sq_i) + # note that batch is a dummy dimension in the packed case + core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1) + + # ================= + # Output. [sq, b, h] + # ================= + encoder_attention_output = core_attn_out[: additional_hidden_states.shape[0], :, :] + attention_output = core_attn_out[additional_hidden_states.shape[0] :, :, :] + + output, bias = self.linear_proj(attention_output) + encoder_output, encoder_bias = self.added_linear_proj(encoder_attention_output) + + output = output + bias + encoder_output = encoder_output + encoder_bias + + return output, encoder_output + + +class FluxSingleAttention(SelfAttention): + """Self-attention layer class + + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def forward( + self, + hidden_states, + attention_mask, + key_value_states=None, + inference_params=None, + rotary_pos_emb=None, + packed_seq_params=None, + ): + # hidden_states: [sq, b, h] + + # For self attention we just duplicate the rotary_pos_emb if it isn't already + if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple): + rotary_pos_emb = (rotary_pos_emb,) * 2 + + # ===================== + # Query, Key, and Value + # ===================== + # Get the query, key and value tensors based on the type of attention - + # self or cross attn. + query, key, value = self.get_query_key_value_tensors(hidden_states, key_value_states) + # print(f'megatron q before ln: {query.transpose(0, 1).contiguous()}, {query.transpose(0, 1).contiguous().shape}') + # print(f'megatron k before ln: {key.transpose(0, 1).contiguous()}, {key.transpose(0, 1).contiguous().shape}') + # print(f'megatron v before ln: {value.transpose(0, 1).contiguous()}, {value.transpose(0, 1).contiguous().shape}') + + # =================================================== + # Adjust key, value, and rotary_pos_emb for inference + # =================================================== + key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference( + inference_params, key, value, rotary_pos_emb + ) + + if packed_seq_params is not None: + query = query.squeeze(1) + key = key.squeeze(1) + value = value.squeeze(1) + + # ================================================ + # relative positional embedding (rotary embedding) + # ================================================ + if rotary_pos_emb is not None: + q_pos_emb, k_pos_emb = rotary_pos_emb + + if packed_seq_params is not None: + cu_seqlens_q = packed_seq_params.cu_seqlens_q + cu_seqlens_kv = packed_seq_params.cu_seqlens_kv + else: + cu_seqlens_q = cu_seqlens_kv = None + query = apply_rotary_pos_emb( + query, + q_pos_emb, + config=self.config, + cu_seqlens=cu_seqlens_q, + ) + key = apply_rotary_pos_emb( + key, + k_pos_emb, + config=self.config, + cu_seqlens=cu_seqlens_kv, + ) + + # TODO, can apply positional embedding to value_layer so it has + # absolute positional embedding. + # otherwise, only relative positional embedding takes effect + # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb) + + # ================================== + # core attention computation + # ================================== + + if self.checkpoint_core_attention and self.training: + core_attn_out = self._checkpointed_attention_forward( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + packed_seq_params=packed_seq_params, + ) + else: + core_attn_out = self.core_attention( + query, + key, + value, + attention_mask, + attn_mask_type=attn_mask_type, + packed_seq_params=packed_seq_params, + ) + + if packed_seq_params is not None: + # reshape to same output shape as unpacked case + # (t, np, hn) -> (t, b=1, h=np*hn) + # t is the pack size = sum (sq_i) + # note that batch is a dummy dimension in the packed case + core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1) + + return core_attn_out diff --git a/nemo/collections/diffusion/models/dit/dit_layer_spec.py b/nemo/collections/diffusion/models/dit/dit_layer_spec.py index 672dcff3ba00..cb7c520493f0 100644 --- a/nemo/collections/diffusion/models/dit/dit_layer_spec.py +++ b/nemo/collections/diffusion/models/dit/dit_layer_spec.py @@ -42,6 +42,12 @@ from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules from megatron.core.utils import make_viewless_tensor +from nemo.collections.diffusion.models.dit.dit_attention import ( + FluxSingleAttention, + JointSelfAttention, + JointSelfAttentionSubmodules, +) + @dataclass class DiTWithAdaLNSubmodules(TransformerLayerSubmodules): @@ -75,7 +81,14 @@ class AdaLN(MegatronModule): Adaptive Layer Normalization Module for DiT. """ - def __init__(self, config: TransformerConfig, n_adaln_chunks=9, norm=nn.LayerNorm): + def __init__( + self, + config: TransformerConfig, + n_adaln_chunks=9, + norm=nn.LayerNorm, + modulation_bias=False, + use_second_norm=False, + ): super().__init__(config) if norm == TENorm: self.ln = norm(config, config.hidden_size, config.layernorm_epsilon) @@ -83,8 +96,11 @@ def __init__(self, config: TransformerConfig, n_adaln_chunks=9, norm=nn.LayerNor self.ln = norm(config.hidden_size, elementwise_affine=False, eps=self.config.layernorm_epsilon) self.n_adaln_chunks = n_adaln_chunks self.adaLN_modulation = nn.Sequential( - nn.SiLU(), nn.Linear(config.hidden_size, self.n_adaln_chunks * config.hidden_size, bias=False) + nn.SiLU(), nn.Linear(config.hidden_size, self.n_adaln_chunks * config.hidden_size, bias=modulation_bias) ) + self.use_second_norm = use_second_norm + if self.use_second_norm: + self.ln2 = nn.LayerNorm(config.hidden_size, elementwise_affine=False, eps=1e-6) nn.init.constant_(self.adaLN_modulation[-1].weight, 0) setattr(self.adaLN_modulation[-1].weight, "sequence_parallel", config.sequence_parallel) @@ -92,29 +108,59 @@ def __init__(self, config: TransformerConfig, n_adaln_chunks=9, norm=nn.LayerNor def forward(self, timestep_emb): return self.adaLN_modulation(timestep_emb).chunk(self.n_adaln_chunks, dim=-1) - @jit_fuser + # @jit_fuser def modulate(self, x, shift, scale): return x * (1 + scale) + shift - @jit_fuser + # @jit_fuser def scale_add(self, residual, x, gate): return residual + gate * x - @jit_fuser - def modulated_layernorm(self, x, shift, scale): + # @jit_fuser + def modulated_layernorm(self, x, shift, scale, layernorm_idx=0): + if self.use_second_norm and layernorm_idx == 1: + layernorm = self.ln2 + else: + layernorm = self.ln # Optional Input Layer norm - input_layernorm_output = self.ln(x).type_as(x) + input_layernorm_output = layernorm(x).type_as(x) # DiT block specific return self.modulate(input_layernorm_output, shift, scale) # @jit_fuser - def scaled_modulated_layernorm(self, residual, x, gate, shift, scale): + def scaled_modulated_layernorm(self, residual, x, gate, shift, scale, layernorm_idx=0): hidden_states = self.scale_add(residual, x, gate) - shifted_pre_mlp_layernorm_output = self.modulated_layernorm(hidden_states, shift, scale) + shifted_pre_mlp_layernorm_output = self.modulated_layernorm(hidden_states, shift, scale, layernorm_idx) return hidden_states, shifted_pre_mlp_layernorm_output +class AdaLNContinuous(MegatronModule): + def __init__( + self, + config: TransformerConfig, + conditioning_embedding_dim: int, + modulation_bias: bool = True, + norm_type: str = "layer_norm", + ): + super().__init__(config) + self.adaLN_modulation = nn.Sequential( + nn.SiLU(), nn.Linear(conditioning_embedding_dim, config.hidden_size * 2, bias=modulation_bias) + ) + if norm_type == "layer_norm": + self.norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False, eps=1e-6, bias=modulation_bias) + elif norm_type == "rms_norm": + self.norm = RMSNorm(config.hidden_size, eps=1e-6) + else: + raise ValueError("Unknown normalization type {}".format(norm_type)) + + def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor: + emb = self.adaLN_modulation(conditioning_embedding) + scale, shift = torch.chunk(emb, 2, dim=1) + x = self.norm(x) * (1 + scale) + shift + return x + + class STDiTLayerWithAdaLN(TransformerLayer): """A single transformer layer. @@ -407,6 +453,225 @@ def forward( return output, context +class DiTLayer(TransformerLayer): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + + Original DiT layer implementation from [https://arxiv.org/pdf/2212.09748]. + """ + + def __init__( + self, + config: TransformerConfig, + submodules: TransformerLayerSubmodules, + layer_number: int = 1, + mlp_ratio: int = 4, + n_adaln_chunks: int = 6, + modulation_bias: bool = True, + ): + # Modify the mlp layer hidden_size of a dit layer according to mlp_ratio + config.ffn_hidden_size = int(mlp_ratio * config.hidden_size) + super().__init__(config=config, submodules=submodules, layer_number=layer_number) + + self.adaLN = AdaLN( + config=config, n_adaln_chunks=n_adaln_chunks, modulation_bias=modulation_bias, use_second_norm=True + ) + + def forward( + self, + hidden_states, + attention_mask, + context=None, + context_mask=None, + rotary_pos_emb=None, + inference_params=None, + packed_seq_params=None, + ): + # passing in conditioning information via attention mask here + c = attention_mask + + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN(c) + + shifted_input_layernorm_output = self.adaLN.modulated_layernorm( + hidden_states, shift=shift_msa, scale=scale_msa, layernorm_idx=0 + ) + + x, bias = self.self_attention(shifted_input_layernorm_output, attention_mask=None) + + hidden_states = self.adaLN.scale_add(hidden_states, x=(x + bias), gate=gate_msa) + + residual = hidden_states + + shited_pre_mlp_layernorm_output = self.adaLN.modulated_layernorm( + hidden_states, shift=shift_mlp, scale=scale_mlp, layernorm_idx=1 + ) + + x, bias = self.mlp(shited_pre_mlp_layernorm_output) + + hidden_states = self.adaLN.scale_add(residual, x=(x + bias), gate=gate_mlp) + + return hidden_states, context + + +class MMDiTLayer(TransformerLayer): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + + MMDiT layer implementation from [https://arxiv.org/pdf/2403.03206]. + """ + + def __init__( + self, + config: TransformerConfig, + submodules: TransformerLayerSubmodules, + layer_number: int = 1, + context_pre_only: bool = False, + ): + + hidden_size = config.hidden_size + super().__init__(config=config, submodules=submodules, layer_number=layer_number) + + self.adaln = AdaLN(config, modulation_bias=True, n_adaln_chunks=6, use_second_norm=True) + + self.context_pre_only = context_pre_only + context_norm_type = "ada_norm_continous" if context_pre_only else "ada_norm_zero" + + if context_norm_type == "ada_norm_continous": + self.adaln_context = AdaLNContinous(config, hidden_size, modulation_bias=True, norm_type="layer_norm") + elif context_norm_type == "ada_norm_zero": + self.adaln_context = AdaLN(config, modulation_bias=True, n_adaln_chunks=6, use_second_norm=True) + else: + raise ValueError( + f"Unknown context_norm_type: {context_norm_type}, currently only support `ada_norm_continous`, `ada_norm_zero`" + ) + # Override Cross Attention to disable CP. + # Disable TP Comm overlap as well. Not disabling will attempt re-use of buffer size same as Q and lead to incorrect tensor shapes. + cp_override_config = copy.deepcopy(config) + cp_override_config.context_parallel_size = 1 + cp_override_config.tp_comm_overlap = False + + if not context_pre_only: + self.context_mlp = build_module( + submodules.mlp, + config=cp_override_config, + ) + else: + self.context_mlp = None + + def forward( + self, + hidden_states, + encoder_hidden_states, + attention_mask=None, + context=None, + context_mask=None, + rotary_pos_emb=None, + inference_params=None, + packed_seq_params=None, + emb=None, + ): + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaln(emb) + + norm_hidden_states = self.adaln.modulated_layernorm( + hidden_states, shift=shift_msa, scale=scale_msa, layernorm_idx=0 + ) + if self.context_pre_only: + norm_encoder_hidden_states = self.adaln_context(encoder_hidden_states, emb) + else: + c_shift_msa, c_scale_msa, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.adaln_context(emb) + norm_encoder_hidden_states = self.adaln_context.modulated_layernorm( + encoder_hidden_states, shift=c_shift_msa, scale=c_scale_msa, layernorm_idx=0 + ) + + attention_output, encoder_attention_output = self.self_attention( + norm_hidden_states, + attention_mask=attention_mask, + key_value_states=None, + additional_hidden_states=norm_encoder_hidden_states, + rotary_pos_emb=rotary_pos_emb, + ) + hidden_states = self.adaln.scale_add(hidden_states, x=attention_output, gate=gate_msa) + norm_hidden_states = self.adaln.modulated_layernorm( + hidden_states, shift=shift_mlp, scale=scale_mlp, layernorm_idx=1 + ) + + mlp_output, mlp_output_bias = self.mlp(norm_hidden_states) + hidden_states = self.adaln.scale_add(hidden_states, x=(mlp_output + mlp_output_bias), gate=gate_mlp) + + if self.context_pre_only: + encoder_hidden_states = None + else: + encoder_hidden_states = self.adaln_context.scale_add( + encoder_hidden_states, x=encoder_attention_output, gate=c_gate_msa + ) + norm_encoder_hidden_states = self.adaln_context.modulated_layernorm( + encoder_hidden_states, shift=c_shift_mlp, scale=c_scale_mlp, layernorm_idx=1 + ) + + context_mlp_output, context_mlp_output_bias = self.context_mlp(norm_encoder_hidden_states) + encoder_hidden_states = self.adaln.scale_add( + encoder_hidden_states, x=(context_mlp_output + context_mlp_output_bias), gate=c_gate_mlp + ) + + return hidden_states, encoder_hidden_states + + +class FluxSingleTransformerBlock(TransformerLayer): + def __init__( + self, + config: TransformerConfig, + submodules: TransformerLayerSubmodules, + layer_number: int = 1, + mlp_ratio: int = 4, + n_adaln_chunks: int = 3, + modulation_bias: bool = True, + ): + super().__init__(config=config, submodules=submodules, layer_number=layer_number) + hidden_size = config.hidden_size + self.adaln = AdaLN( + config=config, n_adaln_chunks=n_adaln_chunks, modulation_bias=modulation_bias, use_second_norm=False + ) + self.mlp_hidden_dim = int(hidden_size * mlp_ratio) + self.proj_in = nn.Linear(hidden_size, self.mlp_hidden_dim) + self.activation = nn.GELU(approximate="tanh") + self.proj_out = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size) + + def forward( + self, + hidden_states, + attention_mask=None, + context=None, + context_mask=None, + rotary_pos_emb=None, + inference_params=None, + packed_seq_params=None, + emb=None, + ): + residual = hidden_states + + shift, scale, gate = self.adaln(emb) + + norm_hidden_states = self.adaln.modulated_layernorm(hidden_states, shift=shift, scale=scale) + + mlp_hidden_states = self.activation(self.proj_in(norm_hidden_states)) + + attention_output = self.self_attention( + norm_hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb + ) + + hidden_states = torch.cat((attention_output, mlp_hidden_states), dim=2) + + hidden_states = self.proj_out(hidden_states) + + hidden_states = self.adaln.scale_add(residual, x=hidden_states, gate=gate) + + return hidden_states + + def get_stdit_adaln_block_with_transformer_engine_spec() -> ModuleSpec: params = {"attn_mask_type": AttnMaskType.padding} return ModuleSpec( @@ -530,3 +795,77 @@ def get_official_dit_adaln_block_with_transformer_engine_spec() -> ModuleSpec: ), ), ) + + +def get_mm_dit_block_with_transformer_engine_spec() -> ModuleSpec: + + return ModuleSpec( + module=MMDiTLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=JointSelfAttention, + params={"attn_mask_type": AttnMaskType.no_mask}, + submodules=JointSelfAttentionSubmodules( + linear_qkv=TEColumnParallelLinear, + added_linear_qkv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + ), + ), + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TEColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), + ), + ), + ) + + +def get_flux_single_transformer_engine_spec() -> ModuleSpec: + return ModuleSpec( + module=FluxSingleTransformerBlock, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=FluxSingleAttention, + params={"attn_mask_type": AttnMaskType.no_mask}, + submodules=SelfAttentionSubmodules( + linear_qkv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + q_layernorm=RMSNorm, + k_layernorm=RMSNorm, + linear_proj=IdentityOp, + ), + ), + ), + ) + + +def get_flux_double_transformer_engine_spec() -> ModuleSpec: + return ModuleSpec( + module=MMDiTLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=JointSelfAttention, + params={"attn_mask_type": AttnMaskType.no_mask}, + submodules=JointSelfAttentionSubmodules( + q_layernorm=RMSNorm, + k_layernorm=RMSNorm, + added_q_layernorm=RMSNorm, + added_k_layernorm=RMSNorm, + linear_qkv=TEColumnParallelLinear, + added_linear_qkv=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + ), + ), + mlp=ModuleSpec( + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TEColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), + ), + ), + ) diff --git a/nemo/collections/diffusion/models/flux/__init__.py b/nemo/collections/diffusion/models/flux/__init__.py new file mode 100644 index 000000000000..9e3250071955 --- /dev/null +++ b/nemo/collections/diffusion/models/flux/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/diffusion/models/flux/layers.py b/nemo/collections/diffusion/models/flux/layers.py new file mode 100644 index 000000000000..222a9a1d67ae --- /dev/null +++ b/nemo/collections/diffusion/models/flux/layers.py @@ -0,0 +1,173 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import torch +from torch import Tensor, nn + + +def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: + """ + Different from the original ROPE used for flux. + Megatron attention takes the out product and calculate sin/cos inside, so we only need to get the freqs here + in the shape of [seq, ..., dim] + """ + assert dim % 2 == 0, "The dimension must be even." + + scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim + omega = 1.0 / (theta**scale) + + out = torch.einsum("...n,d->...nd", pos, omega) + + return out.float() + + +class EmbedND(nn.Module): + def __init__(self, dim: int, theta: int, axes_dim: list[int]): + super().__init__() + self.dim = dim + self.theta = theta + self.axes_dim = axes_dim + + def forward(self, ids: torch.Tensor) -> torch.Tensor: + n_axes = ids.shape[-1] + emb = torch.cat( + [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)], + dim=-1, + ) + emb = emb.unsqueeze(1).permute(2, 0, 1, 3) + return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1) + + +class MLPEmbedder(nn.Module): + def __init__(self, in_dim: int, hidden_dim: int): + super().__init__() + self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True) + self.silu = nn.SiLU() + self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True) + + def forward(self, x: Tensor) -> Tensor: + return self.out_layer(self.silu(self.in_layer(x))) + + +def get_timestep_embedding( + timesteps: torch.Tensor, + embedding_dim: int, + flip_sin_to_cos: bool = True, + downscale_freq_shift: float = 0, + scale: float = 1, + max_period: int = 10000, +): + """ + This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings. + + Args + timesteps (torch.Tensor): + a 1-D Tensor of N indices, one per batch element. These may be fractional. + embedding_dim (int): + the dimension of the output. + flip_sin_to_cos (bool): + Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False) + downscale_freq_shift (float): + Controls the delta between frequencies between dimensions + scale (float): + Scaling factor applied to the embeddings. + max_period (int): + Controls the maximum frequency of the embeddings + Returns + torch.Tensor: an [N x dim] Tensor of positional embeddings. + """ + assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array" + + half_dim = embedding_dim // 2 + exponent = -math.log(max_period) * torch.arange( + start=0, end=half_dim, dtype=torch.float32, device=timesteps.device + ) + exponent = exponent / (half_dim - downscale_freq_shift) + + emb = torch.exp(exponent) + emb = timesteps[:, None].float() * emb[None, :] + + # scale embeddings + emb = scale * emb + + # concat sine and cosine embeddings + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1) + + # flip sine and cosine embeddings + if flip_sin_to_cos: + emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1) + + # zero pad + if embedding_dim % 2 == 1: + emb = torch.nn.functional.pad(emb, (0, 1, 0, 0)) + return emb + + +class Timesteps(nn.Module): + def __init__( + self, + embedding_dim: int, + flip_sin_to_cos: bool = True, + downscale_freq_shift: float = 0, + scale: float = 1, + max_period: int = 10000, + ): + super().__init__() + self.embedding_dim = embedding_dim + self.flip_sin_to_cos = flip_sin_to_cos + self.downscale_freq_shift = downscale_freq_shift + self.scale = scale + self.max_period = max_period + + def forward(self, timesteps: torch.Tensor) -> torch.Tensor: + t_emb = get_timestep_embedding( + timesteps, + self.embedding_dim, + flip_sin_to_cos=self.flip_sin_to_cos, + downscale_freq_shift=self.downscale_freq_shift, + scale=self.scale, + max_period=self.max_period, + ) + return t_emb + + +class TimeStepEmbedder(nn.Module): + def __init__( + self, + embedding_dim: int, + hidden_dim: int, + flip_sin_to_cos: bool = True, + downscale_freq_shift: float = 0, + scale: float = 1, + max_period: int = 10000, + ): + + super().__init__() + + self.time_proj = Timesteps( + embedding_dim=embedding_dim, + flip_sin_to_cos=flip_sin_to_cos, + downscale_freq_shift=downscale_freq_shift, + scale=scale, + max_period=max_period, + ) + self.time_embedder = MLPEmbedder(in_dim=embedding_dim, hidden_dim=hidden_dim) + + def forward(self, timesteps: torch.Tensor) -> torch.Tensor: + timesteps_proj = self.time_proj(timesteps) + timesteps_emb = self.time_embedder(timesteps_proj) + + return timesteps_emb diff --git a/nemo/collections/diffusion/models/flux/model.py b/nemo/collections/diffusion/models/flux/model.py new file mode 100644 index 000000000000..4d42c80a75a1 --- /dev/null +++ b/nemo/collections/diffusion/models/flux/model.py @@ -0,0 +1,156 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Callable + +import torch +from megatron.core.models.common.vision_module.vision_module import VisionModule +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.utils import openai_gelu +from torch import nn + +from nemo.collections.diffusion.models.dit.dit_layer_spec import ( + AdaLNContinuous, + FluxSingleTransformerBlock, + MMDiTLayer, + get_flux_double_transformer_engine_spec, + get_flux_single_transformer_engine_spec, +) +from nemo.collections.diffusion.models.flux.layers import EmbedND, MLPEmbedder, TimeStepEmbedder + + +@dataclass +class FluxParams: + num_joint_layers: int = 19 + num_single_layers: int = 38 + hidden_size: int = 3072 + num_attention_heads: int = 24 + activation_func: Callable = openai_gelu + add_qkv_bias: bool = True + ffn_hidden_size: int = 16384 + in_channels: int = 64 + context_dim: int = 4096 + model_channels: int = 256 + patch_size: int = 1 + guidance_embed: bool = False + vec_in_dim: int = 768 + + +class Flux(VisionModule): + def __init__(self, config: FluxParams): + + self.out_channels = config.in_channels + self.hidden_size = config.hidden_size + self.num_attention_heads = config.num_attention_heads + self.patch_size = config.patch_size + self.in_channels = config.in_channels + self.guidance_embed = config.guidance_embed + transformer_config = TransformerConfig( + num_layers=1, + hidden_size=self.hidden_size, + num_attention_heads=self.num_attention_heads, + use_cpu_initialization=True, + activation_func=config.activation_func, + hidden_dropout=0, + attention_dropout=0, + layernorm_epsilon=1e-6, + add_qkv_bias=config.add_qkv_bias, + rotary_interleaved=True, + ) + super().__init__(transformer_config) + + self.pos_embed = EmbedND(dim=self.hidden_size, theta=10000, axes_dim=[16, 56, 56]) + self.img_embed = nn.Linear(config.in_channels, self.hidden_size) + self.txt_embed = nn.Linear(config.context_dim, self.hidden_size) + self.timestep_embedding = TimeStepEmbedder(config.model_channels, self.hidden_size) + self.vector_embedding = MLPEmbedder(in_dim=config.vec_in_dim, hidden_dim=self.hidden_size) + if config.guidance_embed: + self.guidance_embedding = ( + MLPEmbedder(in_dim=config.model_channels, hidden_dim=self.hidden_size) + if config.guidance_embed + else nn.Identity() + ) + + self.double_blocks = nn.ModuleList( + [ + MMDiTLayer( + config=transformer_config, + submodules=get_flux_double_transformer_engine_spec().submodules, + layer_number=i, + context_pre_only=False, + ) + for i in range(config.num_joint_layers) + ] + ) + + self.single_blocks = nn.ModuleList( + [ + FluxSingleTransformerBlock( + config=transformer_config, + submodules=get_flux_single_transformer_engine_spec().submodules, + layer_number=i, + ) + for i in range(config.num_single_layers) + ] + ) + + self.norm_out = AdaLNContinuous(config=transformer_config, conditioning_embedding_dim=self.hidden_size) + self.proj_out = nn.Linear(self.hidden_size, self.patch_size * self.patch_size * self.out_channels, bias=True) + + def forward( + self, + img: torch.Tensor, + txt: torch.Tensor = None, + y: torch.Tensor = None, + timesteps: torch.LongTensor = None, + img_ids: torch.Tensor = None, + txt_ids: torch.Tensor = None, + guidance: torch.Tensor = None, + ): + hidden_states = self.img_embed(img) + encoder_hidden_states = self.txt_embed(txt) + + timesteps = timesteps.to(img.dtype) * 1000 + vec_emb = self.timestep_embedding(timesteps) + + if guidance is not None: + vec_emb = vec_emb + self.guidance_embedding(self.timestep_embedding.time_proj(guidance * 1000)) + vec_emb = vec_emb + self.vector_embedding(y) + + ids = torch.cat((txt_ids, img_ids), dim=1) + rotary_pos_emb = self.pos_embed(ids) + for id_block, block in enumerate(self.double_blocks): + hidden_states, encoder_hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + rotary_pos_emb=rotary_pos_emb, + emb=vec_emb, + ) + + hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=0) + + for id_block, block in enumerate(self.single_blocks): + hidden_states = block( + hidden_states=hidden_states, + rotary_pos_emb=rotary_pos_emb, + emb=vec_emb, + ) + + hidden_states = hidden_states[encoder_hidden_states.shape[0] :, ...] + + hidden_states = self.norm_out(hidden_states, vec_emb) + output = self.proj_out(hidden_states) + + return output diff --git a/nemo/collections/diffusion/models/flux/pipeline.py b/nemo/collections/diffusion/models/flux/pipeline.py new file mode 100644 index 000000000000..e460f8f115bd --- /dev/null +++ b/nemo/collections/diffusion/models/flux/pipeline.py @@ -0,0 +1,342 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import List, Optional, Union + +import numpy as np +import torch +from PIL import Image +from safetensors.torch import load_file as load_safetensors +from safetensors.torch import save_file as save_safetensors +from torch import nn +from tqdm import tqdm + +from nemo.collections.diffusion.encoders.conditioner import FrozenCLIPEmbedder, FrozenT5Embedder +from nemo.collections.diffusion.models.flux.model import Flux, FluxParams +from nemo.collections.diffusion.sampler.flow_matching.flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler +from nemo.collections.diffusion.utils.flux_ckpt_converter import flux_transformer_converter +from nemo.collections.diffusion.utils.flux_pipeline_utils import FluxModelParams +from nemo.collections.diffusion.vae.autoencoder import AutoEncoder + + +class FluxInferencePipeline(nn.Module): + def __init__(self, params: FluxModelParams): + super().__init__() + self.device = params.device + params.clip_params['device'] = self.device + params.t5_params['device'] = self.device + + self.vae = AutoEncoder(params.vae_params).to(self.device).eval() + self.clip_encoder = FrozenCLIPEmbedder(**params.clip_params) + self.t5_encoder = FrozenT5Embedder(**params.t5_params) + self.transformer = Flux(params.flux_params).to(self.device).eval() + self.vae_scale_factor = 2 ** (len(self.vae.params.ch_mult)) + self.scheduler = FlowMatchEulerDiscreteScheduler(**params.scheduler_params) + self.params = params + + def load_from_pretrained(self, ckpt_path, do_convert_from_hf=True, save_converted_model=None): + if do_convert_from_hf: + ckpt = flux_transformer_converter(ckpt_path, self.transformer.config) + if save_converted_model: + save_path = os.path.join(ckpt_path, 'nemo_flux_transformer.safetensors') + save_safetensors(ckpt, save_path) + print(f'saving converted transformer checkpoint to {save_path}') + else: + ckpt = load_safetensors(ckpt_path) + missing, unexpected = self.transformer.load_state_dict(ckpt, strict=False) + missing = [ + k for k in missing if not k.endswith('_extra_state') + ] # These keys are mcore specific and should not affect the model performance + if len(missing) > 0: + print( + f"The folloing keys are missing during checkpoint loading, please check the ckpt provided or the image quality may be compromised.\n {missing}" + ) + print(f"Found unexepected keys: \n {unexpected}") + + def encoder_prompt( + self, + prompt: Union[str, List[str]], + num_images_per_prompt: int = 1, + prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + max_sequence_length: int = 512, + device: Optional[torch.device] = 'cuda', + dtype: Optional[torch.dtype] = torch.float, + ): + if prompt is not None: + batch_size = len(prompt) + elif prompt_embeds is not None: + batch_size = prompt_embeds.shape[0] + else: + raise ValueError("Either prompt or prompt_embeds must be provided.") + if device == 'cuda' and self.t5_encoder.device != device: + self.t5_encoder.to(device) + if prompt_embeds is None: + prompt_embeds = self.t5_encoder(prompt, max_sequence_length=max_sequence_length) + seq_len = prompt_embeds.shape[1] + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1).to(dtype=dtype) + + if device == 'cuda' and self.clip_encoder.device != device: + self.clip_encoder.to(device) + if pooled_prompt_embeds is None: + _, pooled_prompt_embeds = self.clip_encoder(prompt) + + pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1) + pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1).to(dtype=dtype) + + dtype = dtype if dtype is not None else self.t5_encoder.dtype + text_ids = torch.zeros(batch_size, prompt_embeds.shape[1], 3).to(device=device, dtype=dtype) + text_ids = text_ids.repeat(num_images_per_prompt, 1, 1) + + return prompt_embeds.transpose(0, 1), pooled_prompt_embeds, text_ids + + @staticmethod + def _prepare_latent_image_ids(batch_size: int, height: int, width: int, device: torch.device, dtype: torch.dtype): + latent_image_ids = torch.zeros(height // 2, width // 2, 3) + latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None] + latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :] + + latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape + + latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1) + latent_image_ids = latent_image_ids.reshape( + batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels + ) + + return latent_image_ids.to(device=device, dtype=dtype) + + @staticmethod + def _pack_latents(latents, batch_size, num_channels_latents, height, width): + latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2) + latents = latents.permute(0, 2, 4, 1, 3, 5) + latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4) + + return latents + + @staticmethod + def _unpack_latents(latents, height, width, vae_scale_factor): + batch_size, num_patches, channels = latents.shape + + height = height // vae_scale_factor + width = width // vae_scale_factor + + latents = latents.view(batch_size, height, width, channels // 4, 2, 2) + latents = latents.permute(0, 3, 1, 4, 2, 5) + + latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2) + + return latents + + @staticmethod + def _calculate_shift( + image_seq_len, + base_seq_len: int = 256, + max_seq_len: int = 4096, + base_shift: float = 0.5, + max_shift: float = 1.16, + ): + m = (max_shift - base_shift) / (max_seq_len - base_seq_len) + b = base_shift - m * base_seq_len + mu = image_seq_len * m + b + return mu + + def prepare_latents( + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + device, + generator, + latents=None, + ): + height = 2 * int(height) // self.vae_scale_factor + width = 2 * int(width) // self.vae_scale_factor + + shape = (batch_size, num_channels_latents, height, width) + + if latents is not None: + latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype) + return latents.to(device=device, dtype=dtype), latent_image_ids + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + latents = FluxInferencePipeline._generate_rand_latents(shape, generator=generator, device=device, dtype=dtype) + latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width) + + latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype) + + return latents.transpose(0, 1), latent_image_ids + + @staticmethod + def _generate_rand_latents( + shape, + generator, + device, + dtype, + ): + if isinstance(generator, list): + shape = (1,) + shape[1:] + latents = [ + torch.randn(shape, generator=generator[i], device=device, dtype=dtype, layout=layout) + for i in range(batch_size) + ] + latents = torch.cat(latents, dim=0).to(device=device) + else: + latents = torch.randn(shape, generator=generator, device=device, dtype=dtype) + + return latents + + @staticmethod + def numpy_to_pil(images): + """ + Convert a numpy image or a batch of images to a PIL image. + """ + if images.ndim == 3: + images = images[None, ...] + images = (images * 255).round().astype("uint8") + pil_images = [Image.fromarray(image) for image in images] + + return pil_images + + @staticmethod + def torch_to_numpy(images): + numpy_images = images.float().cpu().permute(0, 2, 3, 1).numpy() + return numpy_images + + @staticmethod + def denormalize(image): + return (image / 2 + 0.5).clamp(0, 1) + + def __call__( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = 512, + width: Optional[int] = 512, + num_inference_steps: int = 28, + timesteps: Optional[List[int]] = None, + guidance_scale: float = 7.0, + num_images_per_prompt: Optional[int] = 1, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + max_sequence_length: int = 512, + device: torch.device = 'cuda', + dtype: torch.dtype = torch.float32, + save_to_disk: bool = True, + offload: bool = True, + ): + assert device == 'cuda', 'Transformer blocks in Mcore must run on cuda devices' + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + prompt = [prompt] + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + elif prompt_embeds is not None and isinstance(prompt_embeds, torch.FloatTensor): + batch_size = prompt_embeds.shape[0] + else: + raise ValueError("Either prompt or prompt_embeds must be provided.") + + ## get text prompt embeddings + prompt_embeds, pooled_prompt_embeds, text_ids = self.encoder_prompt( + prompt=prompt, + prompt_embeds=prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + device=device, + dtype=dtype, + ) + if offload: + self.t5_encoder.to('cpu') + self.clip_encoder.to('cpu') + torch.cuda.empty_cache() + + ## prepare image latents + num_channels_latents = self.transformer.in_channels // 4 + latents, latent_image_ids = self.prepare_latents( + batch_size * num_images_per_prompt, num_channels_latents, height, width, dtype, device, generator, latents + ) + # prepare timesteps + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + image_seq_len = latents.shape[0] + + mu = FluxInferencePipeline._calculate_shift( + image_seq_len, + self.scheduler.base_image_seq_len, + self.scheduler.max_image_seq_len, + self.scheduler.base_shift, + self.scheduler.max_shift, + ) + + self.scheduler.set_timesteps(sigmas=sigmas, device=device, mu=mu) + timesteps = self.scheduler.timesteps + + if device == 'cuda' and device != self.device: + self.transformer.to(device) + with torch.no_grad(): + for i, t in tqdm(enumerate(timesteps)): + timestep = t.expand(latents.shape[1]).to(device=latents.device, dtype=latents.dtype) + if self.transformer.guidance_embed: + guidance = torch.tensor([guidance_scale], device=device).expand(latents.shape[1]) + else: + guidance = None + with torch.autocast(device_type='cuda', dtype=latents.dtype): + pred = self.transformer( + img=latents, + txt=prompt_embeds, + y=pooled_prompt_embeds, + timesteps=timestep / 1000, + img_ids=latent_image_ids, + txt_ids=text_ids, + guidance=guidance, + ) + latents = self.scheduler.step(pred, t, latents)[0] + if offload: + self.transformer.to('cpu') + torch.cuda.empty_cache() + + if output_type == "latent": + return latents.transpose(0, 1) + elif output_type == "pil": + latents = self._unpack_latents(latents.transpose(0, 1), height, width, self.vae_scale_factor) + latents = (latents / self.vae.params.scale_factor) + self.vae.params.shift_factor + if device == 'cuda' and device != self.device: + self.vae.to(device) + with torch.autocast(device_type='cuda', dtype=latents.dtype): + image = self.vae.decode(latents) + if offload: + self.vae.to('cpu') + torch.cuda.empty_cache() + image = FluxInferencePipeline.denormalize(image) + image = FluxInferencePipeline.torch_to_numpy(image) + image = FluxInferencePipeline.numpy_to_pil(image) + if save_to_disk: + print('Saving to disk') + assert len(image) == int(len(prompt) * num_images_per_prompt) + prompt = [p[:40] + f'_{idx}' for p in prompt for idx in range(num_images_per_prompt)] + for file_name, image in zip(prompt, image): + image.save(f'{file_name}.png') + + return image diff --git a/nemo/collections/diffusion/sampler/flow_matching/__init__.py b/nemo/collections/diffusion/sampler/flow_matching/__init__.py new file mode 100644 index 000000000000..d9155f923f18 --- /dev/null +++ b/nemo/collections/diffusion/sampler/flow_matching/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py b/nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py new file mode 100644 index 000000000000..5bde6b0d1dc1 --- /dev/null +++ b/nemo/collections/diffusion/sampler/flow_matching/flow_match_euler_discrete.py @@ -0,0 +1,284 @@ +# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from abc import ABC +from typing import List, Optional, Tuple, Union + + +import numpy as np +import torch + + +class FlowMatchEulerDiscreteScheduler(ABC): + """ + Euler scheduler. + + Args: + num_train_timesteps (`int`, defaults to 1000): + The number of diffusion steps to train the model. + timestep_spacing (`str`, defaults to `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. + shift (`float`, defaults to 1.0): + The shift value for the timestep schedule. + """ + + _compatibles = [] + order = 1 + + def __init__( + self, + num_train_timesteps: int = 1000, + shift: float = 1.0, + use_dynamic_shifting=False, + base_shift: Optional[float] = 0.5, + max_shift: Optional[float] = 1.15, + base_image_seq_len: Optional[int] = 256, + max_image_seq_len: Optional[int] = 4096, + ): + timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy() + timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32) + + sigmas = timesteps / num_train_timesteps + if not use_dynamic_shifting: + # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution + sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) + + self.timesteps = sigmas * num_train_timesteps + + self._step_index = None + self._begin_index = None + + self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication + self.sigma_min = self.sigmas[-1].item() + self.sigma_max = self.sigmas[0].item() + + self.base_shift = base_shift + self.max_shift = max_shift + self.base_image_seq_len = base_image_seq_len + self.max_image_seq_len = max_image_seq_len + self.use_dynamic_shifting = use_dynamic_shifting + self.num_train_timesteps = num_train_timesteps + self.shift = shift + + @property + def step_index(self): + """ + The index counter for current timestep. It will increase 1 after each scheduler step. + """ + return self._step_index + + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + + def scale_noise( + self, + sample: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + noise: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + """ + Forward process in flow-matching + + Args: + sample (`torch.FloatTensor`): + The input sample. + timestep (`int`, *optional*): + The current timestep in the diffusion chain. + + Returns: + `torch.FloatTensor`: + A scaled input sample. + """ + # Make sure sigmas and timesteps have the same device and dtype as original_samples + sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype) + + if sample.device.type == "mps" and torch.is_floating_point(timestep): + # mps does not support float64 + schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32) + timestep = timestep.to(sample.device, dtype=torch.float32) + else: + schedule_timesteps = self.timesteps.to(sample.device) + timestep = timestep.to(sample.device) + + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep] + elif self.step_index is not None: + # add_noise is called after first denoising step (for inpainting) + step_indices = [self.step_index] * timestep.shape[0] + else: + # add noise is called before first denoising step to create initial latent(img2img) + step_indices = [self.begin_index] * timestep.shape[0] + + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < len(sample.shape): + sigma = sigma.unsqueeze(-1) + + sample = sigma * noise + (1.0 - sigma) * sample + + return sample + + def _sigma_to_t(self, sigma): + return sigma * self.num_train_timesteps + + def time_shift(self, mu: float, sigma: float, t: torch.Tensor): + return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma) + + def set_timesteps( + self, + num_inference_steps: int = None, + device: Union[str, torch.device] = None, + sigmas: Optional[List[float]] = None, + mu: Optional[float] = None, + ): + """ + Sets the discrete timesteps used for the diffusion chain (to be run before inference). + + Args: + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + """ + + if self.use_dynamic_shifting and mu is None: + raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`") + + if sigmas is None: + self.num_inference_steps = num_inference_steps + timesteps = np.linspace( + self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps + ) + + sigmas = timesteps / self.num_train_timesteps + + if self.use_dynamic_shifting: + sigmas = self.time_shift(mu, 1.0, sigmas) + else: + sigmas = self.shift * sigmas / (1 + (self.shift - 1) * sigmas) + sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) + timesteps = sigmas * self.num_train_timesteps + + self.timesteps = timesteps.to(device=device) + self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)]) + + self._step_index = None + self._begin_index = None + + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps + + indices = (schedule_timesteps == timestep).nonzero() + + # The sigma index that is taken for the **very** first `step` + # is always the second index (or the last index if there is only 1) + # This way we can ensure we don't accidentally skip a sigma in + # case we start in the middle of the denoising schedule (e.g. for image-to-image) + pos = 1 if len(indices) > 1 else 0 + + return indices[pos].item() + + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index + + def step( + self, + model_output: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + sample: torch.FloatTensor, + s_churn: float = 0.0, + s_tmin: float = 0.0, + s_tmax: float = float("inf"), + s_noise: float = 1.0, + generator: Optional[torch.Generator] = None, + ) -> Tuple: + """ + Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.FloatTensor`): + The direct output from learned diffusion model. + timestep (`float`): + The current discrete timestep in the diffusion chain. + sample (`torch.FloatTensor`): + A current instance of a sample created by the diffusion process. + s_churn (`float`): + s_tmin (`float`): + s_tmax (`float`): + s_noise (`float`, defaults to 1.0): + Scaling factor for noise added to the sample. + generator (`torch.Generator`, *optional*): + A random number generator. + + Returns: + A tuple is returned where the first element is the sample tensor. + """ + + if ( + isinstance(timestep, int) + or isinstance(timestep, torch.IntTensor) + or isinstance(timestep, torch.LongTensor) + ): + raise ValueError( + ( + "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" + " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass" + " one of the `scheduler.timesteps` as a timestep." + ), + ) + + if self.step_index is None: + self._init_step_index(timestep) + + # Upcast to avoid precision issues when computing prev_sample + sample = sample.to(torch.float32) + + sigma = self.sigmas[self.step_index] + sigma_next = self.sigmas[self.step_index + 1] + prev_sample = sample + (sigma_next - sigma) * model_output + + # Cast sample back to model compatible dtype + prev_sample = prev_sample.to(model_output.dtype) + + # upon completion increase step index by one + self._step_index += 1 + + return (prev_sample,) + + def __len__(self): + return self.num_train_timesteps diff --git a/nemo/collections/diffusion/utils/__init__.py b/nemo/collections/diffusion/utils/__init__.py new file mode 100644 index 000000000000..9e3250071955 --- /dev/null +++ b/nemo/collections/diffusion/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/diffusion/utils/flux_ckpt_converter.py b/nemo/collections/diffusion/utils/flux_ckpt_converter.py new file mode 100644 index 000000000000..444a77bfad68 --- /dev/null +++ b/nemo/collections/diffusion/utils/flux_ckpt_converter.py @@ -0,0 +1,206 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import torch +from safetensors.torch import load_file as load_safetensors + + +def _import_qkv_bias(transformer_config, qb, kb, vb): + + head_num = transformer_config.num_attention_heads + num_query_groups = transformer_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = transformer_config.hidden_size + head_num = transformer_config.num_attention_heads + head_size = hidden_size // head_num + + new_q_bias_tensor_shape = (head_num, head_size) + new_kv_bias_tensor_shape = (num_query_groups, head_size) + + qb = qb.view(*new_q_bias_tensor_shape) + kb = kb.view(*new_kv_bias_tensor_shape) + vb = vb.view(*new_kv_bias_tensor_shape) + + qkv_bias_l = [] + for i in range(num_query_groups): + qkv_bias_l.append(qb[i * heads_per_group : (i + 1) * heads_per_group, :]) + qkv_bias_l.append(kb[i : i + 1, :]) + qkv_bias_l.append(vb[i : i + 1, :]) + + qkv_bias = torch.cat(qkv_bias_l) + qkv_bias = qkv_bias.reshape([head_size * (head_num + 2 * num_query_groups)]) + + return qkv_bias + + +def _import_qkv(transformer_config, q, k, v): + + head_num = transformer_config.num_attention_heads + num_query_groups = transformer_config.num_query_groups + heads_per_group = head_num // num_query_groups + hidden_size = transformer_config.hidden_size + head_num = transformer_config.num_attention_heads + head_size = hidden_size // head_num + + old_tensor_shape = q.size() + new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:] + new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:] + + q = q.view(*new_q_tensor_shape) + k = k.view(*new_kv_tensor_shape) + v = v.view(*new_kv_tensor_shape) + + qkv_weights_l = [] + for i in range(num_query_groups): + qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :]) + qkv_weights_l.append(k[i : i + 1, :, :]) + qkv_weights_l.append(v[i : i + 1, :, :]) + qkv_weights = torch.cat(qkv_weights_l) + assert qkv_weights.ndim == 3, qkv_weights.shape + assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape + assert qkv_weights.shape[1] == head_size, qkv_weights.shape + assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape + + qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size]) + + return qkv_weights + + +key_mapping = { + 'double_blocks': { + 'norm1.linear.weight': 'adaln.adaLN_modulation.1.weight', + 'norm1.linear.bias': 'adaln.adaLN_modulation.1.bias', + 'norm1_context.linear.weight': 'adaln_context.adaLN_modulation.1.weight', + 'norm1_context.linear.bias': 'adaln_context.adaLN_modulation.1.bias', + 'attn.norm_q.weight': 'self_attention.q_layernorm.weight', + 'attn.norm_k.weight': 'self_attention.k_layernorm.weight', + 'attn.norm_added_q.weight': 'self_attention.added_q_layernorm.weight', + 'attn.norm_added_k.weight': 'self_attention.added_k_layernorm.weight', + 'attn.to_out.0.weight': 'self_attention.linear_proj.weight', + 'attn.to_out.0.bias': 'self_attention.linear_proj.bias', + 'attn.to_add_out.weight': 'self_attention.added_linear_proj.weight', + 'attn.to_add_out.bias': 'self_attention.added_linear_proj.bias', + 'ff.net.0.proj.weight': 'mlp.linear_fc1.weight', + 'ff.net.0.proj.bias': 'mlp.linear_fc1.bias', + 'ff.net.2.weight': 'mlp.linear_fc2.weight', + 'ff.net.2.bias': 'mlp.linear_fc2.bias', + 'ff_context.net.0.proj.weight': 'context_mlp.linear_fc1.weight', + 'ff_context.net.0.proj.bias': 'context_mlp.linear_fc1.bias', + 'ff_context.net.2.weight': 'context_mlp.linear_fc2.weight', + 'ff_context.net.2.bias': 'context_mlp.linear_fc2.bias', + }, + 'single_blocks': { + 'norm.linear.weight': 'adaln.adaLN_modulation.1.weight', + 'norm.linear.bias': 'adaln.adaLN_modulation.1.bias', + 'proj_mlp.weight': 'proj_in.weight', + 'proj_mlp.bias': 'proj_in.bias', + 'proj_out.weight': 'proj_out.weight', + 'proj_out.bias': 'proj_out.bias', + 'attn.norm_q.weight': 'self_attention.q_layernorm.weight', + 'attn.norm_k.weight': 'self_attention.k_layernorm.weight', + }, + 'norm_out.linear.bias': 'norm_out.adaLN_modulation.1.bias', + 'norm_out.linear.weight': 'norm_out.adaLN_modulation.1.weight', + 'proj_out.bias': 'proj_out.bias', + 'proj_out.weight': 'proj_out.weight', + 'time_text_embed.guidance_embedder.linear_1.bias': 'guidance_embedding.in_layer.bias', + 'time_text_embed.guidance_embedder.linear_1.weight': 'guidance_embedding.in_layer.weight', + 'time_text_embed.guidance_embedder.linear_2.bias': 'guidance_embedding.out_layer.bias', + 'time_text_embed.guidance_embedder.linear_2.weight': 'guidance_embedding.out_layer.weight', + 'x_embedder.bias': 'img_embed.bias', + 'x_embedder.weight': 'img_embed.weight', + 'time_text_embed.timestep_embedder.linear_1.bias': 'timestep_embedding.time_embedder.in_layer.bias', + 'time_text_embed.timestep_embedder.linear_1.weight': 'timestep_embedding.time_embedder.in_layer.weight', + 'time_text_embed.timestep_embedder.linear_2.bias': 'timestep_embedding.time_embedder.out_layer.bias', + 'time_text_embed.timestep_embedder.linear_2.weight': 'timestep_embedding.time_embedder.out_layer.weight', + 'context_embedder.bias': 'txt_embed.bias', + 'context_embedder.weight': 'txt_embed.weight', + 'time_text_embed.text_embedder.linear_1.bias': 'vector_embedding.in_layer.bias', + 'time_text_embed.text_embedder.linear_1.weight': 'vector_embedding.in_layer.weight', + 'time_text_embed.text_embedder.linear_2.bias': 'vector_embedding.out_layer.bias', + 'time_text_embed.text_embedder.linear_2.weight': 'vector_embedding.out_layer.weight', +} + + +def flux_transformer_converter(ckpt_path=None, transformer_config=None): + diffuser_state_dict = {} + if os.path.isdir(ckpt_path): + files = os.listdir(ckpt_path) + for file in files: + if file.endswith('.safetensors'): + loaded_dict = load_safetensors(os.path.join(ckpt_path, file)) + diffuser_state_dict.update(loaded_dict) + elif os.path.isfile(ckpt_path): + diffuser_state_dict = load_safetensors(ckpt_path) + else: + raise FileNotFoundError("Please provide a valid ckpt path.") + new_state_dict = {} + num_single_blocks = 0 + num_double_blocks = 0 + for key, value in diffuser_state_dict.items(): + if 'attn.to_q' in key or 'attn.to_k' in key or 'attn.to_v' in key: + continue + if 'attn.add_q_proj' in key or 'attn.add_k_proj' in key or 'attn.add_v_proj' in key: + continue + if key.startswith('transformer_blocks'): + temp = key.split('.') + idx, k = temp[1], '.'.join(temp[2:]) + num_double_blocks = max(int(idx), num_double_blocks) + new_key = '.'.join(['double_blocks', idx, key_mapping['double_blocks'][k]]) + elif key.startswith('single_transformer_blocks'): + temp = key.split('.') + idx, k = temp[1], '.'.join(temp[2:]) + num_single_blocks = max(int(idx), num_single_blocks) + new_key = '.'.join(['single_blocks', idx, key_mapping['single_blocks'][k]]) + else: + new_key = key_mapping[key] + new_state_dict[new_key] = value + + for i in range(num_double_blocks + 1): + new_key = f'double_blocks.{str(i)}.self_attention.linear_qkv.weight' + qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.to_{n}.weight' for n in ('q', 'k', 'v')] + new_state_dict[new_key] = _import_qkv( + transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk] + ) + new_key = f'double_blocks.{str(i)}.self_attention.linear_qkv.bias' + qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.to_{n}.bias' for n in ('q', 'k', 'v')] + new_state_dict[new_key] = _import_qkv_bias( + transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk] + ) + new_key = f'double_blocks.{str(i)}.self_attention.added_linear_qkv.weight' + qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.add_{n}_proj.weight' for n in ('q', 'k', 'v')] + new_state_dict[new_key] = _import_qkv( + transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk] + ) + new_key = f'double_blocks.{str(i)}.self_attention.added_linear_qkv.bias' + qk, kk, vk = [f'transformer_blocks.{str(i)}.attn.add_{n}_proj.bias' for n in ('q', 'k', 'v')] + new_state_dict[new_key] = _import_qkv_bias( + transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk] + ) + + for i in range(num_single_blocks + 1): + new_key = f'single_blocks.{str(i)}.self_attention.linear_qkv.weight' + qk, kk, vk = [f'single_transformer_blocks.{str(i)}.attn.to_{n}.weight' for n in ('q', 'k', 'v')] + new_state_dict[new_key] = _import_qkv( + transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk] + ) + new_key = f'single_blocks.{str(i)}.self_attention.linear_qkv.bias' + qk, kk, vk = [f'single_transformer_blocks.{str(i)}.attn.to_{n}.bias' for n in ('q', 'k', 'v')] + new_state_dict[new_key] = _import_qkv_bias( + transformer_config, diffuser_state_dict[qk], diffuser_state_dict[kk], diffuser_state_dict[vk] + ) + + return new_state_dict diff --git a/nemo/collections/diffusion/utils/flux_pipeline_utils.py b/nemo/collections/diffusion/utils/flux_pipeline_utils.py new file mode 100644 index 000000000000..77dcfa58450f --- /dev/null +++ b/nemo/collections/diffusion/utils/flux_pipeline_utils.py @@ -0,0 +1,76 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass + +import torch +from megatron.core.transformer.utils import openai_gelu + +from nemo.collections.diffusion.models.flux.model import FluxParams +from nemo.collections.diffusion.vae.autoencoder import AutoEncoderParams + + +@dataclass +class FluxModelParams: + flux_params: FluxParams + vae_params: AutoEncoderParams + clip_params: dict | None + t5_params: dict | None + scheduler_params: dict | None + device: str | torch.device + + +configs = { + "dev": FluxModelParams( + flux_params=FluxParams( + num_joint_layers=19, + num_single_layers=38, + hidden_size=3072, + num_attention_heads=24, + activation_func=openai_gelu, + add_qkv_bias=True, + ffn_hidden_size=16384, + in_channels=64, + context_dim=4096, + model_channels=256, + patch_size=1, + guidance_embed=True, + vec_in_dim=768, + ), + vae_params=AutoEncoderParams( + ch_mult=[1, 2, 4, 4], + attn_resolutions=[], + resolution=256, + in_channels=3, + ch=128, + out_ch=3, + num_res_blocks=2, + z_channels=16, + scale_factor=0.3611, + shift_factor=0.1159, + ckpt=None, + ), + clip_params={ + 'max_length': 77, + 'always_return_pooled': True, + }, + t5_params={ + 'max_length': 512, + }, + scheduler_params={ + 'num_train_timesteps': 1000, + }, + device='cpu', + ) +} diff --git a/nemo/collections/diffusion/utils/mcore_parallel_utils.py b/nemo/collections/diffusion/utils/mcore_parallel_utils.py new file mode 100644 index 000000000000..0b9bdec97464 --- /dev/null +++ b/nemo/collections/diffusion/utils/mcore_parallel_utils.py @@ -0,0 +1,80 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Megatron Model Parallel Initialization +""" + +import os + +import megatron.core.parallel_state as ps +import torch + + +class Utils: + world_size = torch.cuda.device_count() + # rank = int(os.environ["LOCAL_RANK"]) + rank = 0 + + @staticmethod + def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1): + ps.destroy_model_parallel() + + # Torch setup for distributed training + rank = int(os.environ['LOCAL_RANK']) + world_size = 1 # torch.cuda.device_count() + torch.cuda.set_device(rank) + torch.distributed.init_process_group(world_size=world_size, rank=rank) + + # Megatron core distributed training initialization + ps.initialize_model_parallel( + tensor_model_parallel_size, pipeline_model_parallel_size, context_parallel_size=context_parallel_size + ) + + @staticmethod + def set_world_size(world_size=None, rank=None): + Utils.world_size = torch.cuda.device_count() if world_size is None else world_size + if torch.distributed.is_initialized() and Utils.world_size != torch.distributed.get_world_size(): + torch.distributed.destroy_process_group() + + if rank is None: + # Utils.rank = int(os.environ["LOCAL_RANK"]) + Utils.rank = 0 + if Utils.rank >= Utils.world_size: + Utils.rank = -1 + else: + Utils.rank = rank + + @staticmethod + def destroy_model_parallel(): + ps.destroy_model_parallel() + torch.distributed.barrier() + + @staticmethod + def initialize_model_parallel( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + virtual_pipeline_model_parallel_size=None, + pipeline_model_parallel_split_rank=None, + **kwargs, + ): + ps.destroy_model_parallel() + Utils.initialize_distributed() + ps.initialize_model_parallel( + tensor_model_parallel_size, + pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size, + pipeline_model_parallel_split_rank, + **kwargs, + ) diff --git a/nemo/collections/diffusion/vae/autoencoder.py b/nemo/collections/diffusion/vae/autoencoder.py new file mode 100644 index 000000000000..b356d74baac1 --- /dev/null +++ b/nemo/collections/diffusion/vae/autoencoder.py @@ -0,0 +1,334 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass + +import numpy as np +import torch +from torch import Tensor, nn + +from nemo.collections.diffusion.vae.blocks import AttnBlock, Downsample, Normalize, ResnetBlock, Upsample, make_attn + + +@dataclass +class AutoEncoderParams: + ch_mult: list[int] + attn_resolutions: list[int] + resolution: int = 256 + in_channels: int = 3 + ch: int = 128 + out_ch: int = 3 + num_res_blocks: int = 2 + z_channels: int = 16 + scale_factor: float = 0.3611 + shift_factor: float = 0.1159 + attn_type: str = 'vanilla' + double_z: bool = True + dropout: float = 0.0 + ckpt: str = None + + +def nonlinearity(x): + # swish + return torch.nn.functional.silu(x) + + +class Encoder(nn.Module): + def __init__( + self, + *, + ch: int, + out_ch: int, + ch_mult: list[int], + num_res_blocks: int, + attn_resolutions: list[int], + in_channels: int, + resolution: int, + z_channels: int, + dropout=0.0, + resamp_with_conv=True, + double_z=True, + use_linear_attn=False, + attn_type="vanilla", + ): + super().__init__() + if use_linear_attn: + attn_type = "linear" + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + + # downsampling + self.conv_in = torch.nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1) + + curr_res = resolution + in_ch_mult = (1,) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions - 1: + down.downsample = Downsample(block_in, resamp_with_conv) + curr_res = curr_res // 2 + self.down.append(down) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout + ) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout + ) + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, 2 * z_channels if double_z else z_channels, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, x): + # timestep embedding + temb = None + + # downsampling + hs = [self.conv_in(x)] + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](hs[-1], temb) + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + hs.append(h) + if i_level != self.num_resolutions - 1: + hs.append(self.down[i_level].downsample(hs[-1])) + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class Decoder(nn.Module): + def __init__( + self, + *, + ch: int, + out_ch: int, + ch_mult: list[int], + num_res_blocks: int, + attn_resolutions: list[int], + in_channels: int, + resolution: int, + z_channels: int, + dropout=0.0, + resamp_with_conv=True, + give_pre_end=False, + tanh_out=False, + use_linear_attn=False, + attn_type="vanilla", + **ignorekwargs, + ): + super().__init__() + if use_linear_attn: + attn_type = "linear" + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + + # compute in_ch_mult, block_in and curr_res at lowest res + in_ch_mult = (1,) + tuple(ch_mult) + block_in = ch * ch_mult[self.num_resolutions - 1] + curr_res = resolution // 2 ** (self.num_resolutions - 1) + self.z_shape = (1, z_channels, curr_res, curr_res) + print("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape))) + + # z to block_in + self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout + ) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout + ) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1) + + def forward(self, z): + # assert z.shape[1:] == self.z_shape[1:] + self.last_z_shape = z.shape + + # timestep embedding + temb = None + + # z to block_in + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block](h, temb) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + if i_level != 0: + h = self.up[i_level].upsample(h) + + # end + if self.give_pre_end: + return h + + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + if self.tanh_out: + h = torch.tanh(h) + return h + + +class DiagonalGaussian(nn.Module): + def __init__(self, sample: bool = True, chunk_dim: int = 1): + super().__init__() + self.sample = sample + self.chunk_dim = chunk_dim + + def forward(self, z: Tensor) -> Tensor: + mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim) + if self.sample: + std = torch.exp(0.5 * logvar) + return mean + std * torch.randn_like(mean) + else: + return mean + + +class AutoEncoder(nn.Module): + def __init__(self, params: AutoEncoderParams): + super().__init__() + self.encoder = Encoder( + resolution=params.resolution, + in_channels=params.in_channels, + ch=params.ch, + ch_mult=params.ch_mult, + num_res_blocks=params.num_res_blocks, + z_channels=params.z_channels, + double_z=params.double_z, + attn_type=params.attn_type, + dropout=params.dropout, + out_ch=params.out_ch, + attn_resolutions=params.attn_resolutions, + ) + self.decoder = Decoder( + resolution=params.resolution, + in_channels=params.in_channels, + ch=params.ch, + out_ch=params.out_ch, + ch_mult=params.ch_mult, + num_res_blocks=params.num_res_blocks, + z_channels=params.z_channels, + double_z=params.double_z, + attn_type=params.attn_type, + dropout=params.dropout, + attn_resolutions=params.attn_resolutions, + ) + self.reg = DiagonalGaussian() + + self.scale_factor = params.scale_factor + self.shift_factor = params.shift_factor + self.params = params + + if params.ckpt is not None: + self.load_from_checkpoint(params.ckpt) + + def encode(self, x: Tensor) -> Tensor: + z = self.reg(self.encoder(x)) + z = self.scale_factor * (z - self.shift_factor) + return z + + def decode(self, z: Tensor) -> Tensor: + z = z / self.scale_factor + self.shift_factor + return self.decoder(z) + + def forward(self, x: Tensor) -> Tensor: + return self.decode(self.encode(x)) + + def load_from_checkpoint(self, ckpt_path): + from safetensors.torch import load_file as load_sft + + state_dict = load_sft(ckpt_path) + missing, unexpected = self.load_state_dict(state_dict) + if len(missing) > 0: + logger.warning(f"Following keys are missing from checkpoint loaded: {missing}") diff --git a/nemo/collections/diffusion/vae/blocks.py b/nemo/collections/diffusion/vae/blocks.py new file mode 100644 index 000000000000..ad38a7a463cf --- /dev/null +++ b/nemo/collections/diffusion/vae/blocks.py @@ -0,0 +1,180 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from einops import rearrange +from torch import Tensor, nn + +try: + from apex.contrib.group_norm import GroupNorm + + OPT_GROUP_NORM = True +except Exception: + print('Fused optimized group norm has not been installed.') + OPT_GROUP_NORM = False + + +def Normalize(in_channels, num_groups=32, act=""): + return GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True, act=act) + + +class ResnetBlock(nn.Module): + def __init__(self, in_channels, out_channels=None, conv_shortcut=False, dropout=0.0, temb_channels=0): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + + self.norm1 = Normalize(in_channels, act="silu") + self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels, act="silu") + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, x, temb): + h = x + h = self.norm1(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16 + # TODO(yuya): Remove this cast once the issue is fixed in PyTorch + # https://github.com/pytorch/pytorch/issues/86679 + dtype = x.dtype + if dtype == torch.bfloat16: + x = x.to(torch.float32) + x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest") + if dtype == torch.bfloat16: + x = x.to(dtype) + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + # no asymmetric padding in torch conv, must do it ourselves + self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class AttnBlock(nn.Module): + def __init__(self, in_channels: int): + super().__init__() + self.in_channels = in_channels + + self.norm = Normalize(in_channels, act="silu") + + self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1) + self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1) + self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1) + self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1) + + def attention(self, h_: Tensor) -> Tensor: + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + b, c, h, w = q.shape + q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous() + k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous() + v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous() + h_ = nn.functional.scaled_dot_product_attention(q, k, v) + + return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b) + + def forward(self, x: Tensor) -> Tensor: + return x + self.proj_out(self.attention(x)) + + +class LinearAttention(nn.Module): + def __init__(self, dim, heads=4, dim_head=32): + super().__init__() + self.heads = heads + hidden_dim = dim_head * heads + self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False) + self.to_out = nn.Conv2d(hidden_dim, dim, 1) + + def forward(self, x): + b, c, h, w = x.shape + qkv = self.to_qkv(x) + q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads=self.heads, qkv=3) + k = k.softmax(dim=-1) + context = torch.einsum('bhdn,bhen->bhde', k, v) + out = torch.einsum('bhde,bhdn->bhen', context, q) + out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w) + return self.to_out(out) + + +class LinAttnBlock(LinearAttention): + """ + to match AttnBlock usage + """ + + def __init__(self, in_channels): + super().__init__(dim=in_channels, heads=1, dim_head=in_channels) + + +def make_attn(in_channels, attn_type="vanilla"): + assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown' + print(f"making attention of type '{attn_type}' with {in_channels} in_channels") + if attn_type == "vanilla": + return AttnBlock(in_channels) + elif attn_type == "none": + return nn.Identity(in_channels) + else: + return LinAttnBlock(in_channels) From f37d1691087c10fa3dfa9ebf88dc89a7e1f52692 Mon Sep 17 00:00:00 2001 From: BoxiangW <45734921+BoxiangW@users.noreply.github.com> Date: Tue, 22 Oct 2024 00:49:58 -0700 Subject: [PATCH 7/8] Add assertion for always save nemo add model parallel size (#10690) * Add assertion for always save nemo add model parallel size Signed-off-by: Boxiang Wang * Add assertions Signed-off-by: Boxiang Wang * Fix typo Signed-off-by: Boxiang Wang * Apply isort and black reformatting Signed-off-by: BoxiangW * Revert nemo_model_checkpoint.py changes Signed-off-by: Boxiang Wang * Add test Signed-off-by: Boxiang Wang * Fix typo * Fix test bug Signed-off-by: Boxiang Wang * Fix test Signed-off-by: Boxiang Wang --------- Signed-off-by: Boxiang Wang Signed-off-by: BoxiangW Co-authored-by: BoxiangW --- nemo/utils/exp_manager.py | 10 +++++ tests/core/test_exp_manager.py | 72 ++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index 3d4b7189f56e..2bfb40e89e15 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -1169,6 +1169,16 @@ def configure_checkpointing( params.filename = f'{name}--{{{params.monitor}:.4f}}-{{epoch}}' if params.prefix is None: params.prefix = name + if params.always_save_nemo: + app_state = AppState() + if (app_state.tensor_model_parallel_size is not None and app_state.tensor_model_parallel_size > 1) or (app_state.pipeline_model_parallel_size is not None and app_state.pipeline_model_parallel_size > 1) or (app_state.context_parallel_size is not None and app_state.context_parallel_size > 1): + raise LoggerMisconfigurationError( + "always_save_nemo is set to True, please ensure that model parallel is not used." + f"tensor_model_parallel_size: {app_state.tensor_model_parallel_size}," + f"pipeline_model_parallel_size: {app_state.pipeline_model_parallel_size}," + f"context_parallel_size: {app_state.context_parallel_size}," + ) + NeMoModelCheckpoint.CHECKPOINT_NAME_LAST = params.filename + '-last' logging.debug(params.dirpath) diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py index a0b69927ecc0..fa2eeae9b538 100644 --- a/tests/core/test_exp_manager.py +++ b/tests/core/test_exp_manager.py @@ -29,6 +29,7 @@ from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.constants import NEMO_ENV_VARNAME_VERSION from nemo.core.classes import ModelPT +from nemo.utils.app_state import AppState from nemo.utils.callbacks import NeMoModelCheckpoint from nemo.utils.exp_manager import ( CheckpointMisconfigurationError, @@ -1097,3 +1098,74 @@ def test_doesnt_silently_start_from_scratch_dist(self, tmp_path): restored_trainer, {"resume_if_exists": True, "resume_ignore_no_checkpoint": True, "explicit_log_dir": str(test_dir)}, ) + + @pytest.mark.unit + def test_save_nemo_not_comp_with_model_parallel(self, tmp_path): + """ + Ensure that always_save_nemo is not compatible with model parallelism. + """ + + test_dir = tmp_path / "test" + + with pytest.raises(LoggerMisconfigurationError): + appstate = AppState() + appstate.tensor_model_parallel_size = 2 + appstate.pipeline_model_parallel_size = 1 + appstate.context_parallel_size = 1 + test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1) + exp_manager( + test_trainer, + { + "checkpoint_callback_params": { + "always_save_nemo": True, + }, + "explicit_log_dir": str(test_dir), + } + ) + + with pytest.raises(LoggerMisconfigurationError): + appstate = AppState() + appstate.tensor_model_parallel_size = 1 + appstate.pipeline_model_parallel_size = 2 + appstate.context_parallel_size = 1 + test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1) + exp_manager( + test_trainer, + { + "checkpoint_callback_params": { + "always_save_nemo": True, + }, + "explicit_log_dir": str(test_dir), + }, + ) + + with pytest.raises(LoggerMisconfigurationError): + appstate = AppState() + appstate.tensor_model_parallel_size = 1 + appstate.pipeline_model_parallel_size = 1 + appstate.context_parallel_size = 2 + test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1) + exp_manager( + test_trainer, + { + "checkpoint_callback_params": { + "always_save_nemo": True, + }, + "explicit_log_dir": str(test_dir), + }, + ) + + appstate = AppState() + appstate.tensor_model_parallesl_size = 1 + appstate.pipeline_model_parallel_size = 1 + appstate.context_parallel_size = 1 + test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1) + exp_manager( + test_trainer, + { + "checkpoint_callback_params": { + "always_save_nemo": True, + }, + "explicit_log_dir": str(test_dir), + }, + ) From bc4bce71d01234f568c1327f0848001d86143b3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 22 Oct 2024 12:12:05 +0200 Subject: [PATCH 8/8] =?UTF-8?q?[=F0=9F=A4=A0]:=20Howdy=20folks,=20let's=20?= =?UTF-8?q?bump=20`Dockerfile.ci`=20to=20563d5d1=20!=20(#10979)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com> --- Dockerfile.ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ci b/Dockerfile.ci index f01025873628..09ffe9674e5d 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.17.0 -ARG MCORE_TAG=db7d37b54ef96e35f7afc56e29fffb60f5c957b9 +ARG MCORE_TAG=563d5d1726012e8077895b732d5bc81b6e975e8d ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \