Merge branch 'main' into refactor--multi-node-save_best_model-only-wh…

…en-needed
itzsimpl · Aug 30, 2024 · ffea861 · ffea861
2 parents 6704a4c + 0ba9979
commit ffea861
Show file tree

Hide file tree

Showing 27 changed files with 958 additions and 472 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -34,7 +34,7 @@ WORKDIR /workspace
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.15.0
-ARG MCORE_TAG=34e607ef41cf1c0ed481a678df9c76952d0ec00c
+ARG MCORE_TAG=9ab31cbd6265f83640008801e1c3efbf80892cea
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \

diff --git a/docs/source/nlp/nemo_megatron/intro.rst b/docs/source/nlp/nemo_megatron/intro.rst
@@ -20,6 +20,7 @@ To learn more about using NeMo to train Large Language Models at scale, please r
    peft/landing_page
    positional_embeddings
    mcore_customization
+   reset_learning_rate
    rampup_batch_size
 
 

diff --git a/docs/source/nlp/nemo_megatron/reset_learning_rate.rst b/docs/source/nlp/nemo_megatron/reset_learning_rate.rst
@@ -0,0 +1,30 @@
+.. _reset_learning_rate:
+
+Reset Learning Rate
+-------------------
+
+The reset learning rate feature provides the ability to reset the learning rate for an existing checkpoint to its initial value (either 0 or ``optim.min_lr`` depending on the warmup steps) when performing continual pretraining.
+
+Parameters
+----------
+
+* ``reset_lr`` (boolean): Enables resetting the learning rate to the initial value. This feature is only supported with the distributed optimizer and megatron_amp_O2.
+* ``reset_lr_steps`` (boolean): Enables adjusting the learning rate's max_steps and decay_steps by subtracting the number of steps already completed at the checkpoint.
+
+Use Cases
+---------
+
+1. ``reset_lr=True, reset_lr_steps=False``
+When pretraining an existing checkpoint "from scratch" on a different dataset. The learning rate will be reset to its initial value. This allows the model to start training on a new dataset with the same learning rate dynamics as if it were starting from scratch.
+
+2. ``reset_lr=True, reset_lr_steps=True``
+When continuing training from an existing checkpoint with the same configuration. The learning rate will be reset to its initial value, and the ``max_steps`` and ``decay_steps`` for learning rate schedule will be recalculated by subtracting the number of steps already completed at the checkpoint. Specifically:
+    * ``max_steps`` will be recalculated as ``max_steps -= completed_steps``.
+    * ``decay_steps`` will be recalculated as ``decay_steps -= completed_steps``.
+This ensures that the learning rate reaches the ``min_lr`` value by the end of training without changing the ``trainer.max_steps``:
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/asset-post-reset-learning-rate-example.png
+  :alt: 
+  :width: 1080px
+
+
diff --git a/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml b/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml
@@ -35,14 +35,17 @@ model:
         name: ${model.data_name}
         split: 'train.clean.360'
         streaming: ${model.streaming}
+        trust_remote_code: true
       - path: ${model.data_path}
         name: ${model.data_name}
         split: 'train.clean.100'
         streaming: ${model.streaming}
+        trust_remote_code: true
       - path: ${model.data_path}
         name: ${model.data_name}
         split: 'train.other.500'
         streaming: ${model.streaming}
+        trust_remote_code: true
 
     sample_rate: ${model.sample_rate}
     batch_size: 16 # you may increase batch_size if your memory allows
@@ -65,6 +68,7 @@ model:
       name: ${model.data_name}
       split: 'validation.other'
       streaming: ${model.streaming}
+      trust_remote_code: true
 
     sample_rate: ${model.sample_rate}
     batch_size: 8
@@ -87,10 +91,12 @@ model:
         name: ${model.data_name}
         split: 'test.other'
         streaming: ${model.streaming}
+        trust_remote_code: true
       - path: ${model.data_path}
         name: ${model.data_name}
         split: 'test.clean'
         streaming: ${model.streaming}
+        trust_remote_code: true
 
     sample_rate: ${model.sample_rate}
     batch_size: 8

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -175,7 +175,7 @@ model:
   fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
 
   # Distributed checkpoint setup
-  dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
+  dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
   dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
   dist_ckpt_parallel_save: True # if true, each worker will write its own part of the dist checkpoint
   dist_ckpt_parallel_save_within_dp: False # if true, save will be parallelized only within a DP group (whole world otherwise), which might slightly reduce the save overhead

diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -535,9 +535,8 @@ def transcribe_partial_audio(
             if isinstance(asr_model, EncDecHybridRNNTCTCModel) and decoder_type == "ctc":
                 logits = asr_model.ctc_decoder(encoder_output=logits)
 
-            logits = logits.cpu()
-
             if logprobs:
+                logits = logits.cpu()
                 logits = logits.numpy()
                 # dump log probs per file
                 for idx in range(logits.shape[0]):

diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py
@@ -54,6 +54,7 @@ class PreTrainingDataModule(pl.LightningDataModule, IOMixin):
         split (str): A string of 3 comma-separated integers denoting how much of the distribution
             to allocate to train, validation, and test sets, respectively. Unused if ``paths`` is a dict.
         index_mapping_dir (Optional[str]): Path to a directory to write index mapping files.
+        num_dataset_builder_threads (int): The number of threads to use for dataset building.
     """
 
     def __init__(
@@ -73,6 +74,7 @@ def __init__(
         seed: int = 1234,
         split: str = "900,50,50",
         index_mapping_dir: Optional[str] = None,
+        num_dataset_builder_threads: int = 1,
     ) -> None:
         super().__init__()
         if not isinstance(paths, (list, tuple, dict)):
@@ -110,6 +112,7 @@ def __init__(
         self.seed = seed
         self.split = split
         self.index_mapping_dir = index_mapping_dir
+        self.num_dataset_builder_threads = num_dataset_builder_threads
         self.init_global_step = 0
 
         from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
@@ -139,7 +142,11 @@ def setup(self, stage: str = "") -> None:
         num_val_samples = int(eval_iters * self.data_sampler.global_batch_size)
         num_test_samples = int(test_iters * self.data_sampler.global_batch_size)
 
-        if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
+        if (
+            self.trainer.limit_val_batches > 0.0
+            and self.trainer.limit_val_batches <= 1.0
+            and isinstance(self.trainer.limit_val_batches, float)
+        ):
             assert "blend" not in self.build_kwargs, (
                 "When using a single data distribution, limit_val_batches <= 1.0 is not supported. If you'd "
                 "like to run with a fractional value of limit_val_batches, please pass in separate datasets for "
@@ -214,6 +221,7 @@ def gpt_dataset_config(self) -> "GPTDatasetConfig":
             reset_position_ids=self.reset_position_ids,
             reset_attention_mask=self.reset_attention_mask,
             eod_mask_loss=self.eod_mask_loss,
+            num_dataset_builder_threads=self.num_dataset_builder_threads,
             **self.build_kwargs,
         )
 

diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py
@@ -49,6 +49,7 @@ class SeparatorStyle(Enum):
     LLAMA_3 = auto()
     MISTRAL = auto()
     NVGPT = auto()
+    YI34b = auto()
 
 
 @dataclasses.dataclass
@@ -155,7 +156,31 @@ def get_prompt(self):
                     ret += wrap_user(message) + self.sep
                 else:
                     ret += wrap_assistant(message) + (self.sep if message else "")
+        elif self.sep_style == SeparatorStyle.YI34b:
+            """
+            {{ if .System }}<|im_start|>system
+            {{ .System }}<|im_end|>
+            {{ end }}{{ if .Prompt }}<|im_start|>user
+            {{ .Prompt }}<|im_end|>
+            {{ end }}<|im_start|>assistant
+            {{ .Response }}<|im_end|>
+            """
+            wrap_sys = lambda msg: f"<|im_start|>system\n{msg}<|im_end|>"
+            wrap_user = lambda msg: f"<|im_start|>user\n{msg.strip()}<|im_end|>"
+            wrap_assistant = lambda msg: f"<|im_start|>assistant\n{msg}<|im_end|>"
 
+            ret = wrap_sys(self.system) if len(self.system) > 0 else ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if type(message) is tuple:
+                    message, _, _ = message
+                elif i % 2 == 0:
+                    ret += wrap_user(message) + self.sep
+                else:
+                    ret += wrap_assistant(message) + (self.sep if message else "")
+            ret = ret.strip()
         elif self.sep_style == SeparatorStyle.PLAIN:
             seps = [self.sep, self.sep2]
             ret = self.system
@@ -322,6 +347,16 @@ def dict(self):
     sep2=f"{DEFAULT_SYSTEM_TOKEN}System\n",
 )
 
+conv_yi_34b = Conversation(
+    system="",
+    roles=('user', 'assistant'),
+    version="1.5",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.YI34b,
+    sep="\n",
+)
+
 conv_vicuna_v0 = Conversation(
     system="A chat between a curious human and an artificial intelligence assistant. "
     "The assistant gives helpful, detailed, and polite answers to the human's questions.",
@@ -490,6 +525,7 @@ def dict(self):
     "nv_steerlm": conv_nvgpt,
     "nv_dpo": conv_nv_dpo,
     "mistral": conv_mistral,
+    "yi_34b": conv_yi_34b,
 }
 
 if __name__ == "__main__":

diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -598,6 +598,104 @@ def preprocess_llama_2(
     )
 
 
+def preprocess_yi_34b(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
+    """
+    Preprocess sources for Yi-1.5 34b model configuration.
+
+    The function applies prompt templates and tokenizes the conversations according to the Yi-1.5 34b model specifications.
+    It involves special handling of tokens, masking of labels, and adjustments based on configuration settings.
+
+    This template works with the following tokenizer configs:
+    - model.tokenizer.library='huggingface'
+    - model.tokenizer.type='01-ai/Yi-1.5-34B'
+    - model.tokenizer.additional_special_tokens='{additional_special_tokens: ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>"]}'
+    At inference time, add end string to stop sampling:
+    - inference.end_strings='["<|im_end|>"]'
+
+    Parameters:
+    - sources (dict): A dictionary of sources containing conversations to be processed.
+    - tokenizer: The tokenizer to be used for processing the text.
+    - cfg: Configuration settings for preprocessing, including context length and additional tokens.
+
+    Returns:
+    - Dict: A dictionary containing tokenized and labeled data suitable for the LLaMA 2 model.
+      This includes tokens, labels, and any special processing as defined in the configuration.
+    """
+
+    """<|im_start|>user\n{prompt.strip()}<|im_end|>\n<|im_start|>assistant\n"""
+
+    conv = conversation_lib.conv_yi_34b.copy()
+
+    # apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        source = source["conversations"]
+        strip_end_for_inference = False
+
+        for i, turn in enumerate(source):
+
+            if i % 2 == 1:
+                turn["from"] = conv.roles[1]
+                value = turn["value"]
+
+                conv.append_message(turn['from'], value)
+                if not turn["value"]:
+                    strip_end_for_inference = True
+            else:
+                turn["from"] = conv.roles[0]
+                conv.append_message(turn["from"], turn["value"])
+        context = conv.get_prompt()
+        if strip_end_for_inference and context.endswith("\n<|im_end|>"):
+            context = context[: -len("\n<|im_end|>")] + "\n"
+        conversations.append(context)
+
+    add_extra_token = cfg.get("add_extra_token")
+
+    tokens = tokenize(
+        texts=conversations,
+        tokenizer=tokenizer,
+        context_length=cfg.get("context_length"),
+        add_extra_token=add_extra_token,
+    )
+    labels = tokens.clone().detach()
+
+    round_sep = "<|im_start|>user\n"
+    sep = "<|im_start|>assistant\n"
+    for conversation, target in zip(conversations, labels):
+        rounds = conversation.split(round_sep)
+        rounds = [round_sep.join(rounds[:2])] + [(round_sep + x) for x in rounds[2:]]
+        assert len(conversation) == sum(map(len, rounds))
+        cur_len = 0
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            instruction_len = len(tokenizer.text_to_ids(parts[0] + sep))
+            round_len = len(tokenizer.text_to_ids(rou))
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+    if add_extra_token:
+        tokens = tokens[:, :-1].contiguous()
+        labels = labels[:, 1:].contiguous()
+    else:
+        labels = torch.roll(labels, shifts=-1, dims=-1)
+        labels[:, -1] = IGNORE_INDEX
+
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
+
+
 def preprocess_v1(
     sources: dict,
     tokenizer,
@@ -1160,6 +1258,12 @@ def expand2square(pil_img, background_color):
                 self.tokenizer,
                 self.multimodal_cfg,
             )
+        elif self.conv_template == "yi_34b":
+            data_dict = preprocess_yi_34b(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         else:
             raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.")
 

diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -426,6 +426,7 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
         preprocess_nv_dpo,
         preprocess_nvgpt,
         preprocess_v1,
+        preprocess_yi_34b,
     )
 
     list_data_dict = []
@@ -486,6 +487,28 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
             copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
         )  # HARDCODED FOR NOW
         data_dict = preprocess_llama_2(sources, tokenizer, multimodal_cfg)
+    elif multimodal_cfg["conv_template"] == "yi_34b":
+        record = {
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
+        }
+        for turn in record['conversations']:
+            if turn.get('value') is not None:
+                turn['value'] = re.sub('<image>', f'{DEFAULT_IMAGE_TOKEN}\n', turn['value'])
+        list_data_dict.append(record)
+        sources = preprocess_multimodal(
+            copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
+        )  # HARDCODED FOR NOW
+        data_dict = preprocess_yi_34b(sources, tokenizer, multimodal_cfg)
+
     elif multimodal_cfg["conv_template"] == "llama_3":
         record = {
             'conversations': [

diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -288,6 +288,9 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
         elif conv_template == "llama_3":
             clean_response = clean_response.rsplit("assistant<|end_header_id|>\n\n", 1)[-1]
             clean_response = re.sub(r"(<\|eot_id\|>)+$", "", clean_response)
+        elif conv_template == "yi_34b":
+            clean_response = clean_response.split("<|im_start|>assistant\n")[-1]
+            clean_response = clean_response.strip("<|im_end|>")
         elif conv_template == "v1":
             clean_response = clean_response.rsplit("ASSISTANT: ", 1)[-1]
 

diff --git a/nemo/core/optim/mcore_optim.py b/nemo/core/optim/mcore_optim.py
@@ -70,9 +70,9 @@ def step(self, closure):
             loss = closure()
 
         # return unused update_successful, grad_norm, num_zeros_in_grad
-        self.mcore_optimizer.step()
+        _, grad_norm, num_zeros_in_grad = self.mcore_optimizer.step()
 
-        return loss
+        return loss, grad_norm, num_zeros_in_grad
 
     # Promote state so it can be retrieved or set via
     # "optimizer_instance.state"