NVIDIA · suiyoubi · Oct 30, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
@@ -46,6 +46,11 @@
     CodeLlamaConfig13B,
     CodeLlamaConfig34B,
     CodeLlamaConfig70B,
+    Gemma2Config,
+    Gemma2Config2B,
+    Gemma2Config9B,
+    Gemma2Config27B,
+    Gemma2Model,
     GemmaConfig,
     GemmaConfig2B,
     GemmaConfig7B,
@@ -165,6 +170,11 @@
     "CodeGemmaConfig2B",
     "CodeGemmaConfig7B",
     "GemmaModel",
+    "Gemma2Model",
+    "Gemma2Config9B",
+    "Gemma2Config",
+    "Gemma2Config27B",
+    "Gemma2Config2B",
     "Baichuan2Config",
     "Baichuan2Config7B",
     "Baichuan2Model",

diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
@@ -37,6 +37,13 @@
     GemmaConfig7B,
     GemmaModel,
 )
+from nemo.collections.llm.gpt.model.gemma2 import (
+    Gemma2Config,
+    Gemma2Config2B,
+    Gemma2Config9B,
+    Gemma2Config27B,
+    Gemma2Model,
+)
 from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HfAutoModelForCausalLM
 from nemo.collections.llm.gpt.model.llama import (
     CodeLlamaConfig7B,
@@ -142,6 +149,11 @@
     "CodeGemmaConfig2B",
     "CodeGemmaConfig7B",
     "GemmaModel",
+    "Gemma2Config",
+    "Gemma2Config27B",
+    "Gemma2Config2B",
+    "Gemma2Config9B",
+    "Gemma2Model",
     "LlamaModel",
     "Baichuan2Config",
     "Baichuan2Config7B",

diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py
@@ -17,6 +17,7 @@
 from typing import TYPE_CHECKING, Annotated, Callable, Optional
 
 import torch
+from megatron.core import parallel_state
 from torch import nn
 
 from nemo.collections.llm.fn.activation import openai_gelu
@@ -95,7 +96,8 @@ def configure_model(self):
         from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import EmbeddingScalingMixin
 
         super().configure_model()
-        extend_instance(self.module.embedding, EmbeddingScalingMixin)
+        if parallel_state.is_pipeline_first_stage():
+            extend_instance(self.module.embedding, EmbeddingScalingMixin)
 
 
 @io.model_importer(GemmaModel, "hf")
@@ -160,7 +162,7 @@ def make_vocab_size_divisible_by(vocab_size):
             rotary_base=source.rope_theta,
             gated_linear_unit=True,
             make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
-            share_embeddings_and_output_weights=False,
+            share_embeddings_and_output_weights=True,
             fp16=(dtype_from_hf(source) == torch.float16),
             bf16=(dtype_from_hf(source) == torch.bfloat16),
             params_dtype=dtype_from_hf(source),