Add more recipes (#10957)

* add recipes Signed-off-by: Chen Cui <[email protected]> * adjust finetuning recipe Signed-off-by: Chen Cui <[email protected]> * Apply isort and black reformatting Signed-off-by: cuichenx <[email protected]> --------- Signed-off-by: Chen Cui <[email protected]> Signed-off-by: cuichenx <[email protected]> Co-authored-by: cuichenx <[email protected]> Signed-off-by: Chen Cui <[email protected]>
NVIDIA · Oct 25, 2024 · 6a8cefc · 6a8cefc
1 parent b4dd2bd
commit 6a8cefc
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 6 deletions.
diff --git a/nemo/collections/llm/gpt/model/baichuan.py b/nemo/collections/llm/gpt/model/baichuan.py
@@ -209,7 +209,7 @@ def _import_qkv(ctx: io.TransformCTX, qkv_weights):
     q = qkv_weights[0].squeeze().view(*new_q_tensor_shape)
     k = qkv_weights[1].squeeze().view(*new_kv_tensor_shape)
     v = qkv_weights[2].squeeze().view(*new_kv_tensor_shape)
-    qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+    qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:]).type_as(qkv_weights)
     for i in range(num_query_groups):
         qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
         qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))

diff --git a/nemo/collections/llm/gpt/model/chatglm.py b/nemo/collections/llm/gpt/model/chatglm.py
@@ -215,7 +215,7 @@ def _import_qkv_weight(ctx: io.TransformCTX, hf_qkv_weights):
     k = k.view(*new_kv_tensor_shape)
     v = v.view(*new_kv_tensor_shape)
 
-    qkv_weights = torch.empty((0, head_size, old_tensor_shape[1]))
+    qkv_weights = torch.empty((0, head_size, old_tensor_shape[1])).type_as(hf_qkv_weights)
     for i in range(num_query_groups):
         qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
         qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
@@ -245,7 +245,7 @@ def _import_qkv_bias(ctx: io.TransformCTX, hf_qkv_bias):
     q = q.view(*new_q_tensor_shape)
     k = k.view(*new_kv_tensor_shape)
     v = v.view(*new_kv_tensor_shape)
-    qkv_bias = torch.empty((0, head_size))
+    qkv_bias = torch.empty((0, head_size)).type_as(hf_qkv_bias)
     for i in range(num_query_groups):
         qkv_bias = torch.cat((qkv_bias, q[i * heads_per_group : (i + 1) * heads_per_group, :]))
         qkv_bias = torch.cat((qkv_bias, k[i : i + 1, :]))

diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
@@ -14,6 +14,10 @@
 
 
 from nemo.collections.llm.recipes import (
+    baichuan2_7b,
+    chatglm3_6b,
+    gemma_2b,
+    gemma_7b,
     llama3_8b,
     llama3_8b_16k,
     llama3_8b_64k,
@@ -40,6 +44,10 @@
 from nemo.collections.llm.recipes.optim import adam
 
 __all__ = [
+    "baichuan2_7b",
+    "chatglm3_6b",
+    "gemma_2b",
+    "gemma_7b",
     "llama3_8b",
     "llama3_8b_16k",
     "llama3_8b_64k",

diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py
@@ -25,6 +25,8 @@ def distributed_fused_adam_with_cosine_annealing(
     precision: str = "bf16-mixed",  # or "16-mixed"
     warmup_steps: int = 2000,
     constant_steps: int = 0,
+    adam_beta1: float = 0.9,
+    adam_beta2: float = 0.95,
     max_lr: float = 1e-4,
     min_lr: Optional[float] = None,
     clip_grad: float = 1.0,
@@ -37,14 +39,14 @@ def distributed_fused_adam_with_cosine_annealing(
         weight_decay=0.1,
         bf16=precision == "bf16-mixed",
         fp16=precision == "16-mixed",
-        adam_beta1=0.9,
-        adam_beta2=0.95,
+        adam_beta1=adam_beta1,
+        adam_beta2=adam_beta2,
         adam_eps=1e-5,
         use_distributed_optimizer=True,
         clip_grad=clip_grad,
     )
 
-    min_lr = min_lr or (0.1 * max_lr)
+    min_lr = min_lr if min_lr is not None else (0.1 * max_lr)
     sched = run.Config(
         CosineAnnealingScheduler,
         warmup_steps=warmup_steps,