From 003a57d65f41d477ff8d8e3243fd0afb69287ba1 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Tue, 5 Mar 2024 13:52:05 -0800
Subject: [PATCH 1/2] change rope fusion default

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 .../nlp/models/language_modeling/megatron_base_model.py        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 8aa1ecf26240..0cdfde17221f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -471,7 +471,8 @@ def build_transformer_config(self) -> TransformerConfig:
 
         bias_dropout_fusion = self.cfg.get('bias_dropout_add_fusion', True)
 
-        apply_rope_fusion = self.cfg.get('apply_rope_fusion', True)
+        # @chcui default rope fusion to false until #8590 is closed.
+        apply_rope_fusion = self.cfg.get('apply_rope_fusion', False)
 
         # TODO: need to check if recompute APIs are matching up properly
         recompute_granularity = self.cfg.get('activations_checkpoint_granularity', None)

From 9fef8d45b74ce6c4c9300eb7866ffa08035df637 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Tue, 5 Mar 2024 14:51:34 -0800
Subject: [PATCH 2/2] add key to config files

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 examples/nlp/language_modeling/conf/megatron_falcon_config.yaml  | 1 +
 examples/nlp/language_modeling/conf/megatron_gpt_config.yaml     | 1 +
 examples/nlp/language_modeling/conf/megatron_llama_config.yaml   | 1 +
 .../nlp/language_modeling/conf/megatron_starcoder_config.yaml    | 1 +
 4 files changed, 4 insertions(+)

diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
index 0a4b1adc099a..8905abaf3ac2 100644
--- a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
@@ -113,6 +113,7 @@ model:
   bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
 
   # Miscellaneous
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index aaa00df2e006..c9f8b8952d5e 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -129,6 +129,7 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
 
   # Miscellaneous
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_config.yaml b/examples/nlp/language_modeling/conf/megatron_llama_config.yaml
index 7ba5dd07b781..965b511fc7e7 100644
--- a/examples/nlp/language_modeling/conf/megatron_llama_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_llama_config.yaml
@@ -112,6 +112,7 @@ model:
   bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
 
   # Miscellaneous
diff --git a/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml b/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml
index dd23aa369bdd..355e575a6d59 100644
--- a/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml
@@ -117,6 +117,7 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
   # Miscellaneous
   seed: 1234