From 003a57d65f41d477ff8d8e3243fd0afb69287ba1 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Tue, 5 Mar 2024 13:52:05 -0800 Subject: [PATCH 1/2] change rope fusion default Signed-off-by: Chen Cui --- .../nlp/models/language_modeling/megatron_base_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 8aa1ecf26240..0cdfde17221f 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -471,7 +471,8 @@ def build_transformer_config(self) -> TransformerConfig: bias_dropout_fusion = self.cfg.get('bias_dropout_add_fusion', True) - apply_rope_fusion = self.cfg.get('apply_rope_fusion', True) + # @chcui default rope fusion to false until #8590 is closed. + apply_rope_fusion = self.cfg.get('apply_rope_fusion', False) # TODO: need to check if recompute APIs are matching up properly recompute_granularity = self.cfg.get('activations_checkpoint_granularity', None) From 9fef8d45b74ce6c4c9300eb7866ffa08035df637 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Tue, 5 Mar 2024 14:51:34 -0800 Subject: [PATCH 2/2] add key to config files Signed-off-by: Chen Cui --- examples/nlp/language_modeling/conf/megatron_falcon_config.yaml | 1 + examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 1 + examples/nlp/language_modeling/conf/megatron_llama_config.yaml | 1 + .../nlp/language_modeling/conf/megatron_starcoder_config.yaml | 1 + 4 files changed, 4 insertions(+) diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml index 0a4b1adc099a..8905abaf3ac2 100644 --- a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml @@ -113,6 +113,7 @@ model: bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition. masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. + apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope # Miscellaneous diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index aaa00df2e006..c9f8b8952d5e 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -129,6 +129,7 @@ model: bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. + apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope # Miscellaneous diff --git a/examples/nlp/language_modeling/conf/megatron_llama_config.yaml b/examples/nlp/language_modeling/conf/megatron_llama_config.yaml index 7ba5dd07b781..965b511fc7e7 100644 --- a/examples/nlp/language_modeling/conf/megatron_llama_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_llama_config.yaml @@ -112,6 +112,7 @@ model: bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition. masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. + apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope # Miscellaneous diff --git a/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml b/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml index dd23aa369bdd..355e575a6d59 100644 --- a/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml @@ -117,6 +117,7 @@ model: bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. + apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope # Miscellaneous seed: 1234