From 0df7e959b5491176019bdce19896ab58fcb2122a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 6 Jun 2024 10:55:41 -0700
Subject: [PATCH] disable overlap for qkv (#9079)

* disable overlap for qkv (#9072)

* disable overlap for qkv

Signed-off-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: michal2409 <michal2409@users.noreply.github.com>

---------

Signed-off-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: michal2409 <michal2409@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: Rachit Garg <rachitgarg91@gmail.com>
Co-authored-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: michal2409 <michal2409@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
---
 .../common/megatron/adapters/mcore_mixins.py  | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index fe9e900f4ad0..a85c155cc0a8 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -80,7 +80,7 @@ def mcore_register_adapters(self):
         if (
             self.config.sequence_parallel
             and hasattr(self.linear_qkv, "return_layernorm_output_gathered")
-            and not self.config.tp_comm_overlap
+            and not self.linear_qkv.ub_overlap_ag
         ):
             # for LoRA SP, TE v1.5 can return layernorm output gathered so there is no need
             # to perform the redundant gather in the adapter module, unless TP communication
@@ -142,11 +142,19 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
         if SplitAlongDim is not None:
 
             # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
-            (query, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list,)
+            (query, key, value) = SplitAlongDim(
+                mixed_qkv,
+                3,
+                split_arg_list,
+            )
         else:
 
             # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
-            (query, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3,)
+            (query, key, value) = torch.split(
+                mixed_qkv,
+                split_arg_list,
+                dim=3,
+            )
 
         # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
         query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)
@@ -231,11 +239,21 @@ def forward(
 
         if self.checkpoint_core_attention:
             core_attn_out = self._checkpointed_attention_forward(
-                query, key, value, attention_mask, attn_mask_type=attn_mask_type, packed_seq_params=packed_seq_params,
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
             )
         else:
             core_attn_out = self.core_attention(
-                query, key, value, attention_mask, attn_mask_type=attn_mask_type, packed_seq_params=packed_seq_params,
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
             )
 
         if packed_seq_params is not None:
@@ -316,7 +334,9 @@ def forward(self, hidden_states):
                     intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
             elif self.activation_func == F.silu and self.config.gated_linear_unit:
                 intermediate_parallel = bias_swiglu_impl(
-                    intermediate_parallel, bias_parallel, self.config.activation_func_fp8_input_store,
+                    intermediate_parallel,
+                    bias_parallel,
+                    self.config.activation_func_fp8_input_store,
                 )
 
             else: