fixed CP=1 case

Signed-off-by: Lifu Zhang <[email protected]>
tomlifu · Nov 9, 2024 · 02bccd7 · 02bccd7
1 parent d3e9354
commit 02bccd7
Showing 1 changed file with 8 additions and 8 deletions.
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -650,14 +650,14 @@ def collate_fn(self, batch):
                 cu_seqlens[-1].append(max_length)
 
             for i in range(len(item['seq_boundaries']) - 1):
-                # since the data is prepadded with tokenizer's eos_id, we can find out the index of all the eos_id
-                eos_idx = np.where(
-                    np.array(item['input_ids'][item['seq_boundaries'][i] : item['seq_boundaries'][i + 1] - 1])
-                    == self.tokenizer.eos_id
-                )
-
-                # The second eos_id index marks the length of the original unpadded sequence
-                seqlen_unpadded = eos_idx[0][0] + 1
+                current_seq = item['input_ids'][item['seq_boundaries'][i] : item['seq_boundaries'][i + 1] - 1]
+
+                # since the data could be prepadded with tokenizer's eos_id, we can find out the index of all the eos_id
+                eos_idx = np.where(np.array(current_seq) == self.tokenizer.eos_id)
+
+                # The second eos_id index marks the length of the original unpadded sequence if the sequence is
+                # prepadded for cp_size > 1. Otherwise, there is no extra padding.
+                seqlen_unpadded = eos_idx[0][0] + 1 if eos_idx[0].any() else len(current_seq)
                 cu_seqlens_unpadded[-1].append(cu_seqlens_unpadded[-1][-1] + seqlen_unpadded)
 
             # if extra paddings are added in the packed sequence, they can't be counted as