fixed data packing issue

Signed-off-by: root <[email protected]>
tomlifu · Nov 22, 2024 · 12de6bb · 12de6bb
1 parent 463a478
commit 12de6bb
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/nemo/utils/sequence_packing_utils.py b/nemo/utils/sequence_packing_utils.py
@@ -129,7 +129,7 @@ def create_hist(dataset: np.array, truncate_seq_len: int):
     logging.debug(counts)
 
     histogram = []
-    for seq_len in range(truncate_seq_len):
+    for seq_len in range(truncate_seq_len + 1):
         histogram.append(len(sequences[seq_len]))
 
     return sequences, histogram

diff --git a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
@@ -89,7 +89,7 @@ def tokenize_dataset(cfg: 'DictConfig'):
     # are identical to normal SFT training
     data_cfg = cfg.model.data.train_ds
     pad_seq_length_to_mult = 16
-    cp_size = cfg.model.context_parallel_size
+    cp_size = cfg.model.get("context_parallel_size", 1)
 
     # if context parallel is used, each individual data length in one packed dataset sample
     # needs to be a multiple of (cp_size * 2): https://github.com/NVIDIA/TransformerEngine/pull/641