Skip to content

Commit

Permalink
fixed data packing issue
Browse files Browse the repository at this point in the history
Signed-off-by: root <[email protected]>
  • Loading branch information
root committed Nov 22, 2024
1 parent 463a478 commit 12de6bb
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 2 deletions.
2 changes: 1 addition & 1 deletion nemo/utils/sequence_packing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def create_hist(dataset: np.array, truncate_seq_len: int):
logging.debug(counts)

histogram = []
for seq_len in range(truncate_seq_len):
for seq_len in range(truncate_seq_len + 1):
histogram.append(len(sequences[seq_len]))

return sequences, histogram
Expand Down
2 changes: 1 addition & 1 deletion scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def tokenize_dataset(cfg: 'DictConfig'):
# are identical to normal SFT training
data_cfg = cfg.model.data.train_ds
pad_seq_length_to_mult = 16
cp_size = cfg.model.context_parallel_size
cp_size = cfg.model.get("context_parallel_size", 1)

# if context parallel is used, each individual data length in one packed dataset sample
# needs to be a multiple of (cp_size * 2): https://github.com/NVIDIA/TransformerEngine/pull/641
Expand Down

0 comments on commit 12de6bb

Please sign in to comment.