diff --git a/nemo/utils/sequence_packing_utils.py b/nemo/utils/sequence_packing_utils.py index 39035f81dab7..2ca03ce44b67 100644 --- a/nemo/utils/sequence_packing_utils.py +++ b/nemo/utils/sequence_packing_utils.py @@ -129,7 +129,7 @@ def create_hist(dataset: np.array, truncate_seq_len: int): logging.debug(counts) histogram = [] - for seq_len in range(truncate_seq_len): + for seq_len in range(truncate_seq_len + 1): histogram.append(len(sequences[seq_len])) return sequences, histogram diff --git a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py index 3852525f0030..42c50fbc2ace 100644 --- a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py +++ b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py @@ -89,7 +89,7 @@ def tokenize_dataset(cfg: 'DictConfig'): # are identical to normal SFT training data_cfg = cfg.model.data.train_ds pad_seq_length_to_mult = 16 - cp_size = cfg.model.context_parallel_size + cp_size = cfg.model.get("context_parallel_size", 1) # if context parallel is used, each individual data length in one packed dataset sample # needs to be a multiple of (cp_size * 2): https://github.com/NVIDIA/TransformerEngine/pull/641