diff --git a/nemo/utils/sequence_packing_utils.py b/nemo/utils/sequence_packing_utils.py
index 39035f81dab7..2ca03ce44b67 100644
--- a/nemo/utils/sequence_packing_utils.py
+++ b/nemo/utils/sequence_packing_utils.py
@@ -129,7 +129,7 @@ def create_hist(dataset: np.array, truncate_seq_len: int):
     logging.debug(counts)
 
     histogram = []
-    for seq_len in range(truncate_seq_len):
+    for seq_len in range(truncate_seq_len + 1):
         histogram.append(len(sequences[seq_len]))
 
     return sequences, histogram
diff --git a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
index 3852525f0030..42c50fbc2ace 100644
--- a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
+++ b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
@@ -89,7 +89,7 @@ def tokenize_dataset(cfg: 'DictConfig'):
     # are identical to normal SFT training
     data_cfg = cfg.model.data.train_ds
     pad_seq_length_to_mult = 16
-    cp_size = cfg.model.context_parallel_size
+    cp_size = cfg.model.get("context_parallel_size", 1)
 
     # if context parallel is used, each individual data length in one packed dataset sample
     # needs to be a multiple of (cp_size * 2): https://github.com/NVIDIA/TransformerEngine/pull/641