From 5352a60402872a0193745df044eeb6ca1ba65ea7 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Wed, 15 May 2024 13:15:31 -0700 Subject: [PATCH] add num_layers & pp == 0 assertion Signed-off-by: dimapihtar --- .../nlp/models/language_modeling/megatron_gpt_model.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 3660a5145b10..cd15437c1431 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -322,6 +322,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): 'Expert parallelism is currently not supporting Apex distributed optimizer, use Mcore distributed optimizer instead' ) + if self.cfg.get('num_layers', 12) % self.cfg.get('pipeline_model_parallel_size', 1) != 0: + raise ValueError( + f"num_layers ({self.cfg.get('num_layers', 12)}) should be divisible by " + f"pipeline_model_parallel_size ({self.cfg.get('pipeline_model_parallel_size', 1)})" + ) + self.transformer_engine = cfg.get('transformer_engine', False) if self.megatron_amp_O2 and not self.transformer_engine: logging.warning('megatron_amp_O2 is enabled but transformer-engine is not.')