From 33445e5a0980a83d99644c8192ac9a73dda9587e Mon Sep 17 00:00:00 2001 From: Dennis Date: Tue, 26 Nov 2024 09:42:24 -0800 Subject: [PATCH] Update. --- .../nlp/modules/common/megatron/megatron_init.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index 73dbb8404187..6cf51be947d3 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -254,6 +254,7 @@ def fake_initialize_model_parallel( pipeline_model_parallel_split_rank_=None, virtual_pipeline_model_parallel_size_=None, expert_model_parallel_size_=1, + expert_tensor_parallel_size_=None, context_parallel_size_=1, encoder_tensor_model_parallel_size_=0, encoder_pipeline_model_parallel_size_=0, @@ -349,7 +350,7 @@ def fake_initialize_model_parallel( decoder_rank_generator = RankGenerator( tp=tensor_model_parallel_size, - ep=expert_model_parallel_size_, + ep=1, dp=data_parallel_size, pp=pipeline_model_parallel_size, cp=context_parallel_size, @@ -357,10 +358,10 @@ def fake_initialize_model_parallel( rank_offset=encoder_world_size, ) # Build expert rank generator - if expert_tensor_parallel_size is None: - expert_tensor_parallel_size = tensor_model_parallel_size + if expert_tensor_parallel_size_ is None: + expert_tensor_parallel_size_ = tensor_model_parallel_size expert_tensor_model_pipeline_parallel_size = ( - expert_tensor_parallel_size * expert_model_parallel_size * pipeline_model_parallel_size + expert_tensor_parallel_size_ * expert_model_parallel_size_ * pipeline_model_parallel_size ) expert_data_parallel_size = decoder_world_size // expert_tensor_model_pipeline_parallel_size if decoder_world_size % expert_tensor_model_pipeline_parallel_size != 0: @@ -370,12 +371,12 @@ def fake_initialize_model_parallel( # TODO: support expert specific ordering expert_decoder_rank_generator = RankGenerator( - tp=expert_tensor_parallel_size, - ep=expert_model_parallel_size, + tp=expert_tensor_parallel_size_, + ep=expert_model_parallel_size_, dp=expert_data_parallel_size, pp=pipeline_model_parallel_size, cp=1, - order=order, + order='tp-pp-dp' if use_tp_pp_dp_mapping else 'tp-cp-ep-dp-pp', rank_offset=encoder_world_size, ) @@ -391,6 +392,7 @@ def generator_wrapper(group_type, is_expert=False, **kwargs): in addition to the default decoder, we essentially instantiate two `RankGenerator` classes to construct the parallelism for each module separately, and we then have to stitch them together for the right groups. For now, this means pp and tp-pp.""" + from itertools import cycle if is_expert: d_ranks = expert_decoder_rank_generator.get_ranks(group_type, **kwargs) else: