From ed32115ab3e090aa8ce6822e0a6bff6d294dfbe0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 12 Feb 2024 15:06:38 -0700 Subject: [PATCH] Fixes for MoE parameter passing & use of AutoTokenizer/Model for mistral. (#8272) (#8342) Signed-off-by: Alexandros Koumparoulis Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com> --- .../language_modeling/megatron_gpt_model.py | 8 +++---- .../convert_hf_mistral_7b_to_nemo.py | 22 +++++++------------ .../convert_hf_mixtral_to_nemo.py | 9 +++++--- .../convert_starcoder_hf_to_nemo.py | 1 + 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index d01d1b4ec2a06..3bdc1182dda31 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1701,7 +1701,7 @@ def build_transformer_config(self) -> TransformerConfig: 'fp8': fp8, 'tp_comm_overlap': ub_tp_comm_overlap, # MoE related - 'num_experts': self.cfg.get('num_experts', None), + 'num_moe_experts': self.cfg.get('num_moe_experts', None), 'moe_router_load_balancing_type': self.cfg.get('moe_router_load_balancing_type', 'aux_loss'), 'moe_router_topk': self.cfg.get('moe_router_topk', 2), 'moe_grouped_gemm': self.cfg.get('moe_grouped_gemm', False), @@ -1712,11 +1712,11 @@ def build_transformer_config(self) -> TransformerConfig: 'moe_input_jitter_eps': self.cfg.get('moe_input_jitter_eps', None), 'moe_token_dropping': self.cfg.get('moe_token_dropping', False), # TODO: Support token dropping. } - if model_specific_configs['num_experts'] is not None: + if model_specific_configs['num_moe_experts'] is not None: assert mcore_supports_moe(), 'Megatron-core >= v0.5.0 is required for MoE' elif not mcore_supports_moe(): - if 'num_experts' in model_specific_configs: - del model_specific_configs['num_experts'] + if 'num_moe_experts' in model_specific_configs: + del model_specific_configs['num_moe_experts'] moe_keys = list(filter(lambda x: x.startswith('moe_'), model_specific_configs.keys())) for k in moe_keys: del model_specific_configs[k] diff --git a/scripts/nlp_language_modeling/convert_hf_mistral_7b_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_mistral_7b_to_nemo.py index 89bf6cc27088b..b8deebbf0f3e8 100644 --- a/scripts/nlp_language_modeling/convert_hf_mistral_7b_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_mistral_7b_to_nemo.py @@ -32,7 +32,7 @@ from omegaconf import OmegaConf from pytorch_lightning.core.saving import _load_state as ptl_load_state from pytorch_lightning.trainer.trainer import Trainer -from sentencepiece import SentencePieceProcessor +from transformers import AutoModelForCausalLM, AutoTokenizer from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.parts.nlp_overrides import ( @@ -127,23 +127,17 @@ def load_config(mistral_config, tokenizer_path): return nemo_config -def load_mistral_ckpt(dir): - params_file = os.path.join(dir, 'config.json') +def load_mistral_ckpt(in_dir): + params_file = os.path.join(in_dir, 'config.json') assert os.path.exists(params_file) with open(params_file, 'r') as fp: model_args = json.load(fp) - ckpt = OrderedDict() - ckpt['state_dict'] = OrderedDict() - for i in range(2): - ckpt_file = f'pytorch_model-0000{i+1}-of-00002.bin' - ckpt_path = os.path.join(dir, ckpt_file) - assert os.path.exists(ckpt_path) - ckpt.update(torch.load(ckpt_path)) - tokenizer_file = os.path.join(dir, 'tokenizer.model') - assert os.path.exists(tokenizer_file) - tokenizer = SentencePieceProcessor(model_file=tokenizer_file) - assert tokenizer.get_piece_size() == model_args['vocab_size'] + model = AutoModelForCausalLM.from_pretrained(in_dir) + ckpt = model.state_dict() + + tokenizer = AutoTokenizer.from_pretrained(in_dir) + assert tokenizer.vocab_size == model_args['vocab_size'] return model_args, ckpt, tokenizer diff --git a/scripts/nlp_language_modeling/convert_hf_mixtral_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_mixtral_to_nemo.py index bed49cf89be70..2f5bc24fae064 100644 --- a/scripts/nlp_language_modeling/convert_hf_mixtral_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_mixtral_to_nemo.py @@ -81,6 +81,9 @@ def load_model(cls, checkpoint, strict, **kwargs): # register the artifacts cfg = checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] + assert os.path.exists( + cfg.tokenizer.model + ), f"Expected cfg.tokenizer.model {cfg.tokenizer.model} to be present" if cfg.tokenizer.model is not None: model.register_artifact("tokenizer.tokenizer_model", cfg.tokenizer.model) if cfg.tokenizer.vocab_file is not None: @@ -110,8 +113,8 @@ def load_config(mixtral_config, tokenizer_path): if 'num_key_value_heads' in mixtral_config: nemo_config.num_query_groups = mixtral_config['num_key_value_heads'] - nemo_config.num_experts = int(mixtral_config['num_local_experts']) - assert nemo_config.num_experts > 0, "num_experts must be greater than zero." + nemo_config.num_moe_experts = int(mixtral_config['num_local_experts']) + assert nemo_config.num_moe_experts > 0, "num_experts must be greater than zero." nemo_config.moe_router_topk = int(mixtral_config['num_experts_per_tok']) assert nemo_config.moe_router_topk > 0, "moe_router_topk must be greater than zero." nemo_config.use_cpu_initialization = True @@ -266,7 +269,7 @@ def convert(args): raise Exception("not implemented") checkpoint['state_dict'][moe_gate_name] = param_to_weights(moe_gate) # Handle experts - for i in range(nemo_config.num_experts): + for i in range(nemo_config.num_moe_experts): gate_proj = ckpt[f'model.layers.{l}.block_sparse_moe.experts.{i}.w1.weight'] up_proj = ckpt[f'model.layers.{l}.block_sparse_moe.experts.{i}.w3.weight'] if mcore_gpt: diff --git a/scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py b/scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py index 887d431f03495..6cb0fa4c8b9f1 100644 --- a/scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py @@ -194,6 +194,7 @@ def get_new_key(old_key): convert_dict = convert_state_dict(state_dict_hf, amp=omega_cfg.megatron_amp_O2) logging.info("Creating Megatron model...") + omega_cfg.cpu_offloading_num_layers = 0 model = load_state_dict_helper(MegatronGPTModel, omega_cfg, trainer, convert_dict) logging.info(f"Created model:\n{model}")