forked from swiss-ai/nanotron
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Converters ready * Added xglm transformers implementation --------- Co-authored-by: Negar Foroutan <[email protected]>
- Loading branch information
1 parent
328b8c2
commit 3bce1f4
Showing
6 changed files
with
1,596 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
""" | ||
Converts a nanotron moe model to HF format | ||
Command: | ||
torchrun --nproc-per-node=1 convert_nt2hf.py --checkpoint-path=nanotron_weights --save-path=hf_weights | ||
""" | ||
|
||
import warnings | ||
from argparse import ArgumentParser | ||
from pathlib import Path | ||
from typing import Optional | ||
|
||
import torch | ||
from transformers import AutoTokenizer | ||
from tqdm import tqdm | ||
|
||
from nanotron.config.models_config import GPT3MoEConfig | ||
from nanotron.models.gpt3_moe import GPT3MoEForTraining, GPT3MoEBlock | ||
from nanotron.models.moe import dMoE, SparseMLP, LearnedRouter | ||
|
||
from examples.xglm.convert_dense2moe import create_nt_moe_model | ||
from examples.xglm.convert_nt2hf import convert_attention | ||
from examples.xglm.convert_utils import convert_generic | ||
from examples.xglm.transformers_impl.xglm_model import XGLMForCausalLM, XGLMDecoderLayer, XGLMmoeConfig, XGLMSparseMoeBlock, XGLMMLP | ||
from examples.xglm.transformers_impl.gating import BasicGate | ||
|
||
|
||
def convert_config(config: GPT3MoEConfig) -> XGLMmoeConfig: | ||
if config.embd_pdrop != config.resid_pdrop: | ||
warnings.warn( | ||
f"nanotron.embd_pdrop = {config.embd_pdrop} does not match with " | ||
f"nanotron.resid_pdrop = {config.resid_pdrop}. " | ||
"XGLM implementation needs these two values to be equal " | ||
"for correct conversion." | ||
) | ||
if config.layer_norm_epsilon != 1e-5: | ||
warnings.warn(f"nanotron.layer_norm_epsilon must be 1e-5, not {config.layer_norm_epsilon}") | ||
if config.moe_z_loss_weight != 0: | ||
warnings.warn(f"transformer implementation does not support z loss") | ||
assert not config.moe_glu, "Transformer implementation does not support glu MLP layers" | ||
|
||
return XGLMmoeConfig( | ||
# Regular xglm config. | ||
activation_function=config.activation_function, | ||
attention_dropout=config.attn_pdrop, | ||
dropout=config.embd_pdrop, | ||
eos_token_id=config.eos_token_id, | ||
d_model=config.hidden_size, | ||
ffn_dim=config.intermediate_size, | ||
max_position_embeddings=config.max_position_embeddings, | ||
attention_heads=config.num_attention_heads, | ||
num_layers=config.num_hidden_layers, | ||
vocab_size=config.vocab_size, | ||
decoder_start_token_id=config.position_embedding_offset, | ||
activation_dropout=config.act_pdrop, | ||
scale_embedding=config.scale_embedding, | ||
# Moe specifics. | ||
num_local_experts=config.moe_num_experts, | ||
num_experts_per_tok=config.num_experts_per_tok, | ||
gate_type="linear", | ||
gate_depth=1, | ||
router_aux_loss_coef=config.moe_loss_weight, | ||
) | ||
|
||
|
||
def convert_mlp(mlp_hf: XGLMMLP, mlp_nt: SparseMLP): | ||
convert_generic(mlp_hf.fc1, mlp_nt.w1.module) | ||
convert_generic(mlp_hf.fc2, mlp_nt.w2.module) | ||
|
||
|
||
def convert_gate(gate_hf: BasicGate, gate_nt: LearnedRouter): | ||
convert_generic(gate_hf.gate, gate_nt.layer) | ||
|
||
|
||
def convert_ff(ff_hf: XGLMSparseMoeBlock, ff_nt: dMoE): | ||
convert_gate(ff_hf.gate, ff_nt.gate) | ||
int_size = ff_nt.config.intermediate_size | ||
if len(ff_hf.experts) == 1: | ||
assert ff_nt.experts.mlp.w1.module.weight.shape == (int_size*len(ff_hf.experts), ff_nt.config.hidden_size) | ||
assert ff_nt.experts.mlp.w2.module.weight.shape == (ff_nt.config.hidden_size, int_size*len(ff_hf.experts)) | ||
else: | ||
assert ff_nt.experts.mlp.w1.module.weight.T.shape == (int_size*len(ff_hf.experts), ff_nt.config.hidden_size) | ||
assert ff_nt.experts.mlp.w2.module.weight.shape == (int_size*len(ff_hf.experts), ff_nt.config.hidden_size) | ||
|
||
for i, expert_hf in enumerate(ff_hf.experts): | ||
i0 = i*int_size | ||
i1 = (i + 1)*int_size | ||
with torch.no_grad(): | ||
if len(ff_hf.experts) == 1: | ||
expert_hf.fc1.weight.copy_(ff_nt.experts.mlp.w1.module.weight[i0:i1, :].clone()) | ||
expert_hf.fc2.weight.copy_(ff_nt.experts.mlp.w2.module.weight[:, i0:i1].clone()) | ||
else: | ||
expert_hf.fc1.weight.copy_(ff_nt.experts.mlp.w1.module.weight.T[i0:i1, :].clone()) | ||
expert_hf.fc2.weight.copy_(ff_nt.experts.mlp.w2.module.weight[i0:i1, :].T.clone()) | ||
|
||
def convert_decoder(block_hf: XGLMDecoderLayer, block_nt: GPT3MoEBlock): | ||
convert_generic(block_hf.self_attn_layer_norm, block_nt.ln_1) | ||
convert_attention(block_hf.self_attn, block_nt.attn) | ||
convert_generic(block_hf.final_layer_norm, block_nt.ln_2) | ||
convert_ff(block_hf.block_sparse_moe, block_nt.ff) | ||
|
||
|
||
def convert(model_hf: XGLMForCausalLM, model_nt: GPT3MoEForTraining): | ||
convert_generic(model_hf.model.embed_tokens, model_nt.model.token_embeddings.pp_block.token_embedding) | ||
for layer_hf, layer_nt in tqdm(zip(model_hf.model.layers, model_nt.model.decoder), desc="Converting layers", | ||
total=model_nt.config.num_hidden_layers): | ||
convert_decoder(layer_hf, layer_nt.pp_block) | ||
convert_generic(model_hf.model.layer_norm, model_nt.model.final_layer_norm.pp_block) | ||
convert_generic(model_hf.lm_head, model_nt.model.lm_head.pp_block) | ||
|
||
|
||
def main(checkpoint_path: Path, save_path: Path, tokenizer_name: Optional[str]): | ||
# Load nanotron model. | ||
model_nt = create_nt_moe_model(checkpoint_path=checkpoint_path) | ||
|
||
# Init huggingface model. | ||
model_config_hf = convert_config(model_nt.config) | ||
model_hf = XGLMForCausalLM._from_config(model_config_hf) | ||
|
||
# Copy weights, initialize tokenizer and save model. | ||
if tokenizer_name is not None: | ||
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) | ||
tokenizer.save_pretrained(save_path) | ||
states = torch.randn(4, 1, 1024) | ||
convert(model_hf, model_nt), states.cuda().bfloat16() | ||
print("Saving...") | ||
model_hf.save_pretrained(save_path) | ||
print(f"Model saved to {save_path}") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = ArgumentParser(description="Convert HF weights to nanotron format") | ||
parser.add_argument( | ||
"--checkpoint-path", type=Path, default="checkpoints/xglm-7.5B", help="Path to the nanotron checkpoint" | ||
) | ||
parser.add_argument( | ||
"--save-path", type=Path, default="facebook/xglm-7.5B", help="Path to save the huggingface model" | ||
) | ||
parser.add_argument("--tokenizer-name", type=str, default="facebook/xglm-7.5B") | ||
args = parser.parse_args() | ||
ret = main(args.checkpoint_path, args.save_path, args.tokenizer_name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
import torch | ||
import pytest | ||
|
||
import nanotron | ||
from nanotron.config.parallelism_config import ParallelismArgs | ||
from nanotron.config.models_config import GPT3MoEConfig | ||
from nanotron.parallel import ParallelContext | ||
from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer | ||
from nanotron.trainer import mark_tied_parameters | ||
from nanotron.models.gpt3_moe import GPT3MoEBlock, GPT3MoEForTraining | ||
from nanotron.models.moe import LearnedRouter, dMoE | ||
|
||
from tests.helpers.utils import init_distributed | ||
|
||
from examples.xglm.convert_ntmoe2hf import convert_config, convert_gate, convert_ff, convert | ||
from examples.xglm.tests.test_implementation import almost_close | ||
from examples.xglm.transformers_impl.xglm_model import XGLMSparseMoeBlock, XGLMForCausalLM | ||
from examples.xglm.transformers_impl.gating import BasicGate | ||
|
||
|
||
MAX_SEQUENCE_LENGTH = 2048 | ||
TEST_SEQUENCE_LENGTH = 128 # If we test with a very large sequence length, precision errors get more significant independent of the correct implementation. | ||
#TEST_SEQUENCE_LENGTH = MAX_SEQUENCE_LENGTH | ||
BATCH_SIZE = 4 | ||
HIDDEN_SIZE = 1024 | ||
#DTYPE = torch.bfloat16 | ||
DTYPE = torch.float32 | ||
TEXT = "Hello. This is a relatively long text. I will use this text to test the conversion scripts. Let's finish this text soon because I don't have much more to say. Final note:" | ||
|
||
CONFIG = GPT3MoEConfig( | ||
attn_pdrop=0.0, | ||
embd_pdrop=0.0, | ||
resid_pdrop=0.0, | ||
act_pdrop=0.0, | ||
eos_token_id=2, | ||
hidden_size=HIDDEN_SIZE, | ||
intermediate_size=4096, | ||
layer_norm_epsilon=1e-05, | ||
max_position_embeddings=MAX_SEQUENCE_LENGTH, | ||
num_attention_heads=16, | ||
num_hidden_layers=24, | ||
scale_attn_weights=True, | ||
vocab_size=256008, | ||
sinusoidal_position_embedding=True, | ||
position_embedding_offset=2, | ||
use_spda=DTYPE is not torch.bfloat16, | ||
# vvv moe vvv | ||
is_moe=True, | ||
moe_num_experts=8, | ||
num_experts_per_tok=2, | ||
moe_loss_weight=0.01, | ||
moe_z_loss_weight=0.0, | ||
moe_glu=False, | ||
) | ||
PARALLEL_CONFIG = ParallelismArgs(dp=1, pp=1, tp=1, expert_parallel_size=1) #CONFIG.moe_num_experts) | ||
|
||
|
||
@pytest.fixture | ||
def hidden_states() -> torch.Tensor: | ||
return torch.randn(TEST_SEQUENCE_LENGTH, BATCH_SIZE, HIDDEN_SIZE, dtype=DTYPE) | ||
|
||
|
||
@pytest.fixture | ||
def input_mask() -> torch.Tensor: | ||
return torch.ones(BATCH_SIZE, TEST_SEQUENCE_LENGTH, dtype=torch.bool) | ||
|
||
|
||
@pytest.fixture | ||
def input_ids() -> torch.Tensor: | ||
return torch.randint(0, CONFIG.vocab_size, (BATCH_SIZE, TEST_SEQUENCE_LENGTH)) | ||
|
||
|
||
def _test_nt2hf_gate(parallel_context: ParallelContext, hidden_states: torch.Tensor): | ||
hidden_states = hidden_states.cuda() | ||
|
||
config_hf = convert_config(CONFIG) | ||
gate_nt = LearnedRouter(CONFIG).cuda().to(DTYPE) | ||
gate_hf = BasicGate(config_hf).cuda().to(DTYPE) | ||
convert_gate(gate_hf, gate_nt) | ||
|
||
router_logits_nt, _, _ = gate_nt(hidden_states.view(-1, HIDDEN_SIZE)) | ||
router_logits_hf = gate_hf(hidden_states.permute(1, 0, 2).reshape(-1, HIDDEN_SIZE), "") | ||
|
||
router_logits_nt = router_logits_nt.view(TEST_SEQUENCE_LENGTH, BATCH_SIZE, -1) | ||
router_logits_hf = router_logits_hf.view(BATCH_SIZE, TEST_SEQUENCE_LENGTH, -1).permute(1, 0, 2) | ||
|
||
assert router_logits_nt.size() == router_logits_hf.size() | ||
torch.testing.assert_close(router_logits_nt, router_logits_hf) | ||
|
||
|
||
def test_nt2hf_gate(hidden_states: torch.Tensor): | ||
init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_gate)(hidden_states=hidden_states) | ||
|
||
|
||
def _test_nt2hf_ff(parallel_context: ParallelContext, hidden_states: torch.Tensor, | ||
num_experts: int, num_experts_per_tok: int): | ||
hidden_states = hidden_states.cuda() | ||
|
||
config = {**vars(CONFIG)} | ||
config.update({"moe_num_experts": num_experts, "num_experts_per_tok": num_experts_per_tok}) | ||
config = GPT3MoEConfig(**config) | ||
config_hf = convert_config(config) | ||
ff_nt = dMoE(config, parallel_context, PARALLEL_CONFIG).cuda().to(DTYPE) | ||
ff_hf = XGLMSparseMoeBlock(config_hf).cuda().to(DTYPE) | ||
convert_ff(ff_hf, ff_nt) | ||
|
||
out_nt = ff_nt(hidden_states)["hidden_states"] | ||
out_hf, _ = ff_hf(hidden_states.permute(1, 0, 2).contiguous(), "") | ||
out_hf = out_hf.permute(1, 0, 2) | ||
|
||
assert out_nt.size() == out_hf.size() | ||
almost_close(out_nt, out_hf, max_far=0.05, far_atol=0.003) | ||
|
||
|
||
@pytest.mark.parametrize("num_experts,num_experts_per_tok", [(1, 1), (2, 1), (4, 1), (4, 2), (8, 1), (8, 2), (8, 4)]) | ||
def test_nt2hf_ff(hidden_states: torch.Tensor, num_experts: int, num_experts_per_tok: int): | ||
init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_ff)(hidden_states=hidden_states, num_experts=num_experts, num_experts_per_tok=num_experts_per_tok) | ||
|
||
|
||
def _test_nt2hf_model(parallel_context: ParallelContext, input_ids: torch.Tensor, input_mask: torch.Tensor): | ||
random_states = nanotron.random.RandomStates({"tp_synced": nanotron.random.get_current_random_state()}) | ||
input_ids = input_ids.cuda() | ||
input_mask = input_mask.cuda() | ||
|
||
# unfortunately, we can't use float64 with huggingface xglm. | ||
new_dtype = torch.float32 if DTYPE == torch.float64 else DTYPE | ||
|
||
# Get nanotron model. | ||
config_nt = GPT3MoEConfig(**vars(CONFIG)) | ||
if new_dtype not in {torch.bfloat16, torch.float16}: | ||
config_nt.use_spda = True | ||
model_nt = nanotron.models.build_model( | ||
model_builder=lambda: GPT3MoEForTraining( | ||
config=config_nt, | ||
parallel_context=parallel_context, | ||
parallel_config=None, | ||
random_states=random_states, | ||
), | ||
parallel_context=parallel_context, | ||
dtype=new_dtype, | ||
device="cuda", | ||
).eval() | ||
mark_tied_parameters(model=model_nt, parallel_context=parallel_context) | ||
|
||
# Create empty model_hf and make conversion. | ||
model_hf = XGLMForCausalLM(convert_config(config_nt)).cuda().to(new_dtype).eval() | ||
convert(model_hf, model_nt) | ||
|
||
# Needed :/ | ||
aux_losses = { | ||
"load_balancing_loss": ( | ||
torch.zeros(1, device=input_ids.device) | ||
if not isinstance(input_ids, TensorPointer) | ||
else TensorPointer(self.input_pp_rank) | ||
), | ||
"z_loss": ( | ||
torch.zeros(1, device=input_ids.device) | ||
if not isinstance(input_ids, TensorPointer) | ||
else TensorPointer(self.input_pp_rank) | ||
), | ||
} | ||
|
||
# Get outputs and assert. | ||
with torch.no_grad(): | ||
out_nt = model_nt.model(input_ids, input_mask, aux_losses)["sharded_logits"].to(new_dtype) | ||
del model_nt | ||
torch.cuda.empty_cache() | ||
out_hf = model_hf(input_ids=input_ids, attention_mask=input_mask, output_router_logits=False).logits.permute(1, 0, 2) | ||
del model_hf | ||
torch.cuda.empty_cache() | ||
assert out_nt.size() == out_hf.size(), f"{out_nt.size()}, {out_hf.size()}" | ||
return out_nt.cpu(), out_hf.cpu() | ||
|
||
|
||
def _test_nt2hf_dummy_xglm(parallel_context: ParallelContext, input_ids: torch.Tensor, input_mask: torch.Tensor): | ||
out_nt, out_hf = _test_nt2hf_model(parallel_context, input_ids, input_mask) | ||
almost_close(out_nt, out_hf, max_far=0.01, far_atol=2.0) # We allow for less than 1% errors, but some of these are very large! | ||
#torch.testing.assert_close(out_nt.bfloat16(), out_hf.bfloat16()) | ||
|
||
|
||
def test_nt2hf_dummy_xglm(input_ids: torch.Tensor, input_mask: torch.Tensor): | ||
init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_dummy_xglm)(input_ids=input_ids, input_mask=input_mask) |
Oops, something went wrong.