Skip to content

Commit

Permalink
langmoe
Browse files Browse the repository at this point in the history
  • Loading branch information
haeggee committed Sep 3, 2024
1 parent f31a1a3 commit 8e3d195
Show file tree
Hide file tree
Showing 4 changed files with 623 additions and 8 deletions.
95 changes: 94 additions & 1 deletion src/nanotron/config/models_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,4 +262,97 @@ def hidden_act(self):
return self.activation_function


NanotronConfigs = LlamaConfig | Starcoder2Config | GPT3Config | GPT3MoEConfig
@dataclass
class GPT3LangMoEConfig:
"""Configuration for a GPT3 __MoE__ model with language aware gating"""

activation_function: str = "gelu"
attn_pdrop: float = 0.1
embd_pdrop: float = 0.1
eos_token_id: int = 49152
hidden_size: int = 2048
intermediate_size: Optional[int] = None
layer_norm_epsilon: float = 1e-05
max_position_embeddings: int = 4096
num_attention_heads: int = 16
num_hidden_layers: int = 24
resid_pdrop: float = 0.1
scale_attention_softmax_in_fp32: bool = True
scale_attn_weights: bool = True
vocab_size: int = 49280
sinusoidal_position_embedding: bool = True
position_embedding_offset: int = 2
use_spda: bool = False
act_pdrop: float = 0.0
scale_embedding: bool = True
# MoE specific
is_moe: bool = True
moe_num_experts: int = 1
num_experts_per_tok: int = 1
moe_loss_weight: float = 0.01
moe_z_loss_weight: float = 0.001
moe_glu: bool = False

# Language aware gating
num_languages: int = 100
language_embedding_size: int = 128

def as_gpt3(self) -> GPT3Config:
config = dict(**vars(self))

# Moe
del config["is_moe"]
del config["moe_num_experts"]
del config["num_experts_per_tok"]
del config["moe_loss_weight"]
del config["moe_z_loss_weight"]
del config["moe_glu"]

# language aware gating
del config["num_languages"]
del config["language_embedding_size"]

if "_is_using_mup" in config:
del config["_is_using_mup"]
return GPT3Config(**config)

def as_starcoder2(self) -> Starcoder2Config:
# same as gpt3 conversion above
config = dict(**vars(self))
del config["sinusoidal_position_embedding"]
del config["use_spda"]
del config["position_embedding_offset"]
del config["act_pdrop"]
del config["scale_embedding"]

# Moe
del config["is_moe"]
del config["moe_num_experts"]
del config["num_experts_per_tok"]
del config["moe_loss_weight"]
del config["moe_z_loss_weight"]
del config["moe_glu"]

# language aware gating
del config["num_languages"]
del config["language_embedding_size"]

if "_is_using_mup" in config:
del config["_is_using_mup"]
return Starcoder2Config(
grouped_query=True,
num_kv_heads=self.num_attention_heads,
use_rotary_embeddings=False,
**config,
)

@property
def n_inner(self):
return self.intermediate_size

@property
def hidden_act(self):
return self.activation_function


NanotronConfigs = LlamaConfig | Starcoder2Config | GPT3Config | GPT3MoEConfig | GPT3LangMoEConfig
Loading

0 comments on commit 8e3d195

Please sign in to comment.