Skip to content

Commit

Permalink
Add Bert HF checkpoint converter (#8088)
Browse files Browse the repository at this point in the history
* Add Bert HF checkpoint converter

Signed-off-by: yaoyu-33 <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Reformat

Signed-off-by: yaoyu-33 <[email protected]>

* Add BERT ONNX export

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add NeMo BERT to HF BERT script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Clean code

Signed-off-by: yaoyu-33 <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update argument names

Signed-off-by: yaoyu-33 <[email protected]>

* Update build_transformer_config in Bert

Signed-off-by: yaoyu-33 <[email protected]>

---------

Signed-off-by: yaoyu-33 <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Bobby Chen <[email protected]>
  • Loading branch information
3 people authored Jan 31, 2024
1 parent 7b2415a commit 85d8756
Show file tree
Hide file tree
Showing 7 changed files with 745 additions and 4 deletions.
8 changes: 6 additions & 2 deletions examples/nlp/language_modeling/conf/megatron_bert_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: megatron_bert
restore_from_path: null # used when starting from a .nemo file

trainer:
devices: 2
devices: 1
num_nodes: 1
accelerator: gpu
precision: 16
Expand Down Expand Up @@ -56,15 +56,19 @@ model:
hidden_size: 768
ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 12
skip_head: False
transformer_block_type: post_ln
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
normalization: layernorm
layernorm_epsilon: 1e-5
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
pre_process: True # add embedding
post_process: True # add pooler
bert_binary_head: True # BERT binary head
megatron_legacy: False

tokenizer:
library: 'megatron'
Expand Down Expand Up @@ -128,7 +132,7 @@ model:
# - /raid/data/pile/my-gpt3_00_text_document
# - .5
# - /raid/data/pile/my-gpt3_01_text_document
data_prefix: ???
data_prefix: [1.0, /path/to/data]
index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
data_impl: mmap
splits_string: 900,50,50
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ def bert_extended_attention_mask(attention_mask):
# [b, 1, s, s]
extended_attention_mask = attention_mask_bss.unsqueeze(1)

# HF Masking is equivalent to the one below
# extended_attention_mask = (attention_mask.unsqueeze(1) * torch.ones_like(attention_mask).unsqueeze(2)).unsqueeze(1)

# Convert attention mask to binary:
extended_attention_mask = extended_attention_mask < 0.5

Expand Down Expand Up @@ -182,12 +185,15 @@ def __init__(
activations_checkpoint_num_layers=1,
activations_checkpoint_layers_per_pipeline=None,
layernorm_epsilon=1e-5,
normalization='layernorm',
transformer_block_type='pre_ln',
masked_softmax_fusion=False,
bias_gelu_fusion=True,
bias_dropout_add_fusion=True,
openai_gelu=False,
onnx_safe=False,
add_binary_head=True,
skip_head=False,
megatron_legacy=False,
sequence_parallel=False,
position_embedding_type='learned_absolute',
Expand Down Expand Up @@ -229,6 +235,8 @@ def __init__(
activations_checkpoint_num_layers=activations_checkpoint_num_layers,
activations_checkpoint_layers_per_pipeline=activations_checkpoint_layers_per_pipeline,
layernorm_epsilon=layernorm_epsilon,
normalization=normalization,
transformer_block_type=transformer_block_type,
masked_softmax_fusion=masked_softmax_fusion,
bias_activation_fusion=bias_gelu_fusion,
bias_dropout_add_fusion=bias_dropout_add_fusion,
Expand All @@ -242,6 +250,8 @@ def __init__(
init_method=init_method_normal(init_method_std), vocab_size=vocab_size, hidden_size=hidden_size
)

if skip_head:
self.post_process = False
if self.post_process:
self.lm_head = BertLMHead(
config,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,10 +184,13 @@ def model_provider_func(self, pre_process, post_process):
),
layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
masked_softmax_fusion=cfg.get('masked_softmax_fusion', True),
normalization=cfg.get('normalization', 'layernorm'),
transformer_block_type=cfg.get('transformer_block_type', 'pre_ln'),
bias_gelu_fusion=cfg.get('bias_gelu_fusion', True),
bias_dropout_add_fusion=cfg.get("bias_dropout_add_fusion", True),
onnx_safe=cfg.get('onnx_safe', False),
add_binary_head=cfg.bert_binary_head,
skip_head=cfg.get('skip_head', False),
megatron_legacy=cfg.get('megatron_legacy', False),
position_embedding_type=self.cfg.get("position_embedding_type", "learned_absolute"),
)
Expand Down Expand Up @@ -1034,5 +1037,65 @@ def build_transformer_config(self) -> TransformerConfig:
"""
activation = self.cfg.get('activation', 'gelu')
assert activation == 'gelu', "Only gelu activation is support for BERT at the moment."

normalization = self.cfg.get('normalization', 'layernorm')

layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p'
if normalization == 'layernorm':
normalization = 'LayerNorm'
elif normalization == 'rmsnorm':
normalization = 'RMSNorm'
elif normalization == 'layernorm1p':
normalization = 'LayerNorm'
layernorm_zero_centered_gamma = True
else:
logging.warning(
f"The normalization type: {normalization} might not be supported in megatron core."
f"Supported types are LayerNorm and RMSNorm."
)

# any configs that are not in the nemo model config will be added here
model_specific_configs = {
'layernorm_zero_centered_gamma': layernorm_zero_centered_gamma,
'normalization': normalization,
}

transformer_config = super().build_transformer_config()

for key, value in model_specific_configs.items():
setattr(transformer_config, key, value)

# pass mcore customization configs directly to mcore
mcore_customization_config_dict = self.cfg.get('mcore_customization_config', {})
for key, value in mcore_customization_config_dict.items():
setattr(transformer_config, key, value)

return transformer_config


class MegatronBertTextEmbeddingModel(MegatronBertModel):
"""
Megatron Bert Text Embedding.
Model returns [batch, hidden] shape
"""

def average_pool(self, last_hidden_states, attention_mask):
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def forward(
self,
input_ids,
attention_mask,
token_type_ids,
lm_labels=None,
checkpoint_activations_all_layers=None,
model=None,
):
outputs = super().forward(
input_ids, attention_mask, token_type_ids, lm_labels, checkpoint_activations_all_layers, model
)
embeddings = self.average_pool(outputs[0], attention_mask)
embeddings = F.normalize(embeddings, p=2, dim=1)

return embeddings
27 changes: 25 additions & 2 deletions nemo/collections/nlp/modules/common/megatron/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,6 @@ def forward(
)

output = bias_dropout_add_func(mlp_output, mlp_bias, residual, self.hidden_dropout)
# print(f"Layer: {self.layer_number} MLP + Dropout + Residual checksum {output.sum()}")

if self.transformer_block_type == 'post_ln':
output = self.post_attention_layernorm(output)
Expand Down Expand Up @@ -1158,6 +1157,27 @@ def build_layer(layer_number):
offset = parallel_state.get_pipeline_model_parallel_rank() * self.num_layers

self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)])
if self.pre_process and self.transformer_block_type == 'post_ln':
# Final layer norm before output.
if normalization == 'layernorm':
self.initial_layernorm = get_layer_norm(
hidden_size, layernorm_epsilon, persist_layer_norm, sequence_parallel=config.sequence_parallel
)

elif normalization == 'layernorm1p':
self.initial_layernorm = LayerNorm1P(
hidden_size, layernorm_epsilon, sequence_parallel_enabled=config.sequence_parallel
)
elif normalization == 'low_precision_layernorm':
self.initial_layernorm = LPLayerNorm(hidden_size, layernorm_epsilon)
else:
self.initial_layernorm = MixedFusedRMSNorm(hidden_size, layernorm_epsilon)
# for architectures such as MPT, there is no bias term even on the layernorms
# this code allows us to remove the bias terms from the layernorm module
# so that we can support MPT. However, certain apex-based LNs don't support
# removing bias, so we also have to check for that
if not bias and normalization not in ['layernorm', 'layernorm1p']:
remove_bias_from_layernorm(self.initial_layernorm)

if self.post_process and self.transformer_block_type != 'post_ln':
# Final layer norm before output.
Expand Down Expand Up @@ -1435,7 +1455,10 @@ def forward(
'get_key_value does not work with ' 'activation checkpointing'
)

if not self.pre_process:
if self.pre_process:
if self.transformer_block_type == 'post_ln':
hidden_states = self.initial_layernorm(hidden_states)
else:
# See set_input_tensor()
hidden_states = self.input_tensor

Expand Down
Loading

0 comments on commit 85d8756

Please sign in to comment.