diff --git a/scripts/convert_bert_from_huggingface_to_uer.py b/scripts/convert_bert_from_huggingface_to_uer.py index 5deb6282..22a280af 100644 --- a/scripts/convert_bert_from_huggingface_to_uer.py +++ b/scripts/convert_bert_from_huggingface_to_uer.py @@ -72,8 +72,8 @@ def main(): output_model["target.sp.linear_2.bias"] = input_model["cls.seq_relationship.bias"] output_model["target.mlm.linear_1.weight"] = input_model["cls.predictions.transform.dense.weight"] output_model["target.mlm.linear_1.bias"] = input_model["cls.predictions.transform.dense.bias"] - output_model["target.layer_norm.gamma"] = input_model["cls.predictions.transform.LayerNorm.weight"] - output_model["target.layer_norm.beta"] = input_model["cls.predictions.transform.LayerNorm.bias"] + output_model["target.mlm.layer_norm.gamma"] = input_model["cls.predictions.transform.LayerNorm.weight"] + output_model["target.mlm.layer_norm.beta"] = input_model["cls.predictions.transform.LayerNorm.bias"] output_model["target.mlm.linear_2.weight"] = input_model["cls.predictions.decoder.weight"] output_model["target.mlm.linear_2.bias"] = input_model["cls.predictions.bias"] diff --git a/scripts/convert_bert_from_original_tf_to_uer.py b/scripts/convert_bert_from_original_tf_to_uer.py index fe332503..17c2eddd 100644 --- a/scripts/convert_bert_from_original_tf_to_uer.py +++ b/scripts/convert_bert_from_original_tf_to_uer.py @@ -93,8 +93,8 @@ def main(): output_model["target.sp.linear_2.bias"] = input_model["cls/seq_relationship/output_bias"] output_model["target.mlm.linear_1.weight"] = input_model["cls/predictions/transform/dense/kernel"] output_model["target.mlm.linear_1.bias"] = input_model["cls/predictions/transform/dense/bias"] - output_model["target.layer_norm.gamma"] = input_model["cls/predictions/transform/LayerNorm/gamma"] - output_model["target.layer_norm.beta"] = input_model["cls/predictions/transform/LayerNorm/beta"] + output_model["target.mlm.layer_norm.gamma"] = input_model["cls/predictions/transform/LayerNorm/gamma"] + output_model["target.mlm.layer_norm.beta"] = input_model["cls/predictions/transform/LayerNorm/beta"] output_model["target.mlm.linear_2.weight"] = input_model["bert/embeddings/word_embeddings"] output_model["target.mlm.linear_2.bias"] = input_model["cls/predictions/output_bias"] diff --git a/scripts/convert_bert_from_uer_to_huggingface.py b/scripts/convert_bert_from_uer_to_huggingface.py index 1805d904..b7f2a92c 100644 --- a/scripts/convert_bert_from_uer_to_huggingface.py +++ b/scripts/convert_bert_from_uer_to_huggingface.py @@ -70,8 +70,8 @@ def main(): output_model["cls.seq_relationship.bias"] = input_model["target.sp.linear_2.bias"] output_model["cls.predictions.transform.dense.weight"] = input_model["target.mlm.linear_1.weight"] output_model["cls.predictions.transform.dense.bias"] = input_model["target.mlm.linear_1.bias"] - output_model["cls.predictions.transform.LayerNorm.weight"] = input_model["target.layer_norm.gamma"] - output_model["cls.predictions.transform.LayerNorm.bias"] = input_model["target.layer_norm.beta"] + output_model["cls.predictions.transform.LayerNorm.weight"] = input_model["target.mlm.layer_norm.gamma"] + output_model["cls.predictions.transform.LayerNorm.bias"] = input_model["target.mlm.layer_norm.beta"] output_model["cls.predictions.decoder.weight"] = input_model["target.mlm.linear_2.weight"] output_model["cls.predictions.bias"] = input_model["target.mlm.linear_2.bias"] diff --git a/scripts/convert_bert_from_uer_to_original_tf.py b/scripts/convert_bert_from_uer_to_original_tf.py index 78bb698c..e010d76b 100644 --- a/scripts/convert_bert_from_uer_to_original_tf.py +++ b/scripts/convert_bert_from_uer_to_original_tf.py @@ -86,8 +86,8 @@ def main(): output_model["cls/seq_relationship/output_bias"] = input_model["target.sp.linear_2.bias"] output_model["cls/predictions/transform/dense/kernel"] = input_model["target.mlm.linear_1.weight"] output_model["cls/predictions/transform/dense/bias"] = input_model["target.mlm.linear_1.bias"] - output_model["cls/predictions/transform/LayerNorm/gamma"] = input_model["target.layer_norm.gamma"] - output_model["cls/predictions/transform/LayerNorm/beta"] = input_model["target.layer_norm.beta"] + output_model["cls/predictions/transform/LayerNorm/gamma"] = input_model["target.mlm.layer_norm.gamma"] + output_model["cls/predictions/transform/LayerNorm/beta"] = input_model["target.mlm.layer_norm.beta"] output_model["cls/predictions/output_bias"] = input_model["target.mlm.linear_2.bias"] tf_vars = [] diff --git a/scripts/convert_t5_from_uer_to_huggingface.py b/scripts/convert_t5_from_uer_to_huggingface.py index 699c796f..002e175d 100644 --- a/scripts/convert_t5_from_uer_to_huggingface.py +++ b/scripts/convert_t5_from_uer_to_huggingface.py @@ -34,69 +34,69 @@ input_model["target.lm.output_layer.weight"] for i in range(args.layers_num): - output_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.0.weight"] = \ - input_model["encoder.block." + str(i) + ".layer.0.SelfAttention.q.weight"] - output_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.1.weight"] = \ - input_model["encoder.block." + str(i) + ".layer.0.SelfAttention.k.weight"] - output_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.2.weight"] = \ - input_model["encoder.block." + str(i) + ".layer.0.SelfAttention.v.weight"] - output_model["encoder.transformer." + str(i) + ".self_attn.final_linear.weight"] = \ - input_model["encoder.block." + str(i) + ".layer.0.SelfAttention.o.weight"] - output_model["encoder.transformer." + str(i) + ".layer_norm_1.weight"] = \ - input_model["encoder.block." + str(i) + ".layer.0.layer_norm.weight"] + output_model["encoder.block." + str(i) + ".layer.0.SelfAttention.q.weight"] = \ + input_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.0.weight"] + output_model["encoder.block." + str(i) + ".layer.0.SelfAttention.k.weight"] = \ + input_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.1.weight"] + output_model["encoder.block." + str(i) + ".layer.0.SelfAttention.v.weight"] = \ + input_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.2.weight"] + output_model["encoder.block." + str(i) + ".layer.0.SelfAttention.o.weight"] = \ + input_model["encoder.transformer." + str(i) + ".self_attn.final_linear.weight"] + output_model["encoder.block." + str(i) + ".layer.0.layer_norm.weight"] = \ + input_model["encoder.transformer." + str(i) + ".layer_norm_1.weight"] if args.type == "t5-v1_1": - output_model["encoder.transformer." + str(i) + ".feed_forward.linear_gate.weight"] = \ - input_model["encoder.block." + str(i) + ".layer.1.DenseReluDense.wi_0.weight"] - output_model["encoder.transformer." + str(i) + ".feed_forward.linear_1.weight"] = \ - input_model["encoder.block." + str(i) + ".layer.1.DenseReluDense.wi_1.weight"] - output_model["encoder.transformer." + str(i) + ".feed_forward.linear_2.weight"] = \ - input_model["encoder.block." + str(i) + ".layer.1.DenseReluDense.wo.weight"] + output_model["encoder.block." + str(i) + ".layer.1.DenseReluDense.wi_0.weight"] = \ + input_model["encoder.transformer." + str(i) + ".feed_forward.linear_gate.weight"] + output_model["encoder.block." + str(i) + ".layer.1.DenseReluDense.wi_1.weight"] = \ + input_model["encoder.transformer." + str(i) + ".feed_forward.linear_1.weight"] + output_model["encoder.block." + str(i) + ".layer.1.DenseReluDense.wo.weight"] = \ + input_model["encoder.transformer." + str(i) + ".feed_forward.linear_2.weight"] else: - output_model["encoder.transformer." + str(i) + ".feed_forward.linear_1.weight"] = \ - input_model["encoder.block." + str(i) + ".layer.1.DenseReluDense.wi.weight"] - output_model["encoder.transformer." + str(i) + ".feed_forward.linear_2.weight"] = \ - input_model["encoder.block." + str(i) + ".layer.1.DenseReluDense.wo.weight"] - output_model["encoder.transformer." + str(i) + ".layer_norm_2.weight"] = \ - input_model["encoder.block." + str(i) + ".layer.1.layer_norm.weight"] + output_model["encoder.block." + str(i) + ".layer.1.DenseReluDense.wi.weight"] = \ + input_model["encoder.transformer." + str(i) + ".feed_forward.linear_1.weight"] + output_model["encoder.block." + str(i) + ".layer.1.DenseReluDense.wo.weight"] = \ + input_model["encoder.transformer." + str(i) + ".feed_forward.linear_2.weight"] + output_model["encoder.block." + str(i) + ".layer.1.layer_norm.weight"] = \ + input_model["encoder.transformer." + str(i) + ".layer_norm_2.weight"] for i in range(args.decoder_layers_num): - output_model["decoder.transformer_decoder." + str(i) + ".self_attn.linear_layers.0.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.0.SelfAttention.q.weight"] - output_model["decoder.transformer_decoder." + str(i) + ".self_attn.linear_layers.1.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.0.SelfAttention.k.weight"] - output_model["decoder.transformer_decoder." + str(i) + ".self_attn.linear_layers.2.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.0.SelfAttention.v.weight"] - output_model["decoder.transformer_decoder." + str(i) + ".self_attn.final_linear.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.0.SelfAttention.o.weight"] - output_model["decoder.transformer_decoder." + str(i) + ".layer_norm_1.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.0.layer_norm.weight"] + output_model["decoder.block." + str(i) + ".layer.0.SelfAttention.q.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".self_attn.linear_layers.0.weight"] + output_model["decoder.block." + str(i) + ".layer.0.SelfAttention.k.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".self_attn.linear_layers.1.weight"] + output_model["decoder.block." + str(i) + ".layer.0.SelfAttention.v.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".self_attn.linear_layers.2.weight"] + output_model["decoder.block." + str(i) + ".layer.0.SelfAttention.o.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".self_attn.final_linear.weight"] + output_model["decoder.block." + str(i) + ".layer.0.layer_norm.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".layer_norm_1.weight"] - output_model["decoder.transformer_decoder." + str(i) + ".context_attn.linear_layers.0.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.1.EncDecAttention.q.weight"] - output_model["decoder.transformer_decoder." + str(i) + ".context_attn.linear_layers.1.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.1.EncDecAttention.k.weight"] - output_model["decoder.transformer_decoder." + str(i) + ".context_attn.linear_layers.2.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.1.EncDecAttention.v.weight"] - output_model["decoder.transformer_decoder." + str(i) + ".context_attn.final_linear.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.1.EncDecAttention.o.weight"] - output_model["decoder.transformer_decoder." + str(i) + ".layer_norm_2.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.1.layer_norm.weight"] + output_model["decoder.block." + str(i) + ".layer.1.EncDecAttention.q.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".context_attn.linear_layers.0.weight"] + output_model["decoder.block." + str(i) + ".layer.1.EncDecAttention.k.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".context_attn.linear_layers.1.weight"] + output_model["decoder.block." + str(i) + ".layer.1.EncDecAttention.v.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".context_attn.linear_layers.2.weight"] + output_model["decoder.block." + str(i) + ".layer.1.EncDecAttention.o.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".context_attn.final_linear.weight"] + output_model["decoder.block." + str(i) + ".layer.1.layer_norm.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".layer_norm_2.weight"] if args.type == "t5-v1_1": - output_model["decoder.transformer_decoder." + str(i) + ".feed_forward.linear_gate.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.2.DenseReluDense.wi_0.weight"] - output_model["decoder.transformer_decoder." + str(i) + ".feed_forward.linear_1.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.2.DenseReluDense.wi_1.weight"] - output_model["decoder.transformer_decoder." + str(i) + ".feed_forward.linear_2.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.2.DenseReluDense.wo.weight"] + output_model["decoder.block." + str(i) + ".layer.2.DenseReluDense.wi_0.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".feed_forward.linear_gate.weight"] + output_model["decoder.block." + str(i) + ".layer.2.DenseReluDense.wi_1.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".feed_forward.linear_1.weight"] + output_model["decoder.block." + str(i) + ".layer.2.DenseReluDense.wo.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".feed_forward.linear_2.weight"] else: - output_model["decoder.transformer_decoder." + str(i) + ".feed_forward.linear_1.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.2.DenseReluDense.wi.weight"] - output_model["decoder.transformer_decoder." + str(i) + ".feed_forward.linear_2.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.2.DenseReluDense.wo.weight"] - output_model["decoder.transformer_decoder." + str(i) + ".layer_norm_3.weight"] = \ - input_model["decoder.block." + str(i) + ".layer.2.layer_norm.weight"] + output_model["decoder.block." + str(i) + ".layer.2.DenseReluDense.wi.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".feed_forward.linear_1.weight"] + output_model["decoder.block." + str(i) + ".layer.2.DenseReluDense.wo.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".feed_forward.linear_2.weight"] + output_model["decoder.block." + str(i) + ".layer.2.layer_norm.weight"] = \ + input_model["decoder.transformer_decoder." + str(i) + ".layer_norm_3.weight"] output_model["encoder.final_layer_norm.weight"] = \ input_model["encoder.layer_norm.weight"] diff --git a/scripts/convert_xlmroberta_from_huggingface_to_uer.py b/scripts/convert_xlmroberta_from_huggingface_to_uer.py index 476cbb33..8038440e 100644 --- a/scripts/convert_xlmroberta_from_huggingface_to_uer.py +++ b/scripts/convert_xlmroberta_from_huggingface_to_uer.py @@ -67,9 +67,9 @@ input_model["lm_head.dense.weight"] output_model["target.mlm.linear_1.bias"] = \ input_model["lm_head.dense.bias"] -output_model["target.layer_norm.gamma"] = \ +output_model["target.mlm.layer_norm.gamma"] = \ input_model["lm_head.layer_norm.weight"] -output_model["target.layer_norm.beta"] = \ +output_model["target.mlm.layer_norm.beta"] = \ input_model["lm_head.layer_norm.bias"] output_model["target.mlm.linear_2.weight"] = \ input_model["lm_head.decoder.weight"] diff --git a/scripts/convert_xlmroberta_from_uer_to_huggingface.py b/scripts/convert_xlmroberta_from_uer_to_huggingface.py index 3ac8560b..7b7ef5a8 100644 --- a/scripts/convert_xlmroberta_from_uer_to_huggingface.py +++ b/scripts/convert_xlmroberta_from_uer_to_huggingface.py @@ -68,9 +68,9 @@ output_model["lm_head.dense.bias"] = \ input_model["target.mlm.linear_1.bias"] output_model["lm_head.layer_norm.weight"] = \ - input_model["target.layer_norm.gamma"] + input_model["target.mlm.layer_norm.gamma"] output_model["lm_head.layer_norm.bias"] = \ - input_model["target.layer_norm.beta"] + input_model["target.mlm.layer_norm.beta"] output_model["lm_head.decoder.weight"] = \ input_model["target.mlm.linear_2.weight"] output_model["lm_head.decoder.bias"] = \