From ec54d819e9b238581e55f2bf4d17b7bc3c67b2f6 Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Thu, 27 Jan 2022 08:38:49 +0900 Subject: [PATCH 01/20] Add tensor parallelism related mappings --- .../utils/model_parallel_utils.py | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index abddd6c60faccf..3df7eeade1757d 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -15,6 +15,154 @@ from math import ceil +from transformers import BertPreTrainedModel, GPT2PreTrainedModel, TransfoXLPreTrainedModel + + +# ie. nn.Linear(3 * dim, dim) +# only defined the models that have fused attention layer. +FUSED_ATTENTION_MAPPING = { + GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2}, + TransfoXLPreTrainedModel: {"qkv_net": 3}, +} + +# ie. nn.Linear(out_dim, in_dim) or Conv1D() +# only defined the models that have reversed parameters. +REVERSED_PARAM_MAPPING = { + GPT2PreTrainedModel: ["attn", "crossattention", "mlp"], + TransfoXLPreTrainedModel: ["qkv_net"], +} + +# All the mapping for tensor parallelism +TENSOR_PARALLEL_MAPPING = { + BertPreTrainedModel: { + "column_parallel": ["query", "key", "value", "intermediate.dense"], + "row_parallel": ["output.dense"], + "update_attrs": ["num_attention_heads", "all_head_size"], + }, + GPT2PreTrainedModel: { + "column_parallel": ["c_attn", "q_attn", "c_fc"], + "row_parallel": ["c_proj"], + "update_attrs": ["embed_dim", "split_size", "num_heads"], + }, +} + + +def get_mapping(model, mapping): + """ + Helper function to find mapping by model + + Args: + model (PreTrainedModel): model object + mapping (Dict): mapping object + + Returns: + Any: mapping object + + Examples: + >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2") + >>> get_map(lm_head_model, TENSOR_PARALLEL_MAPPING) + { + "column_parallel": ["c_attn", "q_attn", "c_fc"], + "row_parallel": ["c_proj"], + "update_attrs": ["embed_dim", "split_size", "num_heads"], + } + >>> get_map(lm_head_model, FUSED_ATTENTION_MAPPING) + {"attn.c_attn": 3, "crossattention.c_attn": 2} + + >>> seq_clf_model = GPT2ForSequenceClassification.from_pretrained("gpt2") + >>> get_map(seq_clf_model, TENSOR_PARALLEL_MAPPING) + { + "column_parallel": ["c_attn", "q_attn", "c_fc"], + "row_parallel": ["c_proj"], + "update_attrs": ["embed_dim", "split_size", "num_heads"], + } + >>> get_map(seq_clf_model, FUSED_ATTENTION_MAPPING) + {"attn.c_attn": 3, "crossattention.c_attn": 2} + + """ + for pretrained_model_cls, value in mapping.items(): + if isinstance(model, pretrained_model_cls): + return value + + return None + + +def get_tensor_parallel_mapping(model): + """ + Get tensor parallel mapping by model + + Args: + model (PreTrainedModel): model object + + Returns: + Dict: tensor parallel mapping + + Examples: + >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2") + >>> get_tensor_parallel_mapping(lm_head_model) + { + "column_parallel": ["c_attn", "q_attn", "c_fc"], + "row_parallel": ["c_proj"], + "update_attrs": ["embed_dim", "split_size", "num_heads"], + } + """ + return get_mapping(model, TENSOR_PARALLEL_MAPPING) + + +def is_reversed_param(model, param_name): + """ + Check reversed parameter or not. + + Args: + model (PreTrainedModel): model object + param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...') + + Notes: + ``Conv1D`` of GPT2 and ``qkv_net`` of TransfoXL have reversed parameters + + Returns: + bool: whether reversed parameter or not. + + Examples: + >>> is_reversed_param(model, 'transformer.h.0.attn.c_attn') + True + >>> is_reversed_param(model, 'transformer.wte') + False + """ + mapping = get_mapping(model, REVERSED_PARAM_MAPPING) + + if mapping is not None: + return any([i in param_name for i in mapping]) + + return False + + +def get_fusion_degree(model, param_name): + """ + Get fused attention layer degree + + Args: + model (PreTrainedModel): model object + param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...') + + Notes: + The `c_attn` layer in the self-attention layer of GPT2 is size of (dim * 3, dim). + In this case, the fusion degree is 3. + + The `c_attn` layer in the cross-attention attention layer of GPT2 is size of (dim * 2, dim). + In this case, the fusion degree is 2. + + Returns: + int: the fusion degree + """ + mapping = get_mapping(model, FUSED_ATTENTION_MAPPING) + + if mapping is not None: + for key, degree in mapping.items(): + if key in param_name: + return degree + return 1 + def assert_device_map(device_map, num_blocks): blocks = list(range(0, num_blocks)) From e79ff071eba3a224774b8393176dda8c87106a2d Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Thu, 27 Jan 2022 08:39:34 +0900 Subject: [PATCH 02/20] Modify 'transformers' to '..' --- src/transformers/utils/model_parallel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index 3df7eeade1757d..199c0f0fc02936 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -15,7 +15,7 @@ from math import ceil -from transformers import BertPreTrainedModel, GPT2PreTrainedModel, TransfoXLPreTrainedModel +from .. import BertPreTrainedModel, GPT2PreTrainedModel, TransfoXLPreTrainedModel # ie. nn.Linear(3 * dim, dim) From 0d50c471cf21789940285c8288f85022cb1bd0ad Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Thu, 27 Jan 2022 08:55:42 +0900 Subject: [PATCH 03/20] Fix circular import problem --- .../utils/model_parallel_utils.py | 148 ------------------ .../utils/tensor_parallel_utils.py | 147 +++++++++++++++++ 2 files changed, 147 insertions(+), 148 deletions(-) create mode 100644 src/transformers/utils/tensor_parallel_utils.py diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index 199c0f0fc02936..abddd6c60faccf 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -15,154 +15,6 @@ from math import ceil -from .. import BertPreTrainedModel, GPT2PreTrainedModel, TransfoXLPreTrainedModel - - -# ie. nn.Linear(3 * dim, dim) -# only defined the models that have fused attention layer. -FUSED_ATTENTION_MAPPING = { - GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2}, - TransfoXLPreTrainedModel: {"qkv_net": 3}, -} - -# ie. nn.Linear(out_dim, in_dim) or Conv1D() -# only defined the models that have reversed parameters. -REVERSED_PARAM_MAPPING = { - GPT2PreTrainedModel: ["attn", "crossattention", "mlp"], - TransfoXLPreTrainedModel: ["qkv_net"], -} - -# All the mapping for tensor parallelism -TENSOR_PARALLEL_MAPPING = { - BertPreTrainedModel: { - "column_parallel": ["query", "key", "value", "intermediate.dense"], - "row_parallel": ["output.dense"], - "update_attrs": ["num_attention_heads", "all_head_size"], - }, - GPT2PreTrainedModel: { - "column_parallel": ["c_attn", "q_attn", "c_fc"], - "row_parallel": ["c_proj"], - "update_attrs": ["embed_dim", "split_size", "num_heads"], - }, -} - - -def get_mapping(model, mapping): - """ - Helper function to find mapping by model - - Args: - model (PreTrainedModel): model object - mapping (Dict): mapping object - - Returns: - Any: mapping object - - Examples: - >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2") - >>> get_map(lm_head_model, TENSOR_PARALLEL_MAPPING) - { - "column_parallel": ["c_attn", "q_attn", "c_fc"], - "row_parallel": ["c_proj"], - "update_attrs": ["embed_dim", "split_size", "num_heads"], - } - >>> get_map(lm_head_model, FUSED_ATTENTION_MAPPING) - {"attn.c_attn": 3, "crossattention.c_attn": 2} - - >>> seq_clf_model = GPT2ForSequenceClassification.from_pretrained("gpt2") - >>> get_map(seq_clf_model, TENSOR_PARALLEL_MAPPING) - { - "column_parallel": ["c_attn", "q_attn", "c_fc"], - "row_parallel": ["c_proj"], - "update_attrs": ["embed_dim", "split_size", "num_heads"], - } - >>> get_map(seq_clf_model, FUSED_ATTENTION_MAPPING) - {"attn.c_attn": 3, "crossattention.c_attn": 2} - - """ - for pretrained_model_cls, value in mapping.items(): - if isinstance(model, pretrained_model_cls): - return value - - return None - - -def get_tensor_parallel_mapping(model): - """ - Get tensor parallel mapping by model - - Args: - model (PreTrainedModel): model object - - Returns: - Dict: tensor parallel mapping - - Examples: - >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2") - >>> get_tensor_parallel_mapping(lm_head_model) - { - "column_parallel": ["c_attn", "q_attn", "c_fc"], - "row_parallel": ["c_proj"], - "update_attrs": ["embed_dim", "split_size", "num_heads"], - } - """ - return get_mapping(model, TENSOR_PARALLEL_MAPPING) - - -def is_reversed_param(model, param_name): - """ - Check reversed parameter or not. - - Args: - model (PreTrainedModel): model object - param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...') - - Notes: - ``Conv1D`` of GPT2 and ``qkv_net`` of TransfoXL have reversed parameters - - Returns: - bool: whether reversed parameter or not. - - Examples: - >>> is_reversed_param(model, 'transformer.h.0.attn.c_attn') - True - >>> is_reversed_param(model, 'transformer.wte') - False - """ - mapping = get_mapping(model, REVERSED_PARAM_MAPPING) - - if mapping is not None: - return any([i in param_name for i in mapping]) - - return False - - -def get_fusion_degree(model, param_name): - """ - Get fused attention layer degree - - Args: - model (PreTrainedModel): model object - param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...') - - Notes: - The `c_attn` layer in the self-attention layer of GPT2 is size of (dim * 3, dim). - In this case, the fusion degree is 3. - - The `c_attn` layer in the cross-attention attention layer of GPT2 is size of (dim * 2, dim). - In this case, the fusion degree is 2. - - Returns: - int: the fusion degree - """ - mapping = get_mapping(model, FUSED_ATTENTION_MAPPING) - - if mapping is not None: - for key, degree in mapping.items(): - if key in param_name: - return degree - return 1 - def assert_device_map(device_map, num_blocks): blocks = list(range(0, num_blocks)) diff --git a/src/transformers/utils/tensor_parallel_utils.py b/src/transformers/utils/tensor_parallel_utils.py new file mode 100644 index 00000000000000..a53725ca19ae8d --- /dev/null +++ b/src/transformers/utils/tensor_parallel_utils.py @@ -0,0 +1,147 @@ +from .. import BertPreTrainedModel, GPT2PreTrainedModel, TransfoXLPreTrainedModel + + +# ie. nn.Linear(3 * dim, dim) +# only defined the models that have fused attention layer. +FUSED_ATTENTION_MAPPING = { + GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2}, + TransfoXLPreTrainedModel: {"qkv_net": 3}, +} + +# ie. nn.Linear(out_dim, in_dim) or Conv1D() +# only defined the models that have reversed parameters. +REVERSED_PARAM_MAPPING = { + GPT2PreTrainedModel: ["attn", "crossattention", "mlp"], + TransfoXLPreTrainedModel: ["qkv_net"], +} + +# All the mapping for tensor parallelism +TENSOR_PARALLEL_MAPPING = { + BertPreTrainedModel: { + "column_parallel": ["query", "key", "value", "intermediate.dense"], + "row_parallel": ["output.dense"], + "update_attrs": ["num_attention_heads", "all_head_size"], + }, + GPT2PreTrainedModel: { + "column_parallel": ["c_attn", "q_attn", "c_fc"], + "row_parallel": ["c_proj"], + "update_attrs": ["embed_dim", "split_size", "num_heads"], + }, +} + + +def get_mapping(model, mapping): + """ + Helper function to find mapping by model + + Args: + model (PreTrainedModel): model object + mapping (Dict): mapping object + + Returns: + Any: mapping object + + Examples: + >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2") + >>> get_map(lm_head_model, TENSOR_PARALLEL_MAPPING) + { + "column_parallel": ["c_attn", "q_attn", "c_fc"], + "row_parallel": ["c_proj"], + "update_attrs": ["embed_dim", "split_size", "num_heads"], + } + >>> get_map(lm_head_model, FUSED_ATTENTION_MAPPING) + {"attn.c_attn": 3, "crossattention.c_attn": 2} + + >>> seq_clf_model = GPT2ForSequenceClassification.from_pretrained("gpt2") + >>> get_map(seq_clf_model, TENSOR_PARALLEL_MAPPING) + { + "column_parallel": ["c_attn", "q_attn", "c_fc"], + "row_parallel": ["c_proj"], + "update_attrs": ["embed_dim", "split_size", "num_heads"], + } + >>> get_map(seq_clf_model, FUSED_ATTENTION_MAPPING) + {"attn.c_attn": 3, "crossattention.c_attn": 2} + + """ + for pretrained_model_cls, value in mapping.items(): + if isinstance(model, pretrained_model_cls): + return value + + return None + + +def get_tensor_parallel_mapping(model): + """ + Get tensor parallel mapping by model + + Args: + model (PreTrainedModel): model object + + Returns: + Dict: tensor parallel mapping + + Examples: + >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2") + >>> get_tensor_parallel_mapping(lm_head_model) + { + "column_parallel": ["c_attn", "q_attn", "c_fc"], + "row_parallel": ["c_proj"], + "update_attrs": ["embed_dim", "split_size", "num_heads"], + } + """ + return get_mapping(model, TENSOR_PARALLEL_MAPPING) + + +def is_reversed_param(model, param_name): + """ + Check reversed parameter or not. + + Args: + model (PreTrainedModel): model object + param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...') + + Notes: + ``Conv1D`` of GPT2 and ``qkv_net`` of TransfoXL have reversed parameters + + Returns: + bool: whether reversed parameter or not. + + Examples: + >>> is_reversed_param(model, 'transformer.h.0.attn.c_attn') + True + >>> is_reversed_param(model, 'transformer.wte') + False + """ + mapping = get_mapping(model, REVERSED_PARAM_MAPPING) + + if mapping is not None: + return any([i in param_name for i in mapping]) + + return False + + +def get_fusion_degree(model, param_name): + """ + Get fused attention layer degree + + Args: + model (PreTrainedModel): model object + param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...') + + Notes: + The `c_attn` layer in the self-attention layer of GPT2 is size of (dim * 3, dim). + In this case, the fusion degree is 3. + + The `c_attn` layer in the cross-attention attention layer of GPT2 is size of (dim * 2, dim). + In this case, the fusion degree is 2. + + Returns: + int: the fusion degree + """ + mapping = get_mapping(model, FUSED_ATTENTION_MAPPING) + + if mapping is not None: + for key, degree in mapping.items(): + if key in param_name: + return degree + return 1 From 04089dc577b1da3bc9436ff4fe02fe35c72e33a0 Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Thu, 27 Jan 2022 22:12:17 +0900 Subject: [PATCH 04/20] Add T5 --- .../utils/tensor_parallel_utils.py | 58 +++++++++++++------ 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/src/transformers/utils/tensor_parallel_utils.py b/src/transformers/utils/tensor_parallel_utils.py index a53725ca19ae8d..6360e0a8a7ab7b 100644 --- a/src/transformers/utils/tensor_parallel_utils.py +++ b/src/transformers/utils/tensor_parallel_utils.py @@ -1,32 +1,52 @@ -from .. import BertPreTrainedModel, GPT2PreTrainedModel, TransfoXLPreTrainedModel +from .. import BertPreTrainedModel, GPT2PreTrainedModel, T5PreTrainedModel, TransfoXLPreTrainedModel -# ie. nn.Linear(3 * dim, dim) -# only defined the models that have fused attention layer. -FUSED_ATTENTION_MAPPING = { - GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2}, - TransfoXLPreTrainedModel: {"qkv_net": 3}, -} +""" +All the mapping for tensor parallelism. +This mapping is following the follow format. -# ie. nn.Linear(out_dim, in_dim) or Conv1D() -# only defined the models that have reversed parameters. -REVERSED_PARAM_MAPPING = { - GPT2PreTrainedModel: ["attn", "crossattention", "mlp"], - TransfoXLPreTrainedModel: ["qkv_net"], +TENSOR_PARALLEL_MAPPING = { + PreTrainedModel class: { + "col": list of column parallel parameters, + "row": list of row parallel parameters, + "update": list of attributes to be updated, + "col_no_replacement": list of column parallel parameters without module replacement (Optional) + "row_no_replacement": list of row parallel parameters without module replacement (Optional), + ... + could be added more to avoid exceptions. + } } -# All the mapping for tensor parallelism +""" TENSOR_PARALLEL_MAPPING = { BertPreTrainedModel: { - "column_parallel": ["query", "key", "value", "intermediate.dense"], - "row_parallel": ["output.dense"], - "update_attrs": ["num_attention_heads", "all_head_size"], + "col": ["query", "key", "value", "intermediate.dense"], + "row": ["output.dense"], + "update": ["num_attention_heads", "all_head_size"], }, GPT2PreTrainedModel: { - "column_parallel": ["c_attn", "q_attn", "c_fc"], - "row_parallel": ["c_proj"], - "update_attrs": ["embed_dim", "split_size", "num_heads"], + "col": ["c_attn", "q_attn", "c_fc"], + "row": ["c_proj"], + "update": ["embed_dim", "split_size", "num_heads"], }, + T5PreTrainedModel: { + "col": ["Attention.q", "Attention.k", "Attention.v", "DenseReluDense.wi"], + "row": ["Attention.o", "DenseReluDense.wo"], + "row_no_replacement": ["relative_attention_bias"], + "update": ["d_model", "n_heads", "inner_dim"], + }, +} + +# Optional: fused attention layers like nn.Linear(3 * dim, dim). +FUSED_ATTENTION_MAPPING = { + GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2}, + TransfoXLPreTrainedModel: {"qkv_net": 3}, +} + +# Optional: reversed parameters like nn.Linear(out_dim, in_dim) or Conv1D(). +REVERSED_PARAM_MAPPING = { + GPT2PreTrainedModel: ["attn", "crossattention", "mlp"], + TransfoXLPreTrainedModel: ["qkv_net"], } From 7353d3ee52d261150c464c5e5555cb36b964afd9 Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Fri, 28 Jan 2022 23:34:21 +0900 Subject: [PATCH 05/20] Add copy mapping --- .../utils/tensor_parallel_utils.py | 69 +++++++++++++------ 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/src/transformers/utils/tensor_parallel_utils.py b/src/transformers/utils/tensor_parallel_utils.py index 6360e0a8a7ab7b..b6773c745f6fe1 100644 --- a/src/transformers/utils/tensor_parallel_utils.py +++ b/src/transformers/utils/tensor_parallel_utils.py @@ -1,4 +1,24 @@ -from .. import BertPreTrainedModel, GPT2PreTrainedModel, T5PreTrainedModel, TransfoXLPreTrainedModel +# coding=utf-8 +# Copyright 2021 TUNiB Inc and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from transformers import ( + BertPreTrainedModel, + GPT2PreTrainedModel, + TransfoXLPreTrainedModel, + T5PreTrainedModel, + RobertaPreTrainedModel, +) """ @@ -10,14 +30,22 @@ "col": list of column parallel parameters, "row": list of row parallel parameters, "update": list of attributes to be updated, - "col_no_replacement": list of column parallel parameters without module replacement (Optional) - "row_no_replacement": list of row parallel parameters without module replacement (Optional), + "col_no_replacement": list of column parallel parameters without module replacement (opt) + "row_no_replacement": list of row parallel parameters without module replacement (opt), ... could be added more to avoid exceptions. } } +Or if a model A has the same map with the other model B, define like: + +TENSOR_PARALLEL_MAPPING = { + PreTrainedModel class A: PreTrainedModel class B +} + +Then, call ``copy_mapping(PreTrainedModel class A)``. """ + TENSOR_PARALLEL_MAPPING = { BertPreTrainedModel: { "col": ["query", "key", "value", "intermediate.dense"], @@ -35,15 +63,27 @@ "row_no_replacement": ["relative_attention_bias"], "update": ["d_model", "n_heads", "inner_dim"], }, + RobertaPreTrainedModel: BertPreTrainedModel, } -# Optional: fused attention layers like nn.Linear(3 * dim, dim). + +def copy_mapping(model_cls): + TENSOR_PARALLEL_MAPPING[model_cls] = TENSOR_PARALLEL_MAPPING[ + TENSOR_PARALLEL_MAPPING[model_cls] + ] + + +# Copy the same mapping. +copy_mapping(RobertaPreTrainedModel) + + +# ie. nn.Linear(3 * dim, dim) (opt) FUSED_ATTENTION_MAPPING = { GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2}, TransfoXLPreTrainedModel: {"qkv_net": 3}, } -# Optional: reversed parameters like nn.Linear(out_dim, in_dim) or Conv1D(). +# ie. nn.Linear(out_dim, in_dim) or Conv1D() (opt) REVERSED_PARAM_MAPPING = { GPT2PreTrainedModel: ["attn", "crossattention", "mlp"], TransfoXLPreTrainedModel: ["qkv_net"], @@ -52,11 +92,11 @@ def get_mapping(model, mapping): """ - Helper function to find mapping by model + Helper function to find Args: model (PreTrainedModel): model object - mapping (Dict): mapping object + mapping (Dict): map object Returns: Any: mapping object @@ -115,22 +155,14 @@ def get_tensor_parallel_mapping(model): def is_reversed_param(model, param_name): """ Check reversed parameter or not. + e.g. ``Conv1D`` of GPT2 and ``qkv_net`` of TransfoXL have reversed parameters Args: model (PreTrainedModel): model object param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...') - Notes: - ``Conv1D`` of GPT2 and ``qkv_net`` of TransfoXL have reversed parameters - Returns: bool: whether reversed parameter or not. - - Examples: - >>> is_reversed_param(model, 'transformer.h.0.attn.c_attn') - True - >>> is_reversed_param(model, 'transformer.wte') - False """ mapping = get_mapping(model, REVERSED_PARAM_MAPPING) @@ -149,12 +181,9 @@ def get_fusion_degree(model, param_name): param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...') Notes: - The `c_attn` layer in the self-attention layer of GPT2 is size of (dim * 3, dim). + The `c_attn` layer that has size of (dim * 3, dim) in GPT2. In this case, the fusion degree is 3. - The `c_attn` layer in the cross-attention attention layer of GPT2 is size of (dim * 2, dim). - In this case, the fusion degree is 2. - Returns: int: the fusion degree """ From bec6a8465e6bdd8129fcf8de7d488dcbfccadcd3 Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Fri, 28 Jan 2022 23:37:04 +0900 Subject: [PATCH 06/20] Style code --- src/transformers/utils/tensor_parallel_utils.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/transformers/utils/tensor_parallel_utils.py b/src/transformers/utils/tensor_parallel_utils.py index b6773c745f6fe1..bbfc61057b7db4 100644 --- a/src/transformers/utils/tensor_parallel_utils.py +++ b/src/transformers/utils/tensor_parallel_utils.py @@ -15,9 +15,9 @@ from transformers import ( BertPreTrainedModel, GPT2PreTrainedModel, - TransfoXLPreTrainedModel, - T5PreTrainedModel, RobertaPreTrainedModel, + T5PreTrainedModel, + TransfoXLPreTrainedModel, ) @@ -68,9 +68,7 @@ def copy_mapping(model_cls): - TENSOR_PARALLEL_MAPPING[model_cls] = TENSOR_PARALLEL_MAPPING[ - TENSOR_PARALLEL_MAPPING[model_cls] - ] + TENSOR_PARALLEL_MAPPING[model_cls] = TENSOR_PARALLEL_MAPPING[TENSOR_PARALLEL_MAPPING[model_cls]] # Copy the same mapping. From dc7e74f77c3476b935b83a1537a9921bef1b134d Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Fri, 28 Jan 2022 23:40:01 +0900 Subject: [PATCH 07/20] remove transformers dependancy --- src/transformers/utils/tensor_parallel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/utils/tensor_parallel_utils.py b/src/transformers/utils/tensor_parallel_utils.py index bbfc61057b7db4..936348ab3c4e73 100644 --- a/src/transformers/utils/tensor_parallel_utils.py +++ b/src/transformers/utils/tensor_parallel_utils.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from transformers import ( +from .. import ( BertPreTrainedModel, GPT2PreTrainedModel, RobertaPreTrainedModel, From 6086772f3386defc7b6a3b4686eeede1f3fd718a Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Sat, 29 Jan 2022 18:13:39 +0900 Subject: [PATCH 08/20] Modify structure of tp mapping --- .../utils/model_parallel_utils.py | 267 +++++++++++++++++- .../utils/tensor_parallel_utils.py | 194 ------------- 2 files changed, 266 insertions(+), 195 deletions(-) delete mode 100644 src/transformers/utils/tensor_parallel_utils.py diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index abddd6c60faccf..54729fcb04d25d 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -12,10 +12,275 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import copy +import importlib from math import ceil +class TPInfo(object): + """ + A class to describe tensor parallelization information. + + Args: + name (Tuple[str]): the name of parameter + fuse (int): the degree of fusion + parallel (bool): parallelizable param or not + reverse (bool): reversed param or not + """ + + def __init__( + self, + *name, + fuse: bool = False, + parallel: bool = True, + reverse: bool = True, + # nn.Linear stores data reversely. + # nn.Linear(in, out) -> Parameter(out, int) + ): + self.name = name + self.fuse = fuse + self.reverse = reverse + self.parallel = parallel + + def __str__(self): + return f"{self.__class__.__qualname__}({self.name})" + + def __repr__(self): + return self.__str__() + + +Col = type("COLUMN", (TPInfo,), {"code": "COLUMN"}) +Row = type("ROW", (TPInfo,), {"code": "ROW"}) +Update = type("UPDATE", (TPInfo,), {"code": "UPDATE", "parallel": False}) + + +class TPMapping(object): + __MAPPING__ = dict( + Albert=[ + Col("query", "key", "value", "ffn"), + Row("dense", "ffn_output"), + Update("num_attention_heads", "all_head_size"), + ], + Bart=[ + Col("q_proj", "k_proj", "v_proj", "fc1"), + Row("out_proj", "fc2"), + Update("embed_dim", "num_heads"), + ], + Bert=[ + Col("query", "key", "value", "intermediate.dense"), + Row("output.dense"), + Update("num_attention_heads", "all_head_size"), + ], + T5=[ + Col("Attention.q", "Attention.k", "Attention.v"), + Col("relative_attention_bias", reverse=False), + Row("DenseReluDense.wi", "Attention.o", "DenseReluDense.wo"), + Update("d_model", "n_heads", "inner_dim"), + ], + GPT2=[ + Col("c_attn", reverse=False, fuse=True), + Col("q_attn", reverse=False), + Row("c_proj", reverse=False), + Update("embed_dim", "split_size", "num_heads"), + ], + Electra=[ + Col("query", "key", "value", "intermediate.dense"), + Row("output.dense"), + Update("num_attention_heads", "all_head_size"), + ], + Roberta=[ + Col("query", "key", "value", "intermediate.dense"), + Row("output.dense"), + Update("num_attention_heads", "all_head_size"), + ], + ) + + def __init__(self): + cache_tp_mapping = {} + + for cls_name, mapping in self.__MAPPING__.items(): + cls = self._load_class_by_model_name(cls_name) + cache_tp_mapping[cls] = [] + + for elem in mapping: + for name in elem.name: + copy_elem = copy.deepcopy(elem) + copy_elem.name = name + cache_tp_mapping[cls].append(copy_elem) + + self.__MAPPING__ = {cls: {} for cls in cache_tp_mapping} + # clear exist mapping rather than making new mapping dict + + for cls, mapping in cache_tp_mapping.items(): + for elem in mapping: + if elem.code in self.__MAPPING__[cls]: + self.__MAPPING__[cls][elem.code].append(elem) + else: + self.__MAPPING__[cls][elem.code] = [elem] + + @staticmethod + def _load_class_by_model_name(model_name): + """ + Load base class obj by class name + Args: + model_name (str): model name (e.g. Bert, GPT2, T5, ...) + + Returns: + class: XXXPreTrainedModel + """ + transformers = importlib.import_module("transformers") + cls = getattr(transformers, f"{model_name}PreTrainedModel", None) + if cls is None: + cls = getattr(transformers, f"{model_name}PretrainedModel", None) + assert cls is not None, f"Can not import the model named {cls}." + return cls + + def get_mapping(self, model): + """ + Get mapping by model obj + + Args: + model (PreTrainedModel): model object (e.g. BertForSequenceClassification) + + Returns: + dict: mapping by model + """ + for cls, mapping in self.__MAPPING__.items(): + if isinstance(model, cls): + return mapping + return None + + def column_parallel_params(self, model): + """ + Get list of column parallel param elements + + Args: + model (PreTrainedModel): model obj + + Returns: + List[COLUMN]: list of column parallel param elements + """ + mapping = self.get_mapping(model) + if mapping is not None: + return mapping["COL"] + + def row_parallel_params(self, model): + """ + Get list of row parallel param elements + + Args: + model (PreTrainedModel): model obj + + Returns: + List[ROW]: list of row parallel param elements + """ + mapping = self.get_mapping(model) + if mapping is not None: + return mapping["ROW"] + + def update_attrs(self, model): + """ + Get list of update attribute elements + + Args: + model (PreTrainedModel): model obj + + Returns: + List[UPDATE]: list of update attribute elements + """ + mapping = self.get_mapping(model) + if mapping is not None: + return mapping["UPDATE"] + + def search(self, model, param_name): + """ + Get element by parameter name + + Args: + model (PreTrainedModel): model obj + + Returns: + TPInfo: element by parameter name + """ + mapping = self.get_mapping(model) + count_contain_elem_in_param = False + param_split = param_name.split(".") + + for code, elem in mapping.items(): + elem_split = elem.name.split(".") + for _elem_split in elem_split: + if _elem_split in param_split: + count_contain_elem_in_param += 1 + if count_contain_elem_in_param == len(elem_split): + return elem + + return None + + def is_fused_param(self, model, param_name): + """ + Check whether the param is fused or not + + Args: + model (PreTrainedModel): model obj + param_name (str): name of parameter + + Returns: + bool: whether the param is fused or not + """ + elem = self.search(model, param_name) + if elem is not None: + return elem.fuse + + def get_fusion_degree(self, model, param_name, module): + """ + Get fusion degree + + Args: + model (PreTrainedModel): model obj + param_name (str): name of parameter + module (nn.Module): module that has `weight` parameter + + Returns: + int: fusion degree of module + """ + if self.is_fused_param(model, param_name) and hasattr(module, "weight"): + bigger = max(module.weight.size(0), module.weight.size(1)) + smaller = min(module.weight.size(0), module.weight.size(1)) + return bigger // smaller + return 1 + + def is_reversed_param(self, model, param_name): + """ + Check whether the parameter is reversed or not + + Args: + model (PreTrainedModel): model obj + param_name (str): name of parameter + + Returns: + bool: whether the param is reversed or not + """ + elem = self.search(model, param_name) + if elem is not None: + return elem.reverse + + def is_parallelizable_param(self, model, param_name): + """ + Check whether the parameter is parallelizable or not + + Args: + model (PreTrainedModel): model obj + param_name (str): name of parameter + + Returns: + bool: whether the param is parallelizable or not + """ + + elem = self.search(model, param_name) + if elem is not None: + return elem.parallel + + def assert_device_map(device_map, num_blocks): blocks = list(range(0, num_blocks)) diff --git a/src/transformers/utils/tensor_parallel_utils.py b/src/transformers/utils/tensor_parallel_utils.py deleted file mode 100644 index 936348ab3c4e73..00000000000000 --- a/src/transformers/utils/tensor_parallel_utils.py +++ /dev/null @@ -1,194 +0,0 @@ -# coding=utf-8 -# Copyright 2021 TUNiB Inc and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from .. import ( - BertPreTrainedModel, - GPT2PreTrainedModel, - RobertaPreTrainedModel, - T5PreTrainedModel, - TransfoXLPreTrainedModel, -) - - -""" -All the mapping for tensor parallelism. -This mapping is following the follow format. - -TENSOR_PARALLEL_MAPPING = { - PreTrainedModel class: { - "col": list of column parallel parameters, - "row": list of row parallel parameters, - "update": list of attributes to be updated, - "col_no_replacement": list of column parallel parameters without module replacement (opt) - "row_no_replacement": list of row parallel parameters without module replacement (opt), - ... - could be added more to avoid exceptions. - } -} - -Or if a model A has the same map with the other model B, define like: - -TENSOR_PARALLEL_MAPPING = { - PreTrainedModel class A: PreTrainedModel class B -} - -Then, call ``copy_mapping(PreTrainedModel class A)``. -""" - -TENSOR_PARALLEL_MAPPING = { - BertPreTrainedModel: { - "col": ["query", "key", "value", "intermediate.dense"], - "row": ["output.dense"], - "update": ["num_attention_heads", "all_head_size"], - }, - GPT2PreTrainedModel: { - "col": ["c_attn", "q_attn", "c_fc"], - "row": ["c_proj"], - "update": ["embed_dim", "split_size", "num_heads"], - }, - T5PreTrainedModel: { - "col": ["Attention.q", "Attention.k", "Attention.v", "DenseReluDense.wi"], - "row": ["Attention.o", "DenseReluDense.wo"], - "row_no_replacement": ["relative_attention_bias"], - "update": ["d_model", "n_heads", "inner_dim"], - }, - RobertaPreTrainedModel: BertPreTrainedModel, -} - - -def copy_mapping(model_cls): - TENSOR_PARALLEL_MAPPING[model_cls] = TENSOR_PARALLEL_MAPPING[TENSOR_PARALLEL_MAPPING[model_cls]] - - -# Copy the same mapping. -copy_mapping(RobertaPreTrainedModel) - - -# ie. nn.Linear(3 * dim, dim) (opt) -FUSED_ATTENTION_MAPPING = { - GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2}, - TransfoXLPreTrainedModel: {"qkv_net": 3}, -} - -# ie. nn.Linear(out_dim, in_dim) or Conv1D() (opt) -REVERSED_PARAM_MAPPING = { - GPT2PreTrainedModel: ["attn", "crossattention", "mlp"], - TransfoXLPreTrainedModel: ["qkv_net"], -} - - -def get_mapping(model, mapping): - """ - Helper function to find - - Args: - model (PreTrainedModel): model object - mapping (Dict): map object - - Returns: - Any: mapping object - - Examples: - >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2") - >>> get_map(lm_head_model, TENSOR_PARALLEL_MAPPING) - { - "column_parallel": ["c_attn", "q_attn", "c_fc"], - "row_parallel": ["c_proj"], - "update_attrs": ["embed_dim", "split_size", "num_heads"], - } - >>> get_map(lm_head_model, FUSED_ATTENTION_MAPPING) - {"attn.c_attn": 3, "crossattention.c_attn": 2} - - >>> seq_clf_model = GPT2ForSequenceClassification.from_pretrained("gpt2") - >>> get_map(seq_clf_model, TENSOR_PARALLEL_MAPPING) - { - "column_parallel": ["c_attn", "q_attn", "c_fc"], - "row_parallel": ["c_proj"], - "update_attrs": ["embed_dim", "split_size", "num_heads"], - } - >>> get_map(seq_clf_model, FUSED_ATTENTION_MAPPING) - {"attn.c_attn": 3, "crossattention.c_attn": 2} - - """ - for pretrained_model_cls, value in mapping.items(): - if isinstance(model, pretrained_model_cls): - return value - - return None - - -def get_tensor_parallel_mapping(model): - """ - Get tensor parallel mapping by model - - Args: - model (PreTrainedModel): model object - - Returns: - Dict: tensor parallel mapping - - Examples: - >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2") - >>> get_tensor_parallel_mapping(lm_head_model) - { - "column_parallel": ["c_attn", "q_attn", "c_fc"], - "row_parallel": ["c_proj"], - "update_attrs": ["embed_dim", "split_size", "num_heads"], - } - """ - return get_mapping(model, TENSOR_PARALLEL_MAPPING) - - -def is_reversed_param(model, param_name): - """ - Check reversed parameter or not. - e.g. ``Conv1D`` of GPT2 and ``qkv_net`` of TransfoXL have reversed parameters - - Args: - model (PreTrainedModel): model object - param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...') - - Returns: - bool: whether reversed parameter or not. - """ - mapping = get_mapping(model, REVERSED_PARAM_MAPPING) - - if mapping is not None: - return any([i in param_name for i in mapping]) - - return False - - -def get_fusion_degree(model, param_name): - """ - Get fused attention layer degree - - Args: - model (PreTrainedModel): model object - param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...') - - Notes: - The `c_attn` layer that has size of (dim * 3, dim) in GPT2. - In this case, the fusion degree is 3. - - Returns: - int: the fusion degree - """ - mapping = get_mapping(model, FUSED_ATTENTION_MAPPING) - - if mapping is not None: - for key, degree in mapping.items(): - if key in param_name: - return degree - return 1 From 320a6e0382315d88f852fc733af768d773a36ed8 Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Sat, 29 Jan 2022 19:20:44 +0900 Subject: [PATCH 09/20] remove all the docstring --- .../utils/model_parallel_utils.py | 105 ------------------ 1 file changed, 105 deletions(-) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index 54729fcb04d25d..a1eae461353068 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -18,16 +18,6 @@ class TPInfo(object): - """ - A class to describe tensor parallelization information. - - Args: - name (Tuple[str]): the name of parameter - fuse (int): the degree of fusion - parallel (bool): parallelizable param or not - reverse (bool): reversed param or not - """ - def __init__( self, *name, @@ -120,14 +110,6 @@ def __init__(self): @staticmethod def _load_class_by_model_name(model_name): - """ - Load base class obj by class name - Args: - model_name (str): model name (e.g. Bert, GPT2, T5, ...) - - Returns: - class: XXXPreTrainedModel - """ transformers = importlib.import_module("transformers") cls = getattr(transformers, f"{model_name}PreTrainedModel", None) if cls is None: @@ -136,72 +118,27 @@ def _load_class_by_model_name(model_name): return cls def get_mapping(self, model): - """ - Get mapping by model obj - - Args: - model (PreTrainedModel): model object (e.g. BertForSequenceClassification) - - Returns: - dict: mapping by model - """ for cls, mapping in self.__MAPPING__.items(): if isinstance(model, cls): return mapping return None def column_parallel_params(self, model): - """ - Get list of column parallel param elements - - Args: - model (PreTrainedModel): model obj - - Returns: - List[COLUMN]: list of column parallel param elements - """ mapping = self.get_mapping(model) if mapping is not None: return mapping["COL"] def row_parallel_params(self, model): - """ - Get list of row parallel param elements - - Args: - model (PreTrainedModel): model obj - - Returns: - List[ROW]: list of row parallel param elements - """ mapping = self.get_mapping(model) if mapping is not None: return mapping["ROW"] def update_attrs(self, model): - """ - Get list of update attribute elements - - Args: - model (PreTrainedModel): model obj - - Returns: - List[UPDATE]: list of update attribute elements - """ mapping = self.get_mapping(model) if mapping is not None: return mapping["UPDATE"] def search(self, model, param_name): - """ - Get element by parameter name - - Args: - model (PreTrainedModel): model obj - - Returns: - TPInfo: element by parameter name - """ mapping = self.get_mapping(model) count_contain_elem_in_param = False param_split = param_name.split(".") @@ -217,32 +154,11 @@ def search(self, model, param_name): return None def is_fused_param(self, model, param_name): - """ - Check whether the param is fused or not - - Args: - model (PreTrainedModel): model obj - param_name (str): name of parameter - - Returns: - bool: whether the param is fused or not - """ elem = self.search(model, param_name) if elem is not None: return elem.fuse def get_fusion_degree(self, model, param_name, module): - """ - Get fusion degree - - Args: - model (PreTrainedModel): model obj - param_name (str): name of parameter - module (nn.Module): module that has `weight` parameter - - Returns: - int: fusion degree of module - """ if self.is_fused_param(model, param_name) and hasattr(module, "weight"): bigger = max(module.weight.size(0), module.weight.size(1)) smaller = min(module.weight.size(0), module.weight.size(1)) @@ -250,32 +166,11 @@ def get_fusion_degree(self, model, param_name, module): return 1 def is_reversed_param(self, model, param_name): - """ - Check whether the parameter is reversed or not - - Args: - model (PreTrainedModel): model obj - param_name (str): name of parameter - - Returns: - bool: whether the param is reversed or not - """ elem = self.search(model, param_name) if elem is not None: return elem.reverse def is_parallelizable_param(self, model, param_name): - """ - Check whether the parameter is parallelizable or not - - Args: - model (PreTrainedModel): model obj - param_name (str): name of parameter - - Returns: - bool: whether the param is parallelizable or not - """ - elem = self.search(model, param_name) if elem is not None: return elem.parallel From ded2fd70bff43a34bb0799718fc1709101f9a936 Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Sat, 29 Jan 2022 19:47:18 +0900 Subject: [PATCH 10/20] Add code --- src/transformers/utils/model_parallel_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index a1eae461353068..9c1032d3536c3f 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -31,6 +31,7 @@ def __init__( self.fuse = fuse self.reverse = reverse self.parallel = parallel + self.code = None def __str__(self): return f"{self.__class__.__qualname__}({self.name})" From 5e9b0ef6e895e978a4f74d4bac540f76867a52d7 Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Sat, 29 Jan 2022 20:52:47 +0900 Subject: [PATCH 11/20] Fix bug --- .../utils/model_parallel_utils.py | 51 ++++++++++--------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index 9c1032d3536c3f..52062c05ee1823 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -16,22 +16,17 @@ import importlib from math import ceil - class TPInfo(object): + def __init__( self, *name, fuse: bool = False, - parallel: bool = True, - reverse: bool = True, - # nn.Linear stores data reversely. - # nn.Linear(in, out) -> Parameter(out, int) + reverse: bool = False, ): self.name = name self.fuse = fuse self.reverse = reverse - self.parallel = parallel - self.code = None def __str__(self): return f"{self.__class__.__qualname__}({self.name})" @@ -40,9 +35,9 @@ def __repr__(self): return self.__str__() -Col = type("COLUMN", (TPInfo,), {"code": "COLUMN"}) -Row = type("ROW", (TPInfo,), {"code": "ROW"}) -Update = type("UPDATE", (TPInfo,), {"code": "UPDATE", "parallel": False}) +Col = type("COLUMN", (TPInfo,), {"code": "Col"}) +Row = type("ROW", (TPInfo,), {"code": "Row"}) +Update = type("UPDATE", (TPInfo,), {"code": "Update", "parallel": False}) class TPMapping(object): @@ -64,14 +59,14 @@ class TPMapping(object): ], T5=[ Col("Attention.q", "Attention.k", "Attention.v"), - Col("relative_attention_bias", reverse=False), + Col("relative_attention_bias", reverse=True), Row("DenseReluDense.wi", "Attention.o", "DenseReluDense.wo"), Update("d_model", "n_heads", "inner_dim"), ], GPT2=[ - Col("c_attn", reverse=False, fuse=True), - Col("q_attn", reverse=False), - Row("c_proj", reverse=False), + Col("c_attn", reverse=True, fuse=True), + Col("q_attn", reverse=True), + Row("c_proj", reverse=True), Update("embed_dim", "split_size", "num_heads"), ], Electra=[ @@ -127,27 +122,33 @@ def get_mapping(self, model): def column_parallel_params(self, model): mapping = self.get_mapping(model) if mapping is not None: - return mapping["COL"] + return mapping["Col"] def row_parallel_params(self, model): mapping = self.get_mapping(model) if mapping is not None: - return mapping["ROW"] + return mapping["Row"] def update_attrs(self, model): mapping = self.get_mapping(model) if mapping is not None: - return mapping["UPDATE"] + return mapping["Update"] def search(self, model, param_name): mapping = self.get_mapping(model) - count_contain_elem_in_param = False + count_contain_elem_in_param = 0 param_split = param_name.split(".") + first_check = [] + + for code, elems in mapping.items(): + for elem in elems: + if elem.name in param_name: + first_check.append(elem) - for code, elem in mapping.items(): + for elem in first_check: elem_split = elem.name.split(".") - for _elem_split in elem_split: - if _elem_split in param_split: + for split in elem_split: + if split in param_split: count_contain_elem_in_param += 1 if count_contain_elem_in_param == len(elem_split): return elem @@ -171,11 +172,15 @@ def is_reversed_param(self, model, param_name): if elem is not None: return elem.reverse - def is_parallelizable_param(self, model, param_name): + def is_column_parallel(self, model, param_name): elem = self.search(model, param_name) if elem is not None: - return elem.parallel + return elem.code == "Col" + def is_row_parallel(self, model, param_name): + elem = self.search(model, param_name) + if elem is not None: + return elem.code == "Row" def assert_device_map(device_map, num_blocks): blocks = list(range(0, num_blocks)) From c671435fafb0c5c80c8cc72af7d6564e086b8830 Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Sat, 29 Jan 2022 20:56:44 +0900 Subject: [PATCH 12/20] Style code --- src/transformers/utils/model_parallel_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index 52062c05ee1823..a0773f2bd588ee 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -16,8 +16,8 @@ import importlib from math import ceil -class TPInfo(object): +class TPInfo(object): def __init__( self, *name, @@ -182,6 +182,7 @@ def is_row_parallel(self, model, param_name): if elem is not None: return elem.code == "Row" + def assert_device_map(device_map, num_blocks): blocks = list(range(0, num_blocks)) From cb724a93e1fc613887df43c4d6572f847f162bff Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Sun, 30 Jan 2022 05:59:30 +0900 Subject: [PATCH 13/20] fuse to combined qkv --- .../utils/model_parallel_utils.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index a0773f2bd588ee..7e206b053867a0 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -21,11 +21,11 @@ class TPInfo(object): def __init__( self, *name, - fuse: bool = False, + combined_qkv: bool = False, reverse: bool = False, ): self.name = name - self.fuse = fuse + self.combined_qkv = combined_qkv self.reverse = reverse def __str__(self): @@ -37,7 +37,7 @@ def __repr__(self): Col = type("COLUMN", (TPInfo,), {"code": "Col"}) Row = type("ROW", (TPInfo,), {"code": "Row"}) -Update = type("UPDATE", (TPInfo,), {"code": "Update", "parallel": False}) +Update = type("UPDATE", (TPInfo,), {"code": "Update"}) class TPMapping(object): @@ -58,13 +58,12 @@ class TPMapping(object): Update("num_attention_heads", "all_head_size"), ], T5=[ - Col("Attention.q", "Attention.k", "Attention.v"), - Col("relative_attention_bias", reverse=True), - Row("DenseReluDense.wi", "Attention.o", "DenseReluDense.wo"), + Col("q", "k", "v", "DenseReluDense.wi"), + Row("o", "DenseReluDense.wo", "relative_attention_bias"), Update("d_model", "n_heads", "inner_dim"), ], GPT2=[ - Col("c_attn", reverse=True, fuse=True), + Col("c_attn", reverse=True, combined_qkv=True), Col("q_attn", reverse=True), Row("c_proj", reverse=True), Update("embed_dim", "split_size", "num_heads"), @@ -155,13 +154,13 @@ def search(self, model, param_name): return None - def is_fused_param(self, model, param_name): + def is_combined_qkv_param(self, model, param_name): elem = self.search(model, param_name) if elem is not None: - return elem.fuse + return elem.combined_qkv - def get_fusion_degree(self, model, param_name, module): - if self.is_fused_param(model, param_name) and hasattr(module, "weight"): + def get_combined_qkv_degree(self, model, param_name, module): + if self.is_combined_qkv_param(model, param_name) and hasattr(module, "weight"): bigger = max(module.weight.size(0), module.weight.size(1)) smaller = min(module.weight.size(0), module.weight.size(1)) return bigger // smaller From 844d27c4ae0e5cfca96c36fcde545b0f05d27860 Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Sun, 30 Jan 2022 09:22:01 +0900 Subject: [PATCH 14/20] Fix bug of mapping --- src/transformers/utils/model_parallel_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index 7e206b053867a0..5296f1966db0d7 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -44,7 +44,7 @@ class TPMapping(object): __MAPPING__ = dict( Albert=[ Col("query", "key", "value", "ffn"), - Row("dense", "ffn_output"), + Row("attention.dense", "ffn_output"), Update("num_attention_heads", "all_head_size"), ], Bart=[ @@ -64,7 +64,7 @@ class TPMapping(object): ], GPT2=[ Col("c_attn", reverse=True, combined_qkv=True), - Col("q_attn", reverse=True), + Col("c_fc", "q_attn", reverse=True), Row("c_proj", reverse=True), Update("embed_dim", "split_size", "num_heads"), ], From fb4af90aa337e2562f2fa1f91b3756583eb2e416 Mon Sep 17 00:00:00 2001 From: hyunwoongko Date: Sun, 30 Jan 2022 10:23:52 +0900 Subject: [PATCH 15/20] Add GPTNeo, GPTJ --- src/transformers/utils/model_parallel_utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index 5296f1966db0d7..36697ee57c2921 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -68,6 +68,16 @@ class TPMapping(object): Row("c_proj", reverse=True), Update("embed_dim", "split_size", "num_heads"), ], + GPTNeo=[ + Col("q_proj", "k_proj", "v_proj", "c_fc"), + Row("out_proj", "c_proj"), + Update("embed_dim", "num_heads"), + ], + GPTJ=[ + Col("q_proj", "k_proj", "v_proj", "fc_in"), + Row("out_proj", "fc_out"), + Update("embed_dim", "num_attention_heads"), + ], Electra=[ Col("query", "key", "value", "intermediate.dense"), Row("output.dense"), From 60ba04903aaf4bb84abf6ddb16826bff59b2f0a4 Mon Sep 17 00:00:00 2001 From: Kevin Ko Date: Sun, 6 Feb 2022 05:36:52 +0900 Subject: [PATCH 16/20] Update src/transformers/utils/model_parallel_utils.py Co-authored-by: Jake Tae --- src/transformers/utils/model_parallel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index 36697ee57c2921..94bb8587cd7499 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -103,7 +103,7 @@ def __init__(self): copy_elem.name = name cache_tp_mapping[cls].append(copy_elem) - self.__MAPPING__ = {cls: {} for cls in cache_tp_mapping} + self.__MAPPING__ = {cls: defaultdict(list) for cls in cache_tp_mapping} # clear exist mapping rather than making new mapping dict for cls, mapping in cache_tp_mapping.items(): From b6ea797284795d3e559ef5e7195700ee51554e6b Mon Sep 17 00:00:00 2001 From: Kevin Ko Date: Sun, 6 Feb 2022 05:36:57 +0900 Subject: [PATCH 17/20] Update src/transformers/utils/model_parallel_utils.py Co-authored-by: Jake Tae --- src/transformers/utils/model_parallel_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index 94bb8587cd7499..d3adbf0d343a75 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -14,6 +14,7 @@ # limitations under the License. import copy import importlib +from collections import defaultdict from math import ceil From 958003b379c8f6e34d5d7f59973419e11e2846c1 Mon Sep 17 00:00:00 2001 From: Kevin Ko Date: Sun, 6 Feb 2022 05:37:02 +0900 Subject: [PATCH 18/20] Update src/transformers/utils/model_parallel_utils.py Co-authored-by: Jake Tae --- src/transformers/utils/model_parallel_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index d3adbf0d343a75..241d2b29ba81a5 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -109,10 +109,7 @@ def __init__(self): for cls, mapping in cache_tp_mapping.items(): for elem in mapping: - if elem.code in self.__MAPPING__[cls]: - self.__MAPPING__[cls][elem.code].append(elem) - else: - self.__MAPPING__[cls][elem.code] = [elem] + self.__MAPPING__[cls][elem.code].append(elem) @staticmethod def _load_class_by_model_name(model_name): From 2f3230d980d4514bc28889a1c8392d6e817fb611 Mon Sep 17 00:00:00 2001 From: Kevin Ko Date: Sun, 6 Feb 2022 05:37:09 +0900 Subject: [PATCH 19/20] Update src/transformers/utils/model_parallel_utils.py Co-authored-by: Jake Tae --- src/transformers/utils/model_parallel_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index 241d2b29ba81a5..1e96e6f7940b5e 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -143,6 +143,8 @@ def update_attrs(self, model): def search(self, model, param_name): mapping = self.get_mapping(model) + if mapping is None: + raise ValueError(f"{model} does not support tensor parallelism.") count_contain_elem_in_param = 0 param_split = param_name.split(".") first_check = [] From b9e9ace52187359cebdc184a3c856fd9e67b3b1e Mon Sep 17 00:00:00 2001 From: Kevin Ko Date: Sun, 6 Feb 2022 05:37:13 +0900 Subject: [PATCH 20/20] Update src/transformers/utils/model_parallel_utils.py Co-authored-by: Jake Tae --- src/transformers/utils/model_parallel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index 1e96e6f7940b5e..a34aa194d12f18 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -123,7 +123,7 @@ def _load_class_by_model_name(model_name): def get_mapping(self, model): for cls, mapping in self.__MAPPING__.items(): if isinstance(model, cls): - return mapping + return dict(mapping) return None def column_parallel_params(self, model):