From ec54d819e9b238581e55f2bf4d17b7bc3c67b2f6 Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Thu, 27 Jan 2022 08:38:49 +0900
Subject: [PATCH 01/20] Add tensor parallelism related mappings

---
 .../utils/model_parallel_utils.py             | 148 ++++++++++++++++++
 1 file changed, 148 insertions(+)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index abddd6c60faccf..3df7eeade1757d 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -15,6 +15,154 @@
 
 from math import ceil
 
+from transformers import BertPreTrainedModel, GPT2PreTrainedModel, TransfoXLPreTrainedModel
+
+
+# ie. nn.Linear(3 * dim, dim)
+# only defined the models that have fused attention layer.
+FUSED_ATTENTION_MAPPING = {
+    GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2},
+    TransfoXLPreTrainedModel: {"qkv_net": 3},
+}
+
+# ie. nn.Linear(out_dim, in_dim) or Conv1D()
+# only defined the models that have reversed parameters.
+REVERSED_PARAM_MAPPING = {
+    GPT2PreTrainedModel: ["attn", "crossattention", "mlp"],
+    TransfoXLPreTrainedModel: ["qkv_net"],
+}
+
+# All the mapping for tensor parallelism
+TENSOR_PARALLEL_MAPPING = {
+    BertPreTrainedModel: {
+        "column_parallel": ["query", "key", "value", "intermediate.dense"],
+        "row_parallel": ["output.dense"],
+        "update_attrs": ["num_attention_heads", "all_head_size"],
+    },
+    GPT2PreTrainedModel: {
+        "column_parallel": ["c_attn", "q_attn", "c_fc"],
+        "row_parallel": ["c_proj"],
+        "update_attrs": ["embed_dim", "split_size", "num_heads"],
+    },
+}
+
+
+def get_mapping(model, mapping):
+    """
+    Helper function to find mapping by model
+
+    Args:
+        model (PreTrainedModel): model object
+        mapping (Dict): mapping object
+
+    Returns:
+        Any: mapping object
+
+    Examples:
+        >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2")
+        >>> get_map(lm_head_model, TENSOR_PARALLEL_MAPPING)
+        {
+            "column_parallel": ["c_attn", "q_attn", "c_fc"],
+            "row_parallel": ["c_proj"],
+            "update_attrs": ["embed_dim", "split_size", "num_heads"],
+        }
+        >>> get_map(lm_head_model, FUSED_ATTENTION_MAPPING)
+        {"attn.c_attn": 3, "crossattention.c_attn": 2}
+
+        >>> seq_clf_model = GPT2ForSequenceClassification.from_pretrained("gpt2")
+        >>> get_map(seq_clf_model, TENSOR_PARALLEL_MAPPING)
+        {
+            "column_parallel": ["c_attn", "q_attn", "c_fc"],
+            "row_parallel": ["c_proj"],
+            "update_attrs": ["embed_dim", "split_size", "num_heads"],
+        }
+        >>> get_map(seq_clf_model, FUSED_ATTENTION_MAPPING)
+        {"attn.c_attn": 3, "crossattention.c_attn": 2}
+
+    """
+    for pretrained_model_cls, value in mapping.items():
+        if isinstance(model, pretrained_model_cls):
+            return value
+
+    return None
+
+
+def get_tensor_parallel_mapping(model):
+    """
+    Get tensor parallel mapping by model
+
+    Args:
+        model (PreTrainedModel): model object
+
+    Returns:
+        Dict: tensor parallel mapping
+
+    Examples:
+        >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2")
+        >>> get_tensor_parallel_mapping(lm_head_model)
+        {
+            "column_parallel": ["c_attn", "q_attn", "c_fc"],
+            "row_parallel": ["c_proj"],
+            "update_attrs": ["embed_dim", "split_size", "num_heads"],
+        }
+    """
+    return get_mapping(model, TENSOR_PARALLEL_MAPPING)
+
+
+def is_reversed_param(model, param_name):
+    """
+    Check reversed parameter or not.
+
+    Args:
+        model (PreTrainedModel): model object
+        param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...')
+
+    Notes:
+        ``Conv1D`` of GPT2 and ``qkv_net`` of TransfoXL have reversed parameters
+
+    Returns:
+        bool: whether reversed parameter or not.
+
+    Examples:
+        >>> is_reversed_param(model, 'transformer.h.0.attn.c_attn')
+        True
+        >>> is_reversed_param(model, 'transformer.wte')
+        False
+    """
+    mapping = get_mapping(model, REVERSED_PARAM_MAPPING)
+
+    if mapping is not None:
+        return any([i in param_name for i in mapping])
+
+    return False
+
+
+def get_fusion_degree(model, param_name):
+    """
+    Get fused attention layer degree
+
+    Args:
+        model (PreTrainedModel): model object
+        param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...')
+
+    Notes:
+        The `c_attn` layer in the self-attention layer of GPT2 is size of (dim * 3, dim).
+        In this case, the fusion degree is 3.
+
+        The `c_attn` layer in the cross-attention attention layer of GPT2 is size of (dim * 2, dim).
+        In this case, the fusion degree is 2.
+
+    Returns:
+        int: the fusion degree
+    """
+    mapping = get_mapping(model, FUSED_ATTENTION_MAPPING)
+
+    if mapping is not None:
+        for key, degree in mapping.items():
+            if key in param_name:
+                return degree
+    return 1
+
 
 def assert_device_map(device_map, num_blocks):
     blocks = list(range(0, num_blocks))

From e79ff071eba3a224774b8393176dda8c87106a2d Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Thu, 27 Jan 2022 08:39:34 +0900
Subject: [PATCH 02/20] Modify 'transformers' to '..'

---
 src/transformers/utils/model_parallel_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index 3df7eeade1757d..199c0f0fc02936 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -15,7 +15,7 @@
 
 from math import ceil
 
-from transformers import BertPreTrainedModel, GPT2PreTrainedModel, TransfoXLPreTrainedModel
+from .. import BertPreTrainedModel, GPT2PreTrainedModel, TransfoXLPreTrainedModel
 
 
 # ie. nn.Linear(3 * dim, dim)

From 0d50c471cf21789940285c8288f85022cb1bd0ad Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Thu, 27 Jan 2022 08:55:42 +0900
Subject: [PATCH 03/20] Fix circular import problem

---
 .../utils/model_parallel_utils.py             | 148 ------------------
 .../utils/tensor_parallel_utils.py            | 147 +++++++++++++++++
 2 files changed, 147 insertions(+), 148 deletions(-)
 create mode 100644 src/transformers/utils/tensor_parallel_utils.py

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index 199c0f0fc02936..abddd6c60faccf 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -15,154 +15,6 @@
 
 from math import ceil
 
-from .. import BertPreTrainedModel, GPT2PreTrainedModel, TransfoXLPreTrainedModel
-
-
-# ie. nn.Linear(3 * dim, dim)
-# only defined the models that have fused attention layer.
-FUSED_ATTENTION_MAPPING = {
-    GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2},
-    TransfoXLPreTrainedModel: {"qkv_net": 3},
-}
-
-# ie. nn.Linear(out_dim, in_dim) or Conv1D()
-# only defined the models that have reversed parameters.
-REVERSED_PARAM_MAPPING = {
-    GPT2PreTrainedModel: ["attn", "crossattention", "mlp"],
-    TransfoXLPreTrainedModel: ["qkv_net"],
-}
-
-# All the mapping for tensor parallelism
-TENSOR_PARALLEL_MAPPING = {
-    BertPreTrainedModel: {
-        "column_parallel": ["query", "key", "value", "intermediate.dense"],
-        "row_parallel": ["output.dense"],
-        "update_attrs": ["num_attention_heads", "all_head_size"],
-    },
-    GPT2PreTrainedModel: {
-        "column_parallel": ["c_attn", "q_attn", "c_fc"],
-        "row_parallel": ["c_proj"],
-        "update_attrs": ["embed_dim", "split_size", "num_heads"],
-    },
-}
-
-
-def get_mapping(model, mapping):
-    """
-    Helper function to find mapping by model
-
-    Args:
-        model (PreTrainedModel): model object
-        mapping (Dict): mapping object
-
-    Returns:
-        Any: mapping object
-
-    Examples:
-        >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2")
-        >>> get_map(lm_head_model, TENSOR_PARALLEL_MAPPING)
-        {
-            "column_parallel": ["c_attn", "q_attn", "c_fc"],
-            "row_parallel": ["c_proj"],
-            "update_attrs": ["embed_dim", "split_size", "num_heads"],
-        }
-        >>> get_map(lm_head_model, FUSED_ATTENTION_MAPPING)
-        {"attn.c_attn": 3, "crossattention.c_attn": 2}
-
-        >>> seq_clf_model = GPT2ForSequenceClassification.from_pretrained("gpt2")
-        >>> get_map(seq_clf_model, TENSOR_PARALLEL_MAPPING)
-        {
-            "column_parallel": ["c_attn", "q_attn", "c_fc"],
-            "row_parallel": ["c_proj"],
-            "update_attrs": ["embed_dim", "split_size", "num_heads"],
-        }
-        >>> get_map(seq_clf_model, FUSED_ATTENTION_MAPPING)
-        {"attn.c_attn": 3, "crossattention.c_attn": 2}
-
-    """
-    for pretrained_model_cls, value in mapping.items():
-        if isinstance(model, pretrained_model_cls):
-            return value
-
-    return None
-
-
-def get_tensor_parallel_mapping(model):
-    """
-    Get tensor parallel mapping by model
-
-    Args:
-        model (PreTrainedModel): model object
-
-    Returns:
-        Dict: tensor parallel mapping
-
-    Examples:
-        >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2")
-        >>> get_tensor_parallel_mapping(lm_head_model)
-        {
-            "column_parallel": ["c_attn", "q_attn", "c_fc"],
-            "row_parallel": ["c_proj"],
-            "update_attrs": ["embed_dim", "split_size", "num_heads"],
-        }
-    """
-    return get_mapping(model, TENSOR_PARALLEL_MAPPING)
-
-
-def is_reversed_param(model, param_name):
-    """
-    Check reversed parameter or not.
-
-    Args:
-        model (PreTrainedModel): model object
-        param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...')
-
-    Notes:
-        ``Conv1D`` of GPT2 and ``qkv_net`` of TransfoXL have reversed parameters
-
-    Returns:
-        bool: whether reversed parameter or not.
-
-    Examples:
-        >>> is_reversed_param(model, 'transformer.h.0.attn.c_attn')
-        True
-        >>> is_reversed_param(model, 'transformer.wte')
-        False
-    """
-    mapping = get_mapping(model, REVERSED_PARAM_MAPPING)
-
-    if mapping is not None:
-        return any([i in param_name for i in mapping])
-
-    return False
-
-
-def get_fusion_degree(model, param_name):
-    """
-    Get fused attention layer degree
-
-    Args:
-        model (PreTrainedModel): model object
-        param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...')
-
-    Notes:
-        The `c_attn` layer in the self-attention layer of GPT2 is size of (dim * 3, dim).
-        In this case, the fusion degree is 3.
-
-        The `c_attn` layer in the cross-attention attention layer of GPT2 is size of (dim * 2, dim).
-        In this case, the fusion degree is 2.
-
-    Returns:
-        int: the fusion degree
-    """
-    mapping = get_mapping(model, FUSED_ATTENTION_MAPPING)
-
-    if mapping is not None:
-        for key, degree in mapping.items():
-            if key in param_name:
-                return degree
-    return 1
-
 
 def assert_device_map(device_map, num_blocks):
     blocks = list(range(0, num_blocks))
diff --git a/src/transformers/utils/tensor_parallel_utils.py b/src/transformers/utils/tensor_parallel_utils.py
new file mode 100644
index 00000000000000..a53725ca19ae8d
--- /dev/null
+++ b/src/transformers/utils/tensor_parallel_utils.py
@@ -0,0 +1,147 @@
+from .. import BertPreTrainedModel, GPT2PreTrainedModel, TransfoXLPreTrainedModel
+
+
+# ie. nn.Linear(3 * dim, dim)
+# only defined the models that have fused attention layer.
+FUSED_ATTENTION_MAPPING = {
+    GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2},
+    TransfoXLPreTrainedModel: {"qkv_net": 3},
+}
+
+# ie. nn.Linear(out_dim, in_dim) or Conv1D()
+# only defined the models that have reversed parameters.
+REVERSED_PARAM_MAPPING = {
+    GPT2PreTrainedModel: ["attn", "crossattention", "mlp"],
+    TransfoXLPreTrainedModel: ["qkv_net"],
+}
+
+# All the mapping for tensor parallelism
+TENSOR_PARALLEL_MAPPING = {
+    BertPreTrainedModel: {
+        "column_parallel": ["query", "key", "value", "intermediate.dense"],
+        "row_parallel": ["output.dense"],
+        "update_attrs": ["num_attention_heads", "all_head_size"],
+    },
+    GPT2PreTrainedModel: {
+        "column_parallel": ["c_attn", "q_attn", "c_fc"],
+        "row_parallel": ["c_proj"],
+        "update_attrs": ["embed_dim", "split_size", "num_heads"],
+    },
+}
+
+
+def get_mapping(model, mapping):
+    """
+    Helper function to find mapping by model
+
+    Args:
+        model (PreTrainedModel): model object
+        mapping (Dict): mapping object
+
+    Returns:
+        Any: mapping object
+
+    Examples:
+        >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2")
+        >>> get_map(lm_head_model, TENSOR_PARALLEL_MAPPING)
+        {
+            "column_parallel": ["c_attn", "q_attn", "c_fc"],
+            "row_parallel": ["c_proj"],
+            "update_attrs": ["embed_dim", "split_size", "num_heads"],
+        }
+        >>> get_map(lm_head_model, FUSED_ATTENTION_MAPPING)
+        {"attn.c_attn": 3, "crossattention.c_attn": 2}
+
+        >>> seq_clf_model = GPT2ForSequenceClassification.from_pretrained("gpt2")
+        >>> get_map(seq_clf_model, TENSOR_PARALLEL_MAPPING)
+        {
+            "column_parallel": ["c_attn", "q_attn", "c_fc"],
+            "row_parallel": ["c_proj"],
+            "update_attrs": ["embed_dim", "split_size", "num_heads"],
+        }
+        >>> get_map(seq_clf_model, FUSED_ATTENTION_MAPPING)
+        {"attn.c_attn": 3, "crossattention.c_attn": 2}
+
+    """
+    for pretrained_model_cls, value in mapping.items():
+        if isinstance(model, pretrained_model_cls):
+            return value
+
+    return None
+
+
+def get_tensor_parallel_mapping(model):
+    """
+    Get tensor parallel mapping by model
+
+    Args:
+        model (PreTrainedModel): model object
+
+    Returns:
+        Dict: tensor parallel mapping
+
+    Examples:
+        >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2")
+        >>> get_tensor_parallel_mapping(lm_head_model)
+        {
+            "column_parallel": ["c_attn", "q_attn", "c_fc"],
+            "row_parallel": ["c_proj"],
+            "update_attrs": ["embed_dim", "split_size", "num_heads"],
+        }
+    """
+    return get_mapping(model, TENSOR_PARALLEL_MAPPING)
+
+
+def is_reversed_param(model, param_name):
+    """
+    Check reversed parameter or not.
+
+    Args:
+        model (PreTrainedModel): model object
+        param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...')
+
+    Notes:
+        ``Conv1D`` of GPT2 and ``qkv_net`` of TransfoXL have reversed parameters
+
+    Returns:
+        bool: whether reversed parameter or not.
+
+    Examples:
+        >>> is_reversed_param(model, 'transformer.h.0.attn.c_attn')
+        True
+        >>> is_reversed_param(model, 'transformer.wte')
+        False
+    """
+    mapping = get_mapping(model, REVERSED_PARAM_MAPPING)
+
+    if mapping is not None:
+        return any([i in param_name for i in mapping])
+
+    return False
+
+
+def get_fusion_degree(model, param_name):
+    """
+    Get fused attention layer degree
+
+    Args:
+        model (PreTrainedModel): model object
+        param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...')
+
+    Notes:
+        The `c_attn` layer in the self-attention layer of GPT2 is size of (dim * 3, dim).
+        In this case, the fusion degree is 3.
+
+        The `c_attn` layer in the cross-attention attention layer of GPT2 is size of (dim * 2, dim).
+        In this case, the fusion degree is 2.
+
+    Returns:
+        int: the fusion degree
+    """
+    mapping = get_mapping(model, FUSED_ATTENTION_MAPPING)
+
+    if mapping is not None:
+        for key, degree in mapping.items():
+            if key in param_name:
+                return degree
+    return 1

From 04089dc577b1da3bc9436ff4fe02fe35c72e33a0 Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Thu, 27 Jan 2022 22:12:17 +0900
Subject: [PATCH 04/20] Add T5

---
 .../utils/tensor_parallel_utils.py            | 58 +++++++++++++------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/src/transformers/utils/tensor_parallel_utils.py b/src/transformers/utils/tensor_parallel_utils.py
index a53725ca19ae8d..6360e0a8a7ab7b 100644
--- a/src/transformers/utils/tensor_parallel_utils.py
+++ b/src/transformers/utils/tensor_parallel_utils.py
@@ -1,32 +1,52 @@
-from .. import BertPreTrainedModel, GPT2PreTrainedModel, TransfoXLPreTrainedModel
+from .. import BertPreTrainedModel, GPT2PreTrainedModel, T5PreTrainedModel, TransfoXLPreTrainedModel
 
 
-# ie. nn.Linear(3 * dim, dim)
-# only defined the models that have fused attention layer.
-FUSED_ATTENTION_MAPPING = {
-    GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2},
-    TransfoXLPreTrainedModel: {"qkv_net": 3},
-}
+"""
+All the mapping for tensor parallelism.
+This mapping is following the follow format.
 
-# ie. nn.Linear(out_dim, in_dim) or Conv1D()
-# only defined the models that have reversed parameters.
-REVERSED_PARAM_MAPPING = {
-    GPT2PreTrainedModel: ["attn", "crossattention", "mlp"],
-    TransfoXLPreTrainedModel: ["qkv_net"],
+TENSOR_PARALLEL_MAPPING = {
+    PreTrainedModel class: {
+        "col": list of column parallel parameters,
+        "row": list of row parallel parameters,
+        "update": list of attributes to be updated,
+        "col_no_replacement": list of column parallel parameters without module replacement (Optional)
+        "row_no_replacement": list of row parallel parameters without module replacement (Optional),
+        ...
+        could be added more to avoid exceptions.
+    }
 }
 
-# All the mapping for tensor parallelism
+"""
 TENSOR_PARALLEL_MAPPING = {
     BertPreTrainedModel: {
-        "column_parallel": ["query", "key", "value", "intermediate.dense"],
-        "row_parallel": ["output.dense"],
-        "update_attrs": ["num_attention_heads", "all_head_size"],
+        "col": ["query", "key", "value", "intermediate.dense"],
+        "row": ["output.dense"],
+        "update": ["num_attention_heads", "all_head_size"],
     },
     GPT2PreTrainedModel: {
-        "column_parallel": ["c_attn", "q_attn", "c_fc"],
-        "row_parallel": ["c_proj"],
-        "update_attrs": ["embed_dim", "split_size", "num_heads"],
+        "col": ["c_attn", "q_attn", "c_fc"],
+        "row": ["c_proj"],
+        "update": ["embed_dim", "split_size", "num_heads"],
     },
+    T5PreTrainedModel: {
+        "col": ["Attention.q", "Attention.k", "Attention.v", "DenseReluDense.wi"],
+        "row": ["Attention.o", "DenseReluDense.wo"],
+        "row_no_replacement": ["relative_attention_bias"],
+        "update": ["d_model", "n_heads", "inner_dim"],
+    },
+}
+
+# Optional: fused attention layers like nn.Linear(3 * dim, dim).
+FUSED_ATTENTION_MAPPING = {
+    GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2},
+    TransfoXLPreTrainedModel: {"qkv_net": 3},
+}
+
+# Optional: reversed parameters like nn.Linear(out_dim, in_dim) or Conv1D().
+REVERSED_PARAM_MAPPING = {
+    GPT2PreTrainedModel: ["attn", "crossattention", "mlp"],
+    TransfoXLPreTrainedModel: ["qkv_net"],
 }
 
 

From 7353d3ee52d261150c464c5e5555cb36b964afd9 Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Fri, 28 Jan 2022 23:34:21 +0900
Subject: [PATCH 05/20] Add copy mapping

---
 .../utils/tensor_parallel_utils.py            | 69 +++++++++++++------
 1 file changed, 49 insertions(+), 20 deletions(-)

diff --git a/src/transformers/utils/tensor_parallel_utils.py b/src/transformers/utils/tensor_parallel_utils.py
index 6360e0a8a7ab7b..b6773c745f6fe1 100644
--- a/src/transformers/utils/tensor_parallel_utils.py
+++ b/src/transformers/utils/tensor_parallel_utils.py
@@ -1,4 +1,24 @@
-from .. import BertPreTrainedModel, GPT2PreTrainedModel, T5PreTrainedModel, TransfoXLPreTrainedModel
+# coding=utf-8
+# Copyright 2021 TUNiB Inc and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers import (
+    BertPreTrainedModel,
+    GPT2PreTrainedModel,
+    TransfoXLPreTrainedModel,
+    T5PreTrainedModel,
+    RobertaPreTrainedModel,
+)
 
 
 """
@@ -10,14 +30,22 @@
         "col": list of column parallel parameters,
         "row": list of row parallel parameters,
         "update": list of attributes to be updated,
-        "col_no_replacement": list of column parallel parameters without module replacement (Optional)
-        "row_no_replacement": list of row parallel parameters without module replacement (Optional),
+        "col_no_replacement": list of column parallel parameters without module replacement (opt)
+        "row_no_replacement": list of row parallel parameters without module replacement (opt),
         ...
         could be added more to avoid exceptions.
     }
 }
 
+Or if a model A has the same map with the other model B, define like:
+
+TENSOR_PARALLEL_MAPPING = {
+    PreTrainedModel class A: PreTrainedModel class B
+}
+
+Then, call ``copy_mapping(PreTrainedModel class A)``.
 """
+
 TENSOR_PARALLEL_MAPPING = {
     BertPreTrainedModel: {
         "col": ["query", "key", "value", "intermediate.dense"],
@@ -35,15 +63,27 @@
         "row_no_replacement": ["relative_attention_bias"],
         "update": ["d_model", "n_heads", "inner_dim"],
     },
+    RobertaPreTrainedModel: BertPreTrainedModel,
 }
 
-# Optional: fused attention layers like nn.Linear(3 * dim, dim).
+
+def copy_mapping(model_cls):
+    TENSOR_PARALLEL_MAPPING[model_cls] = TENSOR_PARALLEL_MAPPING[
+        TENSOR_PARALLEL_MAPPING[model_cls]
+    ]
+
+
+# Copy the same mapping.
+copy_mapping(RobertaPreTrainedModel)
+
+
+# ie. nn.Linear(3 * dim, dim) (opt)
 FUSED_ATTENTION_MAPPING = {
     GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2},
     TransfoXLPreTrainedModel: {"qkv_net": 3},
 }
 
-# Optional: reversed parameters like nn.Linear(out_dim, in_dim) or Conv1D().
+# ie. nn.Linear(out_dim, in_dim) or Conv1D() (opt)
 REVERSED_PARAM_MAPPING = {
     GPT2PreTrainedModel: ["attn", "crossattention", "mlp"],
     TransfoXLPreTrainedModel: ["qkv_net"],
@@ -52,11 +92,11 @@
 
 def get_mapping(model, mapping):
     """
-    Helper function to find mapping by model
+    Helper function to find
 
     Args:
         model (PreTrainedModel): model object
-        mapping (Dict): mapping object
+        mapping (Dict): map object
 
     Returns:
         Any: mapping object
@@ -115,22 +155,14 @@ def get_tensor_parallel_mapping(model):
 def is_reversed_param(model, param_name):
     """
     Check reversed parameter or not.
+    e.g. ``Conv1D`` of GPT2 and ``qkv_net`` of TransfoXL have reversed parameters
 
     Args:
         model (PreTrainedModel): model object
         param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...')
 
-    Notes:
-        ``Conv1D`` of GPT2 and ``qkv_net`` of TransfoXL have reversed parameters
-
     Returns:
         bool: whether reversed parameter or not.
-
-    Examples:
-        >>> is_reversed_param(model, 'transformer.h.0.attn.c_attn')
-        True
-        >>> is_reversed_param(model, 'transformer.wte')
-        False
     """
     mapping = get_mapping(model, REVERSED_PARAM_MAPPING)
 
@@ -149,12 +181,9 @@ def get_fusion_degree(model, param_name):
         param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...')
 
     Notes:
-        The `c_attn` layer in the self-attention layer of GPT2 is size of (dim * 3, dim).
+        The `c_attn` layer that has size of (dim * 3, dim) in GPT2.
         In this case, the fusion degree is 3.
 
-        The `c_attn` layer in the cross-attention attention layer of GPT2 is size of (dim * 2, dim).
-        In this case, the fusion degree is 2.
-
     Returns:
         int: the fusion degree
     """

From bec6a8465e6bdd8129fcf8de7d488dcbfccadcd3 Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Fri, 28 Jan 2022 23:37:04 +0900
Subject: [PATCH 06/20] Style code

---
 src/transformers/utils/tensor_parallel_utils.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/transformers/utils/tensor_parallel_utils.py b/src/transformers/utils/tensor_parallel_utils.py
index b6773c745f6fe1..bbfc61057b7db4 100644
--- a/src/transformers/utils/tensor_parallel_utils.py
+++ b/src/transformers/utils/tensor_parallel_utils.py
@@ -15,9 +15,9 @@
 from transformers import (
     BertPreTrainedModel,
     GPT2PreTrainedModel,
-    TransfoXLPreTrainedModel,
-    T5PreTrainedModel,
     RobertaPreTrainedModel,
+    T5PreTrainedModel,
+    TransfoXLPreTrainedModel,
 )
 
 
@@ -68,9 +68,7 @@
 
 
 def copy_mapping(model_cls):
-    TENSOR_PARALLEL_MAPPING[model_cls] = TENSOR_PARALLEL_MAPPING[
-        TENSOR_PARALLEL_MAPPING[model_cls]
-    ]
+    TENSOR_PARALLEL_MAPPING[model_cls] = TENSOR_PARALLEL_MAPPING[TENSOR_PARALLEL_MAPPING[model_cls]]
 
 
 # Copy the same mapping.

From dc7e74f77c3476b935b83a1537a9921bef1b134d Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Fri, 28 Jan 2022 23:40:01 +0900
Subject: [PATCH 07/20] remove transformers dependancy

---
 src/transformers/utils/tensor_parallel_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/tensor_parallel_utils.py b/src/transformers/utils/tensor_parallel_utils.py
index bbfc61057b7db4..936348ab3c4e73 100644
--- a/src/transformers/utils/tensor_parallel_utils.py
+++ b/src/transformers/utils/tensor_parallel_utils.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from transformers import (
+from .. import (
     BertPreTrainedModel,
     GPT2PreTrainedModel,
     RobertaPreTrainedModel,

From 6086772f3386defc7b6a3b4686eeede1f3fd718a Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Sat, 29 Jan 2022 18:13:39 +0900
Subject: [PATCH 08/20] Modify structure of tp mapping

---
 .../utils/model_parallel_utils.py             | 267 +++++++++++++++++-
 .../utils/tensor_parallel_utils.py            | 194 -------------
 2 files changed, 266 insertions(+), 195 deletions(-)
 delete mode 100644 src/transformers/utils/tensor_parallel_utils.py

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index abddd6c60faccf..54729fcb04d25d 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -12,10 +12,275 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import copy
+import importlib
 from math import ceil
 
 
+class TPInfo(object):
+    """
+    A class to describe tensor parallelization information.
+
+    Args:
+        name (Tuple[str]): the name of parameter
+        fuse (int): the degree of fusion
+        parallel (bool): parallelizable param or not
+        reverse (bool): reversed param or not
+    """
+
+    def __init__(
+        self,
+        *name,
+        fuse: bool = False,
+        parallel: bool = True,
+        reverse: bool = True,
+        # nn.Linear stores data reversely.
+        # nn.Linear(in, out) -> Parameter(out, int)
+    ):
+        self.name = name
+        self.fuse = fuse
+        self.reverse = reverse
+        self.parallel = parallel
+
+    def __str__(self):
+        return f"{self.__class__.__qualname__}({self.name})"
+
+    def __repr__(self):
+        return self.__str__()
+
+
+Col = type("COLUMN", (TPInfo,), {"code": "COLUMN"})
+Row = type("ROW", (TPInfo,), {"code": "ROW"})
+Update = type("UPDATE", (TPInfo,), {"code": "UPDATE", "parallel": False})
+
+
+class TPMapping(object):
+    __MAPPING__ = dict(
+        Albert=[
+            Col("query", "key", "value", "ffn"),
+            Row("dense", "ffn_output"),
+            Update("num_attention_heads", "all_head_size"),
+        ],
+        Bart=[
+            Col("q_proj", "k_proj", "v_proj", "fc1"),
+            Row("out_proj", "fc2"),
+            Update("embed_dim", "num_heads"),
+        ],
+        Bert=[
+            Col("query", "key", "value", "intermediate.dense"),
+            Row("output.dense"),
+            Update("num_attention_heads", "all_head_size"),
+        ],
+        T5=[
+            Col("Attention.q", "Attention.k", "Attention.v"),
+            Col("relative_attention_bias", reverse=False),
+            Row("DenseReluDense.wi", "Attention.o", "DenseReluDense.wo"),
+            Update("d_model", "n_heads", "inner_dim"),
+        ],
+        GPT2=[
+            Col("c_attn", reverse=False, fuse=True),
+            Col("q_attn", reverse=False),
+            Row("c_proj", reverse=False),
+            Update("embed_dim", "split_size", "num_heads"),
+        ],
+        Electra=[
+            Col("query", "key", "value", "intermediate.dense"),
+            Row("output.dense"),
+            Update("num_attention_heads", "all_head_size"),
+        ],
+        Roberta=[
+            Col("query", "key", "value", "intermediate.dense"),
+            Row("output.dense"),
+            Update("num_attention_heads", "all_head_size"),
+        ],
+    )
+
+    def __init__(self):
+        cache_tp_mapping = {}
+
+        for cls_name, mapping in self.__MAPPING__.items():
+            cls = self._load_class_by_model_name(cls_name)
+            cache_tp_mapping[cls] = []
+
+            for elem in mapping:
+                for name in elem.name:
+                    copy_elem = copy.deepcopy(elem)
+                    copy_elem.name = name
+                    cache_tp_mapping[cls].append(copy_elem)
+
+        self.__MAPPING__ = {cls: {} for cls in cache_tp_mapping}
+        # clear exist mapping rather than making new mapping dict
+
+        for cls, mapping in cache_tp_mapping.items():
+            for elem in mapping:
+                if elem.code in self.__MAPPING__[cls]:
+                    self.__MAPPING__[cls][elem.code].append(elem)
+                else:
+                    self.__MAPPING__[cls][elem.code] = [elem]
+
+    @staticmethod
+    def _load_class_by_model_name(model_name):
+        """
+        Load base class obj by class name
+        Args:
+            model_name (str): model name (e.g. Bert, GPT2, T5, ...)
+
+        Returns:
+            class: XXXPreTrainedModel
+        """
+        transformers = importlib.import_module("transformers")
+        cls = getattr(transformers, f"{model_name}PreTrainedModel", None)
+        if cls is None:
+            cls = getattr(transformers, f"{model_name}PretrainedModel", None)
+        assert cls is not None, f"Can not import the model named {cls}."
+        return cls
+
+    def get_mapping(self, model):
+        """
+        Get mapping by model obj
+
+        Args:
+            model (PreTrainedModel): model object (e.g. BertForSequenceClassification)
+
+        Returns:
+            dict: mapping by model
+        """
+        for cls, mapping in self.__MAPPING__.items():
+            if isinstance(model, cls):
+                return mapping
+        return None
+
+    def column_parallel_params(self, model):
+        """
+        Get list of column parallel param elements
+
+        Args:
+            model (PreTrainedModel): model obj
+
+        Returns:
+            List[COLUMN]: list of column parallel param elements
+        """
+        mapping = self.get_mapping(model)
+        if mapping is not None:
+            return mapping["COL"]
+
+    def row_parallel_params(self, model):
+        """
+        Get list of row parallel param elements
+
+        Args:
+            model (PreTrainedModel): model obj
+
+        Returns:
+            List[ROW]: list of row parallel param elements
+        """
+        mapping = self.get_mapping(model)
+        if mapping is not None:
+            return mapping["ROW"]
+
+    def update_attrs(self, model):
+        """
+        Get list of update attribute elements
+
+        Args:
+            model (PreTrainedModel): model obj
+
+        Returns:
+            List[UPDATE]: list of update attribute elements
+        """
+        mapping = self.get_mapping(model)
+        if mapping is not None:
+            return mapping["UPDATE"]
+
+    def search(self, model, param_name):
+        """
+        Get element by parameter name
+
+        Args:
+            model (PreTrainedModel): model obj
+
+        Returns:
+            TPInfo: element by parameter name
+        """
+        mapping = self.get_mapping(model)
+        count_contain_elem_in_param = False
+        param_split = param_name.split(".")
+
+        for code, elem in mapping.items():
+            elem_split = elem.name.split(".")
+            for _elem_split in elem_split:
+                if _elem_split in param_split:
+                    count_contain_elem_in_param += 1
+            if count_contain_elem_in_param == len(elem_split):
+                return elem
+
+        return None
+
+    def is_fused_param(self, model, param_name):
+        """
+        Check whether the param is fused or not
+
+        Args:
+            model (PreTrainedModel): model obj
+            param_name (str): name of parameter
+
+        Returns:
+            bool: whether the param is fused or not
+        """
+        elem = self.search(model, param_name)
+        if elem is not None:
+            return elem.fuse
+
+    def get_fusion_degree(self, model, param_name, module):
+        """
+        Get fusion degree
+
+        Args:
+            model (PreTrainedModel): model obj
+            param_name (str): name of parameter
+            module (nn.Module): module that has `weight` parameter
+
+        Returns:
+            int: fusion degree of module
+        """
+        if self.is_fused_param(model, param_name) and hasattr(module, "weight"):
+            bigger = max(module.weight.size(0), module.weight.size(1))
+            smaller = min(module.weight.size(0), module.weight.size(1))
+            return bigger // smaller
+        return 1
+
+    def is_reversed_param(self, model, param_name):
+        """
+        Check whether the parameter is reversed or not
+
+        Args:
+            model (PreTrainedModel): model obj
+            param_name (str): name of parameter
+
+        Returns:
+            bool: whether the param is reversed or not
+        """
+        elem = self.search(model, param_name)
+        if elem is not None:
+            return elem.reverse
+
+    def is_parallelizable_param(self, model, param_name):
+        """
+        Check whether the parameter is parallelizable or not
+
+        Args:
+            model (PreTrainedModel): model obj
+            param_name (str): name of parameter
+
+        Returns:
+            bool: whether the param is parallelizable or not
+        """
+
+        elem = self.search(model, param_name)
+        if elem is not None:
+            return elem.parallel
+
+
 def assert_device_map(device_map, num_blocks):
     blocks = list(range(0, num_blocks))
 
diff --git a/src/transformers/utils/tensor_parallel_utils.py b/src/transformers/utils/tensor_parallel_utils.py
deleted file mode 100644
index 936348ab3c4e73..00000000000000
--- a/src/transformers/utils/tensor_parallel_utils.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# coding=utf-8
-# Copyright 2021 TUNiB Inc and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .. import (
-    BertPreTrainedModel,
-    GPT2PreTrainedModel,
-    RobertaPreTrainedModel,
-    T5PreTrainedModel,
-    TransfoXLPreTrainedModel,
-)
-
-
-"""
-All the mapping for tensor parallelism.
-This mapping is following the follow format.
-
-TENSOR_PARALLEL_MAPPING = {
-    PreTrainedModel class: {
-        "col": list of column parallel parameters,
-        "row": list of row parallel parameters,
-        "update": list of attributes to be updated,
-        "col_no_replacement": list of column parallel parameters without module replacement (opt)
-        "row_no_replacement": list of row parallel parameters without module replacement (opt),
-        ...
-        could be added more to avoid exceptions.
-    }
-}
-
-Or if a model A has the same map with the other model B, define like:
-
-TENSOR_PARALLEL_MAPPING = {
-    PreTrainedModel class A: PreTrainedModel class B
-}
-
-Then, call ``copy_mapping(PreTrainedModel class A)``.
-"""
-
-TENSOR_PARALLEL_MAPPING = {
-    BertPreTrainedModel: {
-        "col": ["query", "key", "value", "intermediate.dense"],
-        "row": ["output.dense"],
-        "update": ["num_attention_heads", "all_head_size"],
-    },
-    GPT2PreTrainedModel: {
-        "col": ["c_attn", "q_attn", "c_fc"],
-        "row": ["c_proj"],
-        "update": ["embed_dim", "split_size", "num_heads"],
-    },
-    T5PreTrainedModel: {
-        "col": ["Attention.q", "Attention.k", "Attention.v", "DenseReluDense.wi"],
-        "row": ["Attention.o", "DenseReluDense.wo"],
-        "row_no_replacement": ["relative_attention_bias"],
-        "update": ["d_model", "n_heads", "inner_dim"],
-    },
-    RobertaPreTrainedModel: BertPreTrainedModel,
-}
-
-
-def copy_mapping(model_cls):
-    TENSOR_PARALLEL_MAPPING[model_cls] = TENSOR_PARALLEL_MAPPING[TENSOR_PARALLEL_MAPPING[model_cls]]
-
-
-# Copy the same mapping.
-copy_mapping(RobertaPreTrainedModel)
-
-
-# ie. nn.Linear(3 * dim, dim) (opt)
-FUSED_ATTENTION_MAPPING = {
-    GPT2PreTrainedModel: {"attn.c_attn": 3, "crossattention.c_attn": 2},
-    TransfoXLPreTrainedModel: {"qkv_net": 3},
-}
-
-# ie. nn.Linear(out_dim, in_dim) or Conv1D() (opt)
-REVERSED_PARAM_MAPPING = {
-    GPT2PreTrainedModel: ["attn", "crossattention", "mlp"],
-    TransfoXLPreTrainedModel: ["qkv_net"],
-}
-
-
-def get_mapping(model, mapping):
-    """
-    Helper function to find
-
-    Args:
-        model (PreTrainedModel): model object
-        mapping (Dict): map object
-
-    Returns:
-        Any: mapping object
-
-    Examples:
-        >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2")
-        >>> get_map(lm_head_model, TENSOR_PARALLEL_MAPPING)
-        {
-            "column_parallel": ["c_attn", "q_attn", "c_fc"],
-            "row_parallel": ["c_proj"],
-            "update_attrs": ["embed_dim", "split_size", "num_heads"],
-        }
-        >>> get_map(lm_head_model, FUSED_ATTENTION_MAPPING)
-        {"attn.c_attn": 3, "crossattention.c_attn": 2}
-
-        >>> seq_clf_model = GPT2ForSequenceClassification.from_pretrained("gpt2")
-        >>> get_map(seq_clf_model, TENSOR_PARALLEL_MAPPING)
-        {
-            "column_parallel": ["c_attn", "q_attn", "c_fc"],
-            "row_parallel": ["c_proj"],
-            "update_attrs": ["embed_dim", "split_size", "num_heads"],
-        }
-        >>> get_map(seq_clf_model, FUSED_ATTENTION_MAPPING)
-        {"attn.c_attn": 3, "crossattention.c_attn": 2}
-
-    """
-    for pretrained_model_cls, value in mapping.items():
-        if isinstance(model, pretrained_model_cls):
-            return value
-
-    return None
-
-
-def get_tensor_parallel_mapping(model):
-    """
-    Get tensor parallel mapping by model
-
-    Args:
-        model (PreTrainedModel): model object
-
-    Returns:
-        Dict: tensor parallel mapping
-
-    Examples:
-        >>> lm_head_model = GPT2LMHeadModel.from_pretrained("gpt2")
-        >>> get_tensor_parallel_mapping(lm_head_model)
-        {
-            "column_parallel": ["c_attn", "q_attn", "c_fc"],
-            "row_parallel": ["c_proj"],
-            "update_attrs": ["embed_dim", "split_size", "num_heads"],
-        }
-    """
-    return get_mapping(model, TENSOR_PARALLEL_MAPPING)
-
-
-def is_reversed_param(model, param_name):
-    """
-    Check reversed parameter or not.
-    e.g. ``Conv1D`` of GPT2 and ``qkv_net`` of TransfoXL have reversed parameters
-
-    Args:
-        model (PreTrainedModel): model object
-        param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...')
-
-    Returns:
-        bool: whether reversed parameter or not.
-    """
-    mapping = get_mapping(model, REVERSED_PARAM_MAPPING)
-
-    if mapping is not None:
-        return any([i in param_name for i in mapping])
-
-    return False
-
-
-def get_fusion_degree(model, param_name):
-    """
-    Get fused attention layer degree
-
-    Args:
-        model (PreTrainedModel): model object
-        param_name (str): the name of parameter (e.g. 'transformer.h.0.attn...')
-
-    Notes:
-        The `c_attn` layer that has size of (dim * 3, dim) in GPT2.
-        In this case, the fusion degree is 3.
-
-    Returns:
-        int: the fusion degree
-    """
-    mapping = get_mapping(model, FUSED_ATTENTION_MAPPING)
-
-    if mapping is not None:
-        for key, degree in mapping.items():
-            if key in param_name:
-                return degree
-    return 1

From 320a6e0382315d88f852fc733af768d773a36ed8 Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Sat, 29 Jan 2022 19:20:44 +0900
Subject: [PATCH 09/20] remove all the docstring

---
 .../utils/model_parallel_utils.py             | 105 ------------------
 1 file changed, 105 deletions(-)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index 54729fcb04d25d..a1eae461353068 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -18,16 +18,6 @@
 
 
 class TPInfo(object):
-    """
-    A class to describe tensor parallelization information.
-
-    Args:
-        name (Tuple[str]): the name of parameter
-        fuse (int): the degree of fusion
-        parallel (bool): parallelizable param or not
-        reverse (bool): reversed param or not
-    """
-
     def __init__(
         self,
         *name,
@@ -120,14 +110,6 @@ def __init__(self):
 
     @staticmethod
     def _load_class_by_model_name(model_name):
-        """
-        Load base class obj by class name
-        Args:
-            model_name (str): model name (e.g. Bert, GPT2, T5, ...)
-
-        Returns:
-            class: XXXPreTrainedModel
-        """
         transformers = importlib.import_module("transformers")
         cls = getattr(transformers, f"{model_name}PreTrainedModel", None)
         if cls is None:
@@ -136,72 +118,27 @@ def _load_class_by_model_name(model_name):
         return cls
 
     def get_mapping(self, model):
-        """
-        Get mapping by model obj
-
-        Args:
-            model (PreTrainedModel): model object (e.g. BertForSequenceClassification)
-
-        Returns:
-            dict: mapping by model
-        """
         for cls, mapping in self.__MAPPING__.items():
             if isinstance(model, cls):
                 return mapping
         return None
 
     def column_parallel_params(self, model):
-        """
-        Get list of column parallel param elements
-
-        Args:
-            model (PreTrainedModel): model obj
-
-        Returns:
-            List[COLUMN]: list of column parallel param elements
-        """
         mapping = self.get_mapping(model)
         if mapping is not None:
             return mapping["COL"]
 
     def row_parallel_params(self, model):
-        """
-        Get list of row parallel param elements
-
-        Args:
-            model (PreTrainedModel): model obj
-
-        Returns:
-            List[ROW]: list of row parallel param elements
-        """
         mapping = self.get_mapping(model)
         if mapping is not None:
             return mapping["ROW"]
 
     def update_attrs(self, model):
-        """
-        Get list of update attribute elements
-
-        Args:
-            model (PreTrainedModel): model obj
-
-        Returns:
-            List[UPDATE]: list of update attribute elements
-        """
         mapping = self.get_mapping(model)
         if mapping is not None:
             return mapping["UPDATE"]
 
     def search(self, model, param_name):
-        """
-        Get element by parameter name
-
-        Args:
-            model (PreTrainedModel): model obj
-
-        Returns:
-            TPInfo: element by parameter name
-        """
         mapping = self.get_mapping(model)
         count_contain_elem_in_param = False
         param_split = param_name.split(".")
@@ -217,32 +154,11 @@ def search(self, model, param_name):
         return None
 
     def is_fused_param(self, model, param_name):
-        """
-        Check whether the param is fused or not
-
-        Args:
-            model (PreTrainedModel): model obj
-            param_name (str): name of parameter
-
-        Returns:
-            bool: whether the param is fused or not
-        """
         elem = self.search(model, param_name)
         if elem is not None:
             return elem.fuse
 
     def get_fusion_degree(self, model, param_name, module):
-        """
-        Get fusion degree
-
-        Args:
-            model (PreTrainedModel): model obj
-            param_name (str): name of parameter
-            module (nn.Module): module that has `weight` parameter
-
-        Returns:
-            int: fusion degree of module
-        """
         if self.is_fused_param(model, param_name) and hasattr(module, "weight"):
             bigger = max(module.weight.size(0), module.weight.size(1))
             smaller = min(module.weight.size(0), module.weight.size(1))
@@ -250,32 +166,11 @@ def get_fusion_degree(self, model, param_name, module):
         return 1
 
     def is_reversed_param(self, model, param_name):
-        """
-        Check whether the parameter is reversed or not
-
-        Args:
-            model (PreTrainedModel): model obj
-            param_name (str): name of parameter
-
-        Returns:
-            bool: whether the param is reversed or not
-        """
         elem = self.search(model, param_name)
         if elem is not None:
             return elem.reverse
 
     def is_parallelizable_param(self, model, param_name):
-        """
-        Check whether the parameter is parallelizable or not
-
-        Args:
-            model (PreTrainedModel): model obj
-            param_name (str): name of parameter
-
-        Returns:
-            bool: whether the param is parallelizable or not
-        """
-
         elem = self.search(model, param_name)
         if elem is not None:
             return elem.parallel

From ded2fd70bff43a34bb0799718fc1709101f9a936 Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Sat, 29 Jan 2022 19:47:18 +0900
Subject: [PATCH 10/20] Add code

---
 src/transformers/utils/model_parallel_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index a1eae461353068..9c1032d3536c3f 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -31,6 +31,7 @@ def __init__(
         self.fuse = fuse
         self.reverse = reverse
         self.parallel = parallel
+        self.code = None
 
     def __str__(self):
         return f"{self.__class__.__qualname__}({self.name})"

From 5e9b0ef6e895e978a4f74d4bac540f76867a52d7 Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Sat, 29 Jan 2022 20:52:47 +0900
Subject: [PATCH 11/20] Fix bug

---
 .../utils/model_parallel_utils.py             | 51 ++++++++++---------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index 9c1032d3536c3f..52062c05ee1823 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -16,22 +16,17 @@
 import importlib
 from math import ceil
 
-
 class TPInfo(object):
+
     def __init__(
         self,
         *name,
         fuse: bool = False,
-        parallel: bool = True,
-        reverse: bool = True,
-        # nn.Linear stores data reversely.
-        # nn.Linear(in, out) -> Parameter(out, int)
+        reverse: bool = False,
     ):
         self.name = name
         self.fuse = fuse
         self.reverse = reverse
-        self.parallel = parallel
-        self.code = None
 
     def __str__(self):
         return f"{self.__class__.__qualname__}({self.name})"
@@ -40,9 +35,9 @@ def __repr__(self):
         return self.__str__()
 
 
-Col = type("COLUMN", (TPInfo,), {"code": "COLUMN"})
-Row = type("ROW", (TPInfo,), {"code": "ROW"})
-Update = type("UPDATE", (TPInfo,), {"code": "UPDATE", "parallel": False})
+Col = type("COLUMN", (TPInfo,), {"code": "Col"})
+Row = type("ROW", (TPInfo,), {"code": "Row"})
+Update = type("UPDATE", (TPInfo,), {"code": "Update", "parallel": False})
 
 
 class TPMapping(object):
@@ -64,14 +59,14 @@ class TPMapping(object):
         ],
         T5=[
             Col("Attention.q", "Attention.k", "Attention.v"),
-            Col("relative_attention_bias", reverse=False),
+            Col("relative_attention_bias", reverse=True),
             Row("DenseReluDense.wi", "Attention.o", "DenseReluDense.wo"),
             Update("d_model", "n_heads", "inner_dim"),
         ],
         GPT2=[
-            Col("c_attn", reverse=False, fuse=True),
-            Col("q_attn", reverse=False),
-            Row("c_proj", reverse=False),
+            Col("c_attn", reverse=True, fuse=True),
+            Col("q_attn", reverse=True),
+            Row("c_proj", reverse=True),
             Update("embed_dim", "split_size", "num_heads"),
         ],
         Electra=[
@@ -127,27 +122,33 @@ def get_mapping(self, model):
     def column_parallel_params(self, model):
         mapping = self.get_mapping(model)
         if mapping is not None:
-            return mapping["COL"]
+            return mapping["Col"]
 
     def row_parallel_params(self, model):
         mapping = self.get_mapping(model)
         if mapping is not None:
-            return mapping["ROW"]
+            return mapping["Row"]
 
     def update_attrs(self, model):
         mapping = self.get_mapping(model)
         if mapping is not None:
-            return mapping["UPDATE"]
+            return mapping["Update"]
 
     def search(self, model, param_name):
         mapping = self.get_mapping(model)
-        count_contain_elem_in_param = False
+        count_contain_elem_in_param = 0
         param_split = param_name.split(".")
+        first_check = []
+
+        for code, elems in mapping.items():
+            for elem in elems:
+                if elem.name in param_name:
+                    first_check.append(elem)
 
-        for code, elem in mapping.items():
+        for elem in first_check:
             elem_split = elem.name.split(".")
-            for _elem_split in elem_split:
-                if _elem_split in param_split:
+            for split in elem_split:
+                if split in param_split:
                     count_contain_elem_in_param += 1
             if count_contain_elem_in_param == len(elem_split):
                 return elem
@@ -171,11 +172,15 @@ def is_reversed_param(self, model, param_name):
         if elem is not None:
             return elem.reverse
 
-    def is_parallelizable_param(self, model, param_name):
+    def is_column_parallel(self, model, param_name):
         elem = self.search(model, param_name)
         if elem is not None:
-            return elem.parallel
+            return elem.code == "Col"
 
+    def is_row_parallel(self, model, param_name):
+        elem = self.search(model, param_name)
+        if elem is not None:
+            return elem.code == "Row"
 
 def assert_device_map(device_map, num_blocks):
     blocks = list(range(0, num_blocks))

From c671435fafb0c5c80c8cc72af7d6564e086b8830 Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Sat, 29 Jan 2022 20:56:44 +0900
Subject: [PATCH 12/20] Style code

---
 src/transformers/utils/model_parallel_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index 52062c05ee1823..a0773f2bd588ee 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -16,8 +16,8 @@
 import importlib
 from math import ceil
 
-class TPInfo(object):
 
+class TPInfo(object):
     def __init__(
         self,
         *name,
@@ -182,6 +182,7 @@ def is_row_parallel(self, model, param_name):
         if elem is not None:
             return elem.code == "Row"
 
+
 def assert_device_map(device_map, num_blocks):
     blocks = list(range(0, num_blocks))
 

From cb724a93e1fc613887df43c4d6572f847f162bff Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Sun, 30 Jan 2022 05:59:30 +0900
Subject: [PATCH 13/20] fuse to combined  qkv

---
 .../utils/model_parallel_utils.py             | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index a0773f2bd588ee..7e206b053867a0 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -21,11 +21,11 @@ class TPInfo(object):
     def __init__(
         self,
         *name,
-        fuse: bool = False,
+        combined_qkv: bool = False,
         reverse: bool = False,
     ):
         self.name = name
-        self.fuse = fuse
+        self.combined_qkv = combined_qkv
         self.reverse = reverse
 
     def __str__(self):
@@ -37,7 +37,7 @@ def __repr__(self):
 
 Col = type("COLUMN", (TPInfo,), {"code": "Col"})
 Row = type("ROW", (TPInfo,), {"code": "Row"})
-Update = type("UPDATE", (TPInfo,), {"code": "Update", "parallel": False})
+Update = type("UPDATE", (TPInfo,), {"code": "Update"})
 
 
 class TPMapping(object):
@@ -58,13 +58,12 @@ class TPMapping(object):
             Update("num_attention_heads", "all_head_size"),
         ],
         T5=[
-            Col("Attention.q", "Attention.k", "Attention.v"),
-            Col("relative_attention_bias", reverse=True),
-            Row("DenseReluDense.wi", "Attention.o", "DenseReluDense.wo"),
+            Col("q", "k", "v", "DenseReluDense.wi"),
+            Row("o", "DenseReluDense.wo", "relative_attention_bias"),
             Update("d_model", "n_heads", "inner_dim"),
         ],
         GPT2=[
-            Col("c_attn", reverse=True, fuse=True),
+            Col("c_attn", reverse=True, combined_qkv=True),
             Col("q_attn", reverse=True),
             Row("c_proj", reverse=True),
             Update("embed_dim", "split_size", "num_heads"),
@@ -155,13 +154,13 @@ def search(self, model, param_name):
 
         return None
 
-    def is_fused_param(self, model, param_name):
+    def is_combined_qkv_param(self, model, param_name):
         elem = self.search(model, param_name)
         if elem is not None:
-            return elem.fuse
+            return elem.combined_qkv
 
-    def get_fusion_degree(self, model, param_name, module):
-        if self.is_fused_param(model, param_name) and hasattr(module, "weight"):
+    def get_combined_qkv_degree(self, model, param_name, module):
+        if self.is_combined_qkv_param(model, param_name) and hasattr(module, "weight"):
             bigger = max(module.weight.size(0), module.weight.size(1))
             smaller = min(module.weight.size(0), module.weight.size(1))
             return bigger // smaller

From 844d27c4ae0e5cfca96c36fcde545b0f05d27860 Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Sun, 30 Jan 2022 09:22:01 +0900
Subject: [PATCH 14/20] Fix bug of mapping

---
 src/transformers/utils/model_parallel_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index 7e206b053867a0..5296f1966db0d7 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -44,7 +44,7 @@ class TPMapping(object):
     __MAPPING__ = dict(
         Albert=[
             Col("query", "key", "value", "ffn"),
-            Row("dense", "ffn_output"),
+            Row("attention.dense", "ffn_output"),
             Update("num_attention_heads", "all_head_size"),
         ],
         Bart=[
@@ -64,7 +64,7 @@ class TPMapping(object):
         ],
         GPT2=[
             Col("c_attn", reverse=True, combined_qkv=True),
-            Col("q_attn", reverse=True),
+            Col("c_fc", "q_attn", reverse=True),
             Row("c_proj", reverse=True),
             Update("embed_dim", "split_size", "num_heads"),
         ],

From fb4af90aa337e2562f2fa1f91b3756583eb2e416 Mon Sep 17 00:00:00 2001
From: hyunwoongko <gusdnd852@naver.com>
Date: Sun, 30 Jan 2022 10:23:52 +0900
Subject: [PATCH 15/20] Add GPTNeo, GPTJ

---
 src/transformers/utils/model_parallel_utils.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index 5296f1966db0d7..36697ee57c2921 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -68,6 +68,16 @@ class TPMapping(object):
             Row("c_proj", reverse=True),
             Update("embed_dim", "split_size", "num_heads"),
         ],
+        GPTNeo=[
+            Col("q_proj", "k_proj", "v_proj", "c_fc"),
+            Row("out_proj", "c_proj"),
+            Update("embed_dim", "num_heads"),
+        ],
+        GPTJ=[
+            Col("q_proj", "k_proj", "v_proj", "fc_in"),
+            Row("out_proj", "fc_out"),
+            Update("embed_dim", "num_attention_heads"),
+        ],
         Electra=[
             Col("query", "key", "value", "intermediate.dense"),
             Row("output.dense"),

From 60ba04903aaf4bb84abf6ddb16826bff59b2f0a4 Mon Sep 17 00:00:00 2001
From: Kevin Ko <gusdnd852@naver.com>
Date: Sun, 6 Feb 2022 05:36:52 +0900
Subject: [PATCH 16/20] Update src/transformers/utils/model_parallel_utils.py

Co-authored-by: Jake Tae <jaesungtae@gmail.com>
---
 src/transformers/utils/model_parallel_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index 36697ee57c2921..94bb8587cd7499 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -103,7 +103,7 @@ def __init__(self):
                     copy_elem.name = name
                     cache_tp_mapping[cls].append(copy_elem)
 
-        self.__MAPPING__ = {cls: {} for cls in cache_tp_mapping}
+        self.__MAPPING__ = {cls: defaultdict(list) for cls in cache_tp_mapping}
         # clear exist mapping rather than making new mapping dict
 
         for cls, mapping in cache_tp_mapping.items():

From b6ea797284795d3e559ef5e7195700ee51554e6b Mon Sep 17 00:00:00 2001
From: Kevin Ko <gusdnd852@naver.com>
Date: Sun, 6 Feb 2022 05:36:57 +0900
Subject: [PATCH 17/20] Update src/transformers/utils/model_parallel_utils.py

Co-authored-by: Jake Tae <jaesungtae@gmail.com>
---
 src/transformers/utils/model_parallel_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index 94bb8587cd7499..d3adbf0d343a75 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 import copy
 import importlib
+from collections import defaultdict
 from math import ceil
 
 

From 958003b379c8f6e34d5d7f59973419e11e2846c1 Mon Sep 17 00:00:00 2001
From: Kevin Ko <gusdnd852@naver.com>
Date: Sun, 6 Feb 2022 05:37:02 +0900
Subject: [PATCH 18/20] Update src/transformers/utils/model_parallel_utils.py

Co-authored-by: Jake Tae <jaesungtae@gmail.com>
---
 src/transformers/utils/model_parallel_utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index d3adbf0d343a75..241d2b29ba81a5 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -109,10 +109,7 @@ def __init__(self):
 
         for cls, mapping in cache_tp_mapping.items():
             for elem in mapping:
-                if elem.code in self.__MAPPING__[cls]:
-                    self.__MAPPING__[cls][elem.code].append(elem)
-                else:
-                    self.__MAPPING__[cls][elem.code] = [elem]
+                self.__MAPPING__[cls][elem.code].append(elem)
 
     @staticmethod
     def _load_class_by_model_name(model_name):

From 2f3230d980d4514bc28889a1c8392d6e817fb611 Mon Sep 17 00:00:00 2001
From: Kevin Ko <gusdnd852@naver.com>
Date: Sun, 6 Feb 2022 05:37:09 +0900
Subject: [PATCH 19/20] Update src/transformers/utils/model_parallel_utils.py

Co-authored-by: Jake Tae <jaesungtae@gmail.com>
---
 src/transformers/utils/model_parallel_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index 241d2b29ba81a5..1e96e6f7940b5e 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -143,6 +143,8 @@ def update_attrs(self, model):
 
     def search(self, model, param_name):
         mapping = self.get_mapping(model)
+        if mapping is None:
+            raise ValueError(f"{model} does not support tensor parallelism.")
         count_contain_elem_in_param = 0
         param_split = param_name.split(".")
         first_check = []

From b9e9ace52187359cebdc184a3c856fd9e67b3b1e Mon Sep 17 00:00:00 2001
From: Kevin Ko <gusdnd852@naver.com>
Date: Sun, 6 Feb 2022 05:37:13 +0900
Subject: [PATCH 20/20] Update src/transformers/utils/model_parallel_utils.py

Co-authored-by: Jake Tae <jaesungtae@gmail.com>
---
 src/transformers/utils/model_parallel_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index 1e96e6f7940b5e..a34aa194d12f18 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -123,7 +123,7 @@ def _load_class_by_model_name(model_name):
     def get_mapping(self, model):
         for cls, mapping in self.__MAPPING__.items():
             if isinstance(model, cls):
-                return mapping
+                return dict(mapping)
         return None
 
     def column_parallel_params(self, model):