Skip to content

Commit

Permalink
stack commits
Browse files Browse the repository at this point in the history
  • Loading branch information
lvdongyi committed Dec 2, 2024
1 parent d455181 commit 5df3873
Show file tree
Hide file tree
Showing 18 changed files with 1,422 additions and 153 deletions.
34 changes: 0 additions & 34 deletions llm/config/qwen2moe/lora_argument.json

This file was deleted.

40 changes: 0 additions & 40 deletions llm/config/qwen2moe/pretrain_argument.json

This file was deleted.

33 changes: 0 additions & 33 deletions llm/config/qwen2moe/sft_argument.json

This file was deleted.

182 changes: 182 additions & 0 deletions llm/experimental/layers/cache_kv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import ParamAttr
from paddle.nn import Layer
from paddle.nn.initializer import Constant
from paddle.nn.quant.format import ConvertibleQuantedLayer


class CacheKVMatMul(Layer):
def __init__(self):
super().__init__()

def forward(self, x, y, transpose_x=False, transpose_y=False, name=None):
return paddle.matmul(x, y, transpose_x, transpose_y, name)


class QuantizedCacheKVMatMul(ConvertibleQuantedLayer):
def __init__(self, layer: Layer, q_config):
super().__init__()
# For FakeQuant
self.activation_quanter = None
self.weight_quanter = None
if q_config.activation is not None:
self.activation_quanter = q_config.activation._instance(layer)

def forward(self, x, y, transpose_x=False, transpose_y=False, name=None):
# qdq
if self.activation_quanter is not None:
y = self.activation_quanter(y)
return paddle.matmul(x, y, transpose_x, transpose_y, name)

def weights_to_quanters(self):
return [("weight", "weight_quanter")]

def activation_quanters(self):
return ["activation_quanter"]


class ShiftSmoothCacheKVMatMul(Layer):
"""
The computational logic of ShiftSmoothCacheKVMatMul is the same as CacheKVMatMul.
The only difference is that its inputs are shift.
"""

def __init__(self):
super().__init__()
self.sequence_parallel = False
self.dtype = None

def forward(
self,
x,
y,
transpose_x=False,
transpose_y=False,
perm_x=None,
perm_y=None,
use_smooth_x=False,
use_smooth_out=False,
name=None,
sequence_parallel=False,
):
self.sequence_parallel = sequence_parallel
# smooth
smooth_x, smooth_y = self._smooth(x, y, use_smooth_x)
# transpose
if perm_x is not None:
smooth_x = paddle.transpose(smooth_x, perm=perm_x)
if perm_y is not None:
smooth_y = paddle.transpose(smooth_y, perm=perm_y)
# matmul output
out = paddle.matmul(smooth_x, smooth_y, transpose_x, transpose_y, name)
if not use_smooth_out:
return out
else:
# combine heads
if self.sequence_parallel:
out = paddle.transpose(out, perm=[2, 0, 1, 3])
else:
out = paddle.transpose(out, perm=[0, 2, 1, 3])
return paddle.multiply(out, self.smooth_weight)

def _smooth(self, x, y, use_smooth_x):
# For ShiftSmooth
smooth_shape = [1]
self.dtype = y.dtype
if not hasattr(self, "smooth_weight"):
self.smooth_weight = self.create_parameter(
shape=smooth_shape, attr=ParamAttr(initializer=Constant(value=1.0)), dtype=self.dtype
)
smooth_y = y
smooth_y = paddle.divide(smooth_y, self.smooth_weight)

if use_smooth_x:
smooth_x = x
x = paddle.multiply(smooth_x, self.smooth_weight)
return x, smooth_y

def convert_weight(self, smooth_weight=None):
if smooth_weight is not None:
self.smooth_weight.set_value(smooth_weight.squeeze().cast(self.dtype))


class QuantizedShiftSmoothCacheKVMatMul(ConvertibleQuantedLayer):
"""
The computational logic of QuantizedShiftSmoothCacheKVMatMul is the same as RowParallelLinear.
The only difference is that its inputs are shift.
"""

def __init__(self, layer: Layer, q_config):
super().__init__()

# For FakeQuant
self.weight_quanter = None
self.activation_quanter = None
self.smooth_weight = layer.smooth_weight
if q_config.activation is not None:
self.activation_quanter = q_config.activation._instance(layer)

def forward(
self,
x,
y,
transpose_x=False,
transpose_y=False,
perm_x=None,
perm_y=None,
use_smooth_x=False,
use_smooth_out=False,
name=None,
sequence_parallel=False,
):
# smooth
smooth_x, smooth_y = self._smooth(x, y, use_smooth_x)
# qdq
if self.activation_quanter is not None:
smooth_y = self.activation_quanter(smooth_y)
# transpose
if perm_x is not None:
smooth_x = paddle.transpose(smooth_x, perm=perm_x)
if perm_y is not None:
smooth_y = paddle.transpose(smooth_y, perm=perm_y)
# matmul output
out = paddle.matmul(smooth_x, smooth_y, transpose_x, transpose_y, name)
if not use_smooth_out:
return out
else:
# combine heads
if sequence_parallel:
out = paddle.transpose(out, perm=[2, 0, 1, 3])
else:
out = paddle.transpose(out, perm=[0, 2, 1, 3])
return paddle.multiply(out, self.smooth_weight)

def _smooth(self, x, y, use_smooth_x):
# For ShiftSmooth
self.dtype = y.dtype
smooth_y = y
smooth_y = paddle.divide(smooth_y, self.smooth_weight)

if use_smooth_x:
smooth_x = x
x = paddle.multiply(smooth_x, self.smooth_weight)
return x, smooth_y

def weights_to_quanters(self):
return [("weight", "weight_quanter")]

def activation_quanters(self):
return ["activation_quanter"]
91 changes: 91 additions & 0 deletions llm/experimental/layers/custom_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Custome Attention Layer for quantization.
"""
# import paddle
import paddle.tensor as tensor
from paddle.nn import Layer
from paddle.nn.quant.format import ConvertibleQuantedLayer


class QuantizedCustomAttentionLayer(ConvertibleQuantedLayer):
"""
Quantized Custom Attention Layer.
"""

def __init__(self, layer: Layer, q_config=None):
"""
Initialize the QuantizeWrapper class.
Args:
layer (Layer): The layer to be quantized.
q_config (QuantConfig, optional): The quantization configuration. Defaults to None.
"""
super().__init__()
# hard code: get activation quanter from weight
self.activation_quanter_k = q_config.weight._instance(layer)
self.activation_quanter_v = q_config.activation._instance(layer)
self.layer = layer
self.enable_fake_quant = False
self.quant_info = None
layer_name = self.layer.full_name()
self.layer_id = int(layer_name.split("_")[-1])
self.kv_losses = {}

def forward(
self,
q,
config,
k,
v,
attention_mask,
output_attentions,
# alibi,
# attn_mask_startend_row_indices,
# sequence_parallel,
**kwargs
):
"""forward"""
if self.enable_fake_quant:
self.collect_kv_quant_policy(q, k, v, **kwargs)
perm = [0, 2, 1, 3] # [1, 2, 0, 3] if self.sequence_parallel else [0, 2, 1, 3]
tmp_k = tensor.transpose(x=k, perm=perm)
tmp_v = tensor.transpose(x=v, perm=perm)
if self.activation_quanter_k is not None:
tmp_k = self.activation_quanter_k(tmp_k)
if self.activation_quanter_v is not None:
tmp_v = self.activation_quanter_v(tmp_v)
k = tensor.transpose(x=tmp_k, perm=perm)
v = tensor.transpose(x=tmp_v, perm=perm)
return self.layer(
q,
config,
k,
v,
attention_mask,
output_attentions,
# alibi,
# attn_mask_startend_row_indices,
# sequence_parallel,
**kwargs,
)

def weights_to_quanters(self):
"""weights to quanters"""
return []

def activation_quanters(self):
"""activation to quanters"""
return ["activation_quanter_k", "activation_quanter_v"]
Loading

0 comments on commit 5df3873

Please sign in to comment.