Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] activations scaling to resolve accuracy issues for infer precision of f16 #27265

Open
wants to merge 40 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
abf9c98
added the static scaling feature
e-ddykim Oct 16, 2024
c8d05b4
added a new rt_info scale_factor
e-ddykim Oct 16, 2024
2b2b658
fp16 scaling for vae decoder of sdxl
e-ddykim Oct 24, 2024
230c4f2
resolved accuracy issue in transformer of flux.1
e-ddykim Oct 27, 2024
eb215f3
removed unnecessary codes
e-ddykim Oct 27, 2024
8ec7b80
removed unnecessary codes
e-ddykim Oct 27, 2024
9b2d48f
renamed to ActivationsScaling
e-ddykim Oct 28, 2024
f24dce6
updated code style
e-ddykim Oct 28, 2024
8897e19
updated to use multiple MatcherPass
e-ddykim Oct 29, 2024
e019834
updated code style
e-ddykim Oct 29, 2024
243e954
updated code style
e-ddykim Oct 29, 2024
267c74e
added unit tests
e-ddykim Oct 29, 2024
76e1aca
update code style
e-ddykim Oct 29, 2024
2656b57
updated code style
e-ddykim Oct 29, 2024
e831e72
updated code style
e-ddykim Oct 29, 2024
54e342c
updated code style
e-ddykim Oct 29, 2024
eb00539
updated for transformer of FLUX.1
e-ddykim Nov 4, 2024
4fa0650
disabled FullyConnectedPerLayerScaling
e-ddykim Nov 4, 2024
409b236
added unit tests
e-ddykim Nov 4, 2024
8a80b6a
fixed code style
e-ddykim Nov 4, 2024
82c35b6
Enable FullyConnectedHorizontalFusion with activations scaling
andrew-k-park Nov 5, 2024
736a738
updated ScaleDownMultipleLayers
e-ddykim Nov 11, 2024
a8f7ca4
updated code style
e-ddykim Nov 11, 2024
753e870
reading ACTIVATIONS_SCALE_FACTOR from rt_info
e-ddykim Nov 12, 2024
8811c65
updated to use LPT
e-ddykim Nov 20, 2024
4704692
fixed for flux.1 dynamic model
e-ddykim Nov 26, 2024
126fd97
fix merging faults
e-ddykim Nov 26, 2024
4469c2c
fixes for flux.1
e-ddykim Nov 28, 2024
98cc716
update not to add redundant Convert
e-ddykim Nov 29, 2024
86c5006
updated apply_rt_info
e-ddykim Nov 29, 2024
ec9c175
added a new ScaleDownFusion pass
e-ddykim Dec 2, 2024
300ec4e
added a new param useDefaultTransformation for activations scaling
e-ddykim Dec 2, 2024
ff0adb0
update code style
e-ddykim Dec 2, 2024
8452092
update code style
e-ddykim Dec 2, 2024
a736d9a
updated clamp_fp16 tests
e-ddykim Dec 2, 2024
22fdf6b
code cleanup
e-ddykim Dec 2, 2024
7683365
code cleanup
e-ddykim Dec 3, 2024
8b966bf
update code style
e-ddykim Dec 3, 2024
2f77bf8
remove redundant code
e-ddykim Dec 3, 2024
f7262b4
updated activations scaling tests
e-ddykim Dec 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -251,11 +251,13 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass
element::Type deqPrecision = element::f32,
const std::vector<ov::element::Type> defaultPrecisions =
{ ov::element::u8, ov::element::i8 },
const bool reshapeIgnorePerTensorQuantizationCheck = false) :
const bool reshapeIgnorePerTensorQuantizationCheck = false,
const bool useDefaultTransformation = true) :
updatePrecisions(updatePrecisions),
deqPrecision(deqPrecision),
defaultPrecisions(defaultPrecisions),
reshapeIgnorePerTensorQuantizationCheck(reshapeIgnorePerTensorQuantizationCheck) {}
reshapeIgnorePerTensorQuantizationCheck(reshapeIgnorePerTensorQuantizationCheck),
useDefaultTransformation(useDefaultTransformation) {}

Params& setUpdatePrecisions(const bool updatePrecisions) {
this->updatePrecisions = updatePrecisions;
Expand All @@ -280,6 +282,8 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass
std::vector<ov::element::Type> defaultPrecisions;
// to support GPU workarround to keep Reshape and MatMul in FP32
bool reshapeIgnorePerTensorQuantizationCheck;
// for MultiplyPartialTransformation to support Activations Scaling
bool useDefaultTransformation;
};

class PrecisionDetails {
Expand Down Expand Up @@ -351,6 +355,7 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass
element::Type deqPrecision;
std::vector<ov::element::Type> defaultPrecisions;
bool reshapeIgnorePerTensorQuantizationCheck;
bool useDefaultTransformation;

static constexpr char originalLayerPostfix[] = "_original";
TransformationContext* context;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ LayerTransformation::LayerTransformation(const Params& params) :
deqPrecision(params.deqPrecision),
defaultPrecisions(params.defaultPrecisions),
reshapeIgnorePerTensorQuantizationCheck(params.reshapeIgnorePerTensorQuantizationCheck),
useDefaultTransformation(params.useDefaultTransformation),
context(nullptr) {}

void LayerTransformation::setContext(TransformationContext* context) noexcept {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,24 +133,30 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov


// before: Y = (SC1 * (X1 - SH1)) * (SC2 * X2)
// after : Y = (SC1' * (X1 - SH1)) * (X2) , where :
// SC1' = SC1 * SC2
// if useDefaultTransformation = true
// after : Y = (SC1' * (X1 - SH1)) * (X2) , where :
// SC1' = SC1 * SC2
// else
// after : Y = ((X1 - SH1) * X2) * SC1' , where :
// SC1' = SC1 * SC2
auto newMultiplyValuesFullPath = fold<ov::opset1::Multiply>(multiplyValuesEmptyPath, multiplyValuesFullPath);
OutputVector inputs{ {}, {} };
inputs[emptyPathIndex] = dequantizationEmptyPath.data;
inputs[emptyPathIndex] = useDefaultTransformation ? dequantizationEmptyPath.data : newMultiplyValuesFullPath;
auto input_for_fullPath = useDefaultTransformation ? newMultiplyValuesFullPath :
dequantizationEmptyPath.data.get_node_shared_ptr();

ov::Output<ov::Node> parent0 = dequantizationFullPath.subtract == nullptr ?
(dequantizationFullPath.convert == nullptr ? dequantizationFullPath.data : dequantizationFullPath.convert) :
dequantizationFullPath.subtract;

inputs[fullPathIndex] =
parent0.get_node()->get_output_element_type(0) == newMultiplyValuesFullPath->get_output_element_type(0) ?
std::make_shared<ov::opset1::Multiply>(parent0, newMultiplyValuesFullPath) :
parent0.get_node()->get_output_element_type(0) == input_for_fullPath->get_output_element_type(0) ?
std::make_shared<ov::opset1::Multiply>(parent0, input_for_fullPath) :
std::make_shared<ov::op::TypeRelaxed<ov::opset1::Multiply>>(
std::vector<element::Type>{element::f32, element::f32},
std::vector<element::Type>{element::f32},
ov::op::TemporaryReplaceOutputType(parent0, element::f32).get(),
ov::op::TemporaryReplaceOutputType(newMultiplyValuesFullPath, element::f32).get());
ov::op::TemporaryReplaceOutputType(input_for_fullPath, element::f32).get());

newMultiply = std::make_shared<ov::op::TypeRelaxed<ov::opset1::Multiply>>(
std::vector<element::Type>{element::f32, element::f32},
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <memory>

#include "openvino/pass/matcher_pass.hpp"
#include "transformations_visibility.hpp"

namespace ov {
namespace pass {

class TRANSFORMATIONS_API ActivationsScaling;

namespace activations_scaling {

TRANSFORMATIONS_API void mark_as_scale_down_node(const std::shared_ptr<Node>& node);

TRANSFORMATIONS_API bool is_scale_down_node(const std::shared_ptr<const Node>& node);

class TRANSFORMATIONS_API ScaleDownNode : public RuntimeAttribute {
public:
OPENVINO_RTTI("scale_down_node", "0");

bool is_copyable() const override {
return false;
}
};

class TRANSFORMATIONS_API ScaleDownSingleLayer;
class TRANSFORMATIONS_API ScaleDownFusion;
class TRANSFORMATIONS_API MulGroupNormTransformation;
class TRANSFORMATIONS_API MulMVNTransformation;
class TRANSFORMATIONS_API MulConcatTransformation;

} // namespace activations_scaling
} // namespace pass
} // namespace ov

// ActivationsScaling makes activation values smaller to prevent overflow due to the limited range of FP16
// This feature is controlled by ov::hint::activations_scale_factor.
// For example, when this property is set as 16, activations are divided by 16.
// If ov::hint::activations_scale_factor is less than zero, it is disabled.

class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("ScaleDownSingleLayer", "0");
ScaleDownSingleLayer(float scale_factor, ov::element::Type scaled_prec);
};

class ov::pass::activations_scaling::ScaleDownFusion : public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("ScaleDownFusion", "0");
ScaleDownFusion(float scale_factor);
};

class ov::pass::activations_scaling::MulGroupNormTransformation : public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("MulGroupNormTransformation", "0");
MulGroupNormTransformation();
};

class ov::pass::activations_scaling::MulMVNTransformation : public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("MulMVNTransformation", "0");
MulMVNTransformation();
};

class ov::pass::activations_scaling::MulConcatTransformation : public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("MulConcatTransformation", "0");
MulConcatTransformation();
};
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ namespace ov {

TRANSFORMATIONS_API void mark_as_dequantization_node(const std::shared_ptr<Node>& node);

TRANSFORMATIONS_API bool is_dequantization_node(const std::shared_ptr<Node>& node);
TRANSFORMATIONS_API bool is_dequantization_node(const std::shared_ptr<const Node>& node);

/**
* @ingroup ov_runtime_attr_api
Expand Down
Loading
Loading