Skip to content

Commit

Permalink
[CPU][Ref] Support Reduce ops with empty input (#27603)
Browse files Browse the repository at this point in the history
### Details:
- *The main part of this PR is contributed by
#27438
- *My revision is placed in the last commit, regarding changes on Reduce
node of CPU plugin, mainly about the following contents:*

1. [x64] Avoid the `divisor` in `reduce_kernel_post_process` to be zero,
and enable post ops fusion of ReduceMean.
2. [x64] Add `axesZeroDim` and `axesZeroDimFusing` in test cases, so
that all these new added test cases will go exactly to the new added
"early return" code block, where input tensor is empty and output tensor
is not.
3. [x64] For the case of empty input combined with low precision ops
fusion, use intermediate buffer to set default results before post ops
fusion.
4. [arm] `makeExecutor` is skipped for the case of empty input on ARM,
because Acl library does not support empty input tensor (e.g.,
NEReduceMean::validate return error). Besides, because of early return,
the executor won't be needed anyway.
5. [arm] ARM Transformations ConvertReduceProd(Min, Max, Sum) are
disabled to avoid empty output.

### Tickets:
 - *[CVS-117469](https://jira.devtools.intel.com/browse/CVS-117469)*

---------

Co-authored-by: mitruska <[email protected]>
  • Loading branch information
xuchen-intel and mitruska authored Nov 29, 2024
1 parent 301d6de commit 4070bea
Show file tree
Hide file tree
Showing 17 changed files with 349 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ ov::matcher_pass_callback ConvertReduceBase::convert_reduce_to_pooling() {
return [&](ov::pass::pattern::Matcher& m) {
auto reduce = std::dynamic_pointer_cast<T>(m.get_match_root());

if (!reduce || transformation_callback(reduce)) {
if (!reduce || transformation_callback(reduce) || ov::shape_size(reduce->input_value(0).get_shape()) == 0) {
return false;
}

Expand Down
4 changes: 4 additions & 0 deletions src/core/reference/include/openvino/reference/reduce_mean.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ void reduce_mean(const T* in, T* out, const Shape& in_shape, const AxisSet& redu
reduce_sum(in, out, in_shape, reduction_axes);

const auto out_shape = util::reduce(in_shape, reduction_axes);
if (shape_size(in_shape) == 0) {
return;
}

const auto out_size = shape_size(out_shape);
const auto count = static_cast<T>(shape_size(in_shape) / out_size);
std::transform(out, std::next(out, out_size), out, [count](const T value) {
Expand Down
64 changes: 48 additions & 16 deletions src/plugins/intel_cpu/src/nodes/reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2020,6 +2020,7 @@ void Reduce::initSupportedPrimitiveDescriptors() {
config.outConfs[0].setMemDesc(creatorsMap.at(outFormat)->createSharedDesc(outPrecision, getOutputShapeAtPort(0)));

if (useAclExecutor) {
#if defined (OV_CPU_WITH_ACL)
std::vector<MemoryDescPtr> srcMemoryDescs;
for (size_t i = 0; i < config.inConfs.size(); i++) {
srcMemoryDescs.push_back(config.inConfs[i].getMemDesc());
Expand All @@ -2034,22 +2035,29 @@ void Reduce::initSupportedPrimitiveDescriptors() {
if (!factory->isEmpty()) {
supportedPrimitiveDescriptors.push_back({config, impl_type, factory});
}
#endif
} else {
supportedPrimitiveDescriptors.push_back({config, impl_type});
}
};

#if defined (OV_CPU_WITH_ACL)
reduceAttrs.operation = algorithm;
reduceAttrs.keepDims = keep_dims;
reduceAttrs.axes = raw_axes;
for (auto &axis : reduceAttrs.axes) {
if (axis < 0)
axis += static_cast<int>(getInputShapeAtPort(REDUCE_DATA).getRank());
// acl doesn't support empty input
if (!isDynamicNode() && shape_size(getInputShapeAtPort(REDUCE_DATA).getStaticDims()) == 0) {
canUseAclExecutor = false;
} else {
reduceAttrs.operation = algorithm;
reduceAttrs.keepDims = keep_dims;
reduceAttrs.axes = raw_axes;
for (auto &axis : reduceAttrs.axes) {
if (axis < 0)
axis += static_cast<int>(getInputShapeAtPort(REDUCE_DATA).getRank());
}
pushDesc(LayoutType::nspc, LayoutType::nspc, input_prec, output_prec, impl_desc_type::undef, true);
pushDesc(LayoutType::ncsp, LayoutType::ncsp, input_prec, output_prec, impl_desc_type::undef, true);
canUseAclExecutor = !supportedPrimitiveDescriptors.empty();
}
pushDesc(LayoutType::nspc, LayoutType::nspc, input_prec, output_prec, impl_desc_type::undef, true);
pushDesc(LayoutType::ncsp, LayoutType::ncsp, input_prec, output_prec, impl_desc_type::undef, true);
canUseAclExecutor = !supportedPrimitiveDescriptors.empty();

if (canUseAclExecutor)
return;
#endif
Expand Down Expand Up @@ -2089,10 +2097,16 @@ void Reduce::initSupportedPrimitiveDescriptors() {
}

bool Reduce::isExecutable() const {
return !isInputTensorAtPortEmpty(REDUCE_DATA);
return !isOutputTensorAtPortEmpty(0);
}

void Reduce::prepareParams() {
auto srcMemPtr = getSrcMemoryAtPort(REDUCE_DATA);
auto dstMemPtr = getDstMemoryAtPort(0);
const auto& src_shape = srcMemPtr->getStaticDims();
dst_size = dstMemPtr->getSize();
empty_input = shape_size(src_shape) == 0;
#if defined (OV_CPU_WITH_ACL)
if (canUseAclExecutor) {
std::vector<MemoryDescPtr> srcMemoryDescs;
for (size_t i = 0; i < getParentEdges().size(); i++) {
Expand All @@ -2102,11 +2116,15 @@ void Reduce::prepareParams() {
dstMemoryDescs.push_back(getDstMemoryAtPort(0)->getDescPtr());

auto selectedPD = getSelectedPrimitiveDescriptor();
aclExecPtr = selectedPD->getExecutorFactoryAs<ReduceExecutorFactory>()->makeExecutor(reduceAttrs, srcMemoryDescs, dstMemoryDescs, {});
selectedPD->setImplementationType(aclExecPtr->getImplType());

if (!empty_input) {
aclExecPtr = selectedPD->getExecutorFactoryAs<ReduceExecutorFactory>()->makeExecutor(reduceAttrs, srcMemoryDescs, dstMemoryDescs, {});
selectedPD->setImplementationType(aclExecPtr->getImplType());
} else {
selectedPD->setImplementationType(acl);
}
return;
}
#endif

src_dims = getParentEdgeAt(REDUCE_DATA)->getMemory().getDesc().getShape().getDims();
std::vector<int> reduce_axes;
Expand All @@ -2116,9 +2134,7 @@ void Reduce::prepareParams() {
reduce_axes = raw_axes;
}

auto dstMemPtr = getDstMemoryAtPort(0);
const VectorDims &dst_dims = dstMemPtr->getDesc().getShape().getDims();
dst_size = dstMemPtr->getSize();
calc_process_dst_dims(reduce_axes, dst_dims);
if (jit_mode) {
set_reduce_dim_flags();
Expand Down Expand Up @@ -2274,11 +2290,26 @@ void Reduce::execute(dnnl::stream strm) {
const uint8_t *src_data = srcMemPtr->getDataAs<const uint8_t>();
uint8_t *dst_data = dstMemPtr->getDataAs<uint8_t>();

if (empty_input && dst_size > 0) {
#if defined(OPENVINO_ARCH_X86_64)
output_info_reassign(&dst_data);
init_dst_data(dst_data, dst_size);
output_info_restore(&dst_data);
if (attr.get()->post_ops_.len() != 0) {
reduce_kernel_post_process(dst_data);
}
#else
init_dst_data(dst_data, dst_size);
#endif
return;
}

if (jit_mode) {
if (is_hybrid_layout) {
dst_data = reinterpret_cast<uint8_t *>(prc_mem.get_data_handle());
}
reduce_type(src_data, dst_data);
#if defined (OV_CPU_WITH_ACL)
} else if (aclExecPtr) {
std::vector<MemoryCPtr> srcMemory;
for (size_t i = 0; i < getParentEdges().size(); i++) {
Expand All @@ -2288,6 +2319,7 @@ void Reduce::execute(dnnl::stream strm) {
dstMemory.push_back(getDstMemoryAtPort(0));

aclExecPtr->exec(srcMemory, dstMemory, postOpsDataPtrs.data());
#endif
} else {
if (layout == ReduceLayoutType::reduce_ncsp) {
auto in_ptr = reinterpret_cast<const float *>(src_data);
Expand Down Expand Up @@ -2725,7 +2757,7 @@ inline void Reduce::reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, s

inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) {
const uint8_t *in_ptr = fuse_low_precision ? static_cast<uint8_t *>(&intermediate_buf[0]) : nullptr;
const size_t integerDivisor = IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW);
const size_t integerDivisor = empty_input ? 1 : IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW);
const float divisor = static_cast<float>(integerDivisor);
if (layout == ReduceLayoutType::reduce_ncsp) {
parallel_for2d(OB, OC, [&](size_t ob, size_t oc) {
Expand Down
3 changes: 3 additions & 0 deletions src/plugins/intel_cpu/src/nodes/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ class Reduce : public Node {
bool ReduceCDW_opt = false;
bool use_aux_kernel = false;
bool set_use_aux_kernel = false;
bool empty_input = false;
bool ReduceN, ReduceC, ReduceD, ReduceH, ReduceW;
size_t IB, IC, ID, IH, IW;
size_t OB, OC, OD, OH, OW;
Expand Down Expand Up @@ -188,9 +189,11 @@ class Reduce : public Node {

std::string errorPrefix;

#if defined (OV_CPU_WITH_ACL)
ReduceAttrs reduceAttrs;
bool canUseAclExecutor = false;
std::shared_ptr<ReduceExecutor> aclExecPtr = nullptr;
#endif
};

} // namespace node
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ ov::matcher_pass_callback ov::intel_cpu::ConvertReduceMultiAxisBase::convert_red
if (!reduction_axes) {
return false;
}
if (!reduce->is_dynamic() && ov::shape_size(input0.get_shape()) == 0) {
return false;
}
if (ov::shape_size(input1.get_shape()) <= 1) {
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,25 @@ const std::vector<ov::test::utils::ReductionType>& reductionTypes() {
return reductionTypes;
}

const std::vector<ov::test::utils::ReductionType>& reductionTypesArithmetic() {
static const std::vector<ov::test::utils::ReductionType> reductionTypesArithmetic = {
ov::test::utils::ReductionType::Mean,
ov::test::utils::ReductionType::Sum,
ov::test::utils::ReductionType::Prod,
ov::test::utils::ReductionType::L1,
ov::test::utils::ReductionType::L2,
};
return reductionTypesArithmetic;
}

const std::vector<ov::test::utils::ReductionType>& reductionTypesCompare() {
static const std::vector<ov::test::utils::ReductionType> reductionTypesCompare = {
ov::test::utils::ReductionType::Max,
ov::test::utils::ReductionType::Min,
};
return reductionTypesCompare;
}

const std::vector<ElementType>& inpOutPrc() {
static const std::vector<ElementType> inpOutPrc = {ElementType::f32};
return inpOutPrc;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ const std::vector<std::vector<int>>& axes();
const std::vector<std::vector<int>>& axesND();
const std::vector<ov::test::utils::OpType>& opTypes();
const std::vector<utils::ReductionType>& reductionTypes();
const std::vector<utils::ReductionType>& reductionTypesArithmetic();
const std::vector<utils::ReductionType>& reductionTypesCompare();
const std::vector<ElementType>& inpOutPrc();
const std::vector<std::map<std::string, ov::element::Type>> additionalConfig();
const std::vector<std::map<std::string, ov::element::Type>> additionalConfigFP32();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ std::vector<std::vector<ov::test::InputShape>> inputShapes_5D = {
{{{}, {{2, 19, 2, 2, 9}}}},
};

std::vector<std::vector<ov::test::InputShape>> inputShapes_5D_ZeroDim = {
{{{}, {{2, 19, 0, 2, 9}}}},
{{{}, {{2, 19, 0, 2, 0}}}},
};

const std::vector<std::vector<int>> axes5D = {
{2, 4},
{1, 2, 4},
Expand Down Expand Up @@ -70,6 +75,20 @@ const auto params_MultiAxis_5D_ref = testing::Combine(
testing::Values(emptyFusingSpec),
testing::ValuesIn(config_infer_prec_f32));

const auto params_MultiAxis_5D_ZeroDim_ref = testing::Combine(
testing::Combine(
testing::ValuesIn(axes5D),
testing::Values(ov::test::utils::OpType::VECTOR),
testing::Values(true),
testing::ValuesIn(reductionTypes()),
testing::ValuesIn(inpOutPrc()),
testing::Values(ElementType::undefined),
testing::Values(ElementType::undefined),
testing::ValuesIn(inputShapes_5D_ZeroDim)),
testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_ref)),
testing::Values(emptyFusingSpec),
testing::ValuesIn(additionalConfigFP32()));

//There are dedicated instences of smoke_Reduce_MultiAxis_5D_CPU test in arm and x64 folders
//because ACL does not support 0 as reduction axis
INSTANTIATE_TEST_SUITE_P(
Expand All @@ -87,6 +106,13 @@ INSTANTIATE_TEST_SUITE_P(
ReduceCPULayerTest::getTestCaseName
);

INSTANTIATE_TEST_SUITE_P(
smoke_Reduce_MultiAxis_5D_ZeroDim_CPU_ref,
ReduceCPULayerTest,
params_MultiAxis_5D_ZeroDim_ref,
ReduceCPULayerTest::getTestCaseName
);

} // namespace
} // namespace Reduce
} // namespace test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ std::vector<std::vector<ov::test::InputShape>> inputShapes_SingleBatch_dyn = {
{{{{1, 5}, 19, {1, 5}, {1, 10}}, {{1, 19, 2, 2}, {1, 19, 2, 9}}}},
};

std::vector<std::vector<ov::test::InputShape>> inputShapes_Dynmic_ZeroDim = {
{{{-1, -1, -1, -1}, {{2, 0, 3, 9}}}},
{{{2, 0, -1, -1}, {{2, 0, 3, 9}}}},
{{{2, 0, -1, -1}, {{2, 0, 3, 0}}}}
};

std::vector<CPUSpecificParams> cpuParams_3D = {
CPUSpecificParams({ncw}, {ncw}, {}, {}),
};
Expand Down Expand Up @@ -99,6 +105,10 @@ const std::vector<std::vector<int>> axesGather = {
{3}
};

const std::vector<std::vector<int>> axesZeroDimFusing = {
{1, 3},
};

std::vector<CPUSpecificParams> cpuParams_5D = {
CPUSpecificParams({nCdhw16c}, {nCdhw16c}, {}, {}),
CPUSpecificParams({ndhwc}, {ndhwc}, {}, {}),
Expand Down Expand Up @@ -144,6 +154,17 @@ const auto fusingFakeQuantizeTranspose = fusingSpecificParams{std::make_shared<p
return ov::builder::subgraph::makeTranspose(fakeQuantize, transpose);
}, "FakeQuantize(PerTensor)"}}), {"FakeQuantize"}};

const std::vector<fusingSpecificParams> fusingParamsFullSet {
emptyFusingSpec,
/* activations */
fusingSwish,
/* FQ */
fusingFakeQuantizePerChannelRelu,
fusingFakeQuantizePerTensorRelu,
/* another patterns */
fusingScaleShift
};

const std::vector<fusingSpecificParams> fusingParamsSet {
/* activations */
fusingSwish,
Expand Down Expand Up @@ -600,6 +621,34 @@ const auto params_LowPrecision_fusing = testing::Combine(
testing::ValuesIn(fusingParamsSet_LowPrecision),
testing::ValuesIn(additionalConfig()));

const auto params_DimZero_Arithmetic_fusing = testing::Combine(
testing::Combine(
testing::ValuesIn(axesZeroDimFusing),
testing::Values(ov::test::utils::OpType::VECTOR),
testing::Values(true),
testing::ValuesIn(reductionTypesArithmetic()),
testing::ValuesIn(inpOutPrc()),
testing::Values(ElementType::undefined),
testing::Values(ElementType::undefined),
testing::ValuesIn(inputShapes_Dynmic_ZeroDim)),
testing::Values(emptyCPUSpec),
testing::ValuesIn(fusingParamsFullSet),
testing::ValuesIn(additionalConfig()));

const auto params_DimZero_Compare_fusing = testing::Combine(
testing::Combine(
testing::ValuesIn(axesZeroDimFusing),
testing::Values(ov::test::utils::OpType::VECTOR),
testing::Values(true),
testing::ValuesIn(reductionTypesCompare()),
testing::ValuesIn(inpOutPrc()),
testing::Values(ElementType::undefined),
testing::Values(ElementType::undefined),
testing::ValuesIn(inputShapes_Dynmic_ZeroDim)),
testing::Values(emptyCPUSpec),
testing::ValuesIn(fusingParamsFullSet),
testing::ValuesIn(additionalConfigFP32()));

INSTANTIATE_TEST_SUITE_P(
smoke_Reduce_OneAxis_fusing_CPU,
ReduceCPULayerTest,
Expand Down Expand Up @@ -635,6 +684,20 @@ INSTANTIATE_TEST_SUITE_P(
ReduceCPULayerTest::getTestCaseName
);

INSTANTIATE_TEST_SUITE_P(
smoke_Reduce_DimZero_Arithmetic_fusing_CPU,
ReduceCPULayerTest,
params_DimZero_Arithmetic_fusing,
ReduceCPULayerTest::getTestCaseName
);

INSTANTIATE_TEST_SUITE_P(
smoke_Reduce_DimZero_Compare_fusing_CPU,
ReduceCPULayerTest,
params_DimZero_Compare_fusing,
ReduceCPULayerTest::getTestCaseName
);

/* ================================ 2.2 Fusion - KeepNoDims ================================ */
const auto params_OneAxis_fusing_KeepNoDims = testing::Combine(
testing::Combine(
Expand Down Expand Up @@ -702,4 +765,4 @@ INSTANTIATE_TEST_SUITE_P(
} // namespace
} // namespace Reduce
} // namespace test
} // namespace ov
} // namespace ov
Loading

0 comments on commit 4070bea

Please sign in to comment.