Skip to content

Commit

Permalink
[CPU] Finalize fixing empty input of Reduce node on CPU part
Browse files Browse the repository at this point in the history
  • Loading branch information
xuchen-intel committed Nov 27, 2024
1 parent 9a51543 commit fe888ef
Show file tree
Hide file tree
Showing 8 changed files with 245 additions and 52 deletions.
59 changes: 44 additions & 15 deletions src/plugins/intel_cpu/src/nodes/reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2020,6 +2020,7 @@ void Reduce::initSupportedPrimitiveDescriptors() {
config.outConfs[0].setMemDesc(creatorsMap.at(outFormat)->createSharedDesc(outPrecision, getOutputShapeAtPort(0)));

if (useAclExecutor) {
#if defined (OV_CPU_WITH_ACL)
std::vector<MemoryDescPtr> srcMemoryDescs;
for (size_t i = 0; i < config.inConfs.size(); i++) {
srcMemoryDescs.push_back(config.inConfs[i].getMemDesc());
Expand All @@ -2029,11 +2030,27 @@ void Reduce::initSupportedPrimitiveDescriptors() {
dstMemoryDescs.push_back(config.outConfs[i].getMemDesc());
}

auto factory = std::make_shared<ReduceExecutorFactory>(reduceAttrs, srcMemoryDescs, dstMemoryDescs,
auto attrs = reduceAttrs;
bool apply_ref = customImplPriorities.size() > 0 && customImplPriorities[0] == ref;
bool single_axis_only = one_of(algorithm, Algorithm::ReduceSum, Algorithm::ReduceMax,
Algorithm::ReduceMin, Algorithm::ReduceProd);
// For the case of empty input, transformations ConvertReduceProd(Min, Max, Sum) are disabled to avoid empty output.
// So these 4 reduce modes with empty input reducing more than one axis are not supported by acl executor. Then
// factory->isEmpty() returns true, supportedPrimitiveDescriptors is emtpy. Though we don't actually need these acl
// kernels in execution, here we pass a fake axis {1} to pass assertion of "!supportedPrimitiveDescriptors.empty()"
// in Node::filterSupportedPrimitiveDescriptors().
if (!apply_ref && !isDynamicNode() && single_axis_only) {
const auto& src_shape = getInputShapeAtPort(REDUCE_DATA).getStaticDims();
if (shape_size(src_shape) == 0) {
attrs.axes = {1};
}
}
auto factory = std::make_shared<ReduceExecutorFactory>(attrs, srcMemoryDescs, dstMemoryDescs,
std::make_shared<ExecutorContext>(context, getImplPriority()));
if (!factory->isEmpty()) {
supportedPrimitiveDescriptors.push_back({config, impl_type, factory});
}
#endif
} else {
supportedPrimitiveDescriptors.push_back({config, impl_type});
}
Expand Down Expand Up @@ -2093,6 +2110,12 @@ bool Reduce::isExecutable() const {
}

void Reduce::prepareParams() {
auto srcMemPtr = getSrcMemoryAtPort(REDUCE_DATA);
auto dstMemPtr = getDstMemoryAtPort(0);
const auto& src_shape = srcMemPtr->getStaticDims();
dst_size = dstMemPtr->getSize();
empty_input = shape_size(src_shape) == 0 || srcMemPtr->getSize() == 0;
#if defined (OV_CPU_WITH_ACL)
if (canUseAclExecutor) {
std::vector<MemoryDescPtr> srcMemoryDescs;
for (size_t i = 0; i < getParentEdges().size(); i++) {
Expand All @@ -2102,11 +2125,15 @@ void Reduce::prepareParams() {
dstMemoryDescs.push_back(getDstMemoryAtPort(0)->getDescPtr());

auto selectedPD = getSelectedPrimitiveDescriptor();
aclExecPtr = selectedPD->getExecutorFactoryAs<ReduceExecutorFactory>()->makeExecutor(reduceAttrs, srcMemoryDescs, dstMemoryDescs, {});
selectedPD->setImplementationType(aclExecPtr->getImplType());

if (!empty_input) {
aclExecPtr = selectedPD->getExecutorFactoryAs<ReduceExecutorFactory>()->makeExecutor(reduceAttrs, srcMemoryDescs, dstMemoryDescs, {});
selectedPD->setImplementationType(aclExecPtr->getImplType());
} else {
selectedPD->setImplementationType(acl);
}
return;
}
#endif

src_dims = getParentEdgeAt(REDUCE_DATA)->getMemory().getDesc().getShape().getDims();
std::vector<int> reduce_axes;
Expand All @@ -2116,9 +2143,7 @@ void Reduce::prepareParams() {
reduce_axes = raw_axes;
}

auto dstMemPtr = getDstMemoryAtPort(0);
const VectorDims &dst_dims = dstMemPtr->getDesc().getShape().getDims();
dst_size = dstMemPtr->getSize();
calc_process_dst_dims(reduce_axes, dst_dims);
if (jit_mode) {
set_reduce_dim_flags();
Expand Down Expand Up @@ -2274,15 +2299,17 @@ void Reduce::execute(dnnl::stream strm) {
const uint8_t *src_data = srcMemPtr->getDataAs<const uint8_t>();
uint8_t *dst_data = dstMemPtr->getDataAs<uint8_t>();

const auto& src_shape = srcMemPtr->getStaticDims();
if ((shape_size(src_shape) == 0 || srcMemPtr->getSize() == 0)) {
if (dstMemPtr->getSize() > 0) {
init_dst_data(dst_data, dstMemPtr->getSize());
const bool skip_post_process = getAlgorithm() == Algorithm::ReduceMean || attr.get()->post_ops_.len() == 0;
if (!skip_post_process) {
reduce_kernel_post_process(dst_data);
}
if (empty_input && dst_size > 0) {
#if defined(OPENVINO_ARCH_X86_64)
output_info_reassign(&dst_data);
init_dst_data(dst_data, dst_size);
output_info_restore(&dst_data);
if (attr.get()->post_ops_.len() != 0) {
reduce_kernel_post_process(dst_data);
}
#else
init_dst_data(dst_data, dst_size);
#endif
return;
}

Expand All @@ -2291,6 +2318,7 @@ void Reduce::execute(dnnl::stream strm) {
dst_data = reinterpret_cast<uint8_t *>(prc_mem.get_data_handle());
}
reduce_type(src_data, dst_data);
#if defined (OV_CPU_WITH_ACL)
} else if (aclExecPtr) {
std::vector<MemoryCPtr> srcMemory;
for (size_t i = 0; i < getParentEdges().size(); i++) {
Expand All @@ -2300,6 +2328,7 @@ void Reduce::execute(dnnl::stream strm) {
dstMemory.push_back(getDstMemoryAtPort(0));

aclExecPtr->exec(srcMemory, dstMemory, postOpsDataPtrs.data());
#endif
} else {
if (layout == ReduceLayoutType::reduce_ncsp) {
auto in_ptr = reinterpret_cast<const float *>(src_data);
Expand Down Expand Up @@ -2737,7 +2766,7 @@ inline void Reduce::reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, s

inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) {
const uint8_t *in_ptr = fuse_low_precision ? static_cast<uint8_t *>(&intermediate_buf[0]) : nullptr;
const size_t integerDivisor = IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW);
const size_t integerDivisor = empty_input ? 1 : IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW);
const float divisor = static_cast<float>(integerDivisor);
if (layout == ReduceLayoutType::reduce_ncsp) {
parallel_for2d(OB, OC, [&](size_t ob, size_t oc) {
Expand Down
3 changes: 3 additions & 0 deletions src/plugins/intel_cpu/src/nodes/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ class Reduce : public Node {
bool ReduceCDW_opt = false;
bool use_aux_kernel = false;
bool set_use_aux_kernel = false;
bool empty_input = false;
bool ReduceN, ReduceC, ReduceD, ReduceH, ReduceW;
size_t IB, IC, ID, IH, IW;
size_t OB, OC, OD, OH, OW;
Expand Down Expand Up @@ -188,9 +189,11 @@ class Reduce : public Node {

std::string errorPrefix;

#if defined (OV_CPU_WITH_ACL)
ReduceAttrs reduceAttrs;
bool canUseAclExecutor = false;
std::shared_ptr<ReduceExecutor> aclExecPtr = nullptr;
#endif
};

} // namespace node
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ ov::matcher_pass_callback ov::intel_cpu::ConvertReduceMultiAxisBase::convert_red
if (!reduction_axes) {
return false;
}
if (!reduce->is_dynamic() && ov::shape_size(input0.get_shape()) == 0) {
return false;
}
if (ov::shape_size(input1.get_shape()) <= 1) {
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,25 @@ const std::vector<ov::test::utils::ReductionType>& reductionTypes() {
return reductionTypes;
}

const std::vector<ov::test::utils::ReductionType>& reductionTypesArithmetic() {
static const std::vector<ov::test::utils::ReductionType> reductionTypesArithmetic = {
ov::test::utils::ReductionType::Mean,
ov::test::utils::ReductionType::Sum,
ov::test::utils::ReductionType::Prod,
ov::test::utils::ReductionType::L1,
ov::test::utils::ReductionType::L2,
};
return reductionTypesArithmetic;
}

const std::vector<ov::test::utils::ReductionType>& reductionTypesCompare() {
static const std::vector<ov::test::utils::ReductionType> reductionTypesCompare = {
ov::test::utils::ReductionType::Max,
ov::test::utils::ReductionType::Min,
};
return reductionTypesCompare;
}

const std::vector<ElementType>& inpOutPrc() {
static const std::vector<ElementType> inpOutPrc = {ElementType::f32};
return inpOutPrc;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ const std::vector<std::vector<int>>& axes();
const std::vector<std::vector<int>>& axesND();
const std::vector<ov::test::utils::OpType>& opTypes();
const std::vector<utils::ReductionType>& reductionTypes();
const std::vector<utils::ReductionType>& reductionTypesArithmetic();
const std::vector<utils::ReductionType>& reductionTypesCompare();
const std::vector<ElementType>& inpOutPrc();
const std::vector<std::map<std::string, ov::element::Type>> additionalConfig();
const std::vector<std::map<std::string, ov::element::Type>> additionalConfigFP32();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ namespace {

std::vector<std::vector<ov::test::InputShape>> inputShapes_5D = {
{{{}, {{2, 19, 2, 2, 9}}}},
{{{}, {{0, 19, 2, 2, 9}}}},
{{{}, {{1, 0, 0, 2, 9}}}},
};

std::vector<std::vector<ov::test::InputShape>> inputShapes_5D_ZeroDim = {
{{{}, {{2, 19, 0, 2, 9}}}},
{{{}, {{2, 19, 0, 2, 0}}}},
};

const std::vector<std::vector<int>> axes5D = {
Expand All @@ -46,6 +49,34 @@ const auto params_MultiAxis_5D = testing::Combine(
testing::Values(emptyFusingSpec),
testing::ValuesIn(additionalConfig()));

const auto params_MultiAxis_5D_ZeroDim = testing::Combine(
testing::Combine(
testing::ValuesIn(axes5D),
testing::Values(ov::test::utils::OpType::VECTOR),
testing::Values(true),
testing::ValuesIn(reductionTypesArithmetic()),
testing::ValuesIn(inpOutPrc()),
testing::Values(ElementType::undefined),
testing::Values(ElementType::undefined),
testing::ValuesIn(inputShapes_5D_ZeroDim)),
testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)),
testing::Values(emptyFusingSpec),
testing::ValuesIn(additionalConfig()));

const auto params_MultiAxis_5D_ZeroDim_Compare = testing::Combine(
testing::Combine(
testing::ValuesIn(axes5D),
testing::Values(ov::test::utils::OpType::VECTOR),
testing::Values(true),
testing::ValuesIn(reductionTypesCompare()),
testing::ValuesIn(inpOutPrc()),
testing::Values(ElementType::undefined),
testing::Values(ElementType::undefined),
testing::ValuesIn(inputShapes_5D_ZeroDim)),
testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)),
testing::Values(emptyFusingSpec),
testing::ValuesIn(additionalConfigFP32()));

const std::vector<std::vector<int>> axes5D_ref = {
{0}
};
Expand Down Expand Up @@ -81,6 +112,20 @@ INSTANTIATE_TEST_SUITE_P(
ReduceCPULayerTest::getTestCaseName
);

INSTANTIATE_TEST_SUITE_P(
smoke_Reduce_MultiAxis_5D_ZeroDim_CPU,
ReduceCPULayerTest,
params_MultiAxis_5D_ZeroDim,
ReduceCPULayerTest::getTestCaseName
);

INSTANTIATE_TEST_SUITE_P(
smoke_Reduce_MultiAxis_5D_ZeroDim_Compare_CPU,
ReduceCPULayerTest,
params_MultiAxis_5D_ZeroDim_Compare,
ReduceCPULayerTest::getTestCaseName
);

// Reference implementation testing of ACL unsupported case
INSTANTIATE_TEST_SUITE_P(
smoke_Reduce_MultiAxis_5D_CPU_ref,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ INSTANTIATE_TEST_SUITE_P(
ReduceCPULayerTest::getTestCaseName
);


INSTANTIATE_TEST_SUITE_P(
smoke_Reduce_Int32_CPU,
ReduceCPULayerTest,
Expand Down
Loading

0 comments on commit fe888ef

Please sign in to comment.