[CPU][Ref] Support Reduce ops with empty input (#27603)

### Details: - *The main part of this PR is contributed by #27438 - *My revision is placed in the last commit, regarding changes on Reduce node of CPU plugin, mainly about the following contents:* 1. [x64] Avoid the `divisor` in `reduce_kernel_post_process` to be zero, and enable post ops fusion of ReduceMean. 2. [x64] Add `axesZeroDim` and `axesZeroDimFusing` in test cases, so that all these new added test cases will go exactly to the new added "early return" code block, where input tensor is empty and output tensor is not. 3. [x64] For the case of empty input combined with low precision ops fusion, use intermediate buffer to set default results before post ops fusion. 4. [arm] `makeExecutor` is skipped for the case of empty input on ARM, because Acl library does not support empty input tensor (e.g., NEReduceMean::validate return error). Besides, because of early return, the executor won't be needed anyway. 5. [arm] ARM Transformations ConvertReduceProd(Min, Max, Sum) are disabled to avoid empty output. ### Tickets: - *[CVS-117469](https://jira.devtools.intel.com/browse/CVS-117469)* --------- Co-authored-by: mitruska <[email protected]>
openvinotoolkit · Nov 29, 2024 · 4070bea · 4070bea
1 parent 301d6de
commit 4070bea
Show file tree

Hide file tree

Showing 17 changed files with 349 additions and 18 deletions.
diff --git a/...mmon/transformations/include/transformations/op_conversions/convert_reduce_to_pooling.hpp b/...mmon/transformations/include/transformations/op_conversions/convert_reduce_to_pooling.hpp
@@ -72,7 +72,7 @@ ov::matcher_pass_callback ConvertReduceBase::convert_reduce_to_pooling() {
     return [&](ov::pass::pattern::Matcher& m) {
         auto reduce = std::dynamic_pointer_cast<T>(m.get_match_root());
 
-        if (!reduce || transformation_callback(reduce)) {
+        if (!reduce || transformation_callback(reduce) || ov::shape_size(reduce->input_value(0).get_shape()) == 0) {
             return false;
         }
 

diff --git a/src/core/reference/include/openvino/reference/reduce_mean.hpp b/src/core/reference/include/openvino/reference/reduce_mean.hpp
@@ -26,6 +26,10 @@ void reduce_mean(const T* in, T* out, const Shape& in_shape, const AxisSet& redu
     reduce_sum(in, out, in_shape, reduction_axes);
 
     const auto out_shape = util::reduce(in_shape, reduction_axes);
+    if (shape_size(in_shape) == 0) {
+        return;
+    }
+
     const auto out_size = shape_size(out_shape);
     const auto count = static_cast<T>(shape_size(in_shape) / out_size);
     std::transform(out, std::next(out, out_size), out, [count](const T value) {

diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp
@@ -2020,6 +2020,7 @@ void Reduce::initSupportedPrimitiveDescriptors() {
         config.outConfs[0].setMemDesc(creatorsMap.at(outFormat)->createSharedDesc(outPrecision, getOutputShapeAtPort(0)));
 
         if (useAclExecutor) {
+#if defined (OV_CPU_WITH_ACL)
             std::vector<MemoryDescPtr> srcMemoryDescs;
             for (size_t i = 0; i < config.inConfs.size(); i++) {
                 srcMemoryDescs.push_back(config.inConfs[i].getMemDesc());
@@ -2034,22 +2035,29 @@ void Reduce::initSupportedPrimitiveDescriptors() {
             if (!factory->isEmpty()) {
                 supportedPrimitiveDescriptors.push_back({config, impl_type, factory});
             }
+#endif
         } else {
             supportedPrimitiveDescriptors.push_back({config, impl_type});
         }
     };
 
 #if defined (OV_CPU_WITH_ACL)
-        reduceAttrs.operation = algorithm;
-        reduceAttrs.keepDims = keep_dims;
-        reduceAttrs.axes = raw_axes;
-        for (auto &axis : reduceAttrs.axes) {
-            if (axis < 0)
-                axis += static_cast<int>(getInputShapeAtPort(REDUCE_DATA).getRank());
+        // acl doesn't support empty input
+        if (!isDynamicNode() && shape_size(getInputShapeAtPort(REDUCE_DATA).getStaticDims()) == 0) {
+            canUseAclExecutor = false;
+        } else {
+            reduceAttrs.operation = algorithm;
+            reduceAttrs.keepDims = keep_dims;
+            reduceAttrs.axes = raw_axes;
+            for (auto &axis : reduceAttrs.axes) {
+                if (axis < 0)
+                    axis += static_cast<int>(getInputShapeAtPort(REDUCE_DATA).getRank());
+            }
+            pushDesc(LayoutType::nspc, LayoutType::nspc, input_prec, output_prec, impl_desc_type::undef, true);
+            pushDesc(LayoutType::ncsp, LayoutType::ncsp, input_prec, output_prec, impl_desc_type::undef, true);
+            canUseAclExecutor = !supportedPrimitiveDescriptors.empty();
         }
-        pushDesc(LayoutType::nspc, LayoutType::nspc, input_prec, output_prec, impl_desc_type::undef, true);
-        pushDesc(LayoutType::ncsp, LayoutType::ncsp, input_prec, output_prec, impl_desc_type::undef, true);
-        canUseAclExecutor = !supportedPrimitiveDescriptors.empty();
+
         if (canUseAclExecutor)
             return;
 #endif
@@ -2089,10 +2097,16 @@ void Reduce::initSupportedPrimitiveDescriptors() {
 }
 
 bool Reduce::isExecutable() const {
-    return !isInputTensorAtPortEmpty(REDUCE_DATA);
+    return !isOutputTensorAtPortEmpty(0);
 }
 
 void Reduce::prepareParams() {
+    auto srcMemPtr = getSrcMemoryAtPort(REDUCE_DATA);
+    auto dstMemPtr = getDstMemoryAtPort(0);
+    const auto& src_shape = srcMemPtr->getStaticDims();
+    dst_size = dstMemPtr->getSize();
+    empty_input = shape_size(src_shape) == 0;
+#if defined (OV_CPU_WITH_ACL)
     if (canUseAclExecutor) {
         std::vector<MemoryDescPtr> srcMemoryDescs;
         for (size_t i = 0; i < getParentEdges().size(); i++) {
@@ -2102,11 +2116,15 @@ void Reduce::prepareParams() {
         dstMemoryDescs.push_back(getDstMemoryAtPort(0)->getDescPtr());
 
         auto selectedPD = getSelectedPrimitiveDescriptor();
-        aclExecPtr = selectedPD->getExecutorFactoryAs<ReduceExecutorFactory>()->makeExecutor(reduceAttrs, srcMemoryDescs, dstMemoryDescs, {});
-        selectedPD->setImplementationType(aclExecPtr->getImplType());
-
+        if (!empty_input) {
+            aclExecPtr = selectedPD->getExecutorFactoryAs<ReduceExecutorFactory>()->makeExecutor(reduceAttrs, srcMemoryDescs, dstMemoryDescs, {});
+            selectedPD->setImplementationType(aclExecPtr->getImplType());
+        } else {
+            selectedPD->setImplementationType(acl);
+        }
         return;
     }
+#endif
 
     src_dims = getParentEdgeAt(REDUCE_DATA)->getMemory().getDesc().getShape().getDims();
     std::vector<int> reduce_axes;
@@ -2116,9 +2134,7 @@ void Reduce::prepareParams() {
         reduce_axes = raw_axes;
     }
 
-    auto dstMemPtr = getDstMemoryAtPort(0);
     const VectorDims &dst_dims = dstMemPtr->getDesc().getShape().getDims();
-    dst_size = dstMemPtr->getSize();
     calc_process_dst_dims(reduce_axes, dst_dims);
     if (jit_mode) {
         set_reduce_dim_flags();
@@ -2274,11 +2290,26 @@ void Reduce::execute(dnnl::stream strm) {
     const uint8_t *src_data = srcMemPtr->getDataAs<const uint8_t>();
     uint8_t *dst_data = dstMemPtr->getDataAs<uint8_t>();
 
+    if (empty_input && dst_size > 0) {
+#if defined(OPENVINO_ARCH_X86_64)
+        output_info_reassign(&dst_data);
+        init_dst_data(dst_data, dst_size);
+        output_info_restore(&dst_data);
+        if (attr.get()->post_ops_.len() != 0) {
+            reduce_kernel_post_process(dst_data);
+        }
+#else
+        init_dst_data(dst_data, dst_size);
+#endif
+        return;
+    }
+
     if (jit_mode) {
         if (is_hybrid_layout) {
             dst_data = reinterpret_cast<uint8_t *>(prc_mem.get_data_handle());
         }
         reduce_type(src_data, dst_data);
+#if defined (OV_CPU_WITH_ACL)
     } else if (aclExecPtr) {
         std::vector<MemoryCPtr> srcMemory;
         for (size_t i = 0; i < getParentEdges().size(); i++) {
@@ -2288,6 +2319,7 @@ void Reduce::execute(dnnl::stream strm) {
         dstMemory.push_back(getDstMemoryAtPort(0));
 
         aclExecPtr->exec(srcMemory, dstMemory, postOpsDataPtrs.data());
+#endif
     } else {
         if (layout == ReduceLayoutType::reduce_ncsp) {
             auto in_ptr = reinterpret_cast<const float *>(src_data);
@@ -2725,7 +2757,7 @@ inline void Reduce::reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, s
 
 inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) {
     const uint8_t *in_ptr = fuse_low_precision ? static_cast<uint8_t *>(&intermediate_buf[0]) : nullptr;
-    const size_t integerDivisor = IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW);
+    const size_t integerDivisor = empty_input ? 1 : IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW);
     const float divisor = static_cast<float>(integerDivisor);
     if (layout == ReduceLayoutType::reduce_ncsp) {
         parallel_for2d(OB, OC, [&](size_t ob, size_t oc) {

diff --git a/src/plugins/intel_cpu/src/nodes/reduce.h b/src/plugins/intel_cpu/src/nodes/reduce.h
@@ -152,6 +152,7 @@ class Reduce : public Node {
     bool ReduceCDW_opt = false;
     bool use_aux_kernel = false;
     bool set_use_aux_kernel = false;
+    bool empty_input = false;
     bool ReduceN, ReduceC, ReduceD, ReduceH, ReduceW;
     size_t IB, IC, ID, IH, IW;
     size_t OB, OC, OD, OH, OW;
@@ -188,9 +189,11 @@ class Reduce : public Node {
 
     std::string errorPrefix;
 
+#if defined (OV_CPU_WITH_ACL)
     ReduceAttrs reduceAttrs;
     bool canUseAclExecutor = false;
     std::shared_ptr<ReduceExecutor> aclExecPtr = nullptr;
+#endif
 };
 
 }   // namespace node

diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.cpp
@@ -23,6 +23,9 @@ ov::matcher_pass_callback ov::intel_cpu::ConvertReduceMultiAxisBase::convert_red
         if (!reduction_axes) {
             return false;
         }
+        if (!reduce->is_dynamic() && ov::shape_size(input0.get_shape()) == 0) {
+            return false;
+        }
         if (ov::shape_size(input1.get_shape()) <= 1) {
             return false;
         }

diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/reduce.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/reduce.cpp
@@ -243,6 +243,25 @@ const std::vector<ov::test::utils::ReductionType>& reductionTypes() {
     return reductionTypes;
 }
 
+const std::vector<ov::test::utils::ReductionType>& reductionTypesArithmetic() {
+    static const std::vector<ov::test::utils::ReductionType> reductionTypesArithmetic = {
+            ov::test::utils::ReductionType::Mean,
+            ov::test::utils::ReductionType::Sum,
+            ov::test::utils::ReductionType::Prod,
+            ov::test::utils::ReductionType::L1,
+            ov::test::utils::ReductionType::L2,
+    };
+    return reductionTypesArithmetic;
+}
+
+const std::vector<ov::test::utils::ReductionType>& reductionTypesCompare() {
+    static const std::vector<ov::test::utils::ReductionType> reductionTypesCompare = {
+            ov::test::utils::ReductionType::Max,
+            ov::test::utils::ReductionType::Min,
+    };
+    return reductionTypesCompare;
+}
+
 const std::vector<ElementType>& inpOutPrc() {
     static const std::vector<ElementType> inpOutPrc = {ElementType::f32};
     return inpOutPrc;

diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/reduce.hpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/reduce.hpp
@@ -52,6 +52,8 @@ const std::vector<std::vector<int>>& axes();
 const std::vector<std::vector<int>>& axesND();
 const std::vector<ov::test::utils::OpType>& opTypes();
 const std::vector<utils::ReductionType>& reductionTypes();
+const std::vector<utils::ReductionType>& reductionTypesArithmetic();
+const std::vector<utils::ReductionType>& reductionTypesCompare();
 const std::vector<ElementType>& inpOutPrc();
 const std::vector<std::map<std::string, ov::element::Type>> additionalConfig();
 const std::vector<std::map<std::string, ov::element::Type>> additionalConfigFP32();

diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/reduce.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/reduce.cpp
@@ -20,6 +20,11 @@ std::vector<std::vector<ov::test::InputShape>> inputShapes_5D = {
     {{{}, {{2, 19, 2, 2, 9}}}},
 };
 
+std::vector<std::vector<ov::test::InputShape>> inputShapes_5D_ZeroDim = {
+    {{{}, {{2, 19, 0, 2, 9}}}},
+    {{{}, {{2, 19, 0, 2, 0}}}},
+};
+
 const std::vector<std::vector<int>> axes5D = {
         {2, 4},
         {1, 2, 4},
@@ -70,6 +75,20 @@ const auto params_MultiAxis_5D_ref = testing::Combine(
         testing::Values(emptyFusingSpec),
         testing::ValuesIn(config_infer_prec_f32));
 
+const auto params_MultiAxis_5D_ZeroDim_ref = testing::Combine(
+        testing::Combine(
+                testing::ValuesIn(axes5D),
+                testing::Values(ov::test::utils::OpType::VECTOR),
+                testing::Values(true),
+                testing::ValuesIn(reductionTypes()),
+                testing::ValuesIn(inpOutPrc()),
+                testing::Values(ElementType::undefined),
+                testing::Values(ElementType::undefined),
+                testing::ValuesIn(inputShapes_5D_ZeroDim)),
+        testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_ref)),
+        testing::Values(emptyFusingSpec),
+        testing::ValuesIn(additionalConfigFP32()));
+
 //There are dedicated instences of smoke_Reduce_MultiAxis_5D_CPU test in arm and x64 folders
 //because ACL does not support 0 as reduction axis
 INSTANTIATE_TEST_SUITE_P(
@@ -87,6 +106,13 @@ INSTANTIATE_TEST_SUITE_P(
         ReduceCPULayerTest::getTestCaseName
 );
 
+INSTANTIATE_TEST_SUITE_P(
+        smoke_Reduce_MultiAxis_5D_ZeroDim_CPU_ref,
+        ReduceCPULayerTest,
+        params_MultiAxis_5D_ZeroDim_ref,
+        ReduceCPULayerTest::getTestCaseName
+);
+
 }  // namespace
 }  // namespace Reduce
 }  // namespace test

diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/reduce.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/reduce.cpp
@@ -53,6 +53,12 @@ std::vector<std::vector<ov::test::InputShape>> inputShapes_SingleBatch_dyn = {
     {{{{1, 5}, 19, {1, 5}, {1, 10}}, {{1, 19, 2, 2}, {1, 19, 2, 9}}}},
 };
 
+std::vector<std::vector<ov::test::InputShape>> inputShapes_Dynmic_ZeroDim = {
+    {{{-1, -1, -1, -1}, {{2, 0, 3, 9}}}},
+    {{{2, 0, -1, -1}, {{2, 0, 3, 9}}}},
+    {{{2, 0, -1, -1}, {{2, 0, 3, 0}}}}
+};
+
 std::vector<CPUSpecificParams> cpuParams_3D = {
         CPUSpecificParams({ncw}, {ncw}, {}, {}),
 };
@@ -99,6 +105,10 @@ const std::vector<std::vector<int>> axesGather = {
         {3}
 };
 
+const std::vector<std::vector<int>> axesZeroDimFusing = {
+        {1, 3},
+};
+
 std::vector<CPUSpecificParams> cpuParams_5D = {
         CPUSpecificParams({nCdhw16c}, {nCdhw16c}, {}, {}),
         CPUSpecificParams({ndhwc}, {ndhwc}, {}, {}),
@@ -144,6 +154,17 @@ const auto fusingFakeQuantizeTranspose = fusingSpecificParams{std::make_shared<p
             return ov::builder::subgraph::makeTranspose(fakeQuantize, transpose);
         }, "FakeQuantize(PerTensor)"}}), {"FakeQuantize"}};
 
+const std::vector<fusingSpecificParams> fusingParamsFullSet {
+        emptyFusingSpec,
+        /* activations */
+        fusingSwish,
+        /* FQ */
+        fusingFakeQuantizePerChannelRelu,
+        fusingFakeQuantizePerTensorRelu,
+        /* another patterns */
+        fusingScaleShift
+};
+
 const std::vector<fusingSpecificParams> fusingParamsSet {
         /* activations */
         fusingSwish,
@@ -600,6 +621,34 @@ const auto params_LowPrecision_fusing = testing::Combine(
         testing::ValuesIn(fusingParamsSet_LowPrecision),
         testing::ValuesIn(additionalConfig()));
 
+const auto params_DimZero_Arithmetic_fusing = testing::Combine(
+        testing::Combine(
+                testing::ValuesIn(axesZeroDimFusing),
+                testing::Values(ov::test::utils::OpType::VECTOR),
+                testing::Values(true),
+                testing::ValuesIn(reductionTypesArithmetic()),
+                testing::ValuesIn(inpOutPrc()),
+                testing::Values(ElementType::undefined),
+                testing::Values(ElementType::undefined),
+                testing::ValuesIn(inputShapes_Dynmic_ZeroDim)),
+        testing::Values(emptyCPUSpec),
+        testing::ValuesIn(fusingParamsFullSet),
+        testing::ValuesIn(additionalConfig()));
+
+const auto params_DimZero_Compare_fusing = testing::Combine(
+        testing::Combine(
+                testing::ValuesIn(axesZeroDimFusing),
+                testing::Values(ov::test::utils::OpType::VECTOR),
+                testing::Values(true),
+                testing::ValuesIn(reductionTypesCompare()),
+                testing::ValuesIn(inpOutPrc()),
+                testing::Values(ElementType::undefined),
+                testing::Values(ElementType::undefined),
+                testing::ValuesIn(inputShapes_Dynmic_ZeroDim)),
+        testing::Values(emptyCPUSpec),
+        testing::ValuesIn(fusingParamsFullSet),
+        testing::ValuesIn(additionalConfigFP32()));
+
 INSTANTIATE_TEST_SUITE_P(
         smoke_Reduce_OneAxis_fusing_CPU,
         ReduceCPULayerTest,
@@ -635,6 +684,20 @@ INSTANTIATE_TEST_SUITE_P(
         ReduceCPULayerTest::getTestCaseName
 );
 
+INSTANTIATE_TEST_SUITE_P(
+        smoke_Reduce_DimZero_Arithmetic_fusing_CPU,
+        ReduceCPULayerTest,
+        params_DimZero_Arithmetic_fusing,
+        ReduceCPULayerTest::getTestCaseName
+);
+
+INSTANTIATE_TEST_SUITE_P(
+        smoke_Reduce_DimZero_Compare_fusing_CPU,
+        ReduceCPULayerTest,
+        params_DimZero_Compare_fusing,
+        ReduceCPULayerTest::getTestCaseName
+);
+
 /* ================================ 2.2 Fusion - KeepNoDims ================================ */
 const auto params_OneAxis_fusing_KeepNoDims = testing::Combine(
         testing::Combine(
@@ -702,4 +765,4 @@ INSTANTIATE_TEST_SUITE_P(
 }  // namespace
 }  // namespace Reduce
 }  // namespace test
-}  // namespace ov
+}  // namespace ov