[CPU][Ref] Fix Reduce ops to produce stable (default val) output for empty input #27438

mitruska · 2024-11-06T15:04:21Z

Details:

Fix Reduce ops to produce stable output (aligned with default values instead of random values) for an empty input:
including update of CPU and reference implementations.

openvino/src/plugins/intel_cpu/src/nodes/reduce.cpp

Lines 2949 to 3023 in eabe528

    
           inline void Reduce::init_dst_data(uint8_t *out_ptr, size_t dst_size) { 
        
               switch (algorithm) { 
        
                   case Algorithm::ReduceL1: 
        
                   case Algorithm::ReduceL2: 
        
                   case Algorithm::ReduceLogSum: 
        
                   case Algorithm::ReduceLogSumExp: 
        
                   case Algorithm::ReduceMean: 
        
                   case Algorithm::ReduceOr: 
        
                   case Algorithm::ReduceSum: 
        
                   case Algorithm::ReduceSumSquare: 
        
                       memset(out_ptr, 0, dst_size); 
        
                       break; 
        
                   case Algorithm::ReduceAnd: 
        
                   case Algorithm::ReduceProd: 
        
                       if (output_prec == ov::element::f32) { 
        
                           auto out_p = reinterpret_cast<float *>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<float>(1); }); 
        
                       } else if (output_prec == ov::element::i32) { 
        
                           auto out_p = reinterpret_cast<int32_t *>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<int32_t>(1); }); 
        
                       } else if (output_prec == ov::element::bf16) { 
        
                           auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<bfloat16_t>(1); }); 
        
                       } else if (output_prec == ov::element::f16) { 
        
                           auto out_p = reinterpret_cast<ov::float16*>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<ov::float16>(1); }); 
        
                       } else if (output_prec == ov::element::u8) { 
        
                           auto out_p = reinterpret_cast<uint8_t *>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<uint8_t>(1); }); 
        
                       } else if (output_prec == ov::element::i8) { 
        
                           auto out_p = reinterpret_cast<int8_t *>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<int8_t>(1); }); 
        
                       } 
        
                       break; 
        
                   case Algorithm::ReduceMax: 
        
                       if (output_prec == ov::element::f32) { 
        
                           auto out_p = reinterpret_cast<float *>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<float>::lowest(); }); 
        
                       } else if (output_prec == ov::element::i32) { 
        
                           auto out_p = reinterpret_cast<int32_t *>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<int32_t>::min(); }); 
        
                       } else if (output_prec == ov::element::bf16) { 
        
                           auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<bfloat16_t>::lowest(); }); 
        
                       } else if (output_prec == ov::element::f16) { 
        
                           auto out_p = reinterpret_cast<ov::float16*>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<ov::float16>::lowest(); }); 
        
                       } else if (output_prec == ov::element::u8) { 
        
                           auto out_p = reinterpret_cast<uint8_t *>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<uint8_t>::min(); }); 
        
                       } else if (output_prec == ov::element::i8) { 
        
                           auto out_p = reinterpret_cast<int8_t *>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<int8_t>::min(); }); 
        
                       } 
        
                       break; 
        
                   case Algorithm::ReduceMin: 
        
                       if (output_prec == ov::element::f32) { 
        
                           auto out_p = reinterpret_cast<float *>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<float>::max(); }); 
        
                       } else if (output_prec == ov::element::i32) { 
        
                           auto out_p = reinterpret_cast<int32_t *>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<int32_t>::max(); }); 
        
                       } else if (output_prec == ov::element::bf16) { 
        
                           auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<bfloat16_t>::max(); }); 
        
                       } else if (output_prec == ov::element::f16) { 
        
                           auto out_p = reinterpret_cast<ov::float16*>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<ov::float16>::max(); }); 
        
                       } else if (output_prec == ov::element::u8) { 
        
                           auto out_p = reinterpret_cast<uint8_t *>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<uint8_t>::max(); }); 
        
                       } else if (output_prec == ov::element::i8) { 
        
                           auto out_p = reinterpret_cast<int8_t *>(out_ptr); 
        
                           parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<int8_t>::max(); }); 
        
                       }

Stable behavior for empty input has been requested for NNCF
Originally random values has been produced, because common shape inference of the Reduce can produce non empty output shape for an empty input, for example Reduce with keep_dims=False:
InputShape{2, 0}, reduce_axes=1, OutputShape{2}
Common CPU Reduce::isExecutable was returning False for an empty input tensor (with 0 dim in the input Shape), and the Reduce::execute has been not executed at all for such case.
The proposal of updated behavior is to return False for an empty output shape (with 0 dim in the output Shape),
but when the input is empty, and the output is not, the output will be filled with default value.

The approach I've tested within the second PR #27423 is similar, but it forces CPU Reduce output value to be always 0 (which is not fully compatible with the reference impl, TF and ONNX).

Note: This change is considered as backward compatible with relatively low risk.
Alternatively Reduce ops common shape_inference can be updated:

A. To produce empty output for empty input and keep 0 dim in the output shape even if requested by "axes" to be reduced.
or
B. Throw an error when reduction over 0 dimension is detected.

But those options are currently considered as not backward compatible, and would require new version of each Reduce* op.

Tickets:

117469

…educe_result

dmitry-gorokhov · 2024-11-08T10:45:43Z

src/plugins/intel_cpu/src/nodes/reduce.cpp

@@ -2274,6 +2274,14 @@ void Reduce::execute(dnnl::stream strm) {
    const uint8_t *src_data = srcMemPtr->getDataAs<const uint8_t>();
    uint8_t *dst_data = dstMemPtr->getDataAs<uint8_t>();

+    const auto& src_shape = srcMemPtr->getStaticDims();


I am afraid this is not fully correct fix.
Reduce node semantics support post operations which mean several ops should be executed within single kernel.
For example Reduce + Add. It means we shouln't only fill ouput with correct default value, but also apply Add oepration to it.
@xuchen-intel Could you please take a look? Would appreciate if you can help to finilize the fix on CPU side.

@dmitry-gorokhov I agree. If the expected output shape is non-empty, then we should apply post ops fusion.
@mitruska Could you please add reduce_kernel_post_process(dst_data); before the new added early return?

An alternative way would be to disable post ops fusion here if input is empty, which would be more safe. After all this is a corner case to reduce dim size 0 to 1, any incompatibility to the variants of such case from the computation module would lead to incorrect/undefined results.

An alternative way would be to disable post ops fusion here if input is empty, which would be more safe. After all this is a corner case to reduce dim size 0 to 1, any incompatibility to the variants of such case from the computation module would lead to incorrect/undefined results.

@xuchen-intel Shapes might be dynamic. So we don't know in advance should the fusion be disabled or not.

Right! Then @mitruska please let's add reduce_kernel_post_process(dst_data);, with some test cases (e.g. here) to reproduce this issue beforehand if possible.
Applying reduce_kernel_post_process(dst_data); should work for post ops cases. Otherwise, please let me know, if you need me to fix it in a separate PR for the CPU part. @dmitry-gorokhov

@dmitry-gorokhov, @xuchen-intel Thank you for your support! I've added the reduce_kernel_post_process(dst_data), but it needed some special conditions to make the tests passed.

The condition:

openvino/src/plugins/intel_cpu/src/nodes/reduce.cpp

Lines 2281 to 2283 in 99c1e89

const bool skip_post_process = getAlgorithm() == Algorithm::ReduceMean || attr.get()->post_ops_.len() == 0;

if (!skip_post_process) {

reduce_kernel_post_process(dst_data);

was inspired by:

openvino/src/plugins/intel_cpu/src/nodes/reduce.cpp

Lines 2564 to 2565 in 99c1e89

apply_division = getAlgorithm() == Algorithm::ReduceMean && attr.get()->post_ops_.len() == 0;

apply_post_kernel = !apply_division;

Moreover I was able to use only fuse "Swish" CPU tests, because other fusion tests pipeline was failing for 0 and dynamic cases. Those cases probably need some fusing test pipeline adjustment.

src/core/reference/include/openvino/reference/reduce_l1.hpp

src/core/reference/include/openvino/reference/reduce_prod.hpp

…for_empty_reduce

src/core/reference/include/openvino/reference/reduce_mean.hpp

src/common/transformations/include/transformations/op_conversions/convert_reduce_to_pooling.hpp

praasz · 2024-11-15T15:27:18Z

LGTM core part.

mitruska · 2024-11-18T14:30:51Z

Agreed with CPU maintainers that the work will be continued within a separate PR.

### Details: - *The main part of this PR is contributed by #27438 - *My revision is placed in the last commit, regarding changes on Reduce node of CPU plugin, mainly about the following contents:* 1. [x64] Avoid the `divisor` in `reduce_kernel_post_process` to be zero, and enable post ops fusion of ReduceMean. 2. [x64] Add `axesZeroDim` and `axesZeroDimFusing` in test cases, so that all these new added test cases will go exactly to the new added "early return" code block, where input tensor is empty and output tensor is not. 3. [x64] For the case of empty input combined with low precision ops fusion, use intermediate buffer to set default results before post ops fusion. 4. [arm] `makeExecutor` is skipped for the case of empty input on ARM, because Acl library does not support empty input tensor (e.g., NEReduceMean::validate return error). Besides, because of early return, the executor won't be needed anyway. 5. [arm] ARM Transformations ConvertReduceProd(Min, Max, Sum) are disabled to avoid empty output. ### Tickets: - *[CVS-117469](https://jira.devtools.intel.com/browse/CVS-117469)* --------- Co-authored-by: mitruska <[email protected]>

mitruska added 9 commits November 5, 2024 20:40

Fill CPU Reduce out tensor with 0 when input is empty

504fede

Add ref tests

6821f1e

Add CPU tests

e0f18b3

Update Reduce ref impls

fe2fcbb

Update convert_reduce_to_pooling transform to skip empty input case

3ab29db

Merge remote-tracking branch 'upstream/master' into mitruska/stable_r…

fb4e3fc

…educe_result

Fix type warning for 0

3b9a845

Make filling output type generic

e7f4a56

Align default values for ReduceProd/Min/Max

998ba53

github-actions bot added category: Core OpenVINO Core (aka ngraph) category: CPU OpenVINO CPU plugin category: transformations OpenVINO Runtime library - Transformations category: TEMPLATE OpenVINO Template plugin labels Nov 6, 2024

mitruska added 3 commits November 6, 2024 16:34

Updat ref tests to check keep_dims for empty input

9420886

Align tests format

af2d8d8

Ensure not proceeding for an empty output

39662fa

mitruska self-assigned this Nov 7, 2024

mitruska marked this pull request as ready for review November 7, 2024 11:43

mitruska requested review from a team as code owners November 7, 2024 11:43

mitruska requested review from itikhono, dmitry-gorokhov and mmikolajcz and removed request for a team November 7, 2024 11:43

dmitry-gorokhov reviewed Nov 8, 2024

View reviewed changes

mitruska requested a review from xuchen-intel November 8, 2024 14:15

Merge branch 'master' into mitruska/default_for_empty_reduce

faa6c75

praasz reviewed Nov 14, 2024

View reviewed changes

src/core/reference/include/openvino/reference/reduce_l1.hpp Outdated Show resolved Hide resolved

praasz reviewed Nov 15, 2024

View reviewed changes

src/core/reference/include/openvino/reference/reduce_prod.hpp Outdated Show resolved Hide resolved

mitruska added 6 commits November 15, 2024 15:05

Move and add fusing tests for CPU

cebf7ae

Apply reduce_kernel_post_process

70352e7

Add arm tests

392996f

Reduce ref tests improvements

125ff8f

Remove early return from reduce refs

946022e

Merge remote-tracking branch 'upstream/master' into mitruska/default_…

99c1e89

…for_empty_reduce

praasz reviewed Nov 15, 2024

View reviewed changes

src/core/reference/include/openvino/reference/reduce_mean.hpp Show resolved Hide resolved

praasz approved these changes Nov 15, 2024

View reviewed changes

praasz reviewed Nov 15, 2024

View reviewed changes

src/common/transformations/include/transformations/op_conversions/convert_reduce_to_pooling.hpp Show resolved Hide resolved

mitruska requested a review from dmitry-gorokhov November 15, 2024 16:21

mitruska closed this Nov 18, 2024

xuchen-intel mentioned this pull request Nov 19, 2024

[CPU][Ref] Support Reduce ops with empty input #27603

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[CPU][Ref] Fix Reduce ops to produce stable (default val) output for empty input #27438

[CPU][Ref] Fix Reduce ops to produce stable (default val) output for empty input #27438

mitruska commented Nov 6, 2024 •

edited

Loading

dmitry-gorokhov Nov 8, 2024

xuchen-intel Nov 11, 2024

xuchen-intel Nov 11, 2024

dmitry-gorokhov Nov 11, 2024 •

edited

Loading

xuchen-intel Nov 11, 2024

mitruska Nov 15, 2024

praasz commented Nov 15, 2024

mitruska commented Nov 18, 2024

	inline void Reduce::init_dst_data(uint8_t *out_ptr, size_t dst_size) {
	switch (algorithm) {
	case Algorithm::ReduceL1:
	case Algorithm::ReduceL2:
	case Algorithm::ReduceLogSum:
	case Algorithm::ReduceLogSumExp:
	case Algorithm::ReduceMean:
	case Algorithm::ReduceOr:
	case Algorithm::ReduceSum:
	case Algorithm::ReduceSumSquare:
	memset(out_ptr, 0, dst_size);
	break;
	case Algorithm::ReduceAnd:
	case Algorithm::ReduceProd:
	if (output_prec == ov::element::f32) {
	auto out_p = reinterpret_cast<float *>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<float>(1); });
	} else if (output_prec == ov::element::i32) {
	auto out_p = reinterpret_cast<int32_t *>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<int32_t>(1); });
	} else if (output_prec == ov::element::bf16) {
	auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<bfloat16_t>(1); });
	} else if (output_prec == ov::element::f16) {
	auto out_p = reinterpret_cast<ov::float16*>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<ov::float16>(1); });
	} else if (output_prec == ov::element::u8) {
	auto out_p = reinterpret_cast<uint8_t *>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<uint8_t>(1); });
	} else if (output_prec == ov::element::i8) {
	auto out_p = reinterpret_cast<int8_t *>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast<int8_t>(1); });
	}
	break;
	case Algorithm::ReduceMax:
	if (output_prec == ov::element::f32) {
	auto out_p = reinterpret_cast<float *>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<float>::lowest(); });
	} else if (output_prec == ov::element::i32) {
	auto out_p = reinterpret_cast<int32_t *>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<int32_t>::min(); });
	} else if (output_prec == ov::element::bf16) {
	auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<bfloat16_t>::lowest(); });
	} else if (output_prec == ov::element::f16) {
	auto out_p = reinterpret_cast<ov::float16*>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<ov::float16>::lowest(); });
	} else if (output_prec == ov::element::u8) {
	auto out_p = reinterpret_cast<uint8_t *>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<uint8_t>::min(); });
	} else if (output_prec == ov::element::i8) {
	auto out_p = reinterpret_cast<int8_t *>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<int8_t>::min(); });
	}
	break;
	case Algorithm::ReduceMin:
	if (output_prec == ov::element::f32) {
	auto out_p = reinterpret_cast<float *>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<float>::max(); });
	} else if (output_prec == ov::element::i32) {
	auto out_p = reinterpret_cast<int32_t *>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<int32_t>::max(); });
	} else if (output_prec == ov::element::bf16) {
	auto out_p = reinterpret_cast<bfloat16_t*>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<bfloat16_t>::max(); });
	} else if (output_prec == ov::element::f16) {
	auto out_p = reinterpret_cast<ov::float16*>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<ov::float16>::max(); });
	} else if (output_prec == ov::element::u8) {
	auto out_p = reinterpret_cast<uint8_t *>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<uint8_t>::max(); });
	} else if (output_prec == ov::element::i8) {
	auto out_p = reinterpret_cast<int8_t *>(out_ptr);
	parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits<int8_t>::max(); });
	}

	const bool skip_post_process = getAlgorithm() == Algorithm::ReduceMean \|\| attr.get()->post_ops_.len() == 0;
	if (!skip_post_process) {
	reduce_kernel_post_process(dst_data);

	apply_division = getAlgorithm() == Algorithm::ReduceMean && attr.get()->post_ops_.len() == 0;
	apply_post_kernel = !apply_division;

[CPU][Ref] Fix Reduce ops to produce stable (default val) output for empty input #27438

[CPU][Ref] Fix Reduce ops to produce stable (default val) output for empty input #27438

Conversation

mitruska commented Nov 6, 2024 • edited Loading

Details:

Tickets:

dmitry-gorokhov Nov 8, 2024

Choose a reason for hiding this comment

xuchen-intel Nov 11, 2024

Choose a reason for hiding this comment

xuchen-intel Nov 11, 2024

Choose a reason for hiding this comment

dmitry-gorokhov Nov 11, 2024 • edited Loading

Choose a reason for hiding this comment

xuchen-intel Nov 11, 2024

Choose a reason for hiding this comment

mitruska Nov 15, 2024

Choose a reason for hiding this comment

praasz commented Nov 15, 2024

mitruska commented Nov 18, 2024

mitruska commented Nov 6, 2024 •

edited

Loading

dmitry-gorokhov Nov 11, 2024 •

edited

Loading