From 8c61a95a8784fa1d4f6628bf973816ccd45abf9b Mon Sep 17 00:00:00 2001
From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
Date: Wed, 22 Mar 2023 11:16:49 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90AMP=20OP&Test=E3=80=91unit=20test=20fo?=
 =?UTF-8?q?r=20accuracy=5Fop=20(#51009)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* test_accuracy_op

* add create_test_fp/bf16_class

* cast after calculation

* change convert_uint16_to_float_ifneed

* delete TestAccuracyOpFp32 according to PR comment

* fix the rtol setting rules in bfloat16 forward
---
 paddle/phi/kernels/gpu/accuracy_kernel.cu     |  6 ++-
 .../fluid/tests/unittests/test_accuracy_op.py | 45 ++++++++++++++++++-
 2 files changed, 49 insertions(+), 2 deletions(-)
diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
index f67605714aba8..a42d131a48d69 100644
--- a/paddle/phi/kernels/gpu/accuracy_kernel.cu
+++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -20,6 +20,8 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 
@@ -34,6 +36,7 @@ __global__ void AccuracyCudaKernel(const int N,
                                    int* correct_data,
                                    T* accuracy,
                                    int* total_data) {
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   int count = 0;
   __shared__ int total[BlockSize];
 
@@ -64,7 +67,7 @@ __global__ void AccuracyCudaKernel(const int N,
 #endif
   if (threadIdx.x == 0) {
     *correct_data = result;
-    *accuracy = static_cast<T>(result) / static_cast<T>(N);
+    *accuracy = static_cast<T>(static_cast<MT>(result) / static_cast<MT>(N));
     *total_data = N;
   }
 }
@@ -136,6 +139,7 @@ PD_REGISTER_KERNEL(accuracy,
                    ALL_LAYOUT,
                    phi::AccuracyRawKernel,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    float,
                    double) {
   kernel->InputAt(1).SetDataType(phi::DataType::INT64);
diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
index b6f99020ea87f..5b579ccae6107 100755
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
@@ -19,7 +19,8 @@
 
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
+from paddle.fluid import Program, core, program_guard
+from paddle.fluid.tests.unittests.op_test import convert_float_to_uint16
 
 
 def accuracy_wrapper(infer, indices, label):
@@ -64,6 +65,48 @@ def test_check_output(self):
         self.check_output(atol=1e-3)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestAccuracyOpBf16(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.python_api = accuracy_wrapper
+        self.init_dtype()
+        n = 8192
+        infer = np.random.random((n, 1)).astype(np.float32)
+        indices = np.random.randint(0, 2, (n, 1)).astype('int64')
+        label = np.random.randint(0, 2, (n, 1)).astype('int64')
+        self.inputs = {
+            'Out': convert_float_to_uint16(infer),
+            'Indices': indices,
+            "Label": label,
+        }
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            'Accuracy': convert_float_to_uint16(
+                np.array([num_correct / float(n)]).astype(np.float32)
+            ),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32"),
+        }
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-2)
+
+
 class TestAccuracyOpError(unittest.TestCase):
     def test_type_errors(self):
         with program_guard(Program(), Program()):