From 445489879c497ff452649a5cf182c21a082f9d93 Mon Sep 17 00:00:00 2001
From: zzk0 <zzk9460@gmail.com>
Date: Tue, 23 May 2023 09:42:03 +0000
Subject: [PATCH 1/8] op unittest for scatter_add

---
 python/tests/ops/test_scatter_add.py | 342 ++++++++++++++++++---------
 1 file changed, 226 insertions(+), 116 deletions(-)

diff --git a/python/tests/ops/test_scatter_add.py b/python/tests/ops/test_scatter_add.py
index 8a0e1269f9..7be09c9792 100644
--- a/python/tests/ops/test_scatter_add.py
+++ b/python/tests/ops/test_scatter_add.py
@@ -14,30 +14,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-import numpy as np
-from op_test import OpTest, OpTestTool
 import paddle
-import paddle.nn.functional as F
-import cinn
 from cinn.frontend import *
 from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
 
 
 @OpTestTool.skip_if(not is_compiled_with_cuda(),
                     "x86 test will be skipped due to timeout.")
 class TestScatterAddOp(OpTest):
     def setUp(self):
-        self.init_case()
-        self.target = DefaultNVGPUTarget()
-
-    def init_case(self):
-        self.axis = 0
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        x_shape = self.case["x_shape"]
+        y_shape = self.case["y_shape"]
+        dtype = self.case["dtype"]
+        axis = self.case["axis"]
         self.inputs = {
-            "x": np.random.random([10, 5]).astype("float32"),
-            "y": np.random.random([5, 5]).astype("float32"),
-            "index": np.array([0, 5, 0, 9, 0]).astype("int32")
+            "x": self.random(x_shape, dtype),
+            "y": self.random(y_shape, dtype),
+            "index": self.random([y_shape[axis]], "int32", 0, x_shape[0])
         }
+        self.axis = axis
 
     def build_paddle_program(self, target):
         x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
@@ -47,24 +49,32 @@ def build_paddle_program(self, target):
         if pos_axis < 0:
             pos_axis += len(x.shape)
 
+        index_nd = []
         if pos_axis == 0:
-            index_nd = []
             for i in range(len(self.inputs["index"])):
                 index_nd.append([self.inputs["index"][i]])
         elif pos_axis == 1:
-            index_nd = []
             for i in range(self.inputs['x'].shape[0]):
                 index_nd.append([])
                 for j in range(len(self.inputs["index"])):
                     index_nd[i].append([i, self.inputs["index"][j]])
         elif pos_axis == 2:
-            index_nd = []
             for i in range(self.inputs['x'].shape[0]):
                 index_nd.append([])
                 for j in range(self.inputs['x'].shape[1]):
                     index_nd[i].append([])
                     for k in range(len(self.inputs["index"])):
                         index_nd[i][j].append([i, j, self.inputs["index"][k]])
+        elif pos_axis == 3:
+            for i in range(self.inputs['x'].shape[0]):
+                index_nd.append([])
+                for j in range(self.inputs['x'].shape[1]):
+                    index_nd[i].append([])
+                    for k in range(self.inputs['x'].shape[2]):
+                        index_nd[i][j].append([])
+                        for l in range(len(self.inputs["index"])):
+                            index_nd[i][j][k].append(
+                                [i, j, k, self.inputs["index"][l]])
         else:
             self.assertTrue(False, "Axis {} No Implement".format(pos_axis))
 
@@ -74,10 +84,15 @@ def build_paddle_program(self, target):
 
     def build_cinn_program(self, target):
         builder = NetBuilder("scatter_add")
-        x = builder.create_input(Float(32), self.inputs["x"].shape, "x")
-        y = builder.create_input(Float(32), self.inputs["y"].shape, "y")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.inputs["y"].dtype),
+            self.inputs["y"].shape, "y")
         index = builder.create_input(
-            Int(32), self.inputs["index"].shape, "index")
+            self.nptype2cinntype(self.inputs["index"].dtype),
+            self.inputs["index"].shape, "index")
         out = builder.scatter_add(x, y, index, self.axis)
 
         prog = builder.build()
@@ -88,60 +103,195 @@ def build_cinn_program(self, target):
         self.cinn_outputs = [res[0]]
 
     def test_check_results(self):
-        self.check_outputs_and_grads()
-
-
-class TestScatterAddCase1(TestScatterAddOp):
-    def init_case(self):
-        self.inputs = {
-            "x": np.random.random([10, 5]).astype("float32"),
-            "y": np.random.random([10, 3]).astype("float32"),
-            "index": np.random.randint(0, 5, size=3).astype("int32")
-        }
-        self.axis = 1
-
-
-class TestScatterAddCase2(TestScatterAddOp):
-    def init_case(self):
-        self.inputs = {
-            "x": np.random.random([10, 5, 5]).astype("float32"),
-            "y": np.random.random([10, 5, 3]).astype("float32"),
-            "index": np.random.randint(0, 5, size=3).astype("int32")
-        }
-        self.axis = -1
-
-
-class TestScatterAddCase3(TestScatterAddOp):
-    def init_case(self):
-        self.inputs = {
-            "x": np.random.random([10, 5, 5]).astype("float32"),
-            "y": np.random.random([10, 3, 5]).astype("float32"),
-            "index": np.random.randint(0, 5, size=3).astype("int32")
-        }
-        self.axis = 1
-
-
-class TestScatterAddCase4(TestScatterAddOp):
-    def init_case(self):
-        self.inputs = {
-            "x": np.random.random([10]).astype("float32"),
-            "y": np.random.random([1]).astype("float32"),
-            "index": np.random.randint(0, 10, size=1).astype("int32")
-        }
-        self.axis = -1
-
-
-class TestScatterAddCase5(TestScatterAddOp):
-    def init_case(self):
-        self.inputs = {
-            "x": np.random.random([10, 5]).astype("float32"),
-            "y": np.random.random([3, 5]).astype("float32"),
-            "index": np.random.randint(0, 10, size=3).astype("int32")
+        if self.case["dtype"] == "float16":
+            self.check_outputs_and_grads(
+                max_relative_error=0.01, max_absolute_error=0.01)
+        else:
+            self.check_outputs_and_grads()
+
+
+class TestScatterAddOpShapeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestScatterAddOpShapeTest"
+        self.cls = TestScatterAddOp
+        self.inputs = [{
+            "x_shape": [10],
+            "y_shape": [5],
+            "axis": 0
+        }, {
+            "x_shape": [10, 8],
+            "y_shape": [8, 8],
+            "axis": 0
+        }, {
+            "x_shape": [10, 8, 16],
+            "y_shape": [10, 4, 16],
+            "axis": 1
+        }, {
+            "x_shape": [10, 8, 16, 32],
+            "y_shape": [10, 8, 20, 32],
+            "axis": -2
+        }, {
+            "x_shape": [10, 8, 16, 32],
+            "y_shape": [10, 8, 1, 32],
+            "axis": -2
+        }, {
+            "x_shape": [10, 1, 16, 32],
+            "y_shape": [10, 1, 8, 32],
+            "axis": -2
+        }, {
+            "x_shape": [1024, 8, 16, 4],
+            "y_shape": [512, 8, 16, 4],
+            "axis": 0
+        }, {
+            "x_shape": [2048, 8, 16, 4],
+            "y_shape": [1024, 8, 16, 4],
+            "axis": 0
+        }, {
+            "x_shape": [1024, 8, 16, 4],
+            "y_shape": [2048, 8, 16, 4],
+            "axis": 0
+        }, {
+            "x_shape": [1, 1, 1, 1],
+            "y_shape": [1, 1, 1, 1],
+            "axis": 0
+        }, {
+            "x_shape": [1],
+            "y_shape": [8],
+            "axis": 0
+        }]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+class TestScatterAddOpDtypeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestScatterAddOpDtypeTest"
+        self.cls = TestScatterAddOp
+        self.inputs = [{
+            "x_shape": [10],
+            "y_shape": [5],
+            "axis": 0
+        }, {
+            "x_shape": [10, 8],
+            "y_shape": [8, 8],
+            "axis": 0
+        }, {
+            "x_shape": [1024, 8, 16, 4],
+            "y_shape": [512, 8, 16, 4],
+            "axis": 0
+        }]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = []
+
+
+class TestScatterAddOpAttributeAxis(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestScatterAddOpAttributeAxis"
+        self.cls = TestScatterAddOp
+        self.inputs = [
+            {
+                "x_shape": [10],
+                "y_shape": [5],
+                "axis": 0
+            },
+            {
+                "x_shape": [10, 8],
+                "y_shape": [8, 8],
+                "axis": -2
+            },
+            {
+                "x_shape": [10, 8, 16],
+                "y_shape": [5, 8, 16],
+                "axis": 0
+            },
+            {
+                "x_shape": [10, 8, 16],
+                "y_shape": [10, 4, 16],
+                "axis": 1
+            },
+            {
+                "x_shape": [10, 8, 16],
+                "y_shape": [10, 8, 8],
+                "axis": 2
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [2, 8, 16, 32],
+                "axis": 0
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 8, 8, 32],
+                "axis": 2
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 8, 16, 16],
+                "axis": 3
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 8, 16, 8],
+                "axis": -1
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 8, 4, 32],
+                "axis": -2
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [1, 8, 16, 32],
+                "axis": -4
+            },
+            # core dumped: cuda_module.cc:118] RAW: The error `CUDA_ERROR_LAUNCH_FAILED` occurs
+            # while compiling the ptx! And its message is `unspecified launch failure`.
+            # {
+            #     "x_shape": [10, 8, 16, 32],
+            #     "y_shape": [10, 4, 16, 32],
+            #     "axis": 1
+            # },
+            # {
+            #     "x_shape": [10, 8, 16, 32],
+            #     "y_shape": [10, 2, 16, 32],
+            #     "axis": -3
+            # },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+# test inline compute: https://github.com/PaddlePaddle/CINN/pull/1329
+class TestScatterAddCaseInline1(TestScatterAddOp):
+    def setUp(self):
+        self.case = {
+            "x_shape": [10, 5],
+            "y_shape": [5, 5],
+            "index_shape": [5],
+            "dtype": "float32",
+            "index_dtype": "int32",
+            "axis": 0
         }
-        self.axis = 0
-
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
 
-class TestScatterAddCase6(TestScatterAddOp):
     def build_cinn_program(self, target):
         builder = NetBuilder("scatter_add")
         x = builder.create_input(Float(64), self.inputs["x"].shape, "x")
@@ -160,7 +310,7 @@ def build_cinn_program(self, target):
         self.cinn_outputs = [res[0]]
 
 
-class TestScatterAddCase7(TestScatterAddOp):
+class TestScatterAddCaseInline2(TestScatterAddCaseInline1):
     def build_cinn_program(self, target):
         builder = NetBuilder("scatter_add")
         x = builder.create_input(Float(32), self.inputs["x"].shape, "x")
@@ -176,50 +326,10 @@ def build_cinn_program(self, target):
             self.inputs["index"]
         ], [out])
 
-        self.cinn_outputs = [res[0]]
-
-
-class TestScatterAddCase8(TestScatterAddCase7):
-    def init_case(self):
-        self.axis = 0
-        self.inputs = {
-            "x": np.random.random([10, 5]).astype("float32"),
-            "y": np.random.random([10, 5]).astype("float32"),
-            "index": np.array([0, 5, 0, 9, 0, 1, 2, 3, 4, 5]).astype("int32")
-        }
-
-
-class TestScatterAddOp9(TestScatterAddOp):
-    def setUp(self):
-        self.init_case()
-        self.target = DefaultNVGPUTarget()
-
-    def init_case(self):
-        self.axis = 0
-        self.inputs = {
-            "x": np.random.random([10, 5]).astype("float64"),
-            "y": np.random.random([5, 5]).astype("float64"),
-            "index": np.array([0, 5, 0, 9, 0]).astype("int32")
-        }
-
-    def build_cinn_program(self, target):
-        builder = NetBuilder("scatter_add")
-        x = builder.create_input(Float(64), self.inputs["x"].shape, "x")
-        y = builder.create_input(Float(64), self.inputs["y"].shape, "y")
-        index = builder.create_input(
-            Int(32), self.inputs["index"].shape, "index")
-        out = builder.scatter_add(x, y, index, self.axis)
-
-        prog = builder.build()
-        res = self.get_cinn_output(
-            prog, target, [x, y, index],
-            [self.inputs["x"], self.inputs["y"], self.inputs["index"]], [out])
-
-        self.cinn_outputs = [res[0]]
-
-    def test_check_results(self):
-        self.check_outputs_and_grads()
+        self.cinn_outputs = res
 
 
 if __name__ == "__main__":
-    unittest.main()
+    TestScatterAddOpShapeTest().run()
+    TestScatterAddOpDtypeTest().run()
+    TestScatterAddOpAttributeAxis().run()

From cfeff2ffd50d4964eea01f7cb5871fe0404fe067 Mon Sep 17 00:00:00 2001
From: zzk0 <zzk9460@gmail.com>
Date: Tue, 23 May 2023 11:02:33 +0000
Subject: [PATCH 2/8] add more dtype

---
 cinn/runtime/cuda/cinn_cuda_runtime_source.cuh | 4 ++++
 cinn/runtime/cuda/cuda_intrinsics.cc           | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
index 7d0b75e476..58c9fb4717 100644
--- a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
+++ b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
@@ -705,6 +705,10 @@ CINN_NVGPU_GT_NUM(int64, long long int)
     return res;                                                               \
   }
 
+CINN_NVGPU_INDEX_ADD(bool, bool)
+CINN_NVGPU_INDEX_ADD(int8, int8_t)
+CINN_NVGPU_INDEX_ADD(int32, int32_t)
+CINN_NVGPU_INDEX_ADD(int64, int64_t)
 CINN_NVGPU_INDEX_ADD(fp32, float)
 CINN_NVGPU_INDEX_ADD(fp64, double)
 #ifdef CINN_CUDA_FP16
diff --git a/cinn/runtime/cuda/cuda_intrinsics.cc b/cinn/runtime/cuda/cuda_intrinsics.cc
index bdb1005ebd..cfdf91cfc0 100644
--- a/cinn/runtime/cuda/cuda_intrinsics.cc
+++ b/cinn/runtime/cuda/cuda_intrinsics.cc
@@ -346,6 +346,10 @@ CINN_REGISTER_HELPER(cuda_intrinsics) {
       .AddInputType<int>()                                                       \
       .End();
 
+  _REGISTER_CINN_NVGPU_INDEX_ADD(bool, bool);
+  _REGISTER_CINN_NVGPU_INDEX_ADD(int8, int8_t);
+  _REGISTER_CINN_NVGPU_INDEX_ADD(int32, int32_t);
+  _REGISTER_CINN_NVGPU_INDEX_ADD(int64, int64_t);
   _REGISTER_CINN_NVGPU_INDEX_ADD(fp32, float);
   _REGISTER_CINN_NVGPU_INDEX_ADD(fp64, double);
 

From 5f6b5b049703ae5486519d3d3febecf7cc47b529 Mon Sep 17 00:00:00 2001
From: zzk0 <zzk9460@gmail.com>
Date: Wed, 24 May 2023 09:21:00 +0000
Subject: [PATCH 3/8] add a.out

---
 a.out | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 a.out

diff --git a/a.out b/a.out
new file mode 100644
index 0000000000..08f58f1227
--- /dev/null
+++ b/a.out
@@ -0,0 +1,144 @@
+#include <cstdint>
+
+#define CINN_WITH_CUDA
+#include "bfloat16.h"
+#include "float16.h"
+using cinn::common::bfloat16;
+using cinn::common::float16;
+// using cinn::common::half4;
+// using cinn::common::half8;
+// using cinn::common::float8;
+
+#include "cinn_cuda_runtime_source.cuh"
+
+extern "C" {
+
+__global__
+void __launch_bounds__(1024) fn_scatter_add_0_35_kernel(const float* __restrict__ x, const float* __restrict__ y, const int32_t* __restrict__ index, float* __restrict__ var_1658)
+{
+  if (((int)blockIdx.x < 40)) {
+    if (((int)threadIdx.x < 1024)) {
+      var_1658[((1024 * (int)blockIdx.x) + (int)threadIdx.x)] = cinn_nvgpu_index_add_fp32(x[((1024 * (int)blockIdx.x) + (int)threadIdx.x)], ((int)blockIdx.x / 4), y, ((512 * ((int)threadIdx.x / 512)) + (((((int)threadIdx.x & 511) / 32) * 32) + ((1024 * ((int)blockIdx.x & 3)) + ((int)threadIdx.x & 31)))), 4096, index, 1);
+    };
+  };
+}
+
+}
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-29558016
+// Cuda compilation tools, release 11.2, V11.2.142
+// Based on NVVM 7.0.1
+//
+
+.version 7.2
+.target sm_70
+.address_size 64
+
+        // .globl       fn_scatter_add_0_35_kernel
+.global .align 4 .u32 _ZN4cinn6common7float165shiftE;
+.global .align 4 .u32 _ZN4cinn6common7float169shiftSignE;
+.global .align 4 .u32 _ZN4cinn6common7float164infNE;
+.global .align 4 .u32 _ZN4cinn6common7float164maxNE;
+.global .align 4 .u32 _ZN4cinn6common7float164minNE;
+.global .align 4 .u32 _ZN4cinn6common7float164sigNE;
+.global .align 4 .u32 _ZN4cinn6common7float164infCE;
+.global .align 4 .u32 _ZN4cinn6common7float164nanNE;
+.global .align 4 .u32 _ZN4cinn6common7float164maxCE;
+.global .align 4 .u32 _ZN4cinn6common7float164minCE;
+.global .align 4 .u32 _ZN4cinn6common7float164sigCE;
+.global .align 4 .u32 _ZN4cinn6common7float164mulNE;
+.global .align 4 .u32 _ZN4cinn6common7float164mulCE;
+.global .align 4 .u32 _ZN4cinn6common7float164subCE;
+.global .align 4 .u32 _ZN4cinn6common7float164norCE;
+.global .align 4 .u32 _ZN4cinn6common7float164maxDE;
+.global .align 4 .u32 _ZN4cinn6common7float164minDE;
+
+.visible .entry fn_scatter_add_0_35_kernel(
+        .param .u64 fn_scatter_add_0_35_kernel_param_0,
+        .param .u64 fn_scatter_add_0_35_kernel_param_1,
+        .param .u64 fn_scatter_add_0_35_kernel_param_2,
+        .param .u64 fn_scatter_add_0_35_kernel_param_3
+)
+.maxntid 1024, 1, 1
+{
+        .reg .pred      %p<8>;
+        .reg .f32       %f<8>;
+        .reg .b32       %r<29>;
+        .reg .b64       %rd<17>;
+
+
+        ld.param.u64    %rd3, [fn_scatter_add_0_35_kernel_param_0];
+        ld.param.u64    %rd4, [fn_scatter_add_0_35_kernel_param_1];
+        ld.param.u64    %rd5, [fn_scatter_add_0_35_kernel_param_2];
+        ld.param.u64    %rd6, [fn_scatter_add_0_35_kernel_param_3];
+        mov.u32         %r1, %ctaid.x;
+        setp.gt.s32     %p1, %r1, 39;
+        mov.u32         %r2, %tid.x;
+        setp.gt.s32     %p2, %r2, 1023;
+        or.pred         %p3, %p1, %p2;
+        @%p3 bra        LBB0_9;
+
+        cvta.to.global.u64      %rd7, %rd3;
+        shl.b32         %r11, %r1, 10;
+        add.s32         %r3, %r11, %r2;
+        mul.wide.s32    %rd8, %r3, 4;
+        add.s64         %rd9, %rd7, %rd8;
+        ld.global.nc.f32        %f6, [%rd9];
+        shr.s32         %r12, %r1, 31;
+        shr.u32         %r13, %r12, 30;
+        add.s32         %r14, %r1, %r13;
+        shr.s32         %r4, %r14, 2;
+        shr.s32         %r15, %r2, 31;
+        shr.u32         %r16, %r15, 23;
+        add.s32         %r17, %r2, %r16;
+        and.b32         %r18, %r17, -512;
+        and.b32         %r19, %r1, 3;
+        and.b32         %r20, %r2, 511;
+        bfi.b32         %r21, %r19, %r20, 10, 2;
+        add.s32         %r5, %r21, %r18;
+        cvta.to.global.u64      %rd1, %rd5;
+        cvta.to.global.u64      %rd2, %rd4;
+        mov.u32         %r26, -1;
+
+LBB0_2:
+        mov.u32         %r27, %r26;
+
+LBB0_3:
+        add.s32         %r8, %r27, 1;
+        setp.gt.s32     %p4, %r27, -1;
+        mov.u32         %r26, -1;
+        @%p4 bra        LBB0_5;
+
+        mul.wide.s32    %rd10, %r27, 4;
+        add.s64         %rd11, %rd1, %rd10;
+        ld.global.nc.u32        %r23, [%rd11+4];
+        setp.ne.s32     %p5, %r23, %r4;
+        mov.u32         %r27, %r8;
+        mov.u32         %r26, %r8;
+        @%p5 bra        LBB0_3;
+
+LBB0_5:
+        setp.lt.s32     %p6, %r26, 0;
+        @%p6 bra        LBB0_7;
+
+        shl.b32         %r24, %r26, 12;
+        add.s32         %r25, %r5, %r24;
+        mul.wide.s32    %rd12, %r25, 4;
+        add.s64         %rd13, %rd2, %rd12;
+        ld.global.nc.f32        %f5, [%rd13];
+        add.f32         %f6, %f6, %f5;
+
+LBB0_7:
+        setp.ne.s32     %p7, %r26, -1;
+        @%p7 bra        LBB0_2;
+
+        cvta.to.global.u64      %rd14, %rd6;
+        add.s64         %rd16, %rd14, %rd8;
+        st.global.f32   [%rd16], %f6;
+
+LBB0_9:
+        ret;
+
+}
\ No newline at end of file

From c1e76f2830205a877cb9c491358a4be4c1176567 Mon Sep 17 00:00:00 2001
From: zzk0 <zzk9460@gmail.com>
Date: Fri, 26 May 2023 08:38:59 +0000
Subject: [PATCH 4/8] [WIP] scatter

---
 cinn/hlir/op/contrib/scatter.cc     | 103 ++++---------------------
 cinn/pybind/frontend.cc             |  32 ++++++++
 python/tests/ops/test_scatter_op.py | 114 ++++++++++++++++++++++++++++
 3 files changed, 160 insertions(+), 89 deletions(-)
 mode change 100755 => 100644 cinn/hlir/op/contrib/scatter.cc
 create mode 100644 python/tests/ops/test_scatter_op.py

diff --git a/cinn/hlir/op/contrib/scatter.cc b/cinn/hlir/op/contrib/scatter.cc
old mode 100755
new mode 100644
index 4ad38f19eb..cf2c52136e
--- a/cinn/hlir/op/contrib/scatter.cc
+++ b/cinn/hlir/op/contrib/scatter.cc
@@ -28,6 +28,7 @@
 #include "cinn/hlir/framework/node.h"
 #include "cinn/hlir/framework/op.h"
 #include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
 #include "cinn/hlir/pe/elementwise.h"
 #include "cinn/hlir/pe/ir_schedule_pe.h"
 #include "cinn/hlir/pe/nn.h"
@@ -82,7 +83,7 @@ ir::Tensor Scatter(const ir::Tensor &A,
       }
     }
     new_axes.push_back(pos_axis);
-    transpose_B = pe::Transpose(B, new_axes, B->name + "_index_transpose");
+    transpose_B = pe::Transpose(B, new_axes, B->name + "_transpose");
   }
   auto res = Compute(
       C->shape,
@@ -176,8 +177,8 @@ std::shared_ptr<framework::OpStrategy> StrategyForScatter(const framework::NodeA
                                                           const Target &target) {
   auto attr_store = attrs.attr_store;
   CHECK(attr_store.count("axis")) << "find no attr of axis";
-  int axis = absl::get<int>(attr_store.at("axis"));
-  std::string op_name("scatter");
+  const int axis = absl::get<int>(attr_store.at("axis"));
+  const std::string op_name("scatter");
 
   framework::CINNCompute scatter_compute([=](lang::Args args, lang::RetValue *ret) {
     CHECK(!args.empty()) << "The input arguments of " << op_name << " compute is empty! Please check.\n";
@@ -186,62 +187,20 @@ std::shared_ptr<framework::OpStrategy> StrategyForScatter(const framework::NodeA
     Expr A = pack_args[0];
     Expr B = pack_args[1];
     Expr C = pack_args[2];
-    CHECK(A.as_tensor());
-    CHECK(B.as_tensor());
-    CHECK(C.as_tensor());
-    CHECK(!output_shapes.empty());
-    auto tensor_A = A.as_tensor_ref();
-    auto tensor_B = B.as_tensor_ref();
-    auto tensor_C = C.as_tensor_ref();
-    auto stages   = CreateStages({tensor_A, tensor_B, tensor_C});
-    VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ") << ", B shape: " << utils::Join(tensor_B->shape, ", ")
-            << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
     std::string tensor_name = UniqName("Scatter_out");
     if (FLAGS_cinn_ir_schedule) {
       CHECK_EQ(pack_args.size(), 4U);
       tensor_name = pack_args[3].operator std::string();
+      VLOG(4) << A.as_tensor_ref()->name << " " << B.as_tensor_ref()->name << " " << C.as_tensor_ref()->name << " " << tensor_name;
+      tensor_name = "test_scatter_out";
     }
-    ir::Tensor out = Scatter(tensor_A, tensor_B, tensor_C, target, axis, tensor_name);
-    std::vector<CINNValue> res;
-    stages->InsertLazily(out);
-    res.push_back(CINNValue(out));
-    CHECK(!out_type.empty()) << "Output type of " << op_name << " is empty! Please check.\n";
-    res.push_back(CINNValue(stages));
-    *ret = CINNValuePack{res};
-  });
-
-  framework::CINNSchedule scatter_schedule([=](lang::Args args, lang::RetValue *ret) {
-    if (FLAGS_cinn_ir_schedule) {
-      CHECK(!args.empty()) << "The input argument of scatter_schedule is empty! Please check.\n";
-      common::CINNValuePack arg_pack = args[0];
-      std::vector<Expr> vec_ast;
-      for (int i = 0; i < arg_pack.size(); i++) {
-        if (arg_pack[i].is_expr()) {
-          Expr temp = arg_pack[i];
-          vec_ast.emplace_back(temp);
-        }
-      }
-      CHECK(!vec_ast.empty());
-      ir::ModuleExpr mod_expr(vec_ast);
-      ir::IRSchedule ir_sch(mod_expr);
-      ir_sch.MergeExprs();
-      long prod_size = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies<int>());
-      if (prod_size > 1) {
-        pe::IRInjectiveSchedule(ir_sch, output_shapes.front(), target);
-      }
-      std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-      *ret = common::CINNValuePack{res};
-    } else {
-      CHECK(!args.empty()) << "The input argument of scatter_schedule is empty! Please check.\n";
-      CINNValuePack arg_pack = args[0];
-      Expr out               = arg_pack[0];
-      CHECK(out.as_tensor());
-      *ret = arg_pack;
-    }
+    ir::Tensor out = Scatter(A.as_tensor_ref(), B.as_tensor_ref(), C.as_tensor_ref(), target, axis, tensor_name);
+    auto stages    = CreateStages({out});
+    *ret           = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
-  strategy->AddImpl(scatter_compute, scatter_schedule, "strategy.scatter.x86", 1);
+  strategy->AddImpl(scatter_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.scatter.x86", 1);
   return strategy;
 }
 
@@ -269,7 +228,6 @@ std::shared_ptr<framework::OpStrategy> StrategyForScatterNd(const framework::Nod
     auto tensor_A = A.as_tensor_ref();
     auto tensor_B = B.as_tensor_ref();
     auto tensor_C = C.as_tensor_ref();
-    auto stages   = CreateStages({tensor_A, tensor_B, tensor_C});
     VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ") << ", B shape: " << utils::Join(tensor_B->shape, ", ")
             << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
     std::string tensor_name = UniqName("ScatterNd_out");
@@ -278,46 +236,12 @@ std::shared_ptr<framework::OpStrategy> StrategyForScatterNd(const framework::Nod
       tensor_name = pack_args[3].operator std::string();
     }
     ir::Tensor out = ScatterNd(tensor_A, tensor_B, tensor_C, target, axes, tensor_name);
-    std::vector<CINNValue> res;
-    stages->InsertLazily(out);
-    res.push_back(CINNValue(out));
-    CHECK(!out_type.empty()) << "Output type of " << op_name << " is empty! Please check.\n";
-    res.push_back(CINNValue(stages));
-    *ret = CINNValuePack{res};
-  });
-
-  framework::CINNSchedule scatter_nd_schedule([=](lang::Args args, lang::RetValue *ret) {
-    if (FLAGS_cinn_ir_schedule) {
-      CHECK(!args.empty()) << "The input argument of scatter_nd_schedule is empty! Please check.\n";
-      common::CINNValuePack arg_pack = args[0];
-      std::vector<Expr> vec_ast;
-      for (int i = 0; i < arg_pack.size(); i++) {
-        if (arg_pack[i].is_expr()) {
-          Expr temp = arg_pack[i];
-          vec_ast.emplace_back(temp);
-        }
-      }
-      CHECK(!vec_ast.empty());
-      ir::ModuleExpr mod_expr(vec_ast);
-      ir::IRSchedule ir_sch(mod_expr);
-      ir_sch.MergeExprs();
-      long prod_size = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies<int>());
-      if (prod_size > 1) {
-        pe::IRInjectiveSchedule(ir_sch, output_shapes.front(), target);
-      }
-      std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-      *ret = common::CINNValuePack{res};
-    } else {
-      CHECK(!args.empty()) << "The input argument of scatter_nd_schedule is empty! Please check.\n";
-      CINNValuePack arg_pack = args[0];
-      Expr out               = arg_pack[0];
-      CHECK(out.as_tensor());
-      *ret = arg_pack;
-    }
+    auto stages    = CreateStages({out});
+    *ret           = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
-  strategy->AddImpl(scatter_nd_compute, scatter_nd_schedule, "strategy.scatter_nd.x86", 1);
+  strategy->AddImpl(scatter_nd_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.scatter_nd.x86", 1);
   return strategy;
 }
 
@@ -357,6 +281,7 @@ CINN_REGISTER_HELPER(scatter_ops) {
       .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForScatterNd)
       .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForScatter))
       .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForScatter))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective)
       .set_support_level(4);
 
   return true;
diff --git a/cinn/pybind/frontend.cc b/cinn/pybind/frontend.cc
index 1cb719538e..2800302375 100644
--- a/cinn/pybind/frontend.cc
+++ b/cinn/pybind/frontend.cc
@@ -530,6 +530,38 @@ void BindFrontend(pybind11::module *m) {
            py::arg("updates"),
            py::arg("index"),
            py::arg("axis") = 0)
+      .def("scatter",
+           static_cast<Variable (NetBuilder::*)(const Variable &, const Variable &, const Variable &, const int &)>(
+               &NetBuilder::Scatter),
+           py::arg("src"),
+           py::arg("index"),
+           py::arg("out"),
+           py::arg("axis"))
+      .def("scatter",
+           static_cast<Variable (NetBuilder::*)(
+               const Variable &, const Variable &, const std::vector<int> &, const float &, const int &)>(
+               &NetBuilder::Scatter),
+           py::arg("src"),
+           py::arg("index"),
+           py::arg("shape"),
+           py::arg("default_value") = 0,
+           py::arg("axis")          = 0)
+      .def("scatter_nd",
+           static_cast<Variable (NetBuilder::*)(
+               const Variable &, const Variable &, const Variable &, const std::vector<int> &)>(&NetBuilder::ScatterNd),
+           py::arg("src"),
+           py::arg("index"),
+           py::arg("out"),
+           py::arg("axis"))
+      .def("scatter_nd",
+           static_cast<Variable (NetBuilder::*)(
+               const Variable &, const Variable &, const std::vector<int> &, const float &, const std::vector<int> &)>(
+               &NetBuilder::ScatterNd),
+           py::arg("src"),
+           py::arg("index"),
+           py::arg("shape"),
+           py::arg("default_value") = 0,
+           py::arg("axis")          = 0)
       .def("isclose",
            &NetBuilder::IsClose,
            py::arg("x"),
diff --git a/python/tests/ops/test_scatter_op.py b/python/tests/ops/test_scatter_op.py
new file mode 100644
index 0000000000..9ead260dcc
--- /dev/null
+++ b/python/tests/ops/test_scatter_op.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestScatterOp(OpTest):
+    def setUp(self):
+        self.init_case()
+        self.target = DefaultNVGPUTarget()
+
+    def init_case(self):
+        self.axis = 0
+        self.inputs = {
+            "x": np.random.random([10, 5]).astype("float32"),
+            "y": np.random.random([5, 5]).astype("float32"),
+            "index": np.array([0, 5, 0, 9, 0]).astype("int32"),
+            "src": self.random([2, 5], "float32"),
+            "out": self.random([3, 5], "float32"),
+            "index0": np.array([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]).astype("int32"),
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        y = paddle.to_tensor(self.inputs["y"], stop_gradient=True)
+
+        pos_axis = self.axis
+        if pos_axis < 0:
+            pos_axis += len(x.shape)
+
+        if pos_axis == 0:
+            index_nd = []
+            for i in range(len(self.inputs["index"])):
+                index_nd.append([self.inputs["index"][i]])
+        elif pos_axis == 1:
+            index_nd = []
+            for i in range(self.inputs['x'].shape[0]):
+                index_nd.append([])
+                for j in range(len(self.inputs["index"])):
+                    index_nd[i].append([i, self.inputs["index"][j]])
+        elif pos_axis == 2:
+            index_nd = []
+            for i in range(self.inputs['x'].shape[0]):
+                index_nd.append([])
+                for j in range(self.inputs['x'].shape[1]):
+                    index_nd[i].append([])
+                    for k in range(len(self.inputs["index"])):
+                        index_nd[i][j].append([i, j, self.inputs["index"][k]])
+        else:
+            self.assertTrue(False, "Axis {} No Implement".format(pos_axis))
+
+        index = paddle.to_tensor(index_nd, stop_gradient=True)
+        res = paddle.scatter_nd_add(x, index, y)
+        self.paddle_outputs = [res]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("scatter")
+        src = builder.create_input(OpTest.nptype2cinntype(self.inputs["src"].dtype), self.inputs["src"].shape, "src")
+        index = builder.create_input(OpTest.nptype2cinntype(self.inputs["index0"].dtype), self.inputs["index0"].shape, "index0")
+        out = builder.create_input(OpTest.nptype2cinntype(self.inputs["out"].dtype), self.inputs["out"].shape, "out")
+        out1 = builder.scatter(src, index, out, 0)
+
+        print(self.inputs["src"], self.inputs["index0"], self.inputs["out"])
+
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog, target, [src, index, out],
+            [self.inputs["src"], self.inputs["index0"], self.inputs["out"]], [out1])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+# class TestScatterOp1(TestScatterOp):
+#     def setUp(self):
+#         self.init_case()
+#         self.target = DefaultNVGPUTarget()
+
+#     def init_case(self):
+#         self.axis = 0
+#         self.inputs = {
+#             "x": np.random.random([10, 5]).astype("int32"),
+#             "y": np.random.random([5, 5]).astype("int32"),
+#             "index": np.array([0, 5, 0, 9, 0]).astype("int32")
+#         }
+
+#     def test_check_results(self):
+#         self.check_outputs_and_grads()
+
+
+if __name__ == "__main__":
+    unittest.main()

From ad8c6065c5eb60b043f68c0262c66668abd85784 Mon Sep 17 00:00:00 2001
From: zzk0 <zzk9460@gmail.com>
Date: Wed, 31 May 2023 01:38:12 +0000
Subject: [PATCH 5/8] op unittest for test_scatter_add

---
 python/tests/ops/test_scatter_add.py | 162 +++++++++++++++------------
 1 file changed, 89 insertions(+), 73 deletions(-)

diff --git a/python/tests/ops/test_scatter_add.py b/python/tests/ops/test_scatter_add.py
index 7be09c9792..a2879c521e 100644
--- a/python/tests/ops/test_scatter_add.py
+++ b/python/tests/ops/test_scatter_add.py
@@ -18,7 +18,7 @@
 from cinn.frontend import *
 from cinn.common import *
 from op_test import OpTest, OpTestTool
-from op_test_helper import TestCaseHelper
+from op_test_helper import TestCaseHelper, run_test
 
 
 @OpTestTool.skip_if(not is_compiled_with_cuda(),
@@ -37,7 +37,7 @@ def prepare_inputs(self):
         self.inputs = {
             "x": self.random(x_shape, dtype),
             "y": self.random(y_shape, dtype),
-            "index": self.random([y_shape[axis]], "int32", 0, x_shape[0])
+            "index": self.random([y_shape[axis]], "int32", 0, x_shape[axis])
         }
         self.axis = axis
 
@@ -100,7 +100,7 @@ def build_cinn_program(self, target):
             prog, target, [x, y, index],
             [self.inputs["x"], self.inputs["y"], self.inputs["index"]], [out])
 
-        self.cinn_outputs = [res[0]]
+        self.cinn_outputs = res
 
     def test_check_results(self):
         if self.case["dtype"] == "float16":
@@ -114,51 +114,63 @@ class TestScatterAddOpShapeTest(TestCaseHelper):
     def init_attrs(self):
         self.class_name = "TestScatterAddOpShapeTest"
         self.cls = TestScatterAddOp
-        self.inputs = [{
-            "x_shape": [10],
-            "y_shape": [5],
-            "axis": 0
-        }, {
-            "x_shape": [10, 8],
-            "y_shape": [8, 8],
-            "axis": 0
-        }, {
-            "x_shape": [10, 8, 16],
-            "y_shape": [10, 4, 16],
-            "axis": 1
-        }, {
-            "x_shape": [10, 8, 16, 32],
-            "y_shape": [10, 8, 20, 32],
-            "axis": -2
-        }, {
-            "x_shape": [10, 8, 16, 32],
-            "y_shape": [10, 8, 1, 32],
-            "axis": -2
-        }, {
-            "x_shape": [10, 1, 16, 32],
-            "y_shape": [10, 1, 8, 32],
-            "axis": -2
-        }, {
-            "x_shape": [1024, 8, 16, 4],
-            "y_shape": [512, 8, 16, 4],
-            "axis": 0
-        }, {
-            "x_shape": [2048, 8, 16, 4],
-            "y_shape": [1024, 8, 16, 4],
-            "axis": 0
-        }, {
-            "x_shape": [1024, 8, 16, 4],
-            "y_shape": [2048, 8, 16, 4],
-            "axis": 0
-        }, {
-            "x_shape": [1, 1, 1, 1],
-            "y_shape": [1, 1, 1, 1],
-            "axis": 0
-        }, {
-            "x_shape": [1],
-            "y_shape": [8],
-            "axis": 0
-        }]
+        self.inputs = [
+            {
+                "x_shape": [10],
+                "y_shape": [5],
+                "axis": 0
+            },
+            {
+                "x_shape": [10, 8],
+                "y_shape": [8, 8],
+                "axis": 0
+            },
+            {
+                "x_shape": [10, 8, 16],
+                "y_shape": [10, 4, 16],
+                "axis": 1
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 8, 20, 32],
+                "axis": -2
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 8, 1, 32],
+                "axis": -2
+            },
+            {
+                "x_shape": [10, 1, 16, 32],
+                "y_shape": [10, 1, 8, 32],
+                "axis": -2
+            },
+            {
+                "x_shape": [1024, 8, 16, 4],
+                "y_shape": [512, 8, 16, 4],
+                "axis": 0
+            },
+            {
+                "x_shape": [2048, 8, 16, 4],
+                "y_shape": [1024, 8, 16, 4],
+                "axis": 0
+            },
+            {
+                "x_shape": [1024, 8, 16, 4],
+                "y_shape": [2048, 8, 16, 4],
+                "axis": 0
+            },
+            {
+                "x_shape": [1, 1, 1, 1],
+                "y_shape": [1, 1, 1, 1],
+                "axis": 0
+            },
+            {
+                "x_shape": [1],
+                "y_shape": [8],
+                "axis": 0
+            },
+        ]
         self.dtypes = [{"dtype": "float32"}]
         self.attrs = []
 
@@ -167,19 +179,23 @@ class TestScatterAddOpDtypeTest(TestCaseHelper):
     def init_attrs(self):
         self.class_name = "TestScatterAddOpDtypeTest"
         self.cls = TestScatterAddOp
-        self.inputs = [{
-            "x_shape": [10],
-            "y_shape": [5],
-            "axis": 0
-        }, {
-            "x_shape": [10, 8],
-            "y_shape": [8, 8],
-            "axis": 0
-        }, {
-            "x_shape": [1024, 8, 16, 4],
-            "y_shape": [512, 8, 16, 4],
-            "axis": 0
-        }]
+        self.inputs = [
+            {
+                "x_shape": [10],
+                "y_shape": [5],
+                "axis": 0
+            },
+            {
+                "x_shape": [10, 8],
+                "y_shape": [8, 8],
+                "axis": 0
+            },
+            {
+                "x_shape": [1024, 8, 16, 4],
+                "y_shape": [512, 8, 16, 4],
+                "axis": 0
+            },
+        ]
         self.dtypes = [
             {
                 "dtype": "float16"
@@ -260,18 +276,16 @@ def init_attrs(self):
                 "y_shape": [1, 8, 16, 32],
                 "axis": -4
             },
-            # core dumped: cuda_module.cc:118] RAW: The error `CUDA_ERROR_LAUNCH_FAILED` occurs
-            # while compiling the ptx! And its message is `unspecified launch failure`.
-            # {
-            #     "x_shape": [10, 8, 16, 32],
-            #     "y_shape": [10, 4, 16, 32],
-            #     "axis": 1
-            # },
-            # {
-            #     "x_shape": [10, 8, 16, 32],
-            #     "y_shape": [10, 2, 16, 32],
-            #     "axis": -3
-            # },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 4, 16, 32],
+                "axis": 1
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 2, 16, 32],
+                "axis": -3
+            },
         ]
         self.dtypes = [{"dtype": "float32"}]
         self.attrs = []
@@ -333,3 +347,5 @@ def build_cinn_program(self, target):
     TestScatterAddOpShapeTest().run()
     TestScatterAddOpDtypeTest().run()
     TestScatterAddOpAttributeAxis().run()
+    run_test(TestScatterAddCaseInline1)
+    run_test(TestScatterAddCaseInline2)

From 9cf811eacad371bdc12b0f00b4b82b404a051f23 Mon Sep 17 00:00:00 2001
From: zzk0 <zzk9460@gmail.com>
Date: Thu, 1 Jun 2023 00:49:09 +0000
Subject: [PATCH 6/8] add temporary files

---
 cinn/hlir/op/contrib/scatter.cc     | 44 +++++++++--------
 python/tests/ops/test_scatter_op.py | 75 ++++++++++++++++-------------
 2 files changed, 67 insertions(+), 52 deletions(-)

diff --git a/cinn/hlir/op/contrib/scatter.cc b/cinn/hlir/op/contrib/scatter.cc
index cf2c52136e..7733a5f92d 100644
--- a/cinn/hlir/op/contrib/scatter.cc
+++ b/cinn/hlir/op/contrib/scatter.cc
@@ -72,19 +72,19 @@ ir::Tensor Scatter(const ir::Tensor &A,
     pos_axis += C->shape.size();
   }
 
-  ir::Tensor transpose_B;
-  if (pos_axis == A->shape.size() - 1) {
-    transpose_B = B;
-  } else {
-    std::vector<int> new_axes;
-    for (int i = 0; i < A->shape.size(); ++i) {
-      if (i != pos_axis) {
-        new_axes.push_back(i);
-      }
-    }
-    new_axes.push_back(pos_axis);
-    transpose_B = pe::Transpose(B, new_axes, B->name + "_transpose");
-  }
+  ir::Tensor transpose_B = B;
+  // if (pos_axis == A->shape.size() - 1) {
+  //   transpose_B = B;
+  // } else {
+  //   std::vector<int> new_axes;
+  //   for (int i = 0; i < A->shape.size(); ++i) {
+  //     if (i != pos_axis) {
+  //       new_axes.push_back(i);
+  //     }
+  //   }
+  //   new_axes.push_back(pos_axis);
+  //   transpose_B = pe::Transpose(B, new_axes, B->name + "_transpose");
+  // }
   auto res = Compute(
       C->shape,
       [=](const std::vector<Expr> &indices) {
@@ -184,19 +184,25 @@ std::shared_ptr<framework::OpStrategy> StrategyForScatter(const framework::NodeA
     CHECK(!args.empty()) << "The input arguments of " << op_name << " compute is empty! Please check.\n";
     CINNValuePack pack_args = args[0];
     CHECK_GE(pack_args.size(), 3U) << "3 input tensors for " << op_name << " compute\n";
-    Expr A = pack_args[0];
-    Expr B = pack_args[1];
-    Expr C = pack_args[2];
+    Expr A                  = pack_args[0];
+    Expr B                  = pack_args[1];
+    Expr C                  = pack_args[2];
     std::string tensor_name = UniqName("Scatter_out");
+    auto stages    = CreateStages({A.as_tensor_ref(), B.as_tensor_ref(), C.as_tensor_ref()});
     if (FLAGS_cinn_ir_schedule) {
       CHECK_EQ(pack_args.size(), 4U);
       tensor_name = pack_args[3].operator std::string();
-      VLOG(4) << A.as_tensor_ref()->name << " " << B.as_tensor_ref()->name << " " << C.as_tensor_ref()->name << " " << tensor_name;
+      VLOG(4) << A.as_tensor_ref()->name << " " << B.as_tensor_ref()->name << " " << C.as_tensor_ref()->name << " "
+              << tensor_name;
       tensor_name = "test_scatter_out";
     }
     ir::Tensor out = Scatter(A.as_tensor_ref(), B.as_tensor_ref(), C.as_tensor_ref(), target, axis, tensor_name);
-    auto stages    = CreateStages({out});
-    *ret           = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+    std::vector<CINNValue> res;
+    stages->InsertLazily(out);
+    res.push_back(CINNValue(out));
+    CHECK(!out_type.empty()) << "Output type of Scatter is empty! Please check.\n";
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
diff --git a/python/tests/ops/test_scatter_op.py b/python/tests/ops/test_scatter_op.py
index 9ead260dcc..31a3530f2f 100644
--- a/python/tests/ops/test_scatter_op.py
+++ b/python/tests/ops/test_scatter_op.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import unittest
 import numpy as np
 from op_test import OpTest, OpTestTool
@@ -30,7 +31,7 @@ def setUp(self):
         self.target = DefaultNVGPUTarget()
 
     def init_case(self):
-        self.axis = 0
+        self.axis = -1
         self.inputs = {
             "x": np.random.random([10, 5]).astype("float32"),
             "y": np.random.random([5, 5]).astype("float32"),
@@ -39,55 +40,64 @@ def init_case(self):
             "out": self.random([3, 5], "float32"),
             "index0": np.array([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]).astype("int32"),
         }
+        print(self.inputs["src"], '\n', self.inputs["index0"], '\n', self.inputs["out"])
 
     def build_paddle_program(self, target):
-        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
-        y = paddle.to_tensor(self.inputs["y"], stop_gradient=True)
+        # x = paddle.to_tensor(self.inputs["src"], stop_gradient=True)
+        # y = paddle.to_tensor(self.inputs["out"], stop_gradient=True)
+        # index = paddle.to_tensor(self.inputs["index0"], stop_gradient=True)
+        x = self.inputs["src"]
+        y = self.inputs["out"]
+        index = self.inputs["index0"]
 
         pos_axis = self.axis
         if pos_axis < 0:
             pos_axis += len(x.shape)
 
-        if pos_axis == 0:
-            index_nd = []
-            for i in range(len(self.inputs["index"])):
-                index_nd.append([self.inputs["index"][i]])
-        elif pos_axis == 1:
-            index_nd = []
-            for i in range(self.inputs['x'].shape[0]):
-                index_nd.append([])
-                for j in range(len(self.inputs["index"])):
-                    index_nd[i].append([i, self.inputs["index"][j]])
-        elif pos_axis == 2:
-            index_nd = []
-            for i in range(self.inputs['x'].shape[0]):
-                index_nd.append([])
-                for j in range(self.inputs['x'].shape[1]):
-                    index_nd[i].append([])
-                    for k in range(len(self.inputs["index"])):
-                        index_nd[i][j].append([i, j, self.inputs["index"][k]])
+        dim = len(x.shape)
+        res = copy.deepcopy(y)
+        if dim == 1:
+            pass
+        elif dim == 2:
+            if pos_axis == 0:
+                for i in range(x.shape[0]):
+                    for j in range(x.shape[1]):
+                        res[index[i, j], j] = x[i, j]
+            elif pos_axis == 1:
+                for i in range(x.shape[0]):
+                    for j in range(x.shape[1]):
+                        res[i, index[i, j]] = x[i, j]
+        elif dim == 3:
+            pass
+        elif dim == 4:
+            pass
         else:
-            self.assertTrue(False, "Axis {} No Implement".format(pos_axis))
-
-        index = paddle.to_tensor(index_nd, stop_gradient=True)
-        res = paddle.scatter_nd_add(x, index, y)
-        self.paddle_outputs = [res]
+            raise NotImplementedError
+        
+        self.paddle_outputs = [paddle.to_tensor(res)]
+        print("paddle res:\n", res)
 
     def build_cinn_program(self, target):
         builder = NetBuilder("scatter")
-        src = builder.create_input(OpTest.nptype2cinntype(self.inputs["src"].dtype), self.inputs["src"].shape, "src")
-        index = builder.create_input(OpTest.nptype2cinntype(self.inputs["index0"].dtype), self.inputs["index0"].shape, "index0")
-        out = builder.create_input(OpTest.nptype2cinntype(self.inputs["out"].dtype), self.inputs["out"].shape, "out")
+        src = builder.create_input(
+            OpTest.nptype2cinntype(self.inputs["src"].dtype),
+            self.inputs["src"].shape, "src")
+        index = builder.create_input(
+            OpTest.nptype2cinntype(self.inputs["index0"].dtype),
+            self.inputs["index0"].shape, "index0")
+        out = builder.create_input(
+            OpTest.nptype2cinntype(self.inputs["out"].dtype),
+            self.inputs["out"].shape, "out")
         out1 = builder.scatter(src, index, out, 0)
 
-        print(self.inputs["src"], self.inputs["index0"], self.inputs["out"])
-
         prog = builder.build()
         res = self.get_cinn_output(
             prog, target, [src, index, out],
-            [self.inputs["src"], self.inputs["index0"], self.inputs["out"]], [out1])
+            [self.inputs["src"], self.inputs["index0"], self.inputs["out"]],
+            [out1])
 
         self.cinn_outputs = res
+        print("cinn res:\n", res)
 
     def test_check_results(self):
         self.check_outputs_and_grads()
@@ -109,6 +119,5 @@ def test_check_results(self):
 #     def test_check_results(self):
 #         self.check_outputs_and_grads()
 
-
 if __name__ == "__main__":
     unittest.main()

From b414d8421997739b864ff0227f4d88290d36bd4f Mon Sep 17 00:00:00 2001
From: zzk0 <zzk9460@gmail.com>
Date: Sat, 3 Jun 2023 09:43:09 +0000
Subject: [PATCH 7/8] remove scatter & scatter_nd

---
 a.out                                | 144 -------------
 cinn/frontend/net_builder.cc         |  27 ---
 cinn/frontend/net_builder_test.cc    | 151 --------------
 cinn/hlir/op/contrib/CMakeLists.txt  |   2 -
 cinn/hlir/op/contrib/scatter.cc      | 294 ---------------------------
 cinn/hlir/op/contrib/scatter.h       |  44 ----
 cinn/hlir/op/contrib/scatter_test.cc | 165 ---------------
 cinn/hlir/op/use_ops.h               |   1 -
 cinn/pybind/frontend.cc              |  32 ---
 python/tests/ops/test_scatter_op.py  | 123 -----------
 10 files changed, 983 deletions(-)
 delete mode 100644 a.out
 delete mode 100644 cinn/hlir/op/contrib/scatter.cc
 delete mode 100644 cinn/hlir/op/contrib/scatter.h
 delete mode 100644 cinn/hlir/op/contrib/scatter_test.cc
 delete mode 100644 python/tests/ops/test_scatter_op.py

diff --git a/a.out b/a.out
deleted file mode 100644
index 08f58f1227..0000000000
--- a/a.out
+++ /dev/null
@@ -1,144 +0,0 @@
-#include <cstdint>
-
-#define CINN_WITH_CUDA
-#include "bfloat16.h"
-#include "float16.h"
-using cinn::common::bfloat16;
-using cinn::common::float16;
-// using cinn::common::half4;
-// using cinn::common::half8;
-// using cinn::common::float8;
-
-#include "cinn_cuda_runtime_source.cuh"
-
-extern "C" {
-
-__global__
-void __launch_bounds__(1024) fn_scatter_add_0_35_kernel(const float* __restrict__ x, const float* __restrict__ y, const int32_t* __restrict__ index, float* __restrict__ var_1658)
-{
-  if (((int)blockIdx.x < 40)) {
-    if (((int)threadIdx.x < 1024)) {
-      var_1658[((1024 * (int)blockIdx.x) + (int)threadIdx.x)] = cinn_nvgpu_index_add_fp32(x[((1024 * (int)blockIdx.x) + (int)threadIdx.x)], ((int)blockIdx.x / 4), y, ((512 * ((int)threadIdx.x / 512)) + (((((int)threadIdx.x & 511) / 32) * 32) + ((1024 * ((int)blockIdx.x & 3)) + ((int)threadIdx.x & 31)))), 4096, index, 1);
-    };
-  };
-}
-
-}
-//
-// Generated by NVIDIA NVVM Compiler
-//
-// Compiler Build ID: CL-29558016
-// Cuda compilation tools, release 11.2, V11.2.142
-// Based on NVVM 7.0.1
-//
-
-.version 7.2
-.target sm_70
-.address_size 64
-
-        // .globl       fn_scatter_add_0_35_kernel
-.global .align 4 .u32 _ZN4cinn6common7float165shiftE;
-.global .align 4 .u32 _ZN4cinn6common7float169shiftSignE;
-.global .align 4 .u32 _ZN4cinn6common7float164infNE;
-.global .align 4 .u32 _ZN4cinn6common7float164maxNE;
-.global .align 4 .u32 _ZN4cinn6common7float164minNE;
-.global .align 4 .u32 _ZN4cinn6common7float164sigNE;
-.global .align 4 .u32 _ZN4cinn6common7float164infCE;
-.global .align 4 .u32 _ZN4cinn6common7float164nanNE;
-.global .align 4 .u32 _ZN4cinn6common7float164maxCE;
-.global .align 4 .u32 _ZN4cinn6common7float164minCE;
-.global .align 4 .u32 _ZN4cinn6common7float164sigCE;
-.global .align 4 .u32 _ZN4cinn6common7float164mulNE;
-.global .align 4 .u32 _ZN4cinn6common7float164mulCE;
-.global .align 4 .u32 _ZN4cinn6common7float164subCE;
-.global .align 4 .u32 _ZN4cinn6common7float164norCE;
-.global .align 4 .u32 _ZN4cinn6common7float164maxDE;
-.global .align 4 .u32 _ZN4cinn6common7float164minDE;
-
-.visible .entry fn_scatter_add_0_35_kernel(
-        .param .u64 fn_scatter_add_0_35_kernel_param_0,
-        .param .u64 fn_scatter_add_0_35_kernel_param_1,
-        .param .u64 fn_scatter_add_0_35_kernel_param_2,
-        .param .u64 fn_scatter_add_0_35_kernel_param_3
-)
-.maxntid 1024, 1, 1
-{
-        .reg .pred      %p<8>;
-        .reg .f32       %f<8>;
-        .reg .b32       %r<29>;
-        .reg .b64       %rd<17>;
-
-
-        ld.param.u64    %rd3, [fn_scatter_add_0_35_kernel_param_0];
-        ld.param.u64    %rd4, [fn_scatter_add_0_35_kernel_param_1];
-        ld.param.u64    %rd5, [fn_scatter_add_0_35_kernel_param_2];
-        ld.param.u64    %rd6, [fn_scatter_add_0_35_kernel_param_3];
-        mov.u32         %r1, %ctaid.x;
-        setp.gt.s32     %p1, %r1, 39;
-        mov.u32         %r2, %tid.x;
-        setp.gt.s32     %p2, %r2, 1023;
-        or.pred         %p3, %p1, %p2;
-        @%p3 bra        LBB0_9;
-
-        cvta.to.global.u64      %rd7, %rd3;
-        shl.b32         %r11, %r1, 10;
-        add.s32         %r3, %r11, %r2;
-        mul.wide.s32    %rd8, %r3, 4;
-        add.s64         %rd9, %rd7, %rd8;
-        ld.global.nc.f32        %f6, [%rd9];
-        shr.s32         %r12, %r1, 31;
-        shr.u32         %r13, %r12, 30;
-        add.s32         %r14, %r1, %r13;
-        shr.s32         %r4, %r14, 2;
-        shr.s32         %r15, %r2, 31;
-        shr.u32         %r16, %r15, 23;
-        add.s32         %r17, %r2, %r16;
-        and.b32         %r18, %r17, -512;
-        and.b32         %r19, %r1, 3;
-        and.b32         %r20, %r2, 511;
-        bfi.b32         %r21, %r19, %r20, 10, 2;
-        add.s32         %r5, %r21, %r18;
-        cvta.to.global.u64      %rd1, %rd5;
-        cvta.to.global.u64      %rd2, %rd4;
-        mov.u32         %r26, -1;
-
-LBB0_2:
-        mov.u32         %r27, %r26;
-
-LBB0_3:
-        add.s32         %r8, %r27, 1;
-        setp.gt.s32     %p4, %r27, -1;
-        mov.u32         %r26, -1;
-        @%p4 bra        LBB0_5;
-
-        mul.wide.s32    %rd10, %r27, 4;
-        add.s64         %rd11, %rd1, %rd10;
-        ld.global.nc.u32        %r23, [%rd11+4];
-        setp.ne.s32     %p5, %r23, %r4;
-        mov.u32         %r27, %r8;
-        mov.u32         %r26, %r8;
-        @%p5 bra        LBB0_3;
-
-LBB0_5:
-        setp.lt.s32     %p6, %r26, 0;
-        @%p6 bra        LBB0_7;
-
-        shl.b32         %r24, %r26, 12;
-        add.s32         %r25, %r5, %r24;
-        mul.wide.s32    %rd12, %r25, 4;
-        add.s64         %rd13, %rd2, %rd12;
-        ld.global.nc.f32        %f5, [%rd13];
-        add.f32         %f6, %f6, %f5;
-
-LBB0_7:
-        setp.ne.s32     %p7, %r26, -1;
-        @%p7 bra        LBB0_2;
-
-        cvta.to.global.u64      %rd14, %rd6;
-        add.s64         %rd16, %rd14, %rd8;
-        st.global.f32   [%rd16], %f6;
-
-LBB0_9:
-        ret;
-
-}
\ No newline at end of file
diff --git a/cinn/frontend/net_builder.cc b/cinn/frontend/net_builder.cc
index f3e0f47385..ede81f39fe 100644
--- a/cinn/frontend/net_builder.cc
+++ b/cinn/frontend/net_builder.cc
@@ -430,33 +430,6 @@ Variable NetBuilder::GatherNd(const Variable& x, const Variable& index) {
   return CustomInstr("gather_nd", {x, index}, {}).front();
 }
 
-Variable NetBuilder::Scatter(const Variable& src, const Variable& index, const Variable& out, const int& axis) {
-  return CustomInstr("scatter", {src, index, out}, {{"axis", axis}}).front();
-}
-Variable NetBuilder::Scatter(const Variable& src,
-                             const Variable& index,
-                             const std::vector<int>& shape,
-                             const float& default_value,
-                             const int& axis) {
-  auto out = FillConstant(shape, default_value, UniqName("fill_constant"), "float", false);
-  return Scatter(src, index, out, axis);
-}
-
-Variable NetBuilder::ScatterNd(const Variable& src,
-                               const Variable& index,
-                               const Variable& out,
-                               const std::vector<int>& axes) {
-  return CustomInstr("scatter_nd", {src, index, out}, {{"axes", axes}}).front();
-}
-Variable NetBuilder::ScatterNd(const Variable& src,
-                               const Variable& index,
-                               const std::vector<int>& shape,
-                               const float& default_value,
-                               const std::vector<int>& axes) {
-  auto out = FillConstant(shape, default_value, UniqName("fill_constant"), "float", false);
-  return ScatterNd(src, index, out, axes);
-}
-
 Variable NetBuilder::Cast(const Variable& operand, const std::string& dtype) {
   return CustomInstr("cast", {operand}, {{"dtype", dtype}}).front();
 }
diff --git a/cinn/frontend/net_builder_test.cc b/cinn/frontend/net_builder_test.cc
index 413e4e618a..4e55150c0c 100644
--- a/cinn/frontend/net_builder_test.cc
+++ b/cinn/frontend/net_builder_test.cc
@@ -354,157 +354,6 @@ TEST(net_build, program_execute_gather_nd) {
   }
 }
 
-TEST(net_build, program_execute_scatter) {
-  const float default_value = 3.14;
-  const int B               = 3;
-  const int H_IN            = 4;
-  const int H_OUT           = 11;
-
-  NetBuilder builder("net_builder");
-  Placeholder input1 = builder.CreateInput(Float(32), {B, H_IN}, "In1");
-  Placeholder input2 = builder.CreateInput(Int(32), {B, H_IN}, "In2");
-  Variable output    = builder.Scatter(input1, input2, {B, H_OUT}, default_value, 1);
-  auto program       = builder.Build();
-
-#ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
-#else
-  Target target = common::DefaultHostTarget();
-#endif
-  std::unordered_set<std::string> fetch_ids;
-  auto graph = Optimize(&program, fetch_ids, target);
-
-  auto scope = BuildScope(target, graph);
-  hlir::framework::GraphCompiler gc(target, scope, graph);
-  auto runtime_program = gc.Build();
-
-  scope->Var<hlir::framework::Tensor>(std::string(input1.id()));
-  scope->Var<hlir::framework::Tensor>(std::string(input2.id()));
-  scope->Var<hlir::framework::Tensor>(std::string(output->id));
-
-  auto input1_tensor = scope->GetTensor(std::string(input1.id()));
-  SetRandData<float>(input1_tensor, target);
-  std::vector<float> input1_data = GetTensorData<float>(input1_tensor, target);
-
-  auto input2_tensor = scope->GetTensor(std::string(input2.id()));
-  SetRandInt(input2_tensor, target, -1, 0, H_IN);
-
-  std::vector<int> input2_data = GetTensorData<int>(input2_tensor, target);
-
-  runtime_program->Execute();
-
-  auto output_tensor                   = scope->GetTensor(std::string(output->id));
-  const std::vector<int>& output_shape = output_tensor->shape().data();
-  EXPECT_EQ(output_tensor->type(), Float(32));
-  EXPECT_EQ(output_shape.size(), 2UL);
-  EXPECT_EQ(output_shape[0], B);
-  EXPECT_EQ(output_shape[1], H_OUT);
-
-  float true_data[B * H_OUT];
-  for (int b = 0; b < B; ++b) {
-    for (int h = 0; h < H_OUT; ++h) {
-      int index        = h + H_OUT * b;
-      true_data[index] = default_value;
-    }
-  }
-  for (int b = 0; b < B; ++b) {
-    for (int h = 0; h < H_IN; ++h) {
-      int index                                 = h + H_IN * b;
-      true_data[input2_data[index] + H_OUT * b] = input1_data[index];
-    }
-  }
-
-  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
-  VLOG(6) << "Visualize output_data";
-  for (int b = 0; b < B; ++b) {
-    for (int h = 0; h < H_OUT; ++h) {
-      std::string line;
-      int index      = h + H_OUT * b;
-      float t_data   = true_data[index];
-      float out_data = output_data[index];
-      line += (std::to_string(out_data) + ", ");
-      EXPECT_EQ(t_data, out_data);
-      VLOG(6) << line;
-    }
-  }
-}
-
-TEST(net_build, program_execute_scatter_nd) {
-  Context::Global().ResetNameId();
-  const float default_value = 3.14;
-  const int B               = 3;
-  const int H_IN            = 4;
-  const int H_OUT           = 11;
-
-  NetBuilder builder("net_builder");
-  Placeholder input1 = builder.CreateInput(Float(32), {B, H_IN}, "In1");
-  Placeholder input2 = builder.CreateInput(Int(32), {B, H_IN, 1}, "In2");
-  Variable output    = builder.ScatterNd(input1, input2, {B, H_OUT}, default_value, {1});
-  auto program       = builder.Build();
-
-#ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
-#else
-  Target target = common::DefaultHostTarget();
-#endif
-  std::unordered_set<std::string> fetch_ids;
-  auto graph = Optimize(&program, fetch_ids, target);
-
-  auto scope = BuildScope(target, graph);
-  hlir::framework::GraphCompiler gc(target, scope, graph);
-  auto runtime_program = gc.Build();
-
-  scope->Var<hlir::framework::Tensor>(std::string(input1.id()));
-  scope->Var<hlir::framework::Tensor>(std::string(input2.id()));
-  scope->Var<hlir::framework::Tensor>(std::string(output->id));
-
-  auto input1_tensor = scope->GetTensor(std::string(input1.id()));
-  SetRandData<float>(input1_tensor, target);
-
-  auto input2_tensor = scope->GetTensor(std::string(input2.id()));
-  SetRandInt(input2_tensor, target);
-
-  runtime_program->Execute();
-
-  std::vector<float> input1_data = GetTensorData<float>(input1_tensor, target);
-  std::vector<int> input2_data   = GetTensorData<int>(input2_tensor, target);
-
-  auto output_tensor                   = scope->GetTensor(std::string(output->id));
-  const std::vector<int>& output_shape = output_tensor->shape().data();
-  EXPECT_EQ(output_tensor->type(), Float(32));
-  EXPECT_EQ(output_shape.size(), 2UL);
-  EXPECT_EQ(output_shape[0], B);
-  EXPECT_EQ(output_shape[1], H_OUT);
-
-  float true_data[B * H_OUT];
-  for (int b = 0; b < B; ++b) {
-    for (int h = 0; h < H_OUT; ++h) {
-      int index        = h + H_OUT * b;
-      true_data[index] = default_value;
-    }
-  }
-  for (int b = 0; b < B; ++b) {
-    for (int h = 0; h < H_IN; ++h) {
-      int index                                 = h + H_IN * b;
-      true_data[input2_data[index] + H_OUT * b] = input1_data[index];
-    }
-  }
-
-  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
-  VLOG(6) << "Visualize output_data";
-  for (int b = 0; b < B; ++b) {
-    for (int h = 0; h < H_OUT; ++h) {
-      std::string line;
-      int index      = h + H_OUT * b;
-      float t_data   = true_data[index];
-      float out_data = output_data[index];
-      line += (std::to_string(out_data) + ", ");
-      EXPECT_EQ(t_data, out_data);
-      VLOG(6) << line;
-    }
-  }
-}
-
 TEST(net_build, program_execute_cast) {
   const int B = 4;
   const int H = 7;
diff --git a/cinn/hlir/op/contrib/CMakeLists.txt b/cinn/hlir/op/contrib/CMakeLists.txt
index fde84a733e..48565a4edb 100644
--- a/cinn/hlir/op/contrib/CMakeLists.txt
+++ b/cinn/hlir/op/contrib/CMakeLists.txt
@@ -2,7 +2,6 @@ core_gather_headers()
 
 gather_srcs(cinnapi_src SRCS
         gather_nd.cc
-        scatter.cc
         flip.cc
         sort.cc
         argmin.cc
@@ -22,7 +21,6 @@ gather_srcs(cinnapi_src SRCS
         )
 
 cc_test(test_gather_nd SRCS gather_nd_test.cc DEPS cinncore)
-cc_test(test_scatter SRCS scatter_test.cc DEPS cinncore)
 cc_test(test_sort SRCS sort_test.cc DEPS cinncore)
 cc_test(test_argmin SRCS argmin_test.cc DEPS cinncore)
 cc_test(test_argmax SRCS argmax_test.cc DEPS cinncore)
diff --git a/cinn/hlir/op/contrib/scatter.cc b/cinn/hlir/op/contrib/scatter.cc
deleted file mode 100644
index 7733a5f92d..0000000000
--- a/cinn/hlir/op/contrib/scatter.cc
+++ /dev/null
@@ -1,294 +0,0 @@
-// Copyright (c) 2022 CINN Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "cinn/hlir/op/contrib/scatter.h"
-
-#include <gflags/gflags.h>
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "cinn/common/cas.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/macros.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/hlir/pe/transform.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-
-DECLARE_bool(cinn_ir_schedule);
-
-namespace cinn {
-namespace hlir {
-namespace op {
-
-using common::CINNValue;
-using common::CINNValuePack;
-
-ir::Tensor Scatter(const ir::Tensor &A,
-                   const ir::Tensor &B,
-                   const ir::Tensor &C,
-                   const common::Target &target,
-                   const int &axis,
-                   const std::string &name) {
-  CHECK_EQ(A->shape.size(), B->shape.size());
-  CHECK_EQ(A->shape.size(), C->shape.size());
-
-  std::string extern_fun_name;
-  if (target.arch == common::Target::Arch::NVGPU) {
-    extern_fun_name.assign("cinn_cuda_find_int_nd");
-  } else if (target.arch == common::Target::Arch::X86) {
-    extern_fun_name.assign("cinn_host_find_int_nd");
-  } else {
-    LOG(FATAL) << "Scatter only support X86 and NVGPU ! Please Check.\n";
-  }
-
-  int pos_axis = axis;
-  if (pos_axis < 0) {
-    pos_axis += C->shape.size();
-  }
-
-  ir::Tensor transpose_B = B;
-  // if (pos_axis == A->shape.size() - 1) {
-  //   transpose_B = B;
-  // } else {
-  //   std::vector<int> new_axes;
-  //   for (int i = 0; i < A->shape.size(); ++i) {
-  //     if (i != pos_axis) {
-  //       new_axes.push_back(i);
-  //     }
-  //   }
-  //   new_axes.push_back(pos_axis);
-  //   transpose_B = pe::Transpose(B, new_axes, B->name + "_transpose");
-  // }
-  auto res = Compute(
-      C->shape,
-      [=](const std::vector<Expr> &indices) {
-        Expr offset(0);
-        for (int i = 0; i < indices.size(); i++) {
-          if (i != pos_axis) {
-            offset = offset * C->shape[i] + indices[i];
-          }
-        }
-        auto B_shape_axis = B->shape[pos_axis];
-        offset            = common::AutoSimplify(offset * B_shape_axis);
-        auto idx = lang::CallExtern(extern_fun_name, {transpose_B, B_shape_axis, indices[pos_axis], offset, Expr(1)});
-        std::vector<Expr> A_indices(indices);
-        A_indices[pos_axis] = idx;
-        auto keep           = ir::EQ::Make(idx, Expr(-1));
-        return ir::Select::Make(keep, C(indices), A(A_indices));
-      },
-      name);
-  return res;
-}
-
-ir::Tensor ScatterNd(const ir::Tensor &A,
-                     const ir::Tensor &B,
-                     const ir::Tensor &C,
-                     const common::Target &target,
-                     const std::vector<int> &axes,
-                     const std::string &name) {
-  CHECK(!A->shape.empty());
-  CHECK_EQ(A->shape.size() + 1, B->shape.size());
-  CHECK_EQ(A->shape.size() + axes.size() - 1, C->shape.size());
-
-  std::string extern_fun_name;
-  if (target.arch == common::Target::Arch::NVGPU) {
-    extern_fun_name.assign("cinn_cuda_find_int_nd");
-  } else if (target.arch == common::Target::Arch::X86) {
-    extern_fun_name.assign("cinn_host_find_int_nd");
-  } else {
-    LOG(FATAL) << "ScatterNd only support X86 and NVGPU ! Please Check.\n";
-  }
-
-  std::vector<int> pos_axes;
-  for (auto axis : axes) {
-    if (axis < 0) {
-      pos_axes.push_back(axis + C->shape.size());
-    } else {
-      pos_axes.push_back(axis);
-    }
-  }
-
-  auto res = Compute(
-      C->shape,
-      [=](const std::vector<Expr> &indices) {
-        auto offset = Expr(0);
-        std::vector<Expr> A_indices;
-        for (int i = 0; i < indices.size(); i++) {
-          if (std::find(pos_axes.begin(), pos_axes.end(), i) == pos_axes.end()) {
-            offset = offset * C->shape[i] + indices[i];
-            A_indices.push_back(indices[i]);
-          }
-        }
-        offset    = offset * B->shape[B->shape.size() - 2] * B->shape[B->shape.size() - 1];
-        auto keep = Expr(true);
-        std::vector<Expr> idx;
-        for (int i = 0; i < pos_axes.size(); ++i) {
-          auto cur_idx = lang::CallExtern(extern_fun_name,
-                                          {B,
-                                           B->shape[B->shape.size() - 2],
-                                           indices[pos_axes[i]],
-                                           common::AutoSimplify(offset + Expr(i)),
-                                           Expr(static_cast<int>(pos_axes.size()))});
-          if (idx.empty()) {
-            idx.push_back(cur_idx);
-            A_indices.push_back(cur_idx);
-          } else {
-            keep   = ir::And::Make(keep, ir::EQ::Make(idx[0], cur_idx));
-            idx[0] = cur_idx;
-          }
-        }
-        keep = common::AutoSimplify(ir::And::Make(keep, ir::EQ::Make(idx[0], Expr(-1))));
-        return ir::Select::Make(keep, C(indices), A(A_indices));
-      },
-      name);
-  return res;
-}
-
-std::shared_ptr<framework::OpStrategy> StrategyForScatter(const framework::NodeAttr &attrs,
-                                                          const std::vector<ir::Tensor> &inputs,
-                                                          const std::vector<Type> &out_type,
-                                                          const std::vector<std::vector<int>> &output_shapes,
-                                                          const Target &target) {
-  auto attr_store = attrs.attr_store;
-  CHECK(attr_store.count("axis")) << "find no attr of axis";
-  const int axis = absl::get<int>(attr_store.at("axis"));
-  const std::string op_name("scatter");
-
-  framework::CINNCompute scatter_compute([=](lang::Args args, lang::RetValue *ret) {
-    CHECK(!args.empty()) << "The input arguments of " << op_name << " compute is empty! Please check.\n";
-    CINNValuePack pack_args = args[0];
-    CHECK_GE(pack_args.size(), 3U) << "3 input tensors for " << op_name << " compute\n";
-    Expr A                  = pack_args[0];
-    Expr B                  = pack_args[1];
-    Expr C                  = pack_args[2];
-    std::string tensor_name = UniqName("Scatter_out");
-    auto stages    = CreateStages({A.as_tensor_ref(), B.as_tensor_ref(), C.as_tensor_ref()});
-    if (FLAGS_cinn_ir_schedule) {
-      CHECK_EQ(pack_args.size(), 4U);
-      tensor_name = pack_args[3].operator std::string();
-      VLOG(4) << A.as_tensor_ref()->name << " " << B.as_tensor_ref()->name << " " << C.as_tensor_ref()->name << " "
-              << tensor_name;
-      tensor_name = "test_scatter_out";
-    }
-    ir::Tensor out = Scatter(A.as_tensor_ref(), B.as_tensor_ref(), C.as_tensor_ref(), target, axis, tensor_name);
-    std::vector<CINNValue> res;
-    stages->InsertLazily(out);
-    res.push_back(CINNValue(out));
-    CHECK(!out_type.empty()) << "Output type of Scatter is empty! Please check.\n";
-    res.push_back(CINNValue(stages));
-    *ret = CINNValuePack{res};
-  });
-
-  auto strategy = std::make_shared<framework::OpStrategy>();
-  strategy->AddImpl(scatter_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.scatter.x86", 1);
-  return strategy;
-}
-
-std::shared_ptr<framework::OpStrategy> StrategyForScatterNd(const framework::NodeAttr &attrs,
-                                                            const std::vector<ir::Tensor> &inputs,
-                                                            const std::vector<Type> &out_type,
-                                                            const std::vector<std::vector<int>> &output_shapes,
-                                                            const Target &target) {
-  auto attr_store = attrs.attr_store;
-  CHECK(attr_store.count("axes")) << "find no attr of axis";
-  std::vector<int> axes = absl::get<std::vector<int>>(attr_store.at("axes"));
-  std::string op_name("scatter_nd");
-
-  framework::CINNCompute scatter_nd_compute([=](lang::Args args, lang::RetValue *ret) {
-    CHECK(!args.empty()) << "The input arguments of " << op_name << " compute is empty! Please check.\n";
-    CINNValuePack pack_args = args[0];
-    CHECK_GE(pack_args.size(), 3U) << "3 input tensors for " << op_name << " compute\n";
-    Expr A = pack_args[0];
-    Expr B = pack_args[1];
-    Expr C = pack_args[2];
-    CHECK(A.as_tensor());
-    CHECK(B.as_tensor());
-    CHECK(C.as_tensor());
-    CHECK(!output_shapes.empty());
-    auto tensor_A = A.as_tensor_ref();
-    auto tensor_B = B.as_tensor_ref();
-    auto tensor_C = C.as_tensor_ref();
-    VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ") << ", B shape: " << utils::Join(tensor_B->shape, ", ")
-            << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
-    std::string tensor_name = UniqName("ScatterNd_out");
-    if (FLAGS_cinn_ir_schedule) {
-      CHECK_EQ(pack_args.size(), 4U);
-      tensor_name = pack_args[3].operator std::string();
-    }
-    ir::Tensor out = ScatterNd(tensor_A, tensor_B, tensor_C, target, axes, tensor_name);
-    auto stages    = CreateStages({out});
-    *ret           = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
-  });
-
-  auto strategy = std::make_shared<framework::OpStrategy>();
-  strategy->AddImpl(scatter_nd_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.scatter_nd.x86", 1);
-  return strategy;
-}
-
-std::vector<std::vector<int>> InferShapeForScatter(const std::vector<std::vector<int>> &inputs_shape,
-                                                   const framework::AttrMapType &attrs) {
-  CHECK_EQ(inputs_shape.size(), 3U) << "The input's shape size should be 3! Please check again.";
-  std::vector<std::vector<int>> res{inputs_shape[2]};
-  return res;
-}
-
-std::vector<Type> InferDtypeForScatter(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
-  CHECK_EQ(inputs_type.size(), 3U) << "The input's type size should be 3! Please check again.";
-  CHECK_EQ(inputs_type[1], Int(32)) << "The index's type should be int! Please check again.";
-  std::vector<Type> res{inputs_type[2]};
-  return res;
-}
-
-}  // namespace op
-}  // namespace hlir
-}  // namespace cinn
-
-CINN_REGISTER_HELPER(scatter_ops) {
-  CINN_REGISTER_OP(scatter)
-      .describe("Scatter.")
-      .set_num_inputs(3)
-      .set_num_outputs(1)
-      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForScatter)
-      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForScatter))
-      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForScatter))
-      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective)
-      .set_support_level(4);
-
-  CINN_REGISTER_OP(scatter_nd)
-      .describe("ScatterNd.")
-      .set_num_inputs(3)
-      .set_num_outputs(1)
-      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForScatterNd)
-      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForScatter))
-      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForScatter))
-      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective)
-      .set_support_level(4);
-
-  return true;
-}
diff --git a/cinn/hlir/op/contrib/scatter.h b/cinn/hlir/op/contrib/scatter.h
deleted file mode 100644
index aa8cbcc8ae..0000000000
--- a/cinn/hlir/op/contrib/scatter.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2022 CINN Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
-
-namespace cinn {
-namespace hlir {
-namespace op {
-
-ir::Tensor Scatter(const ir::Tensor& A,
-                   const ir::Tensor& B,
-                   const ir::Tensor& out,
-                   const common::Target& target,
-                   const int& axis,
-                   const std::string& name);
-
-ir::Tensor ScatterNd(const ir::Tensor& A,
-                     const ir::Tensor& B,
-                     const ir::Tensor& out,
-                     const common::Target& target,
-                     const std::vector<int>& axes,
-                     const std::string& name);
-
-}  // namespace op
-}  // namespace hlir
-}  // namespace cinn
diff --git a/cinn/hlir/op/contrib/scatter_test.cc b/cinn/hlir/op/contrib/scatter_test.cc
deleted file mode 100644
index 51441a4cdc..0000000000
--- a/cinn/hlir/op/contrib/scatter_test.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) 2022 CINN Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "cinn/hlir/op/contrib/scatter.h"
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include <string>
-#include <vector>
-
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/codegen_c_x86.h"
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/common/context.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/stage.h"
-
-namespace cinn {
-namespace hlir {
-namespace op {
-
-TEST(GenerateCode_Cpu, Scatter) {
-  common::Context::Global().ResetNameId();
-
-  auto target = common::DefaultHostTarget();
-
-  ir::Expr n(4);
-  ir::Expr h_in(8);
-  ir::Expr h_out(14);
-
-  lang::Placeholder<float> in1("in1", {n, h_in});
-  lang::Placeholder<int32_t> in2("in2", {n, h_in});
-  lang::Placeholder<float> out("out", {n, h_out});
-  ir::Tensor res = Scatter(in1, in2, out, target, 1, "test_scatter_out");
-
-  poly::StageMap stages = poly::CreateStages({res});
-  std::vector<ir::LoweredFunc> funcs =
-      lang::LowerVec("TestGenerateCodeCpu_Scatter", stages, {res}, {}, {}, nullptr, target, true);
-
-  VLOG(6) << "Expr before CPU codegen:";
-  VLOG(6) << funcs[0]->body;
-
-  ir::Module::Builder builder("Scatter_Module", target);
-  for (auto& f : funcs) {
-    builder.AddFunction(f);
-  }
-
-  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
-  codegen.SetInlineBuiltinCodes(false);
-  std::string code = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
-  VLOG(6) << "Cpu Codegen result:";
-  VLOG(6) << code << std::endl;
-
-  auto target_source = R"ROC(
-#include <cinn_runtime.h>
-#include <stdio.h>
-
-void TestGenerateCodeCpu_Scatter(void* _args, int32_t num_args)
-{
-  cinn_buffer_t* _test_scatter_out = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
-  cinn_buffer_t* _in1 = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 4, 8 });
-  cinn_buffer_t* _in2 = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), { 4, 8 });
-  cinn_buffer_t* _out = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 4, 14 });
-  cinn_buffer_malloc((void*)(0), _test_scatter_out);
-  cinn_buffer_malloc((void*)(0), _in1);
-  cinn_buffer_malloc((void*)(0), _in2);
-  cinn_buffer_malloc((void*)(0), _out);
-  const float* in1 = ((const float*)(_in1->memory));
-  const int32_t* in2 = ((const int32_t*)(_in2->memory));
-  const float* out = ((const float*)(_out->memory));
-  float* test_scatter_out = ((float*)(_test_scatter_out->memory));
-  for (int32_t i = 0; i < 4; i += 1) {
-    for (int32_t j = 0; j < 14; j += 1) {
-      test_scatter_out[((14 * i) + j)] = (((cinn_host_find_int_nd(_in2, 8, j, (8 * i), 1) == -1)) ? out[((14 * i) + j)] : in1[((8 * i) + cinn_host_find_int_nd(_in2, 8, j, (8 * i), 1))]);
-    };
-  };
-  cinn_buffer_free((void*)(0), _in1);
-  cinn_buffer_free((void*)(0), _in2);
-  cinn_buffer_free((void*)(0), _out);
-  cinn_buffer_free((void*)(0), _test_scatter_out);
-}
-)ROC";
-  CHECK_EQ(utils::Trim(code), utils::Trim(target_source));
-}
-
-TEST(GenerateCode_Cpu, ScatterNd) {
-  common::Context::Global().ResetNameId();
-
-  auto target = common::DefaultHostTarget();
-
-  ir::Expr n(4);
-  ir::Expr h_in(8);
-  ir::Expr h_out(14);
-
-  lang::Placeholder<float> in1("in1", {n, h_in});
-  lang::Placeholder<int32_t> in2("in2", {n, h_in, ir::Expr(1)});
-  lang::Placeholder<float> out("out", {n, h_out});
-  ir::Tensor res = ScatterNd(in1, in2, out, target, {1}, "test_scatter_out");
-
-  poly::StageMap stages = poly::CreateStages({res});
-  std::vector<ir::LoweredFunc> funcs =
-      lang::LowerVec("TestGenerateCodeCpu_Scatter", stages, {res}, {}, {}, nullptr, target, true);
-
-  VLOG(6) << "Expr before CPU codegen:";
-  VLOG(6) << funcs[0]->body;
-
-  ir::Module::Builder builder("Scatter_Module", target);
-  for (auto& f : funcs) {
-    builder.AddFunction(f);
-  }
-
-  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
-  codegen.SetInlineBuiltinCodes(false);
-  std::string code = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
-  VLOG(6) << "Cpu Codegen result:";
-  VLOG(6) << code << std::endl;
-
-  auto target_source = R"ROC(
-#include <cinn_runtime.h>
-#include <stdio.h>
-
-void TestGenerateCodeCpu_Scatter(void* _args, int32_t num_args)
-{
-  cinn_buffer_t* _test_scatter_out = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
-  cinn_buffer_t* _in1 = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 4, 8 });
-  cinn_buffer_t* _in2 = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), { 4, 8, 1 });
-  cinn_buffer_t* _out = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 4, 14 });
-  cinn_buffer_malloc((void*)(0), _test_scatter_out);
-  cinn_buffer_malloc((void*)(0), _in1);
-  cinn_buffer_malloc((void*)(0), _in2);
-  cinn_buffer_malloc((void*)(0), _out);
-  const float* in1 = ((const float*)(_in1->memory));
-  const int32_t* in2 = ((const int32_t*)(_in2->memory));
-  const float* out = ((const float*)(_out->memory));
-  float* test_scatter_out = ((float*)(_test_scatter_out->memory));
-  for (int32_t i = 0; i < 4; i += 1) {
-    for (int32_t j = 0; j < 14; j += 1) {
-      test_scatter_out[((14 * i) + j)] = (((cinn_host_find_int_nd(_in2, 8, j, (8 * i), 1) == -1)) ? out[((14 * i) + j)] : in1[((8 * i) + cinn_host_find_int_nd(_in2, 8, j, (8 * i), 1))]);
-    };
-  };
-  cinn_buffer_free((void*)(0), _in1);
-  cinn_buffer_free((void*)(0), _in2);
-  cinn_buffer_free((void*)(0), _out);
-  cinn_buffer_free((void*)(0), _test_scatter_out);
-}
-  )ROC";
-  CHECK_EQ(utils::Trim(code), utils::Trim(target_source));
-}
-
-}  // namespace op
-}  // namespace hlir
-}  // namespace cinn
diff --git a/cinn/hlir/op/use_ops.h b/cinn/hlir/op/use_ops.h
index 3650878d76..e29941efd7 100644
--- a/cinn/hlir/op/use_ops.h
+++ b/cinn/hlir/op/use_ops.h
@@ -23,7 +23,6 @@ CINN_USE_REGISTER(broadcast_grad_ops)
 CINN_USE_REGISTER(elementwise_ops)
 CINN_USE_REGISTER(transform_ops)
 CINN_USE_REGISTER(gather_nd_ops)
-CINN_USE_REGISTER(scatter_ops)
 CINN_USE_REGISTER(sort_ops)
 CINN_USE_REGISTER(argmin_ops)
 CINN_USE_REGISTER(argmax_ops)
diff --git a/cinn/pybind/frontend.cc b/cinn/pybind/frontend.cc
index 2800302375..1cb719538e 100644
--- a/cinn/pybind/frontend.cc
+++ b/cinn/pybind/frontend.cc
@@ -530,38 +530,6 @@ void BindFrontend(pybind11::module *m) {
            py::arg("updates"),
            py::arg("index"),
            py::arg("axis") = 0)
-      .def("scatter",
-           static_cast<Variable (NetBuilder::*)(const Variable &, const Variable &, const Variable &, const int &)>(
-               &NetBuilder::Scatter),
-           py::arg("src"),
-           py::arg("index"),
-           py::arg("out"),
-           py::arg("axis"))
-      .def("scatter",
-           static_cast<Variable (NetBuilder::*)(
-               const Variable &, const Variable &, const std::vector<int> &, const float &, const int &)>(
-               &NetBuilder::Scatter),
-           py::arg("src"),
-           py::arg("index"),
-           py::arg("shape"),
-           py::arg("default_value") = 0,
-           py::arg("axis")          = 0)
-      .def("scatter_nd",
-           static_cast<Variable (NetBuilder::*)(
-               const Variable &, const Variable &, const Variable &, const std::vector<int> &)>(&NetBuilder::ScatterNd),
-           py::arg("src"),
-           py::arg("index"),
-           py::arg("out"),
-           py::arg("axis"))
-      .def("scatter_nd",
-           static_cast<Variable (NetBuilder::*)(
-               const Variable &, const Variable &, const std::vector<int> &, const float &, const std::vector<int> &)>(
-               &NetBuilder::ScatterNd),
-           py::arg("src"),
-           py::arg("index"),
-           py::arg("shape"),
-           py::arg("default_value") = 0,
-           py::arg("axis")          = 0)
       .def("isclose",
            &NetBuilder::IsClose,
            py::arg("x"),
diff --git a/python/tests/ops/test_scatter_op.py b/python/tests/ops/test_scatter_op.py
deleted file mode 100644
index 31a3530f2f..0000000000
--- a/python/tests/ops/test_scatter_op.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) 2021 CINN Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import unittest
-import numpy as np
-from op_test import OpTest, OpTestTool
-import paddle
-from cinn.frontend import *
-from cinn.common import *
-
-
-@OpTestTool.skip_if(not is_compiled_with_cuda(),
-                    "x86 test will be skipped due to timeout.")
-class TestScatterOp(OpTest):
-    def setUp(self):
-        self.init_case()
-        self.target = DefaultNVGPUTarget()
-
-    def init_case(self):
-        self.axis = -1
-        self.inputs = {
-            "x": np.random.random([10, 5]).astype("float32"),
-            "y": np.random.random([5, 5]).astype("float32"),
-            "index": np.array([0, 5, 0, 9, 0]).astype("int32"),
-            "src": self.random([2, 5], "float32"),
-            "out": self.random([3, 5], "float32"),
-            "index0": np.array([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]).astype("int32"),
-        }
-        print(self.inputs["src"], '\n', self.inputs["index0"], '\n', self.inputs["out"])
-
-    def build_paddle_program(self, target):
-        # x = paddle.to_tensor(self.inputs["src"], stop_gradient=True)
-        # y = paddle.to_tensor(self.inputs["out"], stop_gradient=True)
-        # index = paddle.to_tensor(self.inputs["index0"], stop_gradient=True)
-        x = self.inputs["src"]
-        y = self.inputs["out"]
-        index = self.inputs["index0"]
-
-        pos_axis = self.axis
-        if pos_axis < 0:
-            pos_axis += len(x.shape)
-
-        dim = len(x.shape)
-        res = copy.deepcopy(y)
-        if dim == 1:
-            pass
-        elif dim == 2:
-            if pos_axis == 0:
-                for i in range(x.shape[0]):
-                    for j in range(x.shape[1]):
-                        res[index[i, j], j] = x[i, j]
-            elif pos_axis == 1:
-                for i in range(x.shape[0]):
-                    for j in range(x.shape[1]):
-                        res[i, index[i, j]] = x[i, j]
-        elif dim == 3:
-            pass
-        elif dim == 4:
-            pass
-        else:
-            raise NotImplementedError
-        
-        self.paddle_outputs = [paddle.to_tensor(res)]
-        print("paddle res:\n", res)
-
-    def build_cinn_program(self, target):
-        builder = NetBuilder("scatter")
-        src = builder.create_input(
-            OpTest.nptype2cinntype(self.inputs["src"].dtype),
-            self.inputs["src"].shape, "src")
-        index = builder.create_input(
-            OpTest.nptype2cinntype(self.inputs["index0"].dtype),
-            self.inputs["index0"].shape, "index0")
-        out = builder.create_input(
-            OpTest.nptype2cinntype(self.inputs["out"].dtype),
-            self.inputs["out"].shape, "out")
-        out1 = builder.scatter(src, index, out, 0)
-
-        prog = builder.build()
-        res = self.get_cinn_output(
-            prog, target, [src, index, out],
-            [self.inputs["src"], self.inputs["index0"], self.inputs["out"]],
-            [out1])
-
-        self.cinn_outputs = res
-        print("cinn res:\n", res)
-
-    def test_check_results(self):
-        self.check_outputs_and_grads()
-
-
-# class TestScatterOp1(TestScatterOp):
-#     def setUp(self):
-#         self.init_case()
-#         self.target = DefaultNVGPUTarget()
-
-#     def init_case(self):
-#         self.axis = 0
-#         self.inputs = {
-#             "x": np.random.random([10, 5]).astype("int32"),
-#             "y": np.random.random([5, 5]).astype("int32"),
-#             "index": np.array([0, 5, 0, 9, 0]).astype("int32")
-#         }
-
-#     def test_check_results(self):
-#         self.check_outputs_and_grads()
-
-if __name__ == "__main__":
-    unittest.main()

From 2f702d6b2820f70f61ba2985f0e6a7d1bfb847c6 Mon Sep 17 00:00:00 2001
From: zzk0 <zzk9460@gmail.com>
Date: Sat, 3 Jun 2023 09:50:25 +0000
Subject: [PATCH 8/8] remove key

---
 cinn/auto_schedule/measure/simple_runner.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cinn/auto_schedule/measure/simple_runner.cc b/cinn/auto_schedule/measure/simple_runner.cc
index 3c2bf59a78..54660ccc93 100644
--- a/cinn/auto_schedule/measure/simple_runner.cc
+++ b/cinn/auto_schedule/measure/simple_runner.cc
@@ -39,8 +39,6 @@ static const std::unordered_map<std::string, std::vector<int>> kInitWithZeroPara
     {"lookup_table", {1}},
     {"gather", {1}},
     {"gather_nd", {1}},
-    {"scatter", {1}},
-    {"scatter_nd", {1}},
     {"scatter_assign", {2}},
     {"scatter_add", {2}},
 };