From 445489879c497ff452649a5cf182c21a082f9d93 Mon Sep 17 00:00:00 2001 From: zzk0 Date: Tue, 23 May 2023 09:42:03 +0000 Subject: [PATCH 1/8] op unittest for scatter_add --- python/tests/ops/test_scatter_add.py | 342 ++++++++++++++++++--------- 1 file changed, 226 insertions(+), 116 deletions(-) diff --git a/python/tests/ops/test_scatter_add.py b/python/tests/ops/test_scatter_add.py index 8a0e1269f9..7be09c9792 100644 --- a/python/tests/ops/test_scatter_add.py +++ b/python/tests/ops/test_scatter_add.py @@ -14,30 +14,32 @@ # See the License for the specific language governing permissions and # limitations under the License. -import unittest -import numpy as np -from op_test import OpTest, OpTestTool import paddle -import paddle.nn.functional as F -import cinn from cinn.frontend import * from cinn.common import * +from op_test import OpTest, OpTestTool +from op_test_helper import TestCaseHelper @OpTestTool.skip_if(not is_compiled_with_cuda(), "x86 test will be skipped due to timeout.") class TestScatterAddOp(OpTest): def setUp(self): - self.init_case() - self.target = DefaultNVGPUTarget() - - def init_case(self): - self.axis = 0 + print(f"\nRunning {self.__class__.__name__}: {self.case}") + self.inputs = {} + self.prepare_inputs() + + def prepare_inputs(self): + x_shape = self.case["x_shape"] + y_shape = self.case["y_shape"] + dtype = self.case["dtype"] + axis = self.case["axis"] self.inputs = { - "x": np.random.random([10, 5]).astype("float32"), - "y": np.random.random([5, 5]).astype("float32"), - "index": np.array([0, 5, 0, 9, 0]).astype("int32") + "x": self.random(x_shape, dtype), + "y": self.random(y_shape, dtype), + "index": self.random([y_shape[axis]], "int32", 0, x_shape[0]) } + self.axis = axis def build_paddle_program(self, target): x = paddle.to_tensor(self.inputs["x"], stop_gradient=True) @@ -47,24 +49,32 @@ def build_paddle_program(self, target): if pos_axis < 0: pos_axis += len(x.shape) + index_nd = [] if pos_axis == 0: - index_nd = [] for i in range(len(self.inputs["index"])): index_nd.append([self.inputs["index"][i]]) elif pos_axis == 1: - index_nd = [] for i in range(self.inputs['x'].shape[0]): index_nd.append([]) for j in range(len(self.inputs["index"])): index_nd[i].append([i, self.inputs["index"][j]]) elif pos_axis == 2: - index_nd = [] for i in range(self.inputs['x'].shape[0]): index_nd.append([]) for j in range(self.inputs['x'].shape[1]): index_nd[i].append([]) for k in range(len(self.inputs["index"])): index_nd[i][j].append([i, j, self.inputs["index"][k]]) + elif pos_axis == 3: + for i in range(self.inputs['x'].shape[0]): + index_nd.append([]) + for j in range(self.inputs['x'].shape[1]): + index_nd[i].append([]) + for k in range(self.inputs['x'].shape[2]): + index_nd[i][j].append([]) + for l in range(len(self.inputs["index"])): + index_nd[i][j][k].append( + [i, j, k, self.inputs["index"][l]]) else: self.assertTrue(False, "Axis {} No Implement".format(pos_axis)) @@ -74,10 +84,15 @@ def build_paddle_program(self, target): def build_cinn_program(self, target): builder = NetBuilder("scatter_add") - x = builder.create_input(Float(32), self.inputs["x"].shape, "x") - y = builder.create_input(Float(32), self.inputs["y"].shape, "y") + x = builder.create_input( + self.nptype2cinntype(self.inputs["x"].dtype), + self.inputs["x"].shape, "x") + y = builder.create_input( + self.nptype2cinntype(self.inputs["y"].dtype), + self.inputs["y"].shape, "y") index = builder.create_input( - Int(32), self.inputs["index"].shape, "index") + self.nptype2cinntype(self.inputs["index"].dtype), + self.inputs["index"].shape, "index") out = builder.scatter_add(x, y, index, self.axis) prog = builder.build() @@ -88,60 +103,195 @@ def build_cinn_program(self, target): self.cinn_outputs = [res[0]] def test_check_results(self): - self.check_outputs_and_grads() - - -class TestScatterAddCase1(TestScatterAddOp): - def init_case(self): - self.inputs = { - "x": np.random.random([10, 5]).astype("float32"), - "y": np.random.random([10, 3]).astype("float32"), - "index": np.random.randint(0, 5, size=3).astype("int32") - } - self.axis = 1 - - -class TestScatterAddCase2(TestScatterAddOp): - def init_case(self): - self.inputs = { - "x": np.random.random([10, 5, 5]).astype("float32"), - "y": np.random.random([10, 5, 3]).astype("float32"), - "index": np.random.randint(0, 5, size=3).astype("int32") - } - self.axis = -1 - - -class TestScatterAddCase3(TestScatterAddOp): - def init_case(self): - self.inputs = { - "x": np.random.random([10, 5, 5]).astype("float32"), - "y": np.random.random([10, 3, 5]).astype("float32"), - "index": np.random.randint(0, 5, size=3).astype("int32") - } - self.axis = 1 - - -class TestScatterAddCase4(TestScatterAddOp): - def init_case(self): - self.inputs = { - "x": np.random.random([10]).astype("float32"), - "y": np.random.random([1]).astype("float32"), - "index": np.random.randint(0, 10, size=1).astype("int32") - } - self.axis = -1 - - -class TestScatterAddCase5(TestScatterAddOp): - def init_case(self): - self.inputs = { - "x": np.random.random([10, 5]).astype("float32"), - "y": np.random.random([3, 5]).astype("float32"), - "index": np.random.randint(0, 10, size=3).astype("int32") + if self.case["dtype"] == "float16": + self.check_outputs_and_grads( + max_relative_error=0.01, max_absolute_error=0.01) + else: + self.check_outputs_and_grads() + + +class TestScatterAddOpShapeTest(TestCaseHelper): + def init_attrs(self): + self.class_name = "TestScatterAddOpShapeTest" + self.cls = TestScatterAddOp + self.inputs = [{ + "x_shape": [10], + "y_shape": [5], + "axis": 0 + }, { + "x_shape": [10, 8], + "y_shape": [8, 8], + "axis": 0 + }, { + "x_shape": [10, 8, 16], + "y_shape": [10, 4, 16], + "axis": 1 + }, { + "x_shape": [10, 8, 16, 32], + "y_shape": [10, 8, 20, 32], + "axis": -2 + }, { + "x_shape": [10, 8, 16, 32], + "y_shape": [10, 8, 1, 32], + "axis": -2 + }, { + "x_shape": [10, 1, 16, 32], + "y_shape": [10, 1, 8, 32], + "axis": -2 + }, { + "x_shape": [1024, 8, 16, 4], + "y_shape": [512, 8, 16, 4], + "axis": 0 + }, { + "x_shape": [2048, 8, 16, 4], + "y_shape": [1024, 8, 16, 4], + "axis": 0 + }, { + "x_shape": [1024, 8, 16, 4], + "y_shape": [2048, 8, 16, 4], + "axis": 0 + }, { + "x_shape": [1, 1, 1, 1], + "y_shape": [1, 1, 1, 1], + "axis": 0 + }, { + "x_shape": [1], + "y_shape": [8], + "axis": 0 + }] + self.dtypes = [{"dtype": "float32"}] + self.attrs = [] + + +class TestScatterAddOpDtypeTest(TestCaseHelper): + def init_attrs(self): + self.class_name = "TestScatterAddOpDtypeTest" + self.cls = TestScatterAddOp + self.inputs = [{ + "x_shape": [10], + "y_shape": [5], + "axis": 0 + }, { + "x_shape": [10, 8], + "y_shape": [8, 8], + "axis": 0 + }, { + "x_shape": [1024, 8, 16, 4], + "y_shape": [512, 8, 16, 4], + "axis": 0 + }] + self.dtypes = [ + { + "dtype": "float16" + }, + { + "dtype": "float32" + }, + { + "dtype": "float64" + }, + { + "dtype": "int32" + }, + { + "dtype": "int64" + }, + ] + self.attrs = [] + + +class TestScatterAddOpAttributeAxis(TestCaseHelper): + def init_attrs(self): + self.class_name = "TestScatterAddOpAttributeAxis" + self.cls = TestScatterAddOp + self.inputs = [ + { + "x_shape": [10], + "y_shape": [5], + "axis": 0 + }, + { + "x_shape": [10, 8], + "y_shape": [8, 8], + "axis": -2 + }, + { + "x_shape": [10, 8, 16], + "y_shape": [5, 8, 16], + "axis": 0 + }, + { + "x_shape": [10, 8, 16], + "y_shape": [10, 4, 16], + "axis": 1 + }, + { + "x_shape": [10, 8, 16], + "y_shape": [10, 8, 8], + "axis": 2 + }, + { + "x_shape": [10, 8, 16, 32], + "y_shape": [2, 8, 16, 32], + "axis": 0 + }, + { + "x_shape": [10, 8, 16, 32], + "y_shape": [10, 8, 8, 32], + "axis": 2 + }, + { + "x_shape": [10, 8, 16, 32], + "y_shape": [10, 8, 16, 16], + "axis": 3 + }, + { + "x_shape": [10, 8, 16, 32], + "y_shape": [10, 8, 16, 8], + "axis": -1 + }, + { + "x_shape": [10, 8, 16, 32], + "y_shape": [10, 8, 4, 32], + "axis": -2 + }, + { + "x_shape": [10, 8, 16, 32], + "y_shape": [1, 8, 16, 32], + "axis": -4 + }, + # core dumped: cuda_module.cc:118] RAW: The error `CUDA_ERROR_LAUNCH_FAILED` occurs + # while compiling the ptx! And its message is `unspecified launch failure`. + # { + # "x_shape": [10, 8, 16, 32], + # "y_shape": [10, 4, 16, 32], + # "axis": 1 + # }, + # { + # "x_shape": [10, 8, 16, 32], + # "y_shape": [10, 2, 16, 32], + # "axis": -3 + # }, + ] + self.dtypes = [{"dtype": "float32"}] + self.attrs = [] + + +# test inline compute: https://github.com/PaddlePaddle/CINN/pull/1329 +class TestScatterAddCaseInline1(TestScatterAddOp): + def setUp(self): + self.case = { + "x_shape": [10, 5], + "y_shape": [5, 5], + "index_shape": [5], + "dtype": "float32", + "index_dtype": "int32", + "axis": 0 } - self.axis = 0 - + print(f"\nRunning {self.__class__.__name__}: {self.case}") + self.inputs = {} + self.prepare_inputs() -class TestScatterAddCase6(TestScatterAddOp): def build_cinn_program(self, target): builder = NetBuilder("scatter_add") x = builder.create_input(Float(64), self.inputs["x"].shape, "x") @@ -160,7 +310,7 @@ def build_cinn_program(self, target): self.cinn_outputs = [res[0]] -class TestScatterAddCase7(TestScatterAddOp): +class TestScatterAddCaseInline2(TestScatterAddCaseInline1): def build_cinn_program(self, target): builder = NetBuilder("scatter_add") x = builder.create_input(Float(32), self.inputs["x"].shape, "x") @@ -176,50 +326,10 @@ def build_cinn_program(self, target): self.inputs["index"] ], [out]) - self.cinn_outputs = [res[0]] - - -class TestScatterAddCase8(TestScatterAddCase7): - def init_case(self): - self.axis = 0 - self.inputs = { - "x": np.random.random([10, 5]).astype("float32"), - "y": np.random.random([10, 5]).astype("float32"), - "index": np.array([0, 5, 0, 9, 0, 1, 2, 3, 4, 5]).astype("int32") - } - - -class TestScatterAddOp9(TestScatterAddOp): - def setUp(self): - self.init_case() - self.target = DefaultNVGPUTarget() - - def init_case(self): - self.axis = 0 - self.inputs = { - "x": np.random.random([10, 5]).astype("float64"), - "y": np.random.random([5, 5]).astype("float64"), - "index": np.array([0, 5, 0, 9, 0]).astype("int32") - } - - def build_cinn_program(self, target): - builder = NetBuilder("scatter_add") - x = builder.create_input(Float(64), self.inputs["x"].shape, "x") - y = builder.create_input(Float(64), self.inputs["y"].shape, "y") - index = builder.create_input( - Int(32), self.inputs["index"].shape, "index") - out = builder.scatter_add(x, y, index, self.axis) - - prog = builder.build() - res = self.get_cinn_output( - prog, target, [x, y, index], - [self.inputs["x"], self.inputs["y"], self.inputs["index"]], [out]) - - self.cinn_outputs = [res[0]] - - def test_check_results(self): - self.check_outputs_and_grads() + self.cinn_outputs = res if __name__ == "__main__": - unittest.main() + TestScatterAddOpShapeTest().run() + TestScatterAddOpDtypeTest().run() + TestScatterAddOpAttributeAxis().run() From cfeff2ffd50d4964eea01f7cb5871fe0404fe067 Mon Sep 17 00:00:00 2001 From: zzk0 Date: Tue, 23 May 2023 11:02:33 +0000 Subject: [PATCH 2/8] add more dtype --- cinn/runtime/cuda/cinn_cuda_runtime_source.cuh | 4 ++++ cinn/runtime/cuda/cuda_intrinsics.cc | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh index 7d0b75e476..58c9fb4717 100644 --- a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh +++ b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh @@ -705,6 +705,10 @@ CINN_NVGPU_GT_NUM(int64, long long int) return res; \ } +CINN_NVGPU_INDEX_ADD(bool, bool) +CINN_NVGPU_INDEX_ADD(int8, int8_t) +CINN_NVGPU_INDEX_ADD(int32, int32_t) +CINN_NVGPU_INDEX_ADD(int64, int64_t) CINN_NVGPU_INDEX_ADD(fp32, float) CINN_NVGPU_INDEX_ADD(fp64, double) #ifdef CINN_CUDA_FP16 diff --git a/cinn/runtime/cuda/cuda_intrinsics.cc b/cinn/runtime/cuda/cuda_intrinsics.cc index bdb1005ebd..cfdf91cfc0 100644 --- a/cinn/runtime/cuda/cuda_intrinsics.cc +++ b/cinn/runtime/cuda/cuda_intrinsics.cc @@ -346,6 +346,10 @@ CINN_REGISTER_HELPER(cuda_intrinsics) { .AddInputType() \ .End(); + _REGISTER_CINN_NVGPU_INDEX_ADD(bool, bool); + _REGISTER_CINN_NVGPU_INDEX_ADD(int8, int8_t); + _REGISTER_CINN_NVGPU_INDEX_ADD(int32, int32_t); + _REGISTER_CINN_NVGPU_INDEX_ADD(int64, int64_t); _REGISTER_CINN_NVGPU_INDEX_ADD(fp32, float); _REGISTER_CINN_NVGPU_INDEX_ADD(fp64, double); From 5f6b5b049703ae5486519d3d3febecf7cc47b529 Mon Sep 17 00:00:00 2001 From: zzk0 Date: Wed, 24 May 2023 09:21:00 +0000 Subject: [PATCH 3/8] add a.out --- a.out | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 a.out diff --git a/a.out b/a.out new file mode 100644 index 0000000000..08f58f1227 --- /dev/null +++ b/a.out @@ -0,0 +1,144 @@ +#include + +#define CINN_WITH_CUDA +#include "bfloat16.h" +#include "float16.h" +using cinn::common::bfloat16; +using cinn::common::float16; +// using cinn::common::half4; +// using cinn::common::half8; +// using cinn::common::float8; + +#include "cinn_cuda_runtime_source.cuh" + +extern "C" { + +__global__ +void __launch_bounds__(1024) fn_scatter_add_0_35_kernel(const float* __restrict__ x, const float* __restrict__ y, const int32_t* __restrict__ index, float* __restrict__ var_1658) +{ + if (((int)blockIdx.x < 40)) { + if (((int)threadIdx.x < 1024)) { + var_1658[((1024 * (int)blockIdx.x) + (int)threadIdx.x)] = cinn_nvgpu_index_add_fp32(x[((1024 * (int)blockIdx.x) + (int)threadIdx.x)], ((int)blockIdx.x / 4), y, ((512 * ((int)threadIdx.x / 512)) + (((((int)threadIdx.x & 511) / 32) * 32) + ((1024 * ((int)blockIdx.x & 3)) + ((int)threadIdx.x & 31)))), 4096, index, 1); + }; + }; +} + +} +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-29558016 +// Cuda compilation tools, release 11.2, V11.2.142 +// Based on NVVM 7.0.1 +// + +.version 7.2 +.target sm_70 +.address_size 64 + + // .globl fn_scatter_add_0_35_kernel +.global .align 4 .u32 _ZN4cinn6common7float165shiftE; +.global .align 4 .u32 _ZN4cinn6common7float169shiftSignE; +.global .align 4 .u32 _ZN4cinn6common7float164infNE; +.global .align 4 .u32 _ZN4cinn6common7float164maxNE; +.global .align 4 .u32 _ZN4cinn6common7float164minNE; +.global .align 4 .u32 _ZN4cinn6common7float164sigNE; +.global .align 4 .u32 _ZN4cinn6common7float164infCE; +.global .align 4 .u32 _ZN4cinn6common7float164nanNE; +.global .align 4 .u32 _ZN4cinn6common7float164maxCE; +.global .align 4 .u32 _ZN4cinn6common7float164minCE; +.global .align 4 .u32 _ZN4cinn6common7float164sigCE; +.global .align 4 .u32 _ZN4cinn6common7float164mulNE; +.global .align 4 .u32 _ZN4cinn6common7float164mulCE; +.global .align 4 .u32 _ZN4cinn6common7float164subCE; +.global .align 4 .u32 _ZN4cinn6common7float164norCE; +.global .align 4 .u32 _ZN4cinn6common7float164maxDE; +.global .align 4 .u32 _ZN4cinn6common7float164minDE; + +.visible .entry fn_scatter_add_0_35_kernel( + .param .u64 fn_scatter_add_0_35_kernel_param_0, + .param .u64 fn_scatter_add_0_35_kernel_param_1, + .param .u64 fn_scatter_add_0_35_kernel_param_2, + .param .u64 fn_scatter_add_0_35_kernel_param_3 +) +.maxntid 1024, 1, 1 +{ + .reg .pred %p<8>; + .reg .f32 %f<8>; + .reg .b32 %r<29>; + .reg .b64 %rd<17>; + + + ld.param.u64 %rd3, [fn_scatter_add_0_35_kernel_param_0]; + ld.param.u64 %rd4, [fn_scatter_add_0_35_kernel_param_1]; + ld.param.u64 %rd5, [fn_scatter_add_0_35_kernel_param_2]; + ld.param.u64 %rd6, [fn_scatter_add_0_35_kernel_param_3]; + mov.u32 %r1, %ctaid.x; + setp.gt.s32 %p1, %r1, 39; + mov.u32 %r2, %tid.x; + setp.gt.s32 %p2, %r2, 1023; + or.pred %p3, %p1, %p2; + @%p3 bra LBB0_9; + + cvta.to.global.u64 %rd7, %rd3; + shl.b32 %r11, %r1, 10; + add.s32 %r3, %r11, %r2; + mul.wide.s32 %rd8, %r3, 4; + add.s64 %rd9, %rd7, %rd8; + ld.global.nc.f32 %f6, [%rd9]; + shr.s32 %r12, %r1, 31; + shr.u32 %r13, %r12, 30; + add.s32 %r14, %r1, %r13; + shr.s32 %r4, %r14, 2; + shr.s32 %r15, %r2, 31; + shr.u32 %r16, %r15, 23; + add.s32 %r17, %r2, %r16; + and.b32 %r18, %r17, -512; + and.b32 %r19, %r1, 3; + and.b32 %r20, %r2, 511; + bfi.b32 %r21, %r19, %r20, 10, 2; + add.s32 %r5, %r21, %r18; + cvta.to.global.u64 %rd1, %rd5; + cvta.to.global.u64 %rd2, %rd4; + mov.u32 %r26, -1; + +LBB0_2: + mov.u32 %r27, %r26; + +LBB0_3: + add.s32 %r8, %r27, 1; + setp.gt.s32 %p4, %r27, -1; + mov.u32 %r26, -1; + @%p4 bra LBB0_5; + + mul.wide.s32 %rd10, %r27, 4; + add.s64 %rd11, %rd1, %rd10; + ld.global.nc.u32 %r23, [%rd11+4]; + setp.ne.s32 %p5, %r23, %r4; + mov.u32 %r27, %r8; + mov.u32 %r26, %r8; + @%p5 bra LBB0_3; + +LBB0_5: + setp.lt.s32 %p6, %r26, 0; + @%p6 bra LBB0_7; + + shl.b32 %r24, %r26, 12; + add.s32 %r25, %r5, %r24; + mul.wide.s32 %rd12, %r25, 4; + add.s64 %rd13, %rd2, %rd12; + ld.global.nc.f32 %f5, [%rd13]; + add.f32 %f6, %f6, %f5; + +LBB0_7: + setp.ne.s32 %p7, %r26, -1; + @%p7 bra LBB0_2; + + cvta.to.global.u64 %rd14, %rd6; + add.s64 %rd16, %rd14, %rd8; + st.global.f32 [%rd16], %f6; + +LBB0_9: + ret; + +} \ No newline at end of file From c1e76f2830205a877cb9c491358a4be4c1176567 Mon Sep 17 00:00:00 2001 From: zzk0 Date: Fri, 26 May 2023 08:38:59 +0000 Subject: [PATCH 4/8] [WIP] scatter --- cinn/hlir/op/contrib/scatter.cc | 103 ++++--------------------- cinn/pybind/frontend.cc | 32 ++++++++ python/tests/ops/test_scatter_op.py | 114 ++++++++++++++++++++++++++++ 3 files changed, 160 insertions(+), 89 deletions(-) mode change 100755 => 100644 cinn/hlir/op/contrib/scatter.cc create mode 100644 python/tests/ops/test_scatter_op.py diff --git a/cinn/hlir/op/contrib/scatter.cc b/cinn/hlir/op/contrib/scatter.cc old mode 100755 new mode 100644 index 4ad38f19eb..cf2c52136e --- a/cinn/hlir/op/contrib/scatter.cc +++ b/cinn/hlir/op/contrib/scatter.cc @@ -28,6 +28,7 @@ #include "cinn/hlir/framework/node.h" #include "cinn/hlir/framework/op.h" #include "cinn/hlir/framework/op_strategy.h" +#include "cinn/hlir/op/op_util.h" #include "cinn/hlir/pe/elementwise.h" #include "cinn/hlir/pe/ir_schedule_pe.h" #include "cinn/hlir/pe/nn.h" @@ -82,7 +83,7 @@ ir::Tensor Scatter(const ir::Tensor &A, } } new_axes.push_back(pos_axis); - transpose_B = pe::Transpose(B, new_axes, B->name + "_index_transpose"); + transpose_B = pe::Transpose(B, new_axes, B->name + "_transpose"); } auto res = Compute( C->shape, @@ -176,8 +177,8 @@ std::shared_ptr StrategyForScatter(const framework::NodeA const Target &target) { auto attr_store = attrs.attr_store; CHECK(attr_store.count("axis")) << "find no attr of axis"; - int axis = absl::get(attr_store.at("axis")); - std::string op_name("scatter"); + const int axis = absl::get(attr_store.at("axis")); + const std::string op_name("scatter"); framework::CINNCompute scatter_compute([=](lang::Args args, lang::RetValue *ret) { CHECK(!args.empty()) << "The input arguments of " << op_name << " compute is empty! Please check.\n"; @@ -186,62 +187,20 @@ std::shared_ptr StrategyForScatter(const framework::NodeA Expr A = pack_args[0]; Expr B = pack_args[1]; Expr C = pack_args[2]; - CHECK(A.as_tensor()); - CHECK(B.as_tensor()); - CHECK(C.as_tensor()); - CHECK(!output_shapes.empty()); - auto tensor_A = A.as_tensor_ref(); - auto tensor_B = B.as_tensor_ref(); - auto tensor_C = C.as_tensor_ref(); - auto stages = CreateStages({tensor_A, tensor_B, tensor_C}); - VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ") << ", B shape: " << utils::Join(tensor_B->shape, ", ") - << ", output_shapes: " << utils::Join(output_shapes[0], ", "); std::string tensor_name = UniqName("Scatter_out"); if (FLAGS_cinn_ir_schedule) { CHECK_EQ(pack_args.size(), 4U); tensor_name = pack_args[3].operator std::string(); + VLOG(4) << A.as_tensor_ref()->name << " " << B.as_tensor_ref()->name << " " << C.as_tensor_ref()->name << " " << tensor_name; + tensor_name = "test_scatter_out"; } - ir::Tensor out = Scatter(tensor_A, tensor_B, tensor_C, target, axis, tensor_name); - std::vector res; - stages->InsertLazily(out); - res.push_back(CINNValue(out)); - CHECK(!out_type.empty()) << "Output type of " << op_name << " is empty! Please check.\n"; - res.push_back(CINNValue(stages)); - *ret = CINNValuePack{res}; - }); - - framework::CINNSchedule scatter_schedule([=](lang::Args args, lang::RetValue *ret) { - if (FLAGS_cinn_ir_schedule) { - CHECK(!args.empty()) << "The input argument of scatter_schedule is empty! Please check.\n"; - common::CINNValuePack arg_pack = args[0]; - std::vector vec_ast; - for (int i = 0; i < arg_pack.size(); i++) { - if (arg_pack[i].is_expr()) { - Expr temp = arg_pack[i]; - vec_ast.emplace_back(temp); - } - } - CHECK(!vec_ast.empty()); - ir::ModuleExpr mod_expr(vec_ast); - ir::IRSchedule ir_sch(mod_expr); - ir_sch.MergeExprs(); - long prod_size = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies()); - if (prod_size > 1) { - pe::IRInjectiveSchedule(ir_sch, output_shapes.front(), target); - } - std::vector res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))}; - *ret = common::CINNValuePack{res}; - } else { - CHECK(!args.empty()) << "The input argument of scatter_schedule is empty! Please check.\n"; - CINNValuePack arg_pack = args[0]; - Expr out = arg_pack[0]; - CHECK(out.as_tensor()); - *ret = arg_pack; - } + ir::Tensor out = Scatter(A.as_tensor_ref(), B.as_tensor_ref(), C.as_tensor_ref(), target, axis, tensor_name); + auto stages = CreateStages({out}); + *ret = CINNValuePack{{CINNValue(out), CINNValue(stages)}}; }); auto strategy = std::make_shared(); - strategy->AddImpl(scatter_compute, scatter_schedule, "strategy.scatter.x86", 1); + strategy->AddImpl(scatter_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.scatter.x86", 1); return strategy; } @@ -269,7 +228,6 @@ std::shared_ptr StrategyForScatterNd(const framework::Nod auto tensor_A = A.as_tensor_ref(); auto tensor_B = B.as_tensor_ref(); auto tensor_C = C.as_tensor_ref(); - auto stages = CreateStages({tensor_A, tensor_B, tensor_C}); VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ") << ", B shape: " << utils::Join(tensor_B->shape, ", ") << ", output_shapes: " << utils::Join(output_shapes[0], ", "); std::string tensor_name = UniqName("ScatterNd_out"); @@ -278,46 +236,12 @@ std::shared_ptr StrategyForScatterNd(const framework::Nod tensor_name = pack_args[3].operator std::string(); } ir::Tensor out = ScatterNd(tensor_A, tensor_B, tensor_C, target, axes, tensor_name); - std::vector res; - stages->InsertLazily(out); - res.push_back(CINNValue(out)); - CHECK(!out_type.empty()) << "Output type of " << op_name << " is empty! Please check.\n"; - res.push_back(CINNValue(stages)); - *ret = CINNValuePack{res}; - }); - - framework::CINNSchedule scatter_nd_schedule([=](lang::Args args, lang::RetValue *ret) { - if (FLAGS_cinn_ir_schedule) { - CHECK(!args.empty()) << "The input argument of scatter_nd_schedule is empty! Please check.\n"; - common::CINNValuePack arg_pack = args[0]; - std::vector vec_ast; - for (int i = 0; i < arg_pack.size(); i++) { - if (arg_pack[i].is_expr()) { - Expr temp = arg_pack[i]; - vec_ast.emplace_back(temp); - } - } - CHECK(!vec_ast.empty()); - ir::ModuleExpr mod_expr(vec_ast); - ir::IRSchedule ir_sch(mod_expr); - ir_sch.MergeExprs(); - long prod_size = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies()); - if (prod_size > 1) { - pe::IRInjectiveSchedule(ir_sch, output_shapes.front(), target); - } - std::vector res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))}; - *ret = common::CINNValuePack{res}; - } else { - CHECK(!args.empty()) << "The input argument of scatter_nd_schedule is empty! Please check.\n"; - CINNValuePack arg_pack = args[0]; - Expr out = arg_pack[0]; - CHECK(out.as_tensor()); - *ret = arg_pack; - } + auto stages = CreateStages({out}); + *ret = CINNValuePack{{CINNValue(out), CINNValue(stages)}}; }); auto strategy = std::make_shared(); - strategy->AddImpl(scatter_nd_compute, scatter_nd_schedule, "strategy.scatter_nd.x86", 1); + strategy->AddImpl(scatter_nd_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.scatter_nd.x86", 1); return strategy; } @@ -357,6 +281,7 @@ CINN_REGISTER_HELPER(scatter_ops) { .set_attr("CINNStrategy", cinn::hlir::op::StrategyForScatterNd) .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForScatter)) .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForScatter)) + .set_attr("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective) .set_support_level(4); return true; diff --git a/cinn/pybind/frontend.cc b/cinn/pybind/frontend.cc index 1cb719538e..2800302375 100644 --- a/cinn/pybind/frontend.cc +++ b/cinn/pybind/frontend.cc @@ -530,6 +530,38 @@ void BindFrontend(pybind11::module *m) { py::arg("updates"), py::arg("index"), py::arg("axis") = 0) + .def("scatter", + static_cast( + &NetBuilder::Scatter), + py::arg("src"), + py::arg("index"), + py::arg("out"), + py::arg("axis")) + .def("scatter", + static_cast &, const float &, const int &)>( + &NetBuilder::Scatter), + py::arg("src"), + py::arg("index"), + py::arg("shape"), + py::arg("default_value") = 0, + py::arg("axis") = 0) + .def("scatter_nd", + static_cast &)>(&NetBuilder::ScatterNd), + py::arg("src"), + py::arg("index"), + py::arg("out"), + py::arg("axis")) + .def("scatter_nd", + static_cast &, const float &, const std::vector &)>( + &NetBuilder::ScatterNd), + py::arg("src"), + py::arg("index"), + py::arg("shape"), + py::arg("default_value") = 0, + py::arg("axis") = 0) .def("isclose", &NetBuilder::IsClose, py::arg("x"), diff --git a/python/tests/ops/test_scatter_op.py b/python/tests/ops/test_scatter_op.py new file mode 100644 index 0000000000..9ead260dcc --- /dev/null +++ b/python/tests/ops/test_scatter_op.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2021 CINN Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest, OpTestTool +import paddle +from cinn.frontend import * +from cinn.common import * + + +@OpTestTool.skip_if(not is_compiled_with_cuda(), + "x86 test will be skipped due to timeout.") +class TestScatterOp(OpTest): + def setUp(self): + self.init_case() + self.target = DefaultNVGPUTarget() + + def init_case(self): + self.axis = 0 + self.inputs = { + "x": np.random.random([10, 5]).astype("float32"), + "y": np.random.random([5, 5]).astype("float32"), + "index": np.array([0, 5, 0, 9, 0]).astype("int32"), + "src": self.random([2, 5], "float32"), + "out": self.random([3, 5], "float32"), + "index0": np.array([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]).astype("int32"), + } + + def build_paddle_program(self, target): + x = paddle.to_tensor(self.inputs["x"], stop_gradient=True) + y = paddle.to_tensor(self.inputs["y"], stop_gradient=True) + + pos_axis = self.axis + if pos_axis < 0: + pos_axis += len(x.shape) + + if pos_axis == 0: + index_nd = [] + for i in range(len(self.inputs["index"])): + index_nd.append([self.inputs["index"][i]]) + elif pos_axis == 1: + index_nd = [] + for i in range(self.inputs['x'].shape[0]): + index_nd.append([]) + for j in range(len(self.inputs["index"])): + index_nd[i].append([i, self.inputs["index"][j]]) + elif pos_axis == 2: + index_nd = [] + for i in range(self.inputs['x'].shape[0]): + index_nd.append([]) + for j in range(self.inputs['x'].shape[1]): + index_nd[i].append([]) + for k in range(len(self.inputs["index"])): + index_nd[i][j].append([i, j, self.inputs["index"][k]]) + else: + self.assertTrue(False, "Axis {} No Implement".format(pos_axis)) + + index = paddle.to_tensor(index_nd, stop_gradient=True) + res = paddle.scatter_nd_add(x, index, y) + self.paddle_outputs = [res] + + def build_cinn_program(self, target): + builder = NetBuilder("scatter") + src = builder.create_input(OpTest.nptype2cinntype(self.inputs["src"].dtype), self.inputs["src"].shape, "src") + index = builder.create_input(OpTest.nptype2cinntype(self.inputs["index0"].dtype), self.inputs["index0"].shape, "index0") + out = builder.create_input(OpTest.nptype2cinntype(self.inputs["out"].dtype), self.inputs["out"].shape, "out") + out1 = builder.scatter(src, index, out, 0) + + print(self.inputs["src"], self.inputs["index0"], self.inputs["out"]) + + prog = builder.build() + res = self.get_cinn_output( + prog, target, [src, index, out], + [self.inputs["src"], self.inputs["index0"], self.inputs["out"]], [out1]) + + self.cinn_outputs = res + + def test_check_results(self): + self.check_outputs_and_grads() + + +# class TestScatterOp1(TestScatterOp): +# def setUp(self): +# self.init_case() +# self.target = DefaultNVGPUTarget() + +# def init_case(self): +# self.axis = 0 +# self.inputs = { +# "x": np.random.random([10, 5]).astype("int32"), +# "y": np.random.random([5, 5]).astype("int32"), +# "index": np.array([0, 5, 0, 9, 0]).astype("int32") +# } + +# def test_check_results(self): +# self.check_outputs_and_grads() + + +if __name__ == "__main__": + unittest.main() From ad8c6065c5eb60b043f68c0262c66668abd85784 Mon Sep 17 00:00:00 2001 From: zzk0 Date: Wed, 31 May 2023 01:38:12 +0000 Subject: [PATCH 5/8] op unittest for test_scatter_add --- python/tests/ops/test_scatter_add.py | 162 +++++++++++++++------------ 1 file changed, 89 insertions(+), 73 deletions(-) diff --git a/python/tests/ops/test_scatter_add.py b/python/tests/ops/test_scatter_add.py index 7be09c9792..a2879c521e 100644 --- a/python/tests/ops/test_scatter_add.py +++ b/python/tests/ops/test_scatter_add.py @@ -18,7 +18,7 @@ from cinn.frontend import * from cinn.common import * from op_test import OpTest, OpTestTool -from op_test_helper import TestCaseHelper +from op_test_helper import TestCaseHelper, run_test @OpTestTool.skip_if(not is_compiled_with_cuda(), @@ -37,7 +37,7 @@ def prepare_inputs(self): self.inputs = { "x": self.random(x_shape, dtype), "y": self.random(y_shape, dtype), - "index": self.random([y_shape[axis]], "int32", 0, x_shape[0]) + "index": self.random([y_shape[axis]], "int32", 0, x_shape[axis]) } self.axis = axis @@ -100,7 +100,7 @@ def build_cinn_program(self, target): prog, target, [x, y, index], [self.inputs["x"], self.inputs["y"], self.inputs["index"]], [out]) - self.cinn_outputs = [res[0]] + self.cinn_outputs = res def test_check_results(self): if self.case["dtype"] == "float16": @@ -114,51 +114,63 @@ class TestScatterAddOpShapeTest(TestCaseHelper): def init_attrs(self): self.class_name = "TestScatterAddOpShapeTest" self.cls = TestScatterAddOp - self.inputs = [{ - "x_shape": [10], - "y_shape": [5], - "axis": 0 - }, { - "x_shape": [10, 8], - "y_shape": [8, 8], - "axis": 0 - }, { - "x_shape": [10, 8, 16], - "y_shape": [10, 4, 16], - "axis": 1 - }, { - "x_shape": [10, 8, 16, 32], - "y_shape": [10, 8, 20, 32], - "axis": -2 - }, { - "x_shape": [10, 8, 16, 32], - "y_shape": [10, 8, 1, 32], - "axis": -2 - }, { - "x_shape": [10, 1, 16, 32], - "y_shape": [10, 1, 8, 32], - "axis": -2 - }, { - "x_shape": [1024, 8, 16, 4], - "y_shape": [512, 8, 16, 4], - "axis": 0 - }, { - "x_shape": [2048, 8, 16, 4], - "y_shape": [1024, 8, 16, 4], - "axis": 0 - }, { - "x_shape": [1024, 8, 16, 4], - "y_shape": [2048, 8, 16, 4], - "axis": 0 - }, { - "x_shape": [1, 1, 1, 1], - "y_shape": [1, 1, 1, 1], - "axis": 0 - }, { - "x_shape": [1], - "y_shape": [8], - "axis": 0 - }] + self.inputs = [ + { + "x_shape": [10], + "y_shape": [5], + "axis": 0 + }, + { + "x_shape": [10, 8], + "y_shape": [8, 8], + "axis": 0 + }, + { + "x_shape": [10, 8, 16], + "y_shape": [10, 4, 16], + "axis": 1 + }, + { + "x_shape": [10, 8, 16, 32], + "y_shape": [10, 8, 20, 32], + "axis": -2 + }, + { + "x_shape": [10, 8, 16, 32], + "y_shape": [10, 8, 1, 32], + "axis": -2 + }, + { + "x_shape": [10, 1, 16, 32], + "y_shape": [10, 1, 8, 32], + "axis": -2 + }, + { + "x_shape": [1024, 8, 16, 4], + "y_shape": [512, 8, 16, 4], + "axis": 0 + }, + { + "x_shape": [2048, 8, 16, 4], + "y_shape": [1024, 8, 16, 4], + "axis": 0 + }, + { + "x_shape": [1024, 8, 16, 4], + "y_shape": [2048, 8, 16, 4], + "axis": 0 + }, + { + "x_shape": [1, 1, 1, 1], + "y_shape": [1, 1, 1, 1], + "axis": 0 + }, + { + "x_shape": [1], + "y_shape": [8], + "axis": 0 + }, + ] self.dtypes = [{"dtype": "float32"}] self.attrs = [] @@ -167,19 +179,23 @@ class TestScatterAddOpDtypeTest(TestCaseHelper): def init_attrs(self): self.class_name = "TestScatterAddOpDtypeTest" self.cls = TestScatterAddOp - self.inputs = [{ - "x_shape": [10], - "y_shape": [5], - "axis": 0 - }, { - "x_shape": [10, 8], - "y_shape": [8, 8], - "axis": 0 - }, { - "x_shape": [1024, 8, 16, 4], - "y_shape": [512, 8, 16, 4], - "axis": 0 - }] + self.inputs = [ + { + "x_shape": [10], + "y_shape": [5], + "axis": 0 + }, + { + "x_shape": [10, 8], + "y_shape": [8, 8], + "axis": 0 + }, + { + "x_shape": [1024, 8, 16, 4], + "y_shape": [512, 8, 16, 4], + "axis": 0 + }, + ] self.dtypes = [ { "dtype": "float16" @@ -260,18 +276,16 @@ def init_attrs(self): "y_shape": [1, 8, 16, 32], "axis": -4 }, - # core dumped: cuda_module.cc:118] RAW: The error `CUDA_ERROR_LAUNCH_FAILED` occurs - # while compiling the ptx! And its message is `unspecified launch failure`. - # { - # "x_shape": [10, 8, 16, 32], - # "y_shape": [10, 4, 16, 32], - # "axis": 1 - # }, - # { - # "x_shape": [10, 8, 16, 32], - # "y_shape": [10, 2, 16, 32], - # "axis": -3 - # }, + { + "x_shape": [10, 8, 16, 32], + "y_shape": [10, 4, 16, 32], + "axis": 1 + }, + { + "x_shape": [10, 8, 16, 32], + "y_shape": [10, 2, 16, 32], + "axis": -3 + }, ] self.dtypes = [{"dtype": "float32"}] self.attrs = [] @@ -333,3 +347,5 @@ def build_cinn_program(self, target): TestScatterAddOpShapeTest().run() TestScatterAddOpDtypeTest().run() TestScatterAddOpAttributeAxis().run() + run_test(TestScatterAddCaseInline1) + run_test(TestScatterAddCaseInline2) From 9cf811eacad371bdc12b0f00b4b82b404a051f23 Mon Sep 17 00:00:00 2001 From: zzk0 Date: Thu, 1 Jun 2023 00:49:09 +0000 Subject: [PATCH 6/8] add temporary files --- cinn/hlir/op/contrib/scatter.cc | 44 +++++++++-------- python/tests/ops/test_scatter_op.py | 75 ++++++++++++++++------------- 2 files changed, 67 insertions(+), 52 deletions(-) diff --git a/cinn/hlir/op/contrib/scatter.cc b/cinn/hlir/op/contrib/scatter.cc index cf2c52136e..7733a5f92d 100644 --- a/cinn/hlir/op/contrib/scatter.cc +++ b/cinn/hlir/op/contrib/scatter.cc @@ -72,19 +72,19 @@ ir::Tensor Scatter(const ir::Tensor &A, pos_axis += C->shape.size(); } - ir::Tensor transpose_B; - if (pos_axis == A->shape.size() - 1) { - transpose_B = B; - } else { - std::vector new_axes; - for (int i = 0; i < A->shape.size(); ++i) { - if (i != pos_axis) { - new_axes.push_back(i); - } - } - new_axes.push_back(pos_axis); - transpose_B = pe::Transpose(B, new_axes, B->name + "_transpose"); - } + ir::Tensor transpose_B = B; + // if (pos_axis == A->shape.size() - 1) { + // transpose_B = B; + // } else { + // std::vector new_axes; + // for (int i = 0; i < A->shape.size(); ++i) { + // if (i != pos_axis) { + // new_axes.push_back(i); + // } + // } + // new_axes.push_back(pos_axis); + // transpose_B = pe::Transpose(B, new_axes, B->name + "_transpose"); + // } auto res = Compute( C->shape, [=](const std::vector &indices) { @@ -184,19 +184,25 @@ std::shared_ptr StrategyForScatter(const framework::NodeA CHECK(!args.empty()) << "The input arguments of " << op_name << " compute is empty! Please check.\n"; CINNValuePack pack_args = args[0]; CHECK_GE(pack_args.size(), 3U) << "3 input tensors for " << op_name << " compute\n"; - Expr A = pack_args[0]; - Expr B = pack_args[1]; - Expr C = pack_args[2]; + Expr A = pack_args[0]; + Expr B = pack_args[1]; + Expr C = pack_args[2]; std::string tensor_name = UniqName("Scatter_out"); + auto stages = CreateStages({A.as_tensor_ref(), B.as_tensor_ref(), C.as_tensor_ref()}); if (FLAGS_cinn_ir_schedule) { CHECK_EQ(pack_args.size(), 4U); tensor_name = pack_args[3].operator std::string(); - VLOG(4) << A.as_tensor_ref()->name << " " << B.as_tensor_ref()->name << " " << C.as_tensor_ref()->name << " " << tensor_name; + VLOG(4) << A.as_tensor_ref()->name << " " << B.as_tensor_ref()->name << " " << C.as_tensor_ref()->name << " " + << tensor_name; tensor_name = "test_scatter_out"; } ir::Tensor out = Scatter(A.as_tensor_ref(), B.as_tensor_ref(), C.as_tensor_ref(), target, axis, tensor_name); - auto stages = CreateStages({out}); - *ret = CINNValuePack{{CINNValue(out), CINNValue(stages)}}; + std::vector res; + stages->InsertLazily(out); + res.push_back(CINNValue(out)); + CHECK(!out_type.empty()) << "Output type of Scatter is empty! Please check.\n"; + res.push_back(CINNValue(stages)); + *ret = CINNValuePack{res}; }); auto strategy = std::make_shared(); diff --git a/python/tests/ops/test_scatter_op.py b/python/tests/ops/test_scatter_op.py index 9ead260dcc..31a3530f2f 100644 --- a/python/tests/ops/test_scatter_op.py +++ b/python/tests/ops/test_scatter_op.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import unittest import numpy as np from op_test import OpTest, OpTestTool @@ -30,7 +31,7 @@ def setUp(self): self.target = DefaultNVGPUTarget() def init_case(self): - self.axis = 0 + self.axis = -1 self.inputs = { "x": np.random.random([10, 5]).astype("float32"), "y": np.random.random([5, 5]).astype("float32"), @@ -39,55 +40,64 @@ def init_case(self): "out": self.random([3, 5], "float32"), "index0": np.array([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]).astype("int32"), } + print(self.inputs["src"], '\n', self.inputs["index0"], '\n', self.inputs["out"]) def build_paddle_program(self, target): - x = paddle.to_tensor(self.inputs["x"], stop_gradient=True) - y = paddle.to_tensor(self.inputs["y"], stop_gradient=True) + # x = paddle.to_tensor(self.inputs["src"], stop_gradient=True) + # y = paddle.to_tensor(self.inputs["out"], stop_gradient=True) + # index = paddle.to_tensor(self.inputs["index0"], stop_gradient=True) + x = self.inputs["src"] + y = self.inputs["out"] + index = self.inputs["index0"] pos_axis = self.axis if pos_axis < 0: pos_axis += len(x.shape) - if pos_axis == 0: - index_nd = [] - for i in range(len(self.inputs["index"])): - index_nd.append([self.inputs["index"][i]]) - elif pos_axis == 1: - index_nd = [] - for i in range(self.inputs['x'].shape[0]): - index_nd.append([]) - for j in range(len(self.inputs["index"])): - index_nd[i].append([i, self.inputs["index"][j]]) - elif pos_axis == 2: - index_nd = [] - for i in range(self.inputs['x'].shape[0]): - index_nd.append([]) - for j in range(self.inputs['x'].shape[1]): - index_nd[i].append([]) - for k in range(len(self.inputs["index"])): - index_nd[i][j].append([i, j, self.inputs["index"][k]]) + dim = len(x.shape) + res = copy.deepcopy(y) + if dim == 1: + pass + elif dim == 2: + if pos_axis == 0: + for i in range(x.shape[0]): + for j in range(x.shape[1]): + res[index[i, j], j] = x[i, j] + elif pos_axis == 1: + for i in range(x.shape[0]): + for j in range(x.shape[1]): + res[i, index[i, j]] = x[i, j] + elif dim == 3: + pass + elif dim == 4: + pass else: - self.assertTrue(False, "Axis {} No Implement".format(pos_axis)) - - index = paddle.to_tensor(index_nd, stop_gradient=True) - res = paddle.scatter_nd_add(x, index, y) - self.paddle_outputs = [res] + raise NotImplementedError + + self.paddle_outputs = [paddle.to_tensor(res)] + print("paddle res:\n", res) def build_cinn_program(self, target): builder = NetBuilder("scatter") - src = builder.create_input(OpTest.nptype2cinntype(self.inputs["src"].dtype), self.inputs["src"].shape, "src") - index = builder.create_input(OpTest.nptype2cinntype(self.inputs["index0"].dtype), self.inputs["index0"].shape, "index0") - out = builder.create_input(OpTest.nptype2cinntype(self.inputs["out"].dtype), self.inputs["out"].shape, "out") + src = builder.create_input( + OpTest.nptype2cinntype(self.inputs["src"].dtype), + self.inputs["src"].shape, "src") + index = builder.create_input( + OpTest.nptype2cinntype(self.inputs["index0"].dtype), + self.inputs["index0"].shape, "index0") + out = builder.create_input( + OpTest.nptype2cinntype(self.inputs["out"].dtype), + self.inputs["out"].shape, "out") out1 = builder.scatter(src, index, out, 0) - print(self.inputs["src"], self.inputs["index0"], self.inputs["out"]) - prog = builder.build() res = self.get_cinn_output( prog, target, [src, index, out], - [self.inputs["src"], self.inputs["index0"], self.inputs["out"]], [out1]) + [self.inputs["src"], self.inputs["index0"], self.inputs["out"]], + [out1]) self.cinn_outputs = res + print("cinn res:\n", res) def test_check_results(self): self.check_outputs_and_grads() @@ -109,6 +119,5 @@ def test_check_results(self): # def test_check_results(self): # self.check_outputs_and_grads() - if __name__ == "__main__": unittest.main() From b414d8421997739b864ff0227f4d88290d36bd4f Mon Sep 17 00:00:00 2001 From: zzk0 Date: Sat, 3 Jun 2023 09:43:09 +0000 Subject: [PATCH 7/8] remove scatter & scatter_nd --- a.out | 144 ------------- cinn/frontend/net_builder.cc | 27 --- cinn/frontend/net_builder_test.cc | 151 -------------- cinn/hlir/op/contrib/CMakeLists.txt | 2 - cinn/hlir/op/contrib/scatter.cc | 294 --------------------------- cinn/hlir/op/contrib/scatter.h | 44 ---- cinn/hlir/op/contrib/scatter_test.cc | 165 --------------- cinn/hlir/op/use_ops.h | 1 - cinn/pybind/frontend.cc | 32 --- python/tests/ops/test_scatter_op.py | 123 ----------- 10 files changed, 983 deletions(-) delete mode 100644 a.out delete mode 100644 cinn/hlir/op/contrib/scatter.cc delete mode 100644 cinn/hlir/op/contrib/scatter.h delete mode 100644 cinn/hlir/op/contrib/scatter_test.cc delete mode 100644 python/tests/ops/test_scatter_op.py diff --git a/a.out b/a.out deleted file mode 100644 index 08f58f1227..0000000000 --- a/a.out +++ /dev/null @@ -1,144 +0,0 @@ -#include - -#define CINN_WITH_CUDA -#include "bfloat16.h" -#include "float16.h" -using cinn::common::bfloat16; -using cinn::common::float16; -// using cinn::common::half4; -// using cinn::common::half8; -// using cinn::common::float8; - -#include "cinn_cuda_runtime_source.cuh" - -extern "C" { - -__global__ -void __launch_bounds__(1024) fn_scatter_add_0_35_kernel(const float* __restrict__ x, const float* __restrict__ y, const int32_t* __restrict__ index, float* __restrict__ var_1658) -{ - if (((int)blockIdx.x < 40)) { - if (((int)threadIdx.x < 1024)) { - var_1658[((1024 * (int)blockIdx.x) + (int)threadIdx.x)] = cinn_nvgpu_index_add_fp32(x[((1024 * (int)blockIdx.x) + (int)threadIdx.x)], ((int)blockIdx.x / 4), y, ((512 * ((int)threadIdx.x / 512)) + (((((int)threadIdx.x & 511) / 32) * 32) + ((1024 * ((int)blockIdx.x & 3)) + ((int)threadIdx.x & 31)))), 4096, index, 1); - }; - }; -} - -} -// -// Generated by NVIDIA NVVM Compiler -// -// Compiler Build ID: CL-29558016 -// Cuda compilation tools, release 11.2, V11.2.142 -// Based on NVVM 7.0.1 -// - -.version 7.2 -.target sm_70 -.address_size 64 - - // .globl fn_scatter_add_0_35_kernel -.global .align 4 .u32 _ZN4cinn6common7float165shiftE; -.global .align 4 .u32 _ZN4cinn6common7float169shiftSignE; -.global .align 4 .u32 _ZN4cinn6common7float164infNE; -.global .align 4 .u32 _ZN4cinn6common7float164maxNE; -.global .align 4 .u32 _ZN4cinn6common7float164minNE; -.global .align 4 .u32 _ZN4cinn6common7float164sigNE; -.global .align 4 .u32 _ZN4cinn6common7float164infCE; -.global .align 4 .u32 _ZN4cinn6common7float164nanNE; -.global .align 4 .u32 _ZN4cinn6common7float164maxCE; -.global .align 4 .u32 _ZN4cinn6common7float164minCE; -.global .align 4 .u32 _ZN4cinn6common7float164sigCE; -.global .align 4 .u32 _ZN4cinn6common7float164mulNE; -.global .align 4 .u32 _ZN4cinn6common7float164mulCE; -.global .align 4 .u32 _ZN4cinn6common7float164subCE; -.global .align 4 .u32 _ZN4cinn6common7float164norCE; -.global .align 4 .u32 _ZN4cinn6common7float164maxDE; -.global .align 4 .u32 _ZN4cinn6common7float164minDE; - -.visible .entry fn_scatter_add_0_35_kernel( - .param .u64 fn_scatter_add_0_35_kernel_param_0, - .param .u64 fn_scatter_add_0_35_kernel_param_1, - .param .u64 fn_scatter_add_0_35_kernel_param_2, - .param .u64 fn_scatter_add_0_35_kernel_param_3 -) -.maxntid 1024, 1, 1 -{ - .reg .pred %p<8>; - .reg .f32 %f<8>; - .reg .b32 %r<29>; - .reg .b64 %rd<17>; - - - ld.param.u64 %rd3, [fn_scatter_add_0_35_kernel_param_0]; - ld.param.u64 %rd4, [fn_scatter_add_0_35_kernel_param_1]; - ld.param.u64 %rd5, [fn_scatter_add_0_35_kernel_param_2]; - ld.param.u64 %rd6, [fn_scatter_add_0_35_kernel_param_3]; - mov.u32 %r1, %ctaid.x; - setp.gt.s32 %p1, %r1, 39; - mov.u32 %r2, %tid.x; - setp.gt.s32 %p2, %r2, 1023; - or.pred %p3, %p1, %p2; - @%p3 bra LBB0_9; - - cvta.to.global.u64 %rd7, %rd3; - shl.b32 %r11, %r1, 10; - add.s32 %r3, %r11, %r2; - mul.wide.s32 %rd8, %r3, 4; - add.s64 %rd9, %rd7, %rd8; - ld.global.nc.f32 %f6, [%rd9]; - shr.s32 %r12, %r1, 31; - shr.u32 %r13, %r12, 30; - add.s32 %r14, %r1, %r13; - shr.s32 %r4, %r14, 2; - shr.s32 %r15, %r2, 31; - shr.u32 %r16, %r15, 23; - add.s32 %r17, %r2, %r16; - and.b32 %r18, %r17, -512; - and.b32 %r19, %r1, 3; - and.b32 %r20, %r2, 511; - bfi.b32 %r21, %r19, %r20, 10, 2; - add.s32 %r5, %r21, %r18; - cvta.to.global.u64 %rd1, %rd5; - cvta.to.global.u64 %rd2, %rd4; - mov.u32 %r26, -1; - -LBB0_2: - mov.u32 %r27, %r26; - -LBB0_3: - add.s32 %r8, %r27, 1; - setp.gt.s32 %p4, %r27, -1; - mov.u32 %r26, -1; - @%p4 bra LBB0_5; - - mul.wide.s32 %rd10, %r27, 4; - add.s64 %rd11, %rd1, %rd10; - ld.global.nc.u32 %r23, [%rd11+4]; - setp.ne.s32 %p5, %r23, %r4; - mov.u32 %r27, %r8; - mov.u32 %r26, %r8; - @%p5 bra LBB0_3; - -LBB0_5: - setp.lt.s32 %p6, %r26, 0; - @%p6 bra LBB0_7; - - shl.b32 %r24, %r26, 12; - add.s32 %r25, %r5, %r24; - mul.wide.s32 %rd12, %r25, 4; - add.s64 %rd13, %rd2, %rd12; - ld.global.nc.f32 %f5, [%rd13]; - add.f32 %f6, %f6, %f5; - -LBB0_7: - setp.ne.s32 %p7, %r26, -1; - @%p7 bra LBB0_2; - - cvta.to.global.u64 %rd14, %rd6; - add.s64 %rd16, %rd14, %rd8; - st.global.f32 [%rd16], %f6; - -LBB0_9: - ret; - -} \ No newline at end of file diff --git a/cinn/frontend/net_builder.cc b/cinn/frontend/net_builder.cc index f3e0f47385..ede81f39fe 100644 --- a/cinn/frontend/net_builder.cc +++ b/cinn/frontend/net_builder.cc @@ -430,33 +430,6 @@ Variable NetBuilder::GatherNd(const Variable& x, const Variable& index) { return CustomInstr("gather_nd", {x, index}, {}).front(); } -Variable NetBuilder::Scatter(const Variable& src, const Variable& index, const Variable& out, const int& axis) { - return CustomInstr("scatter", {src, index, out}, {{"axis", axis}}).front(); -} -Variable NetBuilder::Scatter(const Variable& src, - const Variable& index, - const std::vector& shape, - const float& default_value, - const int& axis) { - auto out = FillConstant(shape, default_value, UniqName("fill_constant"), "float", false); - return Scatter(src, index, out, axis); -} - -Variable NetBuilder::ScatterNd(const Variable& src, - const Variable& index, - const Variable& out, - const std::vector& axes) { - return CustomInstr("scatter_nd", {src, index, out}, {{"axes", axes}}).front(); -} -Variable NetBuilder::ScatterNd(const Variable& src, - const Variable& index, - const std::vector& shape, - const float& default_value, - const std::vector& axes) { - auto out = FillConstant(shape, default_value, UniqName("fill_constant"), "float", false); - return ScatterNd(src, index, out, axes); -} - Variable NetBuilder::Cast(const Variable& operand, const std::string& dtype) { return CustomInstr("cast", {operand}, {{"dtype", dtype}}).front(); } diff --git a/cinn/frontend/net_builder_test.cc b/cinn/frontend/net_builder_test.cc index 413e4e618a..4e55150c0c 100644 --- a/cinn/frontend/net_builder_test.cc +++ b/cinn/frontend/net_builder_test.cc @@ -354,157 +354,6 @@ TEST(net_build, program_execute_gather_nd) { } } -TEST(net_build, program_execute_scatter) { - const float default_value = 3.14; - const int B = 3; - const int H_IN = 4; - const int H_OUT = 11; - - NetBuilder builder("net_builder"); - Placeholder input1 = builder.CreateInput(Float(32), {B, H_IN}, "In1"); - Placeholder input2 = builder.CreateInput(Int(32), {B, H_IN}, "In2"); - Variable output = builder.Scatter(input1, input2, {B, H_OUT}, default_value, 1); - auto program = builder.Build(); - -#ifdef CINN_WITH_CUDA - Target target = common::DefaultNVGPUTarget(); -#else - Target target = common::DefaultHostTarget(); -#endif - std::unordered_set fetch_ids; - auto graph = Optimize(&program, fetch_ids, target); - - auto scope = BuildScope(target, graph); - hlir::framework::GraphCompiler gc(target, scope, graph); - auto runtime_program = gc.Build(); - - scope->Var(std::string(input1.id())); - scope->Var(std::string(input2.id())); - scope->Var(std::string(output->id)); - - auto input1_tensor = scope->GetTensor(std::string(input1.id())); - SetRandData(input1_tensor, target); - std::vector input1_data = GetTensorData(input1_tensor, target); - - auto input2_tensor = scope->GetTensor(std::string(input2.id())); - SetRandInt(input2_tensor, target, -1, 0, H_IN); - - std::vector input2_data = GetTensorData(input2_tensor, target); - - runtime_program->Execute(); - - auto output_tensor = scope->GetTensor(std::string(output->id)); - const std::vector& output_shape = output_tensor->shape().data(); - EXPECT_EQ(output_tensor->type(), Float(32)); - EXPECT_EQ(output_shape.size(), 2UL); - EXPECT_EQ(output_shape[0], B); - EXPECT_EQ(output_shape[1], H_OUT); - - float true_data[B * H_OUT]; - for (int b = 0; b < B; ++b) { - for (int h = 0; h < H_OUT; ++h) { - int index = h + H_OUT * b; - true_data[index] = default_value; - } - } - for (int b = 0; b < B; ++b) { - for (int h = 0; h < H_IN; ++h) { - int index = h + H_IN * b; - true_data[input2_data[index] + H_OUT * b] = input1_data[index]; - } - } - - std::vector output_data = GetTensorData(output_tensor, target); - VLOG(6) << "Visualize output_data"; - for (int b = 0; b < B; ++b) { - for (int h = 0; h < H_OUT; ++h) { - std::string line; - int index = h + H_OUT * b; - float t_data = true_data[index]; - float out_data = output_data[index]; - line += (std::to_string(out_data) + ", "); - EXPECT_EQ(t_data, out_data); - VLOG(6) << line; - } - } -} - -TEST(net_build, program_execute_scatter_nd) { - Context::Global().ResetNameId(); - const float default_value = 3.14; - const int B = 3; - const int H_IN = 4; - const int H_OUT = 11; - - NetBuilder builder("net_builder"); - Placeholder input1 = builder.CreateInput(Float(32), {B, H_IN}, "In1"); - Placeholder input2 = builder.CreateInput(Int(32), {B, H_IN, 1}, "In2"); - Variable output = builder.ScatterNd(input1, input2, {B, H_OUT}, default_value, {1}); - auto program = builder.Build(); - -#ifdef CINN_WITH_CUDA - Target target = common::DefaultNVGPUTarget(); -#else - Target target = common::DefaultHostTarget(); -#endif - std::unordered_set fetch_ids; - auto graph = Optimize(&program, fetch_ids, target); - - auto scope = BuildScope(target, graph); - hlir::framework::GraphCompiler gc(target, scope, graph); - auto runtime_program = gc.Build(); - - scope->Var(std::string(input1.id())); - scope->Var(std::string(input2.id())); - scope->Var(std::string(output->id)); - - auto input1_tensor = scope->GetTensor(std::string(input1.id())); - SetRandData(input1_tensor, target); - - auto input2_tensor = scope->GetTensor(std::string(input2.id())); - SetRandInt(input2_tensor, target); - - runtime_program->Execute(); - - std::vector input1_data = GetTensorData(input1_tensor, target); - std::vector input2_data = GetTensorData(input2_tensor, target); - - auto output_tensor = scope->GetTensor(std::string(output->id)); - const std::vector& output_shape = output_tensor->shape().data(); - EXPECT_EQ(output_tensor->type(), Float(32)); - EXPECT_EQ(output_shape.size(), 2UL); - EXPECT_EQ(output_shape[0], B); - EXPECT_EQ(output_shape[1], H_OUT); - - float true_data[B * H_OUT]; - for (int b = 0; b < B; ++b) { - for (int h = 0; h < H_OUT; ++h) { - int index = h + H_OUT * b; - true_data[index] = default_value; - } - } - for (int b = 0; b < B; ++b) { - for (int h = 0; h < H_IN; ++h) { - int index = h + H_IN * b; - true_data[input2_data[index] + H_OUT * b] = input1_data[index]; - } - } - - std::vector output_data = GetTensorData(output_tensor, target); - VLOG(6) << "Visualize output_data"; - for (int b = 0; b < B; ++b) { - for (int h = 0; h < H_OUT; ++h) { - std::string line; - int index = h + H_OUT * b; - float t_data = true_data[index]; - float out_data = output_data[index]; - line += (std::to_string(out_data) + ", "); - EXPECT_EQ(t_data, out_data); - VLOG(6) << line; - } - } -} - TEST(net_build, program_execute_cast) { const int B = 4; const int H = 7; diff --git a/cinn/hlir/op/contrib/CMakeLists.txt b/cinn/hlir/op/contrib/CMakeLists.txt index fde84a733e..48565a4edb 100644 --- a/cinn/hlir/op/contrib/CMakeLists.txt +++ b/cinn/hlir/op/contrib/CMakeLists.txt @@ -2,7 +2,6 @@ core_gather_headers() gather_srcs(cinnapi_src SRCS gather_nd.cc - scatter.cc flip.cc sort.cc argmin.cc @@ -22,7 +21,6 @@ gather_srcs(cinnapi_src SRCS ) cc_test(test_gather_nd SRCS gather_nd_test.cc DEPS cinncore) -cc_test(test_scatter SRCS scatter_test.cc DEPS cinncore) cc_test(test_sort SRCS sort_test.cc DEPS cinncore) cc_test(test_argmin SRCS argmin_test.cc DEPS cinncore) cc_test(test_argmax SRCS argmax_test.cc DEPS cinncore) diff --git a/cinn/hlir/op/contrib/scatter.cc b/cinn/hlir/op/contrib/scatter.cc deleted file mode 100644 index 7733a5f92d..0000000000 --- a/cinn/hlir/op/contrib/scatter.cc +++ /dev/null @@ -1,294 +0,0 @@ -// Copyright (c) 2022 CINN Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "cinn/hlir/op/contrib/scatter.h" - -#include - -#include -#include -#include -#include - -#include "cinn/common/cas.h" -#include "cinn/common/common.h" -#include "cinn/common/context.h" -#include "cinn/common/macros.h" -#include "cinn/hlir/framework/node.h" -#include "cinn/hlir/framework/op.h" -#include "cinn/hlir/framework/op_strategy.h" -#include "cinn/hlir/op/op_util.h" -#include "cinn/hlir/pe/elementwise.h" -#include "cinn/hlir/pe/ir_schedule_pe.h" -#include "cinn/hlir/pe/nn.h" -#include "cinn/hlir/pe/schedule.h" -#include "cinn/hlir/pe/transform.h" -#include "cinn/ir/ir.h" -#include "cinn/ir/ir_base.h" -#include "cinn/ir/tensor.h" -#include "cinn/lang/builtin.h" -#include "cinn/lang/compute.h" - -DECLARE_bool(cinn_ir_schedule); - -namespace cinn { -namespace hlir { -namespace op { - -using common::CINNValue; -using common::CINNValuePack; - -ir::Tensor Scatter(const ir::Tensor &A, - const ir::Tensor &B, - const ir::Tensor &C, - const common::Target &target, - const int &axis, - const std::string &name) { - CHECK_EQ(A->shape.size(), B->shape.size()); - CHECK_EQ(A->shape.size(), C->shape.size()); - - std::string extern_fun_name; - if (target.arch == common::Target::Arch::NVGPU) { - extern_fun_name.assign("cinn_cuda_find_int_nd"); - } else if (target.arch == common::Target::Arch::X86) { - extern_fun_name.assign("cinn_host_find_int_nd"); - } else { - LOG(FATAL) << "Scatter only support X86 and NVGPU ! Please Check.\n"; - } - - int pos_axis = axis; - if (pos_axis < 0) { - pos_axis += C->shape.size(); - } - - ir::Tensor transpose_B = B; - // if (pos_axis == A->shape.size() - 1) { - // transpose_B = B; - // } else { - // std::vector new_axes; - // for (int i = 0; i < A->shape.size(); ++i) { - // if (i != pos_axis) { - // new_axes.push_back(i); - // } - // } - // new_axes.push_back(pos_axis); - // transpose_B = pe::Transpose(B, new_axes, B->name + "_transpose"); - // } - auto res = Compute( - C->shape, - [=](const std::vector &indices) { - Expr offset(0); - for (int i = 0; i < indices.size(); i++) { - if (i != pos_axis) { - offset = offset * C->shape[i] + indices[i]; - } - } - auto B_shape_axis = B->shape[pos_axis]; - offset = common::AutoSimplify(offset * B_shape_axis); - auto idx = lang::CallExtern(extern_fun_name, {transpose_B, B_shape_axis, indices[pos_axis], offset, Expr(1)}); - std::vector A_indices(indices); - A_indices[pos_axis] = idx; - auto keep = ir::EQ::Make(idx, Expr(-1)); - return ir::Select::Make(keep, C(indices), A(A_indices)); - }, - name); - return res; -} - -ir::Tensor ScatterNd(const ir::Tensor &A, - const ir::Tensor &B, - const ir::Tensor &C, - const common::Target &target, - const std::vector &axes, - const std::string &name) { - CHECK(!A->shape.empty()); - CHECK_EQ(A->shape.size() + 1, B->shape.size()); - CHECK_EQ(A->shape.size() + axes.size() - 1, C->shape.size()); - - std::string extern_fun_name; - if (target.arch == common::Target::Arch::NVGPU) { - extern_fun_name.assign("cinn_cuda_find_int_nd"); - } else if (target.arch == common::Target::Arch::X86) { - extern_fun_name.assign("cinn_host_find_int_nd"); - } else { - LOG(FATAL) << "ScatterNd only support X86 and NVGPU ! Please Check.\n"; - } - - std::vector pos_axes; - for (auto axis : axes) { - if (axis < 0) { - pos_axes.push_back(axis + C->shape.size()); - } else { - pos_axes.push_back(axis); - } - } - - auto res = Compute( - C->shape, - [=](const std::vector &indices) { - auto offset = Expr(0); - std::vector A_indices; - for (int i = 0; i < indices.size(); i++) { - if (std::find(pos_axes.begin(), pos_axes.end(), i) == pos_axes.end()) { - offset = offset * C->shape[i] + indices[i]; - A_indices.push_back(indices[i]); - } - } - offset = offset * B->shape[B->shape.size() - 2] * B->shape[B->shape.size() - 1]; - auto keep = Expr(true); - std::vector idx; - for (int i = 0; i < pos_axes.size(); ++i) { - auto cur_idx = lang::CallExtern(extern_fun_name, - {B, - B->shape[B->shape.size() - 2], - indices[pos_axes[i]], - common::AutoSimplify(offset + Expr(i)), - Expr(static_cast(pos_axes.size()))}); - if (idx.empty()) { - idx.push_back(cur_idx); - A_indices.push_back(cur_idx); - } else { - keep = ir::And::Make(keep, ir::EQ::Make(idx[0], cur_idx)); - idx[0] = cur_idx; - } - } - keep = common::AutoSimplify(ir::And::Make(keep, ir::EQ::Make(idx[0], Expr(-1)))); - return ir::Select::Make(keep, C(indices), A(A_indices)); - }, - name); - return res; -} - -std::shared_ptr StrategyForScatter(const framework::NodeAttr &attrs, - const std::vector &inputs, - const std::vector &out_type, - const std::vector> &output_shapes, - const Target &target) { - auto attr_store = attrs.attr_store; - CHECK(attr_store.count("axis")) << "find no attr of axis"; - const int axis = absl::get(attr_store.at("axis")); - const std::string op_name("scatter"); - - framework::CINNCompute scatter_compute([=](lang::Args args, lang::RetValue *ret) { - CHECK(!args.empty()) << "The input arguments of " << op_name << " compute is empty! Please check.\n"; - CINNValuePack pack_args = args[0]; - CHECK_GE(pack_args.size(), 3U) << "3 input tensors for " << op_name << " compute\n"; - Expr A = pack_args[0]; - Expr B = pack_args[1]; - Expr C = pack_args[2]; - std::string tensor_name = UniqName("Scatter_out"); - auto stages = CreateStages({A.as_tensor_ref(), B.as_tensor_ref(), C.as_tensor_ref()}); - if (FLAGS_cinn_ir_schedule) { - CHECK_EQ(pack_args.size(), 4U); - tensor_name = pack_args[3].operator std::string(); - VLOG(4) << A.as_tensor_ref()->name << " " << B.as_tensor_ref()->name << " " << C.as_tensor_ref()->name << " " - << tensor_name; - tensor_name = "test_scatter_out"; - } - ir::Tensor out = Scatter(A.as_tensor_ref(), B.as_tensor_ref(), C.as_tensor_ref(), target, axis, tensor_name); - std::vector res; - stages->InsertLazily(out); - res.push_back(CINNValue(out)); - CHECK(!out_type.empty()) << "Output type of Scatter is empty! Please check.\n"; - res.push_back(CINNValue(stages)); - *ret = CINNValuePack{res}; - }); - - auto strategy = std::make_shared(); - strategy->AddImpl(scatter_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.scatter.x86", 1); - return strategy; -} - -std::shared_ptr StrategyForScatterNd(const framework::NodeAttr &attrs, - const std::vector &inputs, - const std::vector &out_type, - const std::vector> &output_shapes, - const Target &target) { - auto attr_store = attrs.attr_store; - CHECK(attr_store.count("axes")) << "find no attr of axis"; - std::vector axes = absl::get>(attr_store.at("axes")); - std::string op_name("scatter_nd"); - - framework::CINNCompute scatter_nd_compute([=](lang::Args args, lang::RetValue *ret) { - CHECK(!args.empty()) << "The input arguments of " << op_name << " compute is empty! Please check.\n"; - CINNValuePack pack_args = args[0]; - CHECK_GE(pack_args.size(), 3U) << "3 input tensors for " << op_name << " compute\n"; - Expr A = pack_args[0]; - Expr B = pack_args[1]; - Expr C = pack_args[2]; - CHECK(A.as_tensor()); - CHECK(B.as_tensor()); - CHECK(C.as_tensor()); - CHECK(!output_shapes.empty()); - auto tensor_A = A.as_tensor_ref(); - auto tensor_B = B.as_tensor_ref(); - auto tensor_C = C.as_tensor_ref(); - VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ") << ", B shape: " << utils::Join(tensor_B->shape, ", ") - << ", output_shapes: " << utils::Join(output_shapes[0], ", "); - std::string tensor_name = UniqName("ScatterNd_out"); - if (FLAGS_cinn_ir_schedule) { - CHECK_EQ(pack_args.size(), 4U); - tensor_name = pack_args[3].operator std::string(); - } - ir::Tensor out = ScatterNd(tensor_A, tensor_B, tensor_C, target, axes, tensor_name); - auto stages = CreateStages({out}); - *ret = CINNValuePack{{CINNValue(out), CINNValue(stages)}}; - }); - - auto strategy = std::make_shared(); - strategy->AddImpl(scatter_nd_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.scatter_nd.x86", 1); - return strategy; -} - -std::vector> InferShapeForScatter(const std::vector> &inputs_shape, - const framework::AttrMapType &attrs) { - CHECK_EQ(inputs_shape.size(), 3U) << "The input's shape size should be 3! Please check again."; - std::vector> res{inputs_shape[2]}; - return res; -} - -std::vector InferDtypeForScatter(const std::vector &inputs_type, const framework::AttrMapType &attrs) { - CHECK_EQ(inputs_type.size(), 3U) << "The input's type size should be 3! Please check again."; - CHECK_EQ(inputs_type[1], Int(32)) << "The index's type should be int! Please check again."; - std::vector res{inputs_type[2]}; - return res; -} - -} // namespace op -} // namespace hlir -} // namespace cinn - -CINN_REGISTER_HELPER(scatter_ops) { - CINN_REGISTER_OP(scatter) - .describe("Scatter.") - .set_num_inputs(3) - .set_num_outputs(1) - .set_attr("CINNStrategy", cinn::hlir::op::StrategyForScatter) - .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForScatter)) - .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForScatter)) - .set_attr("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective) - .set_support_level(4); - - CINN_REGISTER_OP(scatter_nd) - .describe("ScatterNd.") - .set_num_inputs(3) - .set_num_outputs(1) - .set_attr("CINNStrategy", cinn::hlir::op::StrategyForScatterNd) - .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForScatter)) - .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForScatter)) - .set_attr("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective) - .set_support_level(4); - - return true; -} diff --git a/cinn/hlir/op/contrib/scatter.h b/cinn/hlir/op/contrib/scatter.h deleted file mode 100644 index aa8cbcc8ae..0000000000 --- a/cinn/hlir/op/contrib/scatter.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2022 CINN Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "cinn/ir/ir.h" -#include "cinn/ir/ir_base.h" -#include "cinn/ir/tensor.h" - -namespace cinn { -namespace hlir { -namespace op { - -ir::Tensor Scatter(const ir::Tensor& A, - const ir::Tensor& B, - const ir::Tensor& out, - const common::Target& target, - const int& axis, - const std::string& name); - -ir::Tensor ScatterNd(const ir::Tensor& A, - const ir::Tensor& B, - const ir::Tensor& out, - const common::Target& target, - const std::vector& axes, - const std::string& name); - -} // namespace op -} // namespace hlir -} // namespace cinn diff --git a/cinn/hlir/op/contrib/scatter_test.cc b/cinn/hlir/op/contrib/scatter_test.cc deleted file mode 100644 index 51441a4cdc..0000000000 --- a/cinn/hlir/op/contrib/scatter_test.cc +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright (c) 2022 CINN Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "cinn/hlir/op/contrib/scatter.h" - -#include -#include - -#include -#include - -#include "cinn/backends/codegen_c.h" -#include "cinn/backends/codegen_c_x86.h" -#include "cinn/backends/codegen_cuda_dev.h" -#include "cinn/common/context.h" -#include "cinn/lang/lower.h" -#include "cinn/lang/placeholder.h" -#include "cinn/poly/stage.h" - -namespace cinn { -namespace hlir { -namespace op { - -TEST(GenerateCode_Cpu, Scatter) { - common::Context::Global().ResetNameId(); - - auto target = common::DefaultHostTarget(); - - ir::Expr n(4); - ir::Expr h_in(8); - ir::Expr h_out(14); - - lang::Placeholder in1("in1", {n, h_in}); - lang::Placeholder in2("in2", {n, h_in}); - lang::Placeholder out("out", {n, h_out}); - ir::Tensor res = Scatter(in1, in2, out, target, 1, "test_scatter_out"); - - poly::StageMap stages = poly::CreateStages({res}); - std::vector funcs = - lang::LowerVec("TestGenerateCodeCpu_Scatter", stages, {res}, {}, {}, nullptr, target, true); - - VLOG(6) << "Expr before CPU codegen:"; - VLOG(6) << funcs[0]->body; - - ir::Module::Builder builder("Scatter_Module", target); - for (auto& f : funcs) { - builder.AddFunction(f); - } - - backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512); - codegen.SetInlineBuiltinCodes(false); - std::string code = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl); - VLOG(6) << "Cpu Codegen result:"; - VLOG(6) << code << std::endl; - - auto target_source = R"ROC( -#include -#include - -void TestGenerateCodeCpu_Scatter(void* _args, int32_t num_args) -{ - cinn_buffer_t* _test_scatter_out = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0])); - cinn_buffer_t* _in1 = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 4, 8 }); - cinn_buffer_t* _in2 = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), { 4, 8 }); - cinn_buffer_t* _out = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 4, 14 }); - cinn_buffer_malloc((void*)(0), _test_scatter_out); - cinn_buffer_malloc((void*)(0), _in1); - cinn_buffer_malloc((void*)(0), _in2); - cinn_buffer_malloc((void*)(0), _out); - const float* in1 = ((const float*)(_in1->memory)); - const int32_t* in2 = ((const int32_t*)(_in2->memory)); - const float* out = ((const float*)(_out->memory)); - float* test_scatter_out = ((float*)(_test_scatter_out->memory)); - for (int32_t i = 0; i < 4; i += 1) { - for (int32_t j = 0; j < 14; j += 1) { - test_scatter_out[((14 * i) + j)] = (((cinn_host_find_int_nd(_in2, 8, j, (8 * i), 1) == -1)) ? out[((14 * i) + j)] : in1[((8 * i) + cinn_host_find_int_nd(_in2, 8, j, (8 * i), 1))]); - }; - }; - cinn_buffer_free((void*)(0), _in1); - cinn_buffer_free((void*)(0), _in2); - cinn_buffer_free((void*)(0), _out); - cinn_buffer_free((void*)(0), _test_scatter_out); -} -)ROC"; - CHECK_EQ(utils::Trim(code), utils::Trim(target_source)); -} - -TEST(GenerateCode_Cpu, ScatterNd) { - common::Context::Global().ResetNameId(); - - auto target = common::DefaultHostTarget(); - - ir::Expr n(4); - ir::Expr h_in(8); - ir::Expr h_out(14); - - lang::Placeholder in1("in1", {n, h_in}); - lang::Placeholder in2("in2", {n, h_in, ir::Expr(1)}); - lang::Placeholder out("out", {n, h_out}); - ir::Tensor res = ScatterNd(in1, in2, out, target, {1}, "test_scatter_out"); - - poly::StageMap stages = poly::CreateStages({res}); - std::vector funcs = - lang::LowerVec("TestGenerateCodeCpu_Scatter", stages, {res}, {}, {}, nullptr, target, true); - - VLOG(6) << "Expr before CPU codegen:"; - VLOG(6) << funcs[0]->body; - - ir::Module::Builder builder("Scatter_Module", target); - for (auto& f : funcs) { - builder.AddFunction(f); - } - - backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512); - codegen.SetInlineBuiltinCodes(false); - std::string code = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl); - VLOG(6) << "Cpu Codegen result:"; - VLOG(6) << code << std::endl; - - auto target_source = R"ROC( -#include -#include - -void TestGenerateCodeCpu_Scatter(void* _args, int32_t num_args) -{ - cinn_buffer_t* _test_scatter_out = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0])); - cinn_buffer_t* _in1 = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 4, 8 }); - cinn_buffer_t* _in2 = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), { 4, 8, 1 }); - cinn_buffer_t* _out = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 4, 14 }); - cinn_buffer_malloc((void*)(0), _test_scatter_out); - cinn_buffer_malloc((void*)(0), _in1); - cinn_buffer_malloc((void*)(0), _in2); - cinn_buffer_malloc((void*)(0), _out); - const float* in1 = ((const float*)(_in1->memory)); - const int32_t* in2 = ((const int32_t*)(_in2->memory)); - const float* out = ((const float*)(_out->memory)); - float* test_scatter_out = ((float*)(_test_scatter_out->memory)); - for (int32_t i = 0; i < 4; i += 1) { - for (int32_t j = 0; j < 14; j += 1) { - test_scatter_out[((14 * i) + j)] = (((cinn_host_find_int_nd(_in2, 8, j, (8 * i), 1) == -1)) ? out[((14 * i) + j)] : in1[((8 * i) + cinn_host_find_int_nd(_in2, 8, j, (8 * i), 1))]); - }; - }; - cinn_buffer_free((void*)(0), _in1); - cinn_buffer_free((void*)(0), _in2); - cinn_buffer_free((void*)(0), _out); - cinn_buffer_free((void*)(0), _test_scatter_out); -} - )ROC"; - CHECK_EQ(utils::Trim(code), utils::Trim(target_source)); -} - -} // namespace op -} // namespace hlir -} // namespace cinn diff --git a/cinn/hlir/op/use_ops.h b/cinn/hlir/op/use_ops.h index 3650878d76..e29941efd7 100644 --- a/cinn/hlir/op/use_ops.h +++ b/cinn/hlir/op/use_ops.h @@ -23,7 +23,6 @@ CINN_USE_REGISTER(broadcast_grad_ops) CINN_USE_REGISTER(elementwise_ops) CINN_USE_REGISTER(transform_ops) CINN_USE_REGISTER(gather_nd_ops) -CINN_USE_REGISTER(scatter_ops) CINN_USE_REGISTER(sort_ops) CINN_USE_REGISTER(argmin_ops) CINN_USE_REGISTER(argmax_ops) diff --git a/cinn/pybind/frontend.cc b/cinn/pybind/frontend.cc index 2800302375..1cb719538e 100644 --- a/cinn/pybind/frontend.cc +++ b/cinn/pybind/frontend.cc @@ -530,38 +530,6 @@ void BindFrontend(pybind11::module *m) { py::arg("updates"), py::arg("index"), py::arg("axis") = 0) - .def("scatter", - static_cast( - &NetBuilder::Scatter), - py::arg("src"), - py::arg("index"), - py::arg("out"), - py::arg("axis")) - .def("scatter", - static_cast &, const float &, const int &)>( - &NetBuilder::Scatter), - py::arg("src"), - py::arg("index"), - py::arg("shape"), - py::arg("default_value") = 0, - py::arg("axis") = 0) - .def("scatter_nd", - static_cast &)>(&NetBuilder::ScatterNd), - py::arg("src"), - py::arg("index"), - py::arg("out"), - py::arg("axis")) - .def("scatter_nd", - static_cast &, const float &, const std::vector &)>( - &NetBuilder::ScatterNd), - py::arg("src"), - py::arg("index"), - py::arg("shape"), - py::arg("default_value") = 0, - py::arg("axis") = 0) .def("isclose", &NetBuilder::IsClose, py::arg("x"), diff --git a/python/tests/ops/test_scatter_op.py b/python/tests/ops/test_scatter_op.py deleted file mode 100644 index 31a3530f2f..0000000000 --- a/python/tests/ops/test_scatter_op.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2021 CINN Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import unittest -import numpy as np -from op_test import OpTest, OpTestTool -import paddle -from cinn.frontend import * -from cinn.common import * - - -@OpTestTool.skip_if(not is_compiled_with_cuda(), - "x86 test will be skipped due to timeout.") -class TestScatterOp(OpTest): - def setUp(self): - self.init_case() - self.target = DefaultNVGPUTarget() - - def init_case(self): - self.axis = -1 - self.inputs = { - "x": np.random.random([10, 5]).astype("float32"), - "y": np.random.random([5, 5]).astype("float32"), - "index": np.array([0, 5, 0, 9, 0]).astype("int32"), - "src": self.random([2, 5], "float32"), - "out": self.random([3, 5], "float32"), - "index0": np.array([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]).astype("int32"), - } - print(self.inputs["src"], '\n', self.inputs["index0"], '\n', self.inputs["out"]) - - def build_paddle_program(self, target): - # x = paddle.to_tensor(self.inputs["src"], stop_gradient=True) - # y = paddle.to_tensor(self.inputs["out"], stop_gradient=True) - # index = paddle.to_tensor(self.inputs["index0"], stop_gradient=True) - x = self.inputs["src"] - y = self.inputs["out"] - index = self.inputs["index0"] - - pos_axis = self.axis - if pos_axis < 0: - pos_axis += len(x.shape) - - dim = len(x.shape) - res = copy.deepcopy(y) - if dim == 1: - pass - elif dim == 2: - if pos_axis == 0: - for i in range(x.shape[0]): - for j in range(x.shape[1]): - res[index[i, j], j] = x[i, j] - elif pos_axis == 1: - for i in range(x.shape[0]): - for j in range(x.shape[1]): - res[i, index[i, j]] = x[i, j] - elif dim == 3: - pass - elif dim == 4: - pass - else: - raise NotImplementedError - - self.paddle_outputs = [paddle.to_tensor(res)] - print("paddle res:\n", res) - - def build_cinn_program(self, target): - builder = NetBuilder("scatter") - src = builder.create_input( - OpTest.nptype2cinntype(self.inputs["src"].dtype), - self.inputs["src"].shape, "src") - index = builder.create_input( - OpTest.nptype2cinntype(self.inputs["index0"].dtype), - self.inputs["index0"].shape, "index0") - out = builder.create_input( - OpTest.nptype2cinntype(self.inputs["out"].dtype), - self.inputs["out"].shape, "out") - out1 = builder.scatter(src, index, out, 0) - - prog = builder.build() - res = self.get_cinn_output( - prog, target, [src, index, out], - [self.inputs["src"], self.inputs["index0"], self.inputs["out"]], - [out1]) - - self.cinn_outputs = res - print("cinn res:\n", res) - - def test_check_results(self): - self.check_outputs_and_grads() - - -# class TestScatterOp1(TestScatterOp): -# def setUp(self): -# self.init_case() -# self.target = DefaultNVGPUTarget() - -# def init_case(self): -# self.axis = 0 -# self.inputs = { -# "x": np.random.random([10, 5]).astype("int32"), -# "y": np.random.random([5, 5]).astype("int32"), -# "index": np.array([0, 5, 0, 9, 0]).astype("int32") -# } - -# def test_check_results(self): -# self.check_outputs_and_grads() - -if __name__ == "__main__": - unittest.main() From 2f702d6b2820f70f61ba2985f0e6a7d1bfb847c6 Mon Sep 17 00:00:00 2001 From: zzk0 Date: Sat, 3 Jun 2023 09:50:25 +0000 Subject: [PATCH 8/8] remove key --- cinn/auto_schedule/measure/simple_runner.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cinn/auto_schedule/measure/simple_runner.cc b/cinn/auto_schedule/measure/simple_runner.cc index 3c2bf59a78..54660ccc93 100644 --- a/cinn/auto_schedule/measure/simple_runner.cc +++ b/cinn/auto_schedule/measure/simple_runner.cc @@ -39,8 +39,6 @@ static const std::unordered_map> kInitWithZeroPara {"lookup_table", {1}}, {"gather", {1}}, {"gather_nd", {1}}, - {"scatter", {1}}, - {"scatter_nd", {1}}, {"scatter_assign", {2}}, {"scatter_add", {2}}, };