From ba365c277d2dd201fc0456c2c0ee93bc0034b9c5 Mon Sep 17 00:00:00 2001 From: Feng Yuan Date: Fri, 28 Jun 2024 04:39:16 +0800 Subject: [PATCH 1/2] Add aten::addcdiv and its variants Signed-off-by: Feng Yuan --- src/ATen/native/xpu/PointwiseOps.cpp | 57 ++++++++ src/ATen/native/xpu/XPUFallback.template | 1 - src/ATen/native/xpu/sycl/DistanceKernels.cpp | 129 +++++++++--------- .../native/xpu/sycl/PointwiseOpsKernels.cpp | 55 +++++++- .../native/xpu/sycl/PointwiseOpsKernels.h | 2 + test/xpu/xpu_test_utils.py | 1 + yaml/xpu_functions.yaml | 3 + 7 files changed, 178 insertions(+), 70 deletions(-) diff --git a/src/ATen/native/xpu/PointwiseOps.cpp b/src/ATen/native/xpu/PointwiseOps.cpp index 210cec3e6..a01bdc391 100644 --- a/src/ATen/native/xpu/PointwiseOps.cpp +++ b/src/ATen/native/xpu/PointwiseOps.cpp @@ -6,6 +6,63 @@ namespace at { +TensorIterator addcdiv_meta( + const Tensor& self, + const Tensor& tensor1, + const Tensor& tensor2, + const Scalar& value, + Tensor& out) { + if (isIntegralType(tensor1.scalar_type(), /*includeBool=*/true) && + isIntegralType(tensor2.scalar_type(), /*includeBool=*/true)) { + TORCH_CHECK( + false, + "Integer division with addcdiv is no longer supported, and in a future ", + "release addcdiv will perform a true division of tensor1 and tensor2. ", + "The historic addcdiv behavior can be implemented as ", + "(input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype) ", + "for integer inputs and as ", + "(input + value * tensor1 / tensor2) for float inputs. ", + "The future addcdiv behavior is just the latter implementation: ", + "(input + value * tensor1 / tensor2), for all dtypes."); + } + + TensorIterator iter; + iter.build_ternary_op(out, self, tensor1, tensor2); + return iter; +} + +Tensor& XPUNativeFunctions::addcdiv_out( + const Tensor& self, + const Tensor& tensor1, + const Tensor& tensor2, + const Scalar& value, + Tensor& out) { + auto iter = addcdiv_meta(self, tensor1, tensor2, value, out); + native::xpu::addcdiv_kernel(iter, value); + return out; +} + +Tensor XPUNativeFunctions::addcdiv( + const Tensor& self, + const Tensor& tensor1, + const Tensor& tensor2, + const Scalar& value) { + Tensor out; + auto iter = addcdiv_meta(self, tensor1, tensor2, value, out); + native::xpu::addcdiv_kernel(iter, value); + return iter.output(); +} + +Tensor& XPUNativeFunctions::addcdiv_( + Tensor& self, + const Tensor& tensor1, + const Tensor& tensor2, + const Scalar& value) { + auto iter = addcdiv_meta(self, tensor1, tensor2, value, self); + native::xpu::addcdiv_kernel(iter, value); + return self; +} + TensorIterator addcmul_meta( const Tensor& self, const Tensor& tensor1, diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 0ab03111c..fccecbb53 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -164,7 +164,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "adaptive_max_pool2d.out", "adaptive_max_pool3d_backward.grad_input", "adaptive_max_pool3d.out", - "addcdiv.out", "aminmax.out", "angle", "argmin.out", diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.cpp b/src/ATen/native/xpu/sycl/DistanceKernels.cpp index 689bf1e42..b1780c082 100644 --- a/src/ATen/native/xpu/sycl/DistanceKernels.cpp +++ b/src/ATen/native/xpu/sycl/DistanceKernels.cpp @@ -321,72 +321,69 @@ void cdist_kernel( const int64_t r2 = x2_expanded.size(-2); const int64_t m = x1_expanded.size(-1); - AT_DISPATCH_FLOATING_TYPES( - x1_expanded.scalar_type(), - "cdist_xpu", - [&] { - if (p == 0.0) { - launch_cdist_forward_kernel, 0>( - result, - x1_expanded, - x2_expanded, - p, - r1, - r2, - m, - r1 * r2, - r1 * m, - r2 * m); - } else if (p == 1.0) { - launch_cdist_forward_kernel, 1>( - result, - x1_expanded, - x2_expanded, - p, - r1, - r2, - m, - r1 * r2, - r1 * m, - r2 * m); - } else if (p == 2.0) { - launch_cdist_forward_kernel, 2>( - result, - x1_expanded, - x2_expanded, - p, - r1, - r2, - m, - r1 * r2, - r1 * m, - r2 * m); - } else if (std::isinf(p)) { - launch_cdist_forward_kernel, 3>( - result, - x1_expanded, - x2_expanded, - p, - r1, - r2, - m, - r1 * r2, - r1 * m, - r2 * m); - } else { - launch_cdist_forward_kernel, 4>( - result, - x1_expanded, - x2_expanded, - p, - r1, - r2, - m, - r1 * r2, - r1 * m, - r2 * m); - } - }); + AT_DISPATCH_FLOATING_TYPES(x1_expanded.scalar_type(), "cdist_xpu", [&] { + if (p == 0.0) { + launch_cdist_forward_kernel, 0>( + result, + x1_expanded, + x2_expanded, + p, + r1, + r2, + m, + r1 * r2, + r1 * m, + r2 * m); + } else if (p == 1.0) { + launch_cdist_forward_kernel, 1>( + result, + x1_expanded, + x2_expanded, + p, + r1, + r2, + m, + r1 * r2, + r1 * m, + r2 * m); + } else if (p == 2.0) { + launch_cdist_forward_kernel, 2>( + result, + x1_expanded, + x2_expanded, + p, + r1, + r2, + m, + r1 * r2, + r1 * m, + r2 * m); + } else if (std::isinf(p)) { + launch_cdist_forward_kernel, 3>( + result, + x1_expanded, + x2_expanded, + p, + r1, + r2, + m, + r1 * r2, + r1 * m, + r2 * m); + } else { + launch_cdist_forward_kernel, 4>( + result, + x1_expanded, + x2_expanded, + p, + r1, + r2, + m, + r1 * r2, + r1 * m, + r2 * m); + } + }); } } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp index b36e53ee0..6882ce902 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -6,14 +7,14 @@ namespace at::native::xpu { template -struct AddcmulKernelFunctor { +struct AddcmulFunctor { using opmath_t = at::opmath_type; scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const { return static_cast(a) + alpha_ * static_cast(b) * static_cast(c); } - AddcmulKernelFunctor(opmath_t alpha) : alpha_(alpha) {} + AddcmulFunctor(opmath_t alpha) : alpha_(alpha) {} private: opmath_t alpha_; @@ -28,9 +29,57 @@ void addcmul_kernel(TensorIterator& iter, Scalar value) { [&]() { using opmath_t = at::opmath_type; auto alpha = value.to(); - AddcmulKernelFunctor f(alpha); + AddcmulFunctor f(alpha); gpu_kernel(iter, f); }); } +template +struct AddcdivFunctor { + using accscalar_t = at::acc_type; + scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const { + return a + alpha_ * (b / static_cast(c)); + } + + AddcdivFunctor(accscalar_t alpha) : alpha_(alpha) {} + + private: + accscalar_t alpha_; +}; + +template +struct AddcdivComplexFunctor { + scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const { + return a + alpha_ * (b / c); + } + + AddcdivComplexFunctor(scalar_t alpha) : alpha_(alpha) {} + + private: + scalar_t alpha_; +}; + +void addcdiv_kernel(TensorIterator& iter, Scalar value) { + auto dtype = iter.common_dtype(); + if (at::isComplexType(dtype)) { + AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_xpu", [&]() { + auto alpha = value.to(); + AddcdivComplexFunctor f(alpha); + gpu_kernel(iter, f); + }); + } else { + AT_DISPATCH_ALL_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + iter.dtype(), + "addcdiv_xpu", + [&]() { + using accscalar_t = at::acc_type; + auto alpha = value.to(); + AddcdivFunctor f(alpha); + gpu_kernel(iter, f); + }); + } +} + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h index 986fa4667..4f2e2b19d 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h @@ -6,4 +6,6 @@ namespace at::native::xpu { void addcmul_kernel(TensorIterator& iter, Scalar value); +void addcdiv_kernel(TensorIterator& iter, Scalar value); + } // namespace at::native::xpu diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index eecc5e89b..46385b8e1 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -30,6 +30,7 @@ "bitwise_or", "bitwise_xor", "addcmul", + "addcdiv", "clamp", "clamp_max", "clamp_min", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 595337c75..4d320d260 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -410,6 +410,9 @@ supported: - acosh.out - addr - addr.out + - addcdiv.out + - addcdiv + - addcdiv_ - addcmul.out - addcmul - addcmul_ From 8e60307d97055a9090e54c031a412a7a6f8b40d6 Mon Sep 17 00:00:00 2001 From: Feng Yuan Date: Mon, 8 Jul 2024 21:19:18 +0800 Subject: [PATCH 2/2] Fixing compilation issues --- src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp index 3179ae6bf..d38f511d7 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp @@ -27,7 +27,7 @@ struct AddcmulComplexFunctor { return a + alpha_ * b * c; } - AddcmulComplexFunctor(accscalar_t alpha) : alpha_(alpha) {} + AddcmulComplexFunctor(scalar_t alpha) : alpha_(alpha) {} private: scalar_t alpha_; @@ -47,7 +47,7 @@ void addcmul_kernel(TensorIterator& iter, Scalar value) { iter.dtype(), "addcmul_xpu", [&]() { - using accscalar_t = at::accscalar_type; + using accscalar_t = at::acc_type; auto alpha = value.to(); gpu_kernel(iter, AddcmulFunctor(alpha)); }); @@ -102,6 +102,7 @@ void addcdiv_kernel(TensorIterator& iter, Scalar value) { } } +template struct MSEBackwardFunctor { scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const { return alpha_ * (a - b) * c;