diff --git a/docs/reference/index.rst b/docs/reference/index.rst index c2b74eabee..d3271e5732 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -39,3 +39,4 @@ The MIOpen API library is structured as follows: * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental) * :doc:`Kthvalue <../doxygen/html/group__kthvalue>` (experimental) * :doc:`GLU <../doxygen/html/group__glu>` (experimental) + * :doc:`Cumulative Reduction <./group___cumulative_reduction>` (experimental) diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 60d6fe6ce6..a9b6d94f9b 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -40,6 +40,7 @@ add_executable(MIOpenDriver dm_convfp16.cpp dm_convfp8.cpp dm_convint8.cpp + dm_cumulative_reduction.cpp dm_dropout.cpp dm_fusion.cpp dm_gemm.cpp diff --git a/driver/cumulative_reduction_driver.hpp b/driver/cumulative_reduction_driver.hpp new file mode 100644 index 0000000000..9a30a3690b --- /dev/null +++ b/driver/cumulative_reduction_driver.hpp @@ -0,0 +1,457 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "driver.hpp" +#include "mloCumulativeReductionHost.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" + +#include <../test/ford.hpp> +#include <../test/verify.hpp> + +#include + +inline std::vector GetStrides(std::vector lengths, bool contiguous) +{ + if(!contiguous) + std::swap(lengths.front(), lengths.back()); + std::vector strides(lengths.size()); + strides.back() = 1; + for(int i = lengths.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * lengths[i + 1]; + if(!contiguous) + std::swap(strides.front(), strides.back()); + return strides; +} + +template +class CumulativeReductionDriver : public Driver +{ +public: + CumulativeReductionDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&outputDesc); + miopenCreateTensorDescriptor(&indicesDesc); + + data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~CumulativeReductionDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(outputDesc); + miopenDestroyTensorDescriptor(indicesDesc); + } + +private: + InputFlags inflags; + + int forw; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t outputDesc; + miopenTensorDescriptor_t indicesDesc; + + std::unique_ptr input_dev; + std::unique_ptr output_dev; + std::unique_ptr indices_dev; + + std::vector input; + std::vector output; + std::vector indices; + + std::vector output_host; + std::vector indices_host; + + int dim; + bool exclusive; + bool reverse; + + miopenCumOp_t cumOp; +}; + +template +int CumulativeReductionDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + + auto inTensorParam = inflags.GetValueTensor("input"); + auto input_length = inTensorParam.lengths; + if(input_length.empty()) + { + std::cout << "Tensor must not be empty"; + return miopenStatusBadParm; + } + + int contiguous = inflags.GetValueInt("Contiguous"); + if(contiguous != 0 && contiguous != 1) + { + std::cerr << "Error Tensor Contiguous should be 0 or 1" << std::endl; + return miopenStatusBadParm; + } + + std::vector cumOpList = { + MIOPEN_CUM_MAX, MIOPEN_CUM_MIN, MIOPEN_CUM_SUM, MIOPEN_CUM_PROD}; + int cumOpInt = inflags.GetValueInt("CumulativeOperation"); + bool valid = true; + for(auto op : cumOpList) + if(cumOpInt != static_cast(op)) + { + valid = false; + break; + } + if(valid) + { + std::cerr << "Error CumulativeOperation value should be in set {" << cumOpList << "}" + << std::endl; + return miopenStatusBadParm; + } + + return miopenStatusSuccess; +} + +template +int CumulativeReductionDriver::GetandSetData() +{ + dim = inflags.GetValueInt("dim"); + exclusive = (inflags.GetValueInt("exclusive") != 0); + reverse = (inflags.GetValueInt("reverse") != 0); + cumOp = (miopenCumOp_t)inflags.GetValueInt("CumulativeOperation"); + + auto lengths = inflags.GetValueTensor("input").lengths; + auto strides = GetStrides(lengths, inflags.GetValueInt("Contiguous") != 0); + + if(SetTensorNd(inputDesc, lengths, strides, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input") + "."); + + if(SetTensorNd(outputDesc, lengths, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing output tensor"); + + if(SetTensorNd(indicesDesc, lengths, miopenInt64) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing indices tensor"); + + return miopenStatusSuccess; +} + +template +int CumulativeReductionDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag( + "forw", 'F', "1", "Run only Forward CumulativeReduction (Default=1)", "int"); + inflags.AddTensorFlag("input", 'D', "256x4x256", "input tensor descriptor"); + inflags.AddInputFlag( + "dim", 'd', "0", "The dimension to do the operation over (Default=0)", "int"); + inflags.AddInputFlag("exclusive", + 'e', + "0", + "Enable exclusive calculation. 0 for False, 1 for True (Default=0)", + "int"); + inflags.AddInputFlag( + "reverse", + 'r', + "0", + "Reverse the calculation order to back to front. 0 for False, 1 for True (Default=0)", + "int"); + inflags.AddInputFlag("CumulativeOperation", + 'O', + "1", + "Operator used. 1 for Max, 2 for Min, 3 for Sum, 4 for Prod (Default=1)", + "int"); + inflags.AddInputFlag("Contiguous", + 'C', + "1", + "Is input tensor contiguous? (Default=1 for contiguous tensor)", + "int"); + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int CumulativeReductionDriver::AllocateBuffersAndCopy() +{ + size_t input_sz = miopen::deref(inputDesc).GetElementSpace(); + size_t output_sz = miopen::deref(outputDesc).GetElementSpace(); + size_t indices_sz = miopen::deref(indicesDesc).GetElementSpace(); + + uint32_t ctx = 0; + + input_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + output_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); + indices_dev = std::unique_ptr(new GPUMem(ctx, indices_sz, sizeof(int64_t))); + + input = std::vector(input_sz); + output = std::vector(output_sz, static_cast(0.0f)); + indices = std::vector(indices_sz, static_cast(-1)); + + output_host = std::vector(output_sz, static_cast(0.0f)); + indices_host = std::vector(indices_sz, static_cast(-1)); + + for(int i = 0; i < input_sz; i++) + input[i] = prng::gen_A_to_B(static_cast(-100), static_cast(100)); + + if(input_dev->ToGPU(GetStream(), input.data()) != 0) + { + std::cerr << "Error copying (input) to GPU, size: " << input_dev->GetSize() << std::endl; + return miopenStatusAllocFailed; + } + + if(output_dev->ToGPU(GetStream(), output.data()) != 0) + { + std::cerr << "Error copying (output) to GPU, size: " << output_dev->GetSize() << std::endl; + return miopenStatusAllocFailed; + } + + if(indices_dev->ToGPU(GetStream(), indices.data()) != 0) + { + std::cerr << "Error copying (indices) to GPU, size: " << indices_dev->GetSize() + << std::endl; + return miopenStatusAllocFailed; + } + + return miopenStatusSuccess; +} + +template +int CumulativeReductionDriver::RunForwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenCumulativeReductionForward( + GetHandle(), + inputDesc, + input_dev->GetMem(), + outputDesc, + output_dev->GetMem(), + indicesDesc, + (cumOp == MIOPEN_CUM_MAX || cumOp == MIOPEN_CUM_MIN ? indices_dev->GetMem() : nullptr), + dim, + exclusive, + reverse, + cumOp); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Forward Cumulative Reduction Elapsed: " + << t.gettime_ms() / iter << " ms" << std::endl; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Forward Cumulative Reduction Elapsed: " << kernel_average_time + << " ms" << std::endl; + } + + if(output_dev->FromGPU(GetStream(), output.data()) != 0) + { + std::cerr << "Error copying (output_dev) from GPU, size: " << output_dev->GetSize() + << std::endl; + return miopenStatusInternalError; + } + if(indices_dev->FromGPU(GetStream(), indices.data()) != 0) + { + std::cerr << "Error copying (indices_dev) from GPU, size: " << indices_dev->GetSize() + << std::endl; + return miopenStatusInternalError; + } + + return miopenStatusSuccess; +} + +template +int CumulativeReductionDriver::RunForwardCPU() +{ + int32_t mloStatus = miopenStatusSuccess; + + switch(cumOp) + { + case MIOPEN_CUM_MAX: + mloStatus = + mloCumulativeReductionForwardRunHost(inputDesc, + outputDesc, + indicesDesc, + input.data(), + output_host.data(), + indices_host.data(), + dim, + exclusive, + reverse); + break; + case MIOPEN_CUM_MIN: + mloStatus = + mloCumulativeReductionForwardRunHost(inputDesc, + outputDesc, + indicesDesc, + input.data(), + output_host.data(), + indices_host.data(), + dim, + exclusive, + reverse); + break; + case MIOPEN_CUM_SUM: + mloStatus = + mloCumulativeReductionForwardRunHost(inputDesc, + outputDesc, + indicesDesc, + input.data(), + output_host.data(), + nullptr, + dim, + exclusive, + reverse); + break; + case MIOPEN_CUM_PROD: + mloStatus = + mloCumulativeReductionForwardRunHost(inputDesc, + outputDesc, + indicesDesc, + input.data(), + output_host.data(), + nullptr, + dim, + exclusive, + reverse); + break; + default: + std::cout << "The CPU version of Cumulative Reduction with Operation code " << cumOp + << " has not been implemented" << std::endl; + mloStatus = miopenStatusNotImplemented; + break; + } + + return mloStatus; +} + +template +int CumulativeReductionDriver::RunBackwardGPU() +{ + return miopenStatusNotImplemented; +} + +template +int CumulativeReductionDriver::RunBackwardCPU() +{ + return miopenStatusNotImplemented; +} + +template +Tref CumulativeReductionDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; +} + +template +int CumulativeReductionDriver::VerifyForward() +{ + RunForwardCPU(); + const Tref tolerance = GetTolerance(); + auto error_output = miopen::rms_range(output_host, output); + auto error_indices = miopen::rms_range(indices_host, indices); + + if(!std::isfinite(error_output) || error_output > tolerance) + { + std::cout << "Forward Cumulative Reduction Output FAILED: " << error_output << " > " + << tolerance << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward Cumulative Reduction Output Verifies OK on CPU reference (" + << error_output << " < " << tolerance << ')' << std::endl; + } + + if(!std::isfinite(error_indices) || error_indices > tolerance) + { + std::cout << "Forward Cumulative Reduction Indices FAILED: " << error_indices << " > " + << tolerance << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward Cumulative Reduction Indices Verifies OK on CPU reference (" + << error_indices << " < " << tolerance << ')' << std::endl; + } + + return miopenStatusSuccess; +} + +template +int CumulativeReductionDriver::VerifyBackward() +{ + return miopenStatusNotImplemented; +} diff --git a/driver/dm_cumulative_reduction.cpp b/driver/dm_cumulative_reduction.cpp new file mode 100644 index 0000000000..09da7b45b5 --- /dev/null +++ b/driver/dm_cumulative_reduction.cpp @@ -0,0 +1,41 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "registry_driver_maker.hpp" +#include "cumulative_reduction_driver.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "cum") + return new CumulativeReductionDriver(); + if(base_arg == "cumfp16") + return new CumulativeReductionDriver(); + if(base_arg == "cumbfp16") + return new CumulativeReductionDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index d77d5d02d2..e04b985723 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -314,7 +314,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], " "prelu[bfp16|fp16], kthvalue[bfp16|fp16], glu[bfp16|fp16], softmarginloss[bfp16|fp16], " - "multimarginloss[bfp16|fp16]\n"); + "multimarginloss[bfp16|fp16], cum[bfp16|fp16]\n"); + exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -352,7 +353,7 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "kthvaluebfp16" && arg != "glu" && arg != "glufp16" && arg != "glubfp16" && arg != "softmarginloss" && arg != "softmarginlossfp16" && arg != "softmarginlossbfp16" && arg != "multimarginloss" && arg != "multimarginlossfp16" && arg != "multimarginlossbfp16" && - arg != "--version") + arg != "cum" && arg != "cumfp16" && arg != "cumbfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/mloCumulativeReductionHost.hpp b/driver/mloCumulativeReductionHost.hpp new file mode 100644 index 0000000000..9d7f7cf574 --- /dev/null +++ b/driver/mloCumulativeReductionHost.hpp @@ -0,0 +1,151 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include <../test/ford.hpp> + +#include +#include +#include + +#include + +inline constexpr void update() {} + +template +inline constexpr void update(T& a, T b, Ts&... c, Ts... d) +{ + a = b; + update(c..., d...); +} + +template +struct reduce_func_base +{ + reduce_func_base(){}; + virtual ~reduce_func_base(){}; + virtual inline bool isbetter(const T& /*a*/, const T& /*b*/) const { return false; } + virtual inline void combine(T& a, T b) const { a = b; } + inline constexpr void calculate(T& a, T b, Ts&... c, Ts... d) const + { + if(!isbetter(a, b)) + { + combine(a, b); + update(c..., d...); + } + } +}; + +template +struct reduce_func : reduce_func_base +{ + virtual ~reduce_func(){}; +}; + +template +struct reduce_func : reduce_func_base +{ + const float START_VAL = -std::numeric_limits::max(); + inline bool isbetter(const T& a, const T& b) const { return a > b; } +}; + +template +struct reduce_func : reduce_func_base +{ + const float START_VAL = std::numeric_limits::max(); + inline bool isbetter(const T& a, const T& b) const { return a < b; } +}; + +template +struct reduce_func : reduce_func_base +{ + const float START_VAL = 0.0f; + inline void combine(T& a, T b) const { a += b; } +}; + +template +struct reduce_func : reduce_func_base +{ + const float START_VAL = 1.0f; + inline void combine(T& a, T b) const { a *= b; } +}; + +template +int32_t mloCumulativeReductionForwardRunHost(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + const miopenTensorDescriptor_t indicesDesc, + const Tgpu* input, + Tcheck* output_host, + int64_t* indices_host, + const int dim, + const bool exclusive, + const bool reverse) +{ + const int ndims = miopen::deref(inputDesc).GetNumDims(); + const auto exec_dim = ((dim % ndims) + ndims) % ndims; + + auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc)); + auto indices_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(indicesDesc)); + + auto size = miopen::deref(inputDesc).GetElementSize(); + auto inner_size = miopen::deref(inputDesc).GetLengths()[exec_dim]; + auto outer_size = size / inner_size; + + auto op_worker = reduce_func{}; + + tensor_view_t<5> ignore_dim_input_tv = input_tv; + ignore_dim_input_tv.size[exec_dim] = 1; + + par_ford(outer_size)([&](int64_t gid) { + auto tensor_layout = tensor_layout_t<5>(ignore_dim_input_tv, gid); + float cum_val = op_worker.START_VAL; + int64_t cum_idx = (reverse ? input_tv.size[exec_dim] - 1 : 0); + + ford(inner_size)([&](int64_t idx) { + int64_t tmp_idx = + (reverse ? input_tv.size[exec_dim] - (idx - exclusive) - 1 : (idx - exclusive)); + float tmp_val = op_worker.START_VAL; + if(0 <= tmp_idx && tmp_idx < inner_size) + { + tensor_layout.layout[exec_dim] = tmp_idx; + tmp_val = static_cast(input[input_tv.get_tensor_view_idx(tensor_layout)]); + } + + op_worker.calculate(cum_val, tmp_val, cum_idx, tmp_idx); + + tensor_layout.layout[exec_dim] = (reverse ? input_tv.size[exec_dim] - idx - 1 : idx); + if(output_host) + output_host[output_tv.get_tensor_view_idx(tensor_layout)] = + static_cast(cum_val); + if(indices_host) + indices_host[indices_tv.get_tensor_view_idx(tensor_layout)] = cum_idx; + }); + }); + + return miopenStatusSuccess; +} diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 67652ab832..9f70a3ef31 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -72,6 +72,7 @@ * @defgroup ReduceCalculation * @defgroup RotaryPositionalEmbeddings * @defgroup ReLU + * @defgroup CumulativeReduction * */ @@ -8176,6 +8177,61 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle, // CLOSEOUT LossFunction DOXYGEN GROUP #endif // MIOPEN_BETA_API +#ifdef MIOPEN_BETA_API + +/*! @ingroup CumulativeReduction + * @enum miopenCumOp_t + * Cumulative Reduction operation types + */ +using miopenCumOp_t = enum { + MIOPEN_CUM_MAX = 1, /*!< the operation is getting the maximun value and index of the reduced + elements */ + MIOPEN_CUM_MIN = 2, /*!< the operation is getting the minimum value and index of the reduced + elements */ + MIOPEN_CUM_SUM = 3, /*!< the operation is getting the sum of the reduced elements */ + MIOPEN_CUM_PROD = 4, /*!< the operation is getting the product of the reduced elements */ +}; +#endif // MIOPEN_BETA_API + +#ifdef MIOPEN_BETA_API + +// CumulativeReduction APIs +/** @addtogroup CumulativeReduction + * + * @{ + */ + +/*! @brief Execute a Cumulative Reduction forward layer + * + * @param handle MIOpen handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Data tensor input (input) + * @param outputDesc Tensor descriptor for output tensor (input) + * @param output Data tensor output, using `nullptr` if it is not needed (output) + * @param indicesDesc Tensor descriptor for indices tensor (input) + * @param indices Data tensor indices, using `nullptr` if it is not needed (output) + * @param dim The dimension to do the operation over (input) + * @param exclusive Exclusive operation (input) + * @param reverse Reverse the operation order (input) + * @param cumOp Enumerant specifying the operation used by Cumulative Reduction + * (input) + */ +MIOPEN_EXPORT miopenStatus_t miopenCumulativeReductionForward(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t outputDesc, + void* output, + miopenTensorDescriptor_t indicesDesc, + void* indices, + int dim, + bool exclusive, + bool reverse, + miopenCumOp_t cumOp); + +/** @} */ +// CLOSEOUT CUMULATIVE REDUCTION DOXYGEN GROUP +#endif // MIOPEN_BETA_API + #ifdef __cplusplus } #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 92e4f4264a..0192f2a6b8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -112,6 +112,8 @@ set( MIOpen_Source convolution_api.cpp ctc.cpp ctc_api.cpp + cumulative_reduction/problem_description.cpp + cumulative_reduction_api.cpp db.cpp db_record.cpp driver_arguments.cpp @@ -302,6 +304,8 @@ set( MIOpen_Source solver/conv_ck_igemm_fwd_bias_res_add_activ_fused.cpp solver/conv_ocl_dir2Dfwd_fused.cpp solver/conv_winoRxS_fused.cpp + solver/cumulative_reduction/forward_cumulative_reduction_contiguous_lastdim.cpp + solver/cumulative_reduction/utils.cpp solver/glu/backward_glu.cpp solver/glu/forward_glu.cpp solver/groupnorm/forward_groupnorm.cpp @@ -473,6 +477,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/Conv_Winograd_v30_3_1_metadata.inc kernels/MIOpenReduceCalculation.hpp kernels/MIOpenReduceExtreme.hpp + kernels/MIOpenCumulativeReduction.hpp kernels/bfloat16_dev.hpp kernels/block_reduce.hpp kernels/conv_common.inc @@ -531,6 +536,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenConvDirUni.cl kernels/MIOpenConvDirBatchNormActiv.cl kernels/MIOpenConvDirGenFwd.cl + kernels/MIOpenCumulativeReduction.cpp kernels/MIOpenGLU.cpp kernels/MIOpenGroupNorm.cpp kernels/MIOpenGetitem.cpp @@ -669,6 +675,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN adam.cpp addlayernorm.cpp cat.cpp + cumulative_reduction.cpp exec_utils.cpp groupnorm.cpp getitem.cpp diff --git a/src/cumulative_reduction.cpp b/src/cumulative_reduction.cpp new file mode 100644 index 0000000000..c6dae2f420 --- /dev/null +++ b/src/cumulative_reduction.cpp @@ -0,0 +1,78 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +namespace miopen { +namespace cumulative_reduction { + +miopenStatus_t CumulativeReductionForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const TensorDescriptor& indicesDesc, + Data_t indices, + const int dim, + const bool exclusive, + const bool reverse, + const miopenCumOp_t cumOp) +{ + const auto problem = cumulative_reduction::ForwardProblemDescription{ + inputDesc, outputDesc, indicesDesc, dim, cumOp}; + + const auto invoke_params = [&]() { + auto tmp = cumulative_reduction::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.inputDesc = &inputDesc; + tmp.outputDesc = &outputDesc; + tmp.indicesDesc = &indicesDesc; + tmp.input = input; + tmp.output = output; + tmp.indices = indices; + + tmp.dim = dim; + tmp.exclusive = exclusive; + tmp.reverse = reverse; + + return tmp; + }(); + + const auto algo = AlgorithmName{"CumulativeReductionForward"}; + const auto solvers = + solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace cumulative_reduction +} // namespace miopen diff --git a/src/cumulative_reduction/problem_description.cpp b/src/cumulative_reduction/problem_description.cpp new file mode 100644 index 0000000000..4ef94ef138 --- /dev/null +++ b/src/cumulative_reduction/problem_description.cpp @@ -0,0 +1,72 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include + +namespace miopen { + +namespace cumulative_reduction { + +bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y) +{ + if(x.GetNumDims() != y.GetNumDims()) + return false; + for(int i = 0; i < x.GetNumDims(); ++i) + { + if(x.GetLengths()[i] != y.GetLengths()[i]) + return false; + } + return true; +} + +NetworkConfig ForwardProblemDescription::MakeNetworkConfig() const +{ + auto input_dtype = inputDesc.GetType(); + auto output_dtype = outputDesc.GetType(); + auto size = inputDesc.GetElementSize(); + auto inner_size = inputDesc.GetLengths()[dim]; + auto outer_size = size / inner_size; + + std::ostringstream ss; + + ss << "cum_reduc_fwd"; + ss << "idtype" << input_dtype; + ss << "odtype" << output_dtype; + ss << "outer" << outer_size; + ss << "inner" << inner_size; + ss << "op" << cumOp; + ss << "packed" << IsAllPacked(); + ss << "dimstride1" << IsAllDimStride1(); + + return NetworkConfig{ss.str()}; +} + +} // namespace cumulative_reduction + +} // namespace miopen diff --git a/src/cumulative_reduction_api.cpp b/src/cumulative_reduction_api.cpp new file mode 100644 index 0000000000..80185a75cb --- /dev/null +++ b/src/cumulative_reduction_api.cpp @@ -0,0 +1,108 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include + +static void LogCmdCumulativeReduction(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + const miopenTensorDescriptor_t indicesDesc, + const int dim, + const bool exclusive, + const bool reverse, + const miopenCumOp_t cumOp, + const bool is_fwd) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(inputDesc).GetType(); + if(dtype == miopenHalf) + { + ss << "cumulative_reductionfp16"; + } + else if(dtype == miopenFloat) + { + ss << "cumulative_reductionfp32"; + } + else if(dtype == miopenBFloat16) + { + ss << "cumulative_reductionbfp16"; + } + + MIOPEN_LOG_FUNCTION(inputDesc, outputDesc, indicesDesc); + ss << " -d " << dim; + ss << " --excl " << exclusive; + ss << " --rev " << reverse; + ss << " --op " << cumOp; + ss << " -F " << ((is_fwd) ? "1" : "2"); + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +extern "C" miopenStatus_t +miopenCumulativeReductionForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const miopenTensorDescriptor_t indicesDesc, + void* indices, + const int dim, + const bool exclusive, + const bool reverse, + const miopenCumOp_t cumOp) +{ + MIOPEN_LOG_FUNCTION(handle, + inputDesc, + input, + outputDesc, + output, + indicesDesc, + indices, + dim, + exclusive, + reverse, + cumOp); + + LogCmdCumulativeReduction( + inputDesc, outputDesc, indicesDesc, dim, exclusive, reverse, cumOp, true); + return miopen::try_([&] { + miopen::cumulative_reduction::CumulativeReductionForward(miopen::deref(handle), + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(outputDesc), + DataCast(output), + miopen::deref(indicesDesc), + DataCast(indices), + dim, + exclusive, + reverse, + cumOp); + }); +} diff --git a/src/include/miopen/cumulative_reduction.hpp b/src/include/miopen/cumulative_reduction.hpp new file mode 100644 index 0000000000..fc1ac43162 --- /dev/null +++ b/src/include/miopen/cumulative_reduction.hpp @@ -0,0 +1,50 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include + +namespace miopen { +struct Handle; +struct TensorDescriptor; + +namespace cumulative_reduction { + +MIOPEN_INTERNALS_EXPORT miopenStatus_t +CumulativeReductionForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const TensorDescriptor& indicesDesc, + Data_t indices, + int dim, + bool exclusive, + bool reverse, + miopenCumOp_t cumOp); + +} // namespace cumulative_reduction +} // namespace miopen diff --git a/src/include/miopen/cumulative_reduction/invoke_params.hpp b/src/include/miopen/cumulative_reduction/invoke_params.hpp new file mode 100644 index 0000000000..fac69df6f0 --- /dev/null +++ b/src/include/miopen/cumulative_reduction/invoke_params.hpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +namespace cumulative_reduction { + +struct InvokeParams : public miopen::InvokeParams +{ + InvokeParams() = default; + + const TensorDescriptor* inputDesc = nullptr; + const TensorDescriptor* outputDesc = nullptr; + const TensorDescriptor* indicesDesc = nullptr; + + ConstData_t input = nullptr; + Data_t output = nullptr; + Data_t indices = nullptr; + + int dim = 0; + bool exclusive = false; + bool reverse = false; + + std::size_t GetWorkspaceSize() const { return 0; } + Data_t GetWorkspace() const { return nullptr; } +}; + +} // namespace cumulative_reduction + +} // namespace miopen diff --git a/src/include/miopen/cumulative_reduction/problem_description.hpp b/src/include/miopen/cumulative_reduction/problem_description.hpp new file mode 100644 index 0000000000..1cd2cf621c --- /dev/null +++ b/src/include/miopen/cumulative_reduction/problem_description.hpp @@ -0,0 +1,132 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + *all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include + +#include + +namespace miopen { + +struct NetworkConfig; + +namespace cumulative_reduction { + +bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y); + +struct ForwardProblemDescription : ProblemDescriptionBase +{ + ForwardProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& outputDesc_, + const TensorDescriptor& indicesDesc_, + const int& dim_, + const miopenCumOp_t& cumOp_) + : inputDesc(inputDesc_), + outputDesc(outputDesc_), + indicesDesc(indicesDesc_), + dim(dim_), + cumOp(cumOp_) + { + if(IsValidDim()) + dim = (dim < 0 ? dim + inputDesc.GetNumDims() : dim); + IsValidIndicesType(); + IsSameLength(); + } + + const TensorDescriptor& GetInputDesc() const { return inputDesc; } + const TensorDescriptor& GetOutputDesc() const { return outputDesc; } + const TensorDescriptor& GetIndicesDesc() const { return indicesDesc; } + const int& GetDim() const { return dim; } + const miopenCumOp_t& GetCumOp() const { return cumOp; } + + bool IsValidDim() const + { + const int ndims = inputDesc.GetNumDims(); + if(dim < -ndims || ndims - 1 < dim) + { + MIOPEN_THROW(miopenStatusBadParm, + (std::stringstream() + << "Cumulative Reduction: Operating dim value must be in range [" + << -ndims << "," << ndims - 1 << "].") + .str()); + } + return true; + } + + bool IsValidIndicesType() const + { + if(indicesDesc.GetElementSize() > 0 && indicesDesc.GetType() != miopenInt64) + MIOPEN_THROW(miopenStatusBadParm, + "Cumulative Reduction: Indices tensor type must be int64."); + return true; + } + + bool IsSameLength() const + { + if(outputDesc.GetElementSize() > 0 && !checkSameLength(inputDesc, outputDesc)) + MIOPEN_THROW(miopenStatusBadParm, + "Cumulative Reduction: Input and Output tensor sizes do not match."); + if(indicesDesc.GetElementSize() > 0 && !checkSameLength(inputDesc, indicesDesc)) + MIOPEN_THROW(miopenStatusBadParm, + "Cumulative Reduction: Input and Indices tensor sizes do not match."); + return true; + } + + bool IsAllPacked() const + { + if(!inputDesc.IsPacked() || !outputDesc.IsPacked() || !indicesDesc.IsPacked()) + return false; + return true; + } + + bool IsAllDimStride1() const + { + if(inputDesc.GetStrides()[dim] != 1) + return false; + if(outputDesc.GetElementSize() > 0 && outputDesc.GetStrides()[dim] != 1) + return false; + if(indicesDesc.GetElementSize() > 0 && indicesDesc.GetStrides()[dim] != 1) + return false; + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +private: + TensorDescriptor inputDesc; + TensorDescriptor outputDesc; + TensorDescriptor indicesDesc; + int dim; + miopenCumOp_t cumOp; + + NetworkConfig MakeForwardNetworkConfig() const; +}; + +} // namespace cumulative_reduction + +} // namespace miopen diff --git a/src/include/miopen/cumulative_reduction/solvers.hpp b/src/include/miopen/cumulative_reduction/solvers.hpp new file mode 100644 index 0000000000..1e78b8851d --- /dev/null +++ b/src/include/miopen/cumulative_reduction/solvers.hpp @@ -0,0 +1,59 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +namespace solver { + +namespace cumulative_reduction { + +using ForwardSolverBase = + NonTunableSolverBase; + +struct ForwardContiguousLastDim final : ForwardSolverBase +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable( + const ExecutionContext& context, + const miopen::cumulative_reduction::ForwardProblemDescription& problem) const override; + ConvSolution GetSolution( + const ExecutionContext& context, + const miopen::cumulative_reduction::ForwardProblemDescription& problem) const override; +}; + +} // namespace cumulative_reduction + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/cumulative_reduction/utils.hpp b/src/include/miopen/cumulative_reduction/utils.hpp new file mode 100644 index 0000000000..9137019ff4 --- /dev/null +++ b/src/include/miopen/cumulative_reduction/utils.hpp @@ -0,0 +1,45 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include +#include + +namespace miopen { +namespace solver { +namespace cumulative_reduction { + +KernelInfo make_hip_kernel(std::vector localsize, + std::vector gridsize, + std::string kernel_file, + std::string kernel_name, + KernelBuildParameters build_params); + +} // namespace cumulative_reduction +} // namespace solver +} // namespace miopen diff --git a/src/kernels/MIOpenCumulativeReduction.cpp b/src/kernels/MIOpenCumulativeReduction.cpp new file mode 100644 index 0000000000..a6567d441c --- /dev/null +++ b/src/kernels/MIOpenCumulativeReduction.cpp @@ -0,0 +1,144 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "MIOpenCumulativeReduction.hpp" + +template +__device__ inline void CumulativeReductionScan(const bool& reverse, + const uint64_t& lid, + FLOAT_ACCUM* __restrict__ a, + Ts* __restrict__... b) +{ + // reduction + uint64_t stride = 1; + while(stride <= LOCAL_SIZE) + { + uint64_t idx = (lid + 1) * stride * 2 - 1; + if(idx < LOCAL_SIZE) + reduce_func{}.calculate( + !reverse, a[idx], a[idx - stride], b[idx]..., b[idx - stride]...); + stride *= 2; + __syncthreads(); + } + + // post scan + stride = LOCAL_SIZE / 2; + while(stride > 0) + { + uint64_t idx = (lid + 1) * stride * 2 - 1; + if((idx + stride) < LOCAL_SIZE) + reduce_func{}.calculate( + !reverse, a[idx + stride], a[idx], b[idx + stride]..., b[idx]...); + stride /= 2; + __syncthreads(); + } +} + +template +__device__ void CumulativeReductionForwardContiguousLastDim(const TI* __restrict__ input, + TO* __restrict__ output, + int64_t* __restrict__ indices, + const uint64_t reduce_size, + const bool exclusive, + const bool reverse) +{ + /* + * input = packed tensor with stride[last_dim]=1, output: the same as input, indices: the same +as input + * reduce_size = input.size[last_dim] + * exclusive: TRUE to exclude input[i] when calculate output[i] + * reverse: reverse the operating order + * + * cumulative dimension = last dim + * blockSize = {1, LOCAL_SIZE} + * gridSize = {Number of input elements / input.size[last_dim], input.size[last_dim]} + */ + + __shared__ FLOAT_ACCUM otmp[LOCAL_SIZE]; + int64_t* itmp = nullptr; + if(indices) + { + __shared__ int64_t _itmp[LOCAL_SIZE]; + itmp = _itmp; + } + + uint64_t lid = threadIdx.y; + + uint64_t xid = blockIdx.x * blockDim.x + threadIdx.x; + uint64_t yid = blockIdx.y * blockDim.y + threadIdx.y; + + if(exclusive <= yid && yid < reduce_size) + { + int64_t idx = (reverse ? reduce_size - static_cast(yid) + exclusive - 1 + : static_cast(yid) - exclusive); + otmp[lid] = CVT_FLOAT2ACCUM(input[xid * reduce_size + idx]); + if(indices) + itmp[lid] = idx; + } + else + { + otmp[lid] = reduce_func{}.START_VAL; + if(indices) + itmp[lid] = (reverse ? reduce_size - static_cast(yid) + exclusive - 1 + : static_cast(yid) - exclusive); + } + __syncthreads(); + + if(indices) + CumulativeReductionScan(reverse, lid, otmp, itmp); + else + CumulativeReductionScan(reverse, lid, otmp); + + if(yid < reduce_size) + { + int64_t idx = + (reverse ? reduce_size - static_cast(yid) - 1 : static_cast(yid)); + if(output) + output[xid * reduce_size + idx] = CVT_ACCUM2FLOAT(otmp[lid]); + if(indices) + indices[xid * reduce_size + idx] = itmp[lid]; + } +} + +extern "C" __global__ void CumulativeReductionForwardContiguousLastDim(const INPUT_TYPE* input, + OUTPUT_TYPE* output, + int64_t* indices, + const uint64_t reduce_size, + const bool exclusive, + const bool reverse) +{ + // instantiate the kernel + CumulativeReductionForwardContiguousLastDim( + input, output, indices, reduce_size, exclusive, reverse); +} diff --git a/src/kernels/MIOpenCumulativeReduction.hpp b/src/kernels/MIOpenCumulativeReduction.hpp new file mode 100644 index 0000000000..7323642e8d --- /dev/null +++ b/src/kernels/MIOpenCumulativeReduction.hpp @@ -0,0 +1,116 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_KERNELS_MIOPEN_CUMULATIVE_REDUCTIONS_HPP +#define GUARD_KERNELS_MIOPEN_CUMULATIVE_REDUCTIONS_HPP + +#include "float_types.h" + +enum class CumulativeReductionOp_t +{ + Max = 1, + Min = 2, + Sum = 3, + Prod = 4, +}; + +#ifndef __HIP_DEVICE_COMPILE__ +static_assert(MIOPEN_CUM_MAX == static_cast(CumulativeReductionOp_t::Max)); +static_assert(MIOPEN_CUM_MIN == static_cast(CumulativeReductionOp_t::Min)); +static_assert(MIOPEN_CUM_SUM == static_cast(CumulativeReductionOp_t::Sum)); +static_assert(MIOPEN_CUM_PROD == static_cast(CumulativeReductionOp_t::Prod)); +#endif + +inline constexpr void update() {} +template +inline constexpr void update(T& a, T b, Ts&... c, Ts... d) +{ + a = b; + update(c..., d...); +} + +inline constexpr bool isgreater() { return false; } +template +inline constexpr bool isgreater(T& a, T b, Ts&... c, Ts... d) +{ + if(a != b) + return a > b; + return isgreater(c..., d...); +} + +template +struct reduce_func_base +{ + inline constexpr bool isbetter(const T& /*a*/, const T& /*b*/) { return false; } + inline constexpr void combine(T& a, T b) { a = b; } + inline constexpr void calculate(const bool keep_greater, T& a, T b, Ts&... c, Ts... d) + { + auto derived = static_cast(this); + if(!derived->isbetter(a, b)) + { + if(derived->isbetter(a, b) != derived->isbetter(b, a) || + isgreater(c..., d...) != keep_greater) + update(c..., d...); + derived->combine(a, b); + } + } +}; + +template +struct reduce_func; + +template +struct reduce_func + : reduce_func_base, T, Ts...> +{ + const FLOAT_ACCUM START_VAL = -MAX_VAL_ACCUM; + inline constexpr bool isbetter(const T& a, const T& b) { return a > b; } +}; + +template +struct reduce_func + : reduce_func_base, T, Ts...> +{ + const FLOAT_ACCUM START_VAL = MAX_VAL_ACCUM; + inline constexpr bool isbetter(const T& a, const T& b) { return a < b; } +}; + +template +struct reduce_func + : reduce_func_base, T, Ts...> +{ + const FLOAT_ACCUM START_VAL = CVT_FP32_2ACCUM(0.0f); + inline constexpr void combine(T& a, T b) { a += b; } +}; + +template +struct reduce_func + : reduce_func_base, T, Ts...> +{ + const FLOAT_ACCUM START_VAL = CVT_FP32_2ACCUM(1.0f); + inline constexpr void combine(T& a, T b) { a *= b; } +}; + +#endif // GUARD_GUARD_KERNELS_MIOPEN_CUMULATIVE_REDUCTIONS_HPP diff --git a/src/solver.cpp b/src/solver.cpp index 1f6873d5f7..87ec85a274 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -688,6 +689,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::RoPE, rope::RoPEForward{}.SolverDbId()); Register(registry, ++id, Primitive::RoPE, rope::RoPEBackward{}.SolverDbId()); + Register(registry, ++id, Primitive::ReLU, prelu::MultiWeightsBackward{}.SolverDbId()); Register(registry, ++id, Primitive::ReLU, prelu::SingleWeightBackward{}.SolverDbId()); Register(registry, ++id, Primitive::Kthvalue, kthvalue::KthvalueFwd{}.SolverDbId()); @@ -709,6 +711,11 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) multimarginloss::MultiMarginLossForward{}.SolverDbId()); Register(registry, ++id, Primitive::Mha, mha::MhaCKFlashAttentionV2Forward{}.SolverDbId()); + + Register(registry, + ++id, + Primitive::Reduce, + cumulative_reduction::ForwardContiguousLastDim{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function, and don't leave a white // space between this comment and the newly registered solver(s)! } diff --git a/src/solver/cumulative_reduction/forward_cumulative_reduction_contiguous_lastdim.cpp b/src/solver/cumulative_reduction/forward_cumulative_reduction_contiguous_lastdim.cpp new file mode 100644 index 0000000000..45c312f1a1 --- /dev/null +++ b/src/solver/cumulative_reduction/forward_cumulative_reduction_contiguous_lastdim.cpp @@ -0,0 +1,125 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +#define warpSizeCTX (context.GetStream().GetWavefrontWidth()) +#define LOCAL_SIZE_MAX 256 +#define LOCAL_SIZE_MIN warpSizeCTX + +namespace miopen { + +namespace solver { + +namespace cumulative_reduction { + +bool IsImprovementOverROCm(const ExecutionContext& /*context*/, + const miopen::cumulative_reduction::ForwardProblemDescription& problem) +{ + if(problem.GetInputDesc().GetLengths()[problem.GetDim()] > LOCAL_SIZE_MAX) + return false; + return true; +} + +bool ForwardContiguousLastDim::IsApplicable( + const ExecutionContext& context, + const miopen::cumulative_reduction::ForwardProblemDescription& problem) const +{ + if(!IsImprovementOverROCm(context, problem)) + return false; + if(!problem.IsAllPacked()) + return false; + if(!problem.IsAllDimStride1()) + return false; + return true; +} + +ConvSolution ForwardContiguousLastDim::GetSolution( + const ExecutionContext& context, + const miopen::cumulative_reduction::ForwardProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetInputDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType()); + auto cum_op = problem.GetCumOp(); + + auto size = problem.GetInputDesc().GetElementSize(); + auto inner_size = problem.GetInputDesc().GetLengths()[problem.GetDim()]; + auto outer_size = size / inner_size; + + // LOCAL_SIZE must be the smallest power of 2 that greater than inner_size and warpSize + auto local_size = LOCAL_SIZE_MIN; + while(local_size < inner_size) + local_size *= 2; + + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"OP_TYPE", cum_op}, + {"REDUCE_SIZE", local_size}, + }; + + { + result.construction_params.push_back( + make_hip_kernel({1, local_size}, + {outer_size, inner_size}, + "MIOpenCumulativeReduction.cpp", + "CumulativeReductionForwardContiguousLastDim", + build_params)); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + auto params = raw_params.CastTo(); + + const int ndims = deref(params.inputDesc).GetNumDims(); + const unsigned int true_dim = ((params.dim % ndims) + ndims) % ndims; + auto kernel = handle_.Run(kernels[0]); + kernel(params.input, + params.output, + params.indices, + static_cast(deref(params.inputDesc).GetLengths()[true_dim]), + params.exclusive, + params.reverse); + }; + }; + + return result; +} + +} // namespace cumulative_reduction +} // namespace solver +} // namespace miopen diff --git a/src/solver/cumulative_reduction/utils.cpp b/src/solver/cumulative_reduction/utils.cpp new file mode 100644 index 0000000000..6ee3b00a15 --- /dev/null +++ b/src/solver/cumulative_reduction/utils.cpp @@ -0,0 +1,54 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +namespace miopen { + +namespace solver { + +namespace cumulative_reduction { + +KernelInfo make_hip_kernel(std::vector localsize, + std::vector gridsize, + std::string kernel_file, + std::string kernel_name, + KernelBuildParameters build_params) +{ + while(localsize.size() < 3) + localsize.push_back(1); + while(gridsize.size() < 3) + gridsize.push_back(1); + for(int i = 0; i < localsize.size(); ++i) + gridsize[i] = AlignUp(gridsize[i], localsize[i]); + return KernelInfo{ + build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name}; +} + +} // namespace cumulative_reduction +} // namespace solver +} // namespace miopen diff --git a/test/cpu_cumulative_reduction.hpp b/test/cpu_cumulative_reduction.hpp new file mode 100644 index 0000000000..922a87af19 --- /dev/null +++ b/test/cpu_cumulative_reduction.hpp @@ -0,0 +1,145 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "tensor_holder.hpp" +#include +#include + +#include + +inline constexpr void update() {} + +template +inline constexpr void update(T& a, T b, Ts&... c, Ts... d) +{ + a = b; + update(c..., d...); +} + +template +struct reduce_func_base +{ + reduce_func_base(){}; + virtual ~reduce_func_base(){}; + virtual inline bool isbetter(const T& /*a*/, const T& /*b*/) const { return false; } + virtual inline void combine(T& a, T b) const { a = b; } + inline constexpr void calculate(T& a, T b, Ts&... c, Ts... d) const + { + if(!isbetter(a, b)) + { + combine(a, b); + update(c..., d...); + } + } +}; + +template +struct reduce_func : reduce_func_base +{ + virtual ~reduce_func(){}; +}; + +template +struct reduce_func : reduce_func_base +{ + const float START_VAL = -std::numeric_limits::max(); + inline bool isbetter(const T& a, const T& b) const override { return a > b; } +}; + +template +struct reduce_func : reduce_func_base +{ + const float START_VAL = std::numeric_limits::max(); + inline bool isbetter(const T& a, const T& b) const override { return a < b; } +}; + +template +struct reduce_func : reduce_func_base +{ + const float START_VAL = 0.0f; + inline void combine(T& a, T b) const override { a += b; } +}; + +template +struct reduce_func : reduce_func_base +{ + const float START_VAL = 1.0f; + inline void combine(T& a, T b) const override { a *= b; } +}; + +template +void cpu_cumulative_reduction_forward(const tensor input, + tensor& ref_output, + tensor& ref_indices, + const int dim, + const bool exclusive, + const bool reverse, + const bool has_output = true, + const bool has_indices = true) +{ + const int ndims = input.desc.GetNumDims(); + const auto true_dim = ((dim % ndims) + ndims) % ndims; + + auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); + auto output_tv = miopen::get_inner_expanded_tv<5>(ref_output.desc); + auto indices_tv = miopen::get_inner_expanded_tv<5>(ref_indices.desc); + + auto size = input.desc.GetElementSize(); + auto inner_size = input.desc.GetLengths()[true_dim]; + auto outer_size = size / inner_size; + + auto op_worker = reduce_func{}; + + tensor_view_t<5> ignore_dim_input_tv = input_tv; + ignore_dim_input_tv.size[true_dim] = 1; + + par_ford(outer_size)([&](int64_t gid) { + auto tensor_layout = tensor_layout_t<5>(ignore_dim_input_tv, gid); + float cum_val = op_worker.START_VAL; + int64_t cum_idx = (reverse ? input_tv.size[true_dim] - 1 : 0); + + ford(inner_size)([&](int64_t idx) { + int64_t tmp_idx = + (reverse ? input_tv.size[true_dim] - (idx - exclusive) - 1 : (idx - exclusive)); + float tmp_val = op_worker.START_VAL; + if(0 <= tmp_idx && tmp_idx < inner_size) + { + tensor_layout.layout[true_dim] = tmp_idx; + tmp_val = static_cast(input[input_tv.get_tensor_view_idx(tensor_layout)]); + } + + op_worker.calculate(cum_val, tmp_val, cum_idx, tmp_idx); + + tensor_layout.layout[true_dim] = (reverse ? input_tv.size[true_dim] - idx - 1 : idx); + if(has_output) + ref_output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast(cum_val); + if(has_indices) + ref_indices[indices_tv.get_tensor_view_idx(tensor_layout)] = cum_idx; + }); + }); +} diff --git a/test/gtest/cumulative_reduction.cpp b/test/gtest/cumulative_reduction.cpp new file mode 100644 index 0000000000..e20afcf0b0 --- /dev/null +++ b/test/gtest/cumulative_reduction.cpp @@ -0,0 +1,81 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "cumulative_reduction.hpp" + +namespace cumulative_reduction { +using GPU_CumulativeReduction_fwd_FP32 = CumulativeReductionTest; +using GPU_CumulativeReduction_fwd_FP16 = CumulativeReductionTest; +using GPU_CumulativeReduction_fwd_BFP16 = CumulativeReductionTest; +} // namespace cumulative_reduction + +using namespace cumulative_reduction; + +TEST_P(GPU_CumulativeReduction_fwd_FP32, Test) +{ + RunTest(); + Verify(); +}; +TEST_P(GPU_CumulativeReduction_fwd_FP16, Test) +{ + RunTest(); + Verify(); +}; +TEST_P(GPU_CumulativeReduction_fwd_BFP16, Test) +{ + RunTest(); + Verify(); +}; + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_CumulativeReduction_fwd_FP32, + testing::ValuesIn(CumulativeReductionSmokeTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_CumulativeReduction_fwd_FP16, + testing::ValuesIn(CumulativeReductionSmokeTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_CumulativeReduction_fwd_BFP16, + testing::ValuesIn(CumulativeReductionSmokeTestConfigs())); + +INSTANTIATE_TEST_SUITE_P(Perf, + GPU_CumulativeReduction_fwd_FP32, + testing::ValuesIn(CumulativeReductionPerfTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Perf, + GPU_CumulativeReduction_fwd_FP16, + testing::ValuesIn(CumulativeReductionPerfTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Perf, + GPU_CumulativeReduction_fwd_BFP16, + testing::ValuesIn(CumulativeReductionPerfTestConfigs())); + +INSTANTIATE_TEST_SUITE_P(Full, + GPU_CumulativeReduction_fwd_FP32, + testing::ValuesIn(CumulativeReductionFullTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, + GPU_CumulativeReduction_fwd_FP16, + testing::ValuesIn(CumulativeReductionFullTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, + GPU_CumulativeReduction_fwd_BFP16, + testing::ValuesIn(CumulativeReductionFullTestConfigs())); diff --git a/test/gtest/cumulative_reduction.hpp b/test/gtest/cumulative_reduction.hpp new file mode 100644 index 0000000000..b89c683512 --- /dev/null +++ b/test/gtest/cumulative_reduction.hpp @@ -0,0 +1,305 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "cpu_cumulative_reduction.hpp" +#include "get_handle.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" + +#include +#include +#include +#include + +#define FLOAT_ACCUM float + +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +struct CumulativeReductionTestCase +{ + std::vector lengths; + miopenCumOp_t op; + int dim; + bool exclusive; + bool reverse; + bool contiguous; + + friend std::ostream& operator<<(std::ostream& os, const CumulativeReductionTestCase& tc) + { + return os << " Lengths:" << tc.lengths << " CumOp:" << tc.op << " Dim:" << tc.dim + << " Exclusive:" << (tc.exclusive ? "True" : "False") + << " Reverse:" << (tc.reverse ? "True" : "False") + << " Contiguous:" << (tc.contiguous ? "True" : "False"); + } +}; + +inline std::vector GetStrides(std::vector lengths, bool contiguous) +{ + if(!contiguous) + std::swap(lengths.front(), lengths.back()); + std::vector strides(lengths.size()); + strides.back() = 1; + for(int i = lengths.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * lengths[i + 1]; + if(!contiguous) + std::swap(strides.front(), strides.back()); + return strides; +} + +inline std::vector +CumulativeReductionTestConfigs(const std::vector>& SizeList) +{ + std::vector tcs; + + std::vector ops = { + MIOPEN_CUM_MAX, MIOPEN_CUM_MIN, MIOPEN_CUM_SUM, MIOPEN_CUM_PROD}; + std::vector dims = {-1, 0}; + std::vector exclusives = {false, true}; + std::vector reverses = {false, true}; + std::vector contiguouss = {true, false}; + + auto&& handle = get_handle(); + for(const auto& lengths : SizeList) + { + auto out_strides = GetStrides(lengths, true); + auto indices_strides = GetStrides(lengths, true); + for(auto contiguous : contiguouss) + { + auto input_strides = GetStrides(lengths, contiguous); + for(auto op : ops) + { + for(auto dim : dims) + { + for(auto exclusive : exclusives) + { + if(exclusive && (op == MIOPEN_CUM_MAX || op == MIOPEN_CUM_MIN)) + continue; + for(auto reverse : reverses) + { + if(miopen::solver::cumulative_reduction::ForwardContiguousLastDim() + .IsApplicable( + miopen::ExecutionContext(&handle), + miopen::cumulative_reduction::ForwardProblemDescription( + miopen::TensorDescriptor( + miopen_type{}, lengths, input_strides), + miopen::TensorDescriptor( + miopen_type{}, lengths, out_strides), + miopen::TensorDescriptor( + miopen_type{}, lengths, indices_strides), + dim, + op))) + tcs.push_back({lengths, op, dim, exclusive, reverse, contiguous}); + } + } + } + } + } + } + + return tcs; +} + +inline std::vector> GetSmokeTestSize() +{ + return { + {10}, + {65, 100}, + {65}, + {70, 10}, + }; +} + +inline std::vector> GetSmokePerfSize() +{ + return { + {512, 64, 112}, + }; +} + +inline std::vector CumulativeReductionSmokeTestConfigs() +{ + return CumulativeReductionTestConfigs(GetSmokeTestSize()); +} + +inline std::vector CumulativeReductionPerfTestConfigs() +{ + return CumulativeReductionTestConfigs(GetSmokePerfSize()); +} + +inline std::vector CumulativeReductionFullTestConfigs() +{ + std::vector tcs; + + auto smoke_test = CumulativeReductionSmokeTestConfigs(); + auto perf_test = CumulativeReductionPerfTestConfigs(); + + tcs.reserve(smoke_test.size() + perf_test.size()); + for(const auto& test : smoke_test) + tcs.push_back(test); + for(const auto& test : perf_test) + tcs.push_back(test); + + return tcs; +} + +template +struct CumulativeReductionTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + cumulative_reduction_config = GetParam(); + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + + auto lengths = cumulative_reduction_config.lengths; + + auto input_strides = GetStrides(lengths, cumulative_reduction_config.contiguous); + input = tensor{lengths, input_strides}.generate(gen_value); + + auto out_strides = GetStrides(lengths, true); + output = tensor{lengths, out_strides}; + + auto indices_strides = GetStrides(lengths, true); + indices = tensor{lengths, indices_strides}; + + ref_output = tensor{lengths, out_strides}; + ref_indices = tensor{lengths, indices_strides}; + + input_dev = handle.Write(input.data); + output_dev = handle.Write(output.data); + indices_dev = handle.Write(indices.data); + } + + void RunTest() + { + switch(cumulative_reduction_config.op) + { + case MIOPEN_CUM_MAX: + cpu_cumulative_reduction_forward( + input, + ref_output, + ref_indices, + cumulative_reduction_config.dim, + cumulative_reduction_config.exclusive, + cumulative_reduction_config.reverse); + break; + case MIOPEN_CUM_MIN: + cpu_cumulative_reduction_forward( + input, + ref_output, + ref_indices, + cumulative_reduction_config.dim, + cumulative_reduction_config.exclusive, + cumulative_reduction_config.reverse); + break; + case MIOPEN_CUM_SUM: + cpu_cumulative_reduction_forward( + input, + ref_output, + ref_indices, + cumulative_reduction_config.dim, + cumulative_reduction_config.exclusive, + cumulative_reduction_config.reverse); + break; + case MIOPEN_CUM_PROD: + cpu_cumulative_reduction_forward( + input, + ref_output, + ref_indices, + cumulative_reduction_config.dim, + cumulative_reduction_config.exclusive, + cumulative_reduction_config.reverse); + break; + } + + auto&& handle = get_handle(); + miopenStatus_t status; + + status = miopen::cumulative_reduction::CumulativeReductionForward( + handle, + input.desc, + input_dev.get(), + output.desc, + output_dev.get(), + indices.desc, + indices_dev.get(), + cumulative_reduction_config.dim, + cumulative_reduction_config.exclusive, + cumulative_reduction_config.reverse, + cumulative_reduction_config.op); + EXPECT_EQ(status, miopenStatusSuccess); + output.data = handle.Read(output_dev, output.data.size()); + indices.data = handle.Read(indices_dev, indices.data.size()); + } + + void Verify() + { + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + double tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + + auto error_output = miopen::rms_range(ref_output, output); + auto error_indices = miopen::rms_range(ref_indices, indices); + ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output)); + ASSERT_EQ(miopen::range_distance(ref_indices), miopen::range_distance(indices)); + EXPECT_LT(error_output, tolerance) + << "Error forward Output beyond tolerance Error: " << error_output + << " Tolerance: " << tolerance; + EXPECT_LT(error_indices, tolerance) + << "Error forward Indices beyond tolerance Error: " << error_indices + << " Tolerance: " << tolerance; + } + + CumulativeReductionTestCase cumulative_reduction_config; + + tensor input; + tensor output; + tensor indices; + + tensor ref_output; + tensor ref_indices; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr output_dev; + miopen::Allocator::ManageDataPtr indices_dev; +};