diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index c2b74eabee..d3271e5732 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -39,3 +39,4 @@ The MIOpen API library is structured as follows:
   * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental)
   * :doc:`Kthvalue <../doxygen/html/group__kthvalue>` (experimental)
   * :doc:`GLU <../doxygen/html/group__glu>` (experimental)
+  * :doc:`Cumulative Reduction <./group___cumulative_reduction>` (experimental)
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 60d6fe6ce6..a9b6d94f9b 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -40,6 +40,7 @@ add_executable(MIOpenDriver
     dm_convfp16.cpp
     dm_convfp8.cpp
     dm_convint8.cpp
+    dm_cumulative_reduction.cpp
     dm_dropout.cpp
     dm_fusion.cpp
     dm_gemm.cpp
diff --git a/driver/cumulative_reduction_driver.hpp b/driver/cumulative_reduction_driver.hpp
new file mode 100644
index 0000000000..9a30a3690b
--- /dev/null
+++ b/driver/cumulative_reduction_driver.hpp
@@ -0,0 +1,457 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "driver.hpp"
+#include "mloCumulativeReductionHost.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+
+#include <../test/ford.hpp>
+#include <../test/verify.hpp>
+
+#include <miopen/miopen.h>
+
+inline std::vector<int> GetStrides(std::vector<int> lengths, bool contiguous)
+{
+    if(!contiguous)
+        std::swap(lengths.front(), lengths.back());
+    std::vector<int> strides(lengths.size());
+    strides.back() = 1;
+    for(int i = lengths.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * lengths[i + 1];
+    if(!contiguous)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
+template <typename Tgpu, typename Tref>
+class CumulativeReductionDriver : public Driver
+{
+public:
+    CumulativeReductionDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&indicesDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~CumulativeReductionDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(indicesDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t outputDesc;
+    miopenTensorDescriptor_t indicesDesc;
+
+    std::unique_ptr<GPUMem> input_dev;
+    std::unique_ptr<GPUMem> output_dev;
+    std::unique_ptr<GPUMem> indices_dev;
+
+    std::vector<Tgpu> input;
+    std::vector<Tgpu> output;
+    std::vector<int64_t> indices;
+
+    std::vector<Tref> output_host;
+    std::vector<int64_t> indices_host;
+
+    int dim;
+    bool exclusive;
+    bool reverse;
+
+    miopenCumOp_t cumOp;
+};
+
+template <typename Tgpu, typename Tref>
+int CumulativeReductionDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+
+    auto inTensorParam = inflags.GetValueTensor("input");
+    auto input_length  = inTensorParam.lengths;
+    if(input_length.empty())
+    {
+        std::cout << "Tensor must not be empty";
+        return miopenStatusBadParm;
+    }
+
+    int contiguous = inflags.GetValueInt("Contiguous");
+    if(contiguous != 0 && contiguous != 1)
+    {
+        std::cerr << "Error Tensor Contiguous should be 0 or 1" << std::endl;
+        return miopenStatusBadParm;
+    }
+
+    std::vector<miopenCumOp_t> cumOpList = {
+        MIOPEN_CUM_MAX, MIOPEN_CUM_MIN, MIOPEN_CUM_SUM, MIOPEN_CUM_PROD};
+    int cumOpInt = inflags.GetValueInt("CumulativeOperation");
+    bool valid   = true;
+    for(auto op : cumOpList)
+        if(cumOpInt != static_cast<int>(op))
+        {
+            valid = false;
+            break;
+        }
+    if(valid)
+    {
+        std::cerr << "Error CumulativeOperation value should be in set {" << cumOpList << "}"
+                  << std::endl;
+        return miopenStatusBadParm;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int CumulativeReductionDriver<Tgpu, Tref>::GetandSetData()
+{
+    dim       = inflags.GetValueInt("dim");
+    exclusive = (inflags.GetValueInt("exclusive") != 0);
+    reverse   = (inflags.GetValueInt("reverse") != 0);
+    cumOp     = (miopenCumOp_t)inflags.GetValueInt("CumulativeOperation");
+
+    auto lengths = inflags.GetValueTensor("input").lengths;
+    auto strides = GetStrides(lengths, inflags.GetValueInt("Contiguous") != 0);
+
+    if(SetTensorNd(inputDesc, lengths, strides, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input") + ".");
+
+    if(SetTensorNd(outputDesc, lengths, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing output tensor");
+
+    if(SetTensorNd(indicesDesc, lengths, miopenInt64) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing indices tensor");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int CumulativeReductionDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag(
+        "forw", 'F', "1", "Run only Forward CumulativeReduction (Default=1)", "int");
+    inflags.AddTensorFlag("input", 'D', "256x4x256", "input tensor descriptor");
+    inflags.AddInputFlag(
+        "dim", 'd', "0", "The dimension to do the operation over (Default=0)", "int");
+    inflags.AddInputFlag("exclusive",
+                         'e',
+                         "0",
+                         "Enable exclusive calculation. 0 for False, 1 for True (Default=0)",
+                         "int");
+    inflags.AddInputFlag(
+        "reverse",
+        'r',
+        "0",
+        "Reverse the calculation order to back to front. 0 for False, 1 for True (Default=0)",
+        "int");
+    inflags.AddInputFlag("CumulativeOperation",
+                         'O',
+                         "1",
+                         "Operator used. 1 for Max, 2 for Min, 3 for Sum, 4 for Prod (Default=1)",
+                         "int");
+    inflags.AddInputFlag("Contiguous",
+                         'C',
+                         "1",
+                         "Is input tensor contiguous? (Default=1 for contiguous tensor)",
+                         "int");
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int CumulativeReductionDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t input_sz   = miopen::deref(inputDesc).GetElementSpace();
+    size_t output_sz  = miopen::deref(outputDesc).GetElementSpace();
+    size_t indices_sz = miopen::deref(indicesDesc).GetElementSpace();
+
+    uint32_t ctx = 0;
+
+    input_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+    indices_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, indices_sz, sizeof(int64_t)));
+
+    input   = std::vector<Tgpu>(input_sz);
+    output  = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0.0f));
+    indices = std::vector<int64_t>(indices_sz, static_cast<int64_t>(-1));
+
+    output_host  = std::vector<Tref>(output_sz, static_cast<Tgpu>(0.0f));
+    indices_host = std::vector<int64_t>(indices_sz, static_cast<int64_t>(-1));
+
+    for(int i = 0; i < input_sz; i++)
+        input[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-100), static_cast<Tgpu>(100));
+
+    if(input_dev->ToGPU(GetStream(), input.data()) != 0)
+    {
+        std::cerr << "Error copying (input) to GPU, size: " << input_dev->GetSize() << std::endl;
+        return miopenStatusAllocFailed;
+    }
+
+    if(output_dev->ToGPU(GetStream(), output.data()) != 0)
+    {
+        std::cerr << "Error copying (output) to GPU, size: " << output_dev->GetSize() << std::endl;
+        return miopenStatusAllocFailed;
+    }
+
+    if(indices_dev->ToGPU(GetStream(), indices.data()) != 0)
+    {
+        std::cerr << "Error copying (indices) to GPU, size: " << indices_dev->GetSize()
+                  << std::endl;
+        return miopenStatusAllocFailed;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int CumulativeReductionDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenCumulativeReductionForward(
+            GetHandle(),
+            inputDesc,
+            input_dev->GetMem(),
+            outputDesc,
+            output_dev->GetMem(),
+            indicesDesc,
+            (cumOp == MIOPEN_CUM_MAX || cumOp == MIOPEN_CUM_MIN ? indices_dev->GetMem() : nullptr),
+            dim,
+            exclusive,
+            reverse,
+            cumOp);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Forward Cumulative Reduction Elapsed: "
+                      << t.gettime_ms() / iter << " ms" << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Forward Cumulative Reduction Elapsed: " << kernel_average_time
+                  << " ms" << std::endl;
+    }
+
+    if(output_dev->FromGPU(GetStream(), output.data()) != 0)
+    {
+        std::cerr << "Error copying (output_dev) from GPU, size: " << output_dev->GetSize()
+                  << std::endl;
+        return miopenStatusInternalError;
+    }
+    if(indices_dev->FromGPU(GetStream(), indices.data()) != 0)
+    {
+        std::cerr << "Error copying (indices_dev) from GPU, size: " << indices_dev->GetSize()
+                  << std::endl;
+        return miopenStatusInternalError;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int CumulativeReductionDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    int32_t mloStatus = miopenStatusSuccess;
+
+    switch(cumOp)
+    {
+    case MIOPEN_CUM_MAX:
+        mloStatus =
+            mloCumulativeReductionForwardRunHost<MIOPEN_CUM_MAX, Tgpu, Tref>(inputDesc,
+                                                                             outputDesc,
+                                                                             indicesDesc,
+                                                                             input.data(),
+                                                                             output_host.data(),
+                                                                             indices_host.data(),
+                                                                             dim,
+                                                                             exclusive,
+                                                                             reverse);
+        break;
+    case MIOPEN_CUM_MIN:
+        mloStatus =
+            mloCumulativeReductionForwardRunHost<MIOPEN_CUM_MIN, Tgpu, Tref>(inputDesc,
+                                                                             outputDesc,
+                                                                             indicesDesc,
+                                                                             input.data(),
+                                                                             output_host.data(),
+                                                                             indices_host.data(),
+                                                                             dim,
+                                                                             exclusive,
+                                                                             reverse);
+        break;
+    case MIOPEN_CUM_SUM:
+        mloStatus =
+            mloCumulativeReductionForwardRunHost<MIOPEN_CUM_SUM, Tgpu, Tref>(inputDesc,
+                                                                             outputDesc,
+                                                                             indicesDesc,
+                                                                             input.data(),
+                                                                             output_host.data(),
+                                                                             nullptr,
+                                                                             dim,
+                                                                             exclusive,
+                                                                             reverse);
+        break;
+    case MIOPEN_CUM_PROD:
+        mloStatus =
+            mloCumulativeReductionForwardRunHost<MIOPEN_CUM_PROD, Tgpu, Tref>(inputDesc,
+                                                                              outputDesc,
+                                                                              indicesDesc,
+                                                                              input.data(),
+                                                                              output_host.data(),
+                                                                              nullptr,
+                                                                              dim,
+                                                                              exclusive,
+                                                                              reverse);
+        break;
+    default:
+        std::cout << "The CPU version of Cumulative Reduction with Operation code " << cumOp
+                  << " has not been implemented" << std::endl;
+        mloStatus = miopenStatusNotImplemented;
+        break;
+    }
+
+    return mloStatus;
+}
+
+template <typename Tgpu, typename Tref>
+int CumulativeReductionDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    return miopenStatusNotImplemented;
+}
+
+template <typename Tgpu, typename Tref>
+int CumulativeReductionDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    return miopenStatusNotImplemented;
+}
+
+template <typename Tgpu, typename Tref>
+Tref CumulativeReductionDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int CumulativeReductionDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error_output    = miopen::rms_range(output_host, output);
+    auto error_indices   = miopen::rms_range(indices_host, indices);
+
+    if(!std::isfinite(error_output) || error_output > tolerance)
+    {
+        std::cout << "Forward Cumulative Reduction Output FAILED: " << error_output << " > "
+                  << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Forward Cumulative Reduction Output Verifies OK on CPU reference ("
+                  << error_output << " < " << tolerance << ')' << std::endl;
+    }
+
+    if(!std::isfinite(error_indices) || error_indices > tolerance)
+    {
+        std::cout << "Forward Cumulative Reduction Indices FAILED: " << error_indices << " > "
+                  << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Forward Cumulative Reduction Indices Verifies OK on CPU reference ("
+                  << error_indices << " < " << tolerance << ')' << std::endl;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int CumulativeReductionDriver<Tgpu, Tref>::VerifyBackward()
+{
+    return miopenStatusNotImplemented;
+}
diff --git a/driver/dm_cumulative_reduction.cpp b/driver/dm_cumulative_reduction.cpp
new file mode 100644
index 0000000000..09da7b45b5
--- /dev/null
+++ b/driver/dm_cumulative_reduction.cpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "registry_driver_maker.hpp"
+#include "cumulative_reduction_driver.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "cum")
+        return new CumulativeReductionDriver<float, float>();
+    if(base_arg == "cumfp16")
+        return new CumulativeReductionDriver<float16, float>();
+    if(base_arg == "cumbfp16")
+        return new CumulativeReductionDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index d77d5d02d2..e04b985723 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -314,7 +314,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
            "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], "
            "prelu[bfp16|fp16], kthvalue[bfp16|fp16], glu[bfp16|fp16], softmarginloss[bfp16|fp16], "
-           "multimarginloss[bfp16|fp16]\n");
+           "multimarginloss[bfp16|fp16], cum[bfp16|fp16]\n");
+
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -352,7 +353,7 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "kthvaluebfp16" && arg != "glu" && arg != "glufp16" && arg != "glubfp16" &&
        arg != "softmarginloss" && arg != "softmarginlossfp16" && arg != "softmarginlossbfp16" &&
        arg != "multimarginloss" && arg != "multimarginlossfp16" && arg != "multimarginlossbfp16" &&
-       arg != "--version")
+       arg != "cum" && arg != "cumfp16" && arg != "cumbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/mloCumulativeReductionHost.hpp b/driver/mloCumulativeReductionHost.hpp
new file mode 100644
index 0000000000..9d7f7cf574
--- /dev/null
+++ b/driver/mloCumulativeReductionHost.hpp
@@ -0,0 +1,151 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <../test/ford.hpp>
+
+#include <miopen/tensor.hpp>
+#include <miopen/tensor_view_utils.hpp>
+#include <miopen/cumulative_reduction/utils.hpp>
+
+#include <limits>
+
+inline constexpr void update() {}
+
+template <typename T, typename... Ts>
+inline constexpr void update(T& a, T b, Ts&... c, Ts... d)
+{
+    a = b;
+    update(c..., d...);
+}
+
+template <typename T, typename... Ts>
+struct reduce_func_base
+{
+    reduce_func_base(){};
+    virtual ~reduce_func_base(){};
+    virtual inline bool isbetter(const T& /*a*/, const T& /*b*/) const { return false; }
+    virtual inline void combine(T& a, T b) const { a = b; }
+    inline constexpr void calculate(T& a, T b, Ts&... c, Ts... d) const
+    {
+        if(!isbetter(a, b))
+        {
+            combine(a, b);
+            update(c..., d...);
+        }
+    }
+};
+
+template <miopenCumOp_t OP, typename T, typename... Ts>
+struct reduce_func : reduce_func_base<T, Ts...>
+{
+    virtual ~reduce_func(){};
+};
+
+template <typename T, typename... Ts>
+struct reduce_func<MIOPEN_CUM_MAX, T, Ts...> : reduce_func_base<T, Ts...>
+{
+    const float START_VAL = -std::numeric_limits<float>::max();
+    inline bool isbetter(const T& a, const T& b) const { return a > b; }
+};
+
+template <typename T, typename... Ts>
+struct reduce_func<MIOPEN_CUM_MIN, T, Ts...> : reduce_func_base<T, Ts...>
+{
+    const float START_VAL = std::numeric_limits<float>::max();
+    inline bool isbetter(const T& a, const T& b) const { return a < b; }
+};
+
+template <typename T, typename... Ts>
+struct reduce_func<MIOPEN_CUM_SUM, T, Ts...> : reduce_func_base<T, Ts...>
+{
+    const float START_VAL = 0.0f;
+    inline void combine(T& a, T b) const { a += b; }
+};
+
+template <typename T, typename... Ts>
+struct reduce_func<MIOPEN_CUM_PROD, T, Ts...> : reduce_func_base<T, Ts...>
+{
+    const float START_VAL = 1.0f;
+    inline void combine(T& a, T b) const { a *= b; }
+};
+
+template <miopenCumOp_t OP, typename Tgpu, typename Tcheck>
+int32_t mloCumulativeReductionForwardRunHost(const miopenTensorDescriptor_t inputDesc,
+                                             const miopenTensorDescriptor_t outputDesc,
+                                             const miopenTensorDescriptor_t indicesDesc,
+                                             const Tgpu* input,
+                                             Tcheck* output_host,
+                                             int64_t* indices_host,
+                                             const int dim,
+                                             const bool exclusive,
+                                             const bool reverse)
+{
+    const int ndims     = miopen::deref(inputDesc).GetNumDims();
+    const auto exec_dim = ((dim % ndims) + ndims) % ndims;
+
+    auto input_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto output_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
+    auto indices_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(indicesDesc));
+
+    auto size       = miopen::deref(inputDesc).GetElementSize();
+    auto inner_size = miopen::deref(inputDesc).GetLengths()[exec_dim];
+    auto outer_size = size / inner_size;
+
+    auto op_worker = reduce_func<OP, float, int64_t>{};
+
+    tensor_view_t<5> ignore_dim_input_tv = input_tv;
+    ignore_dim_input_tv.size[exec_dim]   = 1;
+
+    par_ford(outer_size)([&](int64_t gid) {
+        auto tensor_layout = tensor_layout_t<5>(ignore_dim_input_tv, gid);
+        float cum_val      = op_worker.START_VAL;
+        int64_t cum_idx    = (reverse ? input_tv.size[exec_dim] - 1 : 0);
+
+        ford(inner_size)([&](int64_t idx) {
+            int64_t tmp_idx =
+                (reverse ? input_tv.size[exec_dim] - (idx - exclusive) - 1 : (idx - exclusive));
+            float tmp_val = op_worker.START_VAL;
+            if(0 <= tmp_idx && tmp_idx < inner_size)
+            {
+                tensor_layout.layout[exec_dim] = tmp_idx;
+                tmp_val = static_cast<float>(input[input_tv.get_tensor_view_idx(tensor_layout)]);
+            }
+
+            op_worker.calculate(cum_val, tmp_val, cum_idx, tmp_idx);
+
+            tensor_layout.layout[exec_dim] = (reverse ? input_tv.size[exec_dim] - idx - 1 : idx);
+            if(output_host)
+                output_host[output_tv.get_tensor_view_idx(tensor_layout)] =
+                    static_cast<Tcheck>(cum_val);
+            if(indices_host)
+                indices_host[indices_tv.get_tensor_view_idx(tensor_layout)] = cum_idx;
+        });
+    });
+
+    return miopenStatusSuccess;
+}
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 67652ab832..9f70a3ef31 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -72,6 +72,7 @@
  * @defgroup ReduceCalculation
  * @defgroup RotaryPositionalEmbeddings
  * @defgroup ReLU
+ * @defgroup CumulativeReduction
  *
  */
 
@@ -8176,6 +8177,61 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle,
 // CLOSEOUT LossFunction DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+
+/*! @ingroup CumulativeReduction
+ * @enum miopenCumOp_t
+ * Cumulative Reduction operation types
+ */
+using miopenCumOp_t = enum {
+    MIOPEN_CUM_MAX = 1,  /*!< the operation is getting the maximun value and index of the reduced
+                        elements */
+    MIOPEN_CUM_MIN = 2,  /*!< the operation is getting the minimum value and index of the reduced
+                        elements */
+    MIOPEN_CUM_SUM  = 3, /*!< the operation is getting the sum of the reduced elements */
+    MIOPEN_CUM_PROD = 4, /*!< the operation is getting the product of the reduced elements */
+};
+#endif // MIOPEN_BETA_API
+
+#ifdef MIOPEN_BETA_API
+
+// CumulativeReduction APIs
+/** @addtogroup CumulativeReduction
+ *
+ *  @{
+ */
+
+/*! @brief Execute a Cumulative Reduction forward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param output                   Data tensor output, using `nullptr` if it is not needed (output)
+ * @param indicesDesc              Tensor descriptor for indices tensor (input)
+ * @param indices                  Data tensor indices, using `nullptr` if it is not needed (output)
+ * @param dim                      The dimension to do the operation over (input)
+ * @param exclusive                Exclusive operation (input)
+ * @param reverse                  Reverse the operation order (input)
+ * @param cumOp                    Enumerant specifying the operation used by Cumulative Reduction
+ * (input)
+ */
+MIOPEN_EXPORT miopenStatus_t miopenCumulativeReductionForward(miopenHandle_t handle,
+                                                              miopenTensorDescriptor_t inputDesc,
+                                                              const void* input,
+                                                              miopenTensorDescriptor_t outputDesc,
+                                                              void* output,
+                                                              miopenTensorDescriptor_t indicesDesc,
+                                                              void* indices,
+                                                              int dim,
+                                                              bool exclusive,
+                                                              bool reverse,
+                                                              miopenCumOp_t cumOp);
+
+/** @} */
+// CLOSEOUT CUMULATIVE REDUCTION DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 92e4f4264a..0192f2a6b8 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -112,6 +112,8 @@ set( MIOpen_Source
     convolution_api.cpp
     ctc.cpp
     ctc_api.cpp
+    cumulative_reduction/problem_description.cpp
+    cumulative_reduction_api.cpp
     db.cpp
     db_record.cpp
     driver_arguments.cpp
@@ -302,6 +304,8 @@ set( MIOpen_Source
     solver/conv_ck_igemm_fwd_bias_res_add_activ_fused.cpp
     solver/conv_ocl_dir2Dfwd_fused.cpp
     solver/conv_winoRxS_fused.cpp
+    solver/cumulative_reduction/forward_cumulative_reduction_contiguous_lastdim.cpp
+    solver/cumulative_reduction/utils.cpp
     solver/glu/backward_glu.cpp
     solver/glu/forward_glu.cpp
     solver/groupnorm/forward_groupnorm.cpp
@@ -473,6 +477,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/Conv_Winograd_v30_3_1_metadata.inc
         kernels/MIOpenReduceCalculation.hpp
         kernels/MIOpenReduceExtreme.hpp
+        kernels/MIOpenCumulativeReduction.hpp
         kernels/bfloat16_dev.hpp
         kernels/block_reduce.hpp
         kernels/conv_common.inc
@@ -531,6 +536,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenConvDirUni.cl
         kernels/MIOpenConvDirBatchNormActiv.cl
         kernels/MIOpenConvDirGenFwd.cl
+        kernels/MIOpenCumulativeReduction.cpp
         kernels/MIOpenGLU.cpp
         kernels/MIOpenGroupNorm.cpp
         kernels/MIOpenGetitem.cpp
@@ -669,6 +675,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         adam.cpp
         addlayernorm.cpp
         cat.cpp
+        cumulative_reduction.cpp
         exec_utils.cpp
         groupnorm.cpp
         getitem.cpp
diff --git a/src/cumulative_reduction.cpp b/src/cumulative_reduction.cpp
new file mode 100644
index 0000000000..c6dae2f420
--- /dev/null
+++ b/src/cumulative_reduction.cpp
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/find_solution.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/cumulative_reduction.hpp>
+#include <miopen/cumulative_reduction/invoke_params.hpp>
+#include <miopen/cumulative_reduction/solvers.hpp>
+
+namespace miopen {
+namespace cumulative_reduction {
+
+miopenStatus_t CumulativeReductionForward(Handle& handle,
+                                          const TensorDescriptor& inputDesc,
+                                          ConstData_t input,
+                                          const TensorDescriptor& outputDesc,
+                                          Data_t output,
+                                          const TensorDescriptor& indicesDesc,
+                                          Data_t indices,
+                                          const int dim,
+                                          const bool exclusive,
+                                          const bool reverse,
+                                          const miopenCumOp_t cumOp)
+{
+    const auto problem = cumulative_reduction::ForwardProblemDescription{
+        inputDesc, outputDesc, indicesDesc, dim, cumOp};
+
+    const auto invoke_params = [&]() {
+        auto tmp        = cumulative_reduction::InvokeParams{};
+        tmp.type        = InvokeType::Run;
+        tmp.inputDesc   = &inputDesc;
+        tmp.outputDesc  = &outputDesc;
+        tmp.indicesDesc = &indicesDesc;
+        tmp.input       = input;
+        tmp.output      = output;
+        tmp.indices     = indices;
+
+        tmp.dim       = dim;
+        tmp.exclusive = exclusive;
+        tmp.reverse   = reverse;
+
+        return tmp;
+    }();
+
+    const auto algo = AlgorithmName{"CumulativeReductionForward"};
+    const auto solvers =
+        solver::SolverContainer<solver::cumulative_reduction::ForwardContiguousLastDim>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace cumulative_reduction
+} // namespace miopen
diff --git a/src/cumulative_reduction/problem_description.cpp b/src/cumulative_reduction/problem_description.cpp
new file mode 100644
index 0000000000..4ef94ef138
--- /dev/null
+++ b/src/cumulative_reduction/problem_description.cpp
@@ -0,0 +1,72 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/cumulative_reduction/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace cumulative_reduction {
+
+bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y)
+{
+    if(x.GetNumDims() != y.GetNumDims())
+        return false;
+    for(int i = 0; i < x.GetNumDims(); ++i)
+    {
+        if(x.GetLengths()[i] != y.GetLengths()[i])
+            return false;
+    }
+    return true;
+}
+
+NetworkConfig ForwardProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype  = inputDesc.GetType();
+    auto output_dtype = outputDesc.GetType();
+    auto size         = inputDesc.GetElementSize();
+    auto inner_size   = inputDesc.GetLengths()[dim];
+    auto outer_size   = size / inner_size;
+
+    std::ostringstream ss;
+
+    ss << "cum_reduc_fwd";
+    ss << "idtype" << input_dtype;
+    ss << "odtype" << output_dtype;
+    ss << "outer" << outer_size;
+    ss << "inner" << inner_size;
+    ss << "op" << cumOp;
+    ss << "packed" << IsAllPacked();
+    ss << "dimstride1" << IsAllDimStride1();
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace cumulative_reduction
+
+} // namespace miopen
diff --git a/src/cumulative_reduction_api.cpp b/src/cumulative_reduction_api.cpp
new file mode 100644
index 0000000000..80185a75cb
--- /dev/null
+++ b/src/cumulative_reduction_api.cpp
@@ -0,0 +1,108 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/handle.hpp>
+#include <miopen/tensor_ops.hpp>
+#include <miopen/cumulative_reduction.hpp>
+
+static void LogCmdCumulativeReduction(const miopenTensorDescriptor_t inputDesc,
+                                      const miopenTensorDescriptor_t outputDesc,
+                                      const miopenTensorDescriptor_t indicesDesc,
+                                      const int dim,
+                                      const bool exclusive,
+                                      const bool reverse,
+                                      const miopenCumOp_t cumOp,
+                                      const bool is_fwd)
+{
+    if(miopen::IsLoggingCmd())
+    {
+        std::stringstream ss;
+        auto dtype = miopen::deref(inputDesc).GetType();
+        if(dtype == miopenHalf)
+        {
+            ss << "cumulative_reductionfp16";
+        }
+        else if(dtype == miopenFloat)
+        {
+            ss << "cumulative_reductionfp32";
+        }
+        else if(dtype == miopenBFloat16)
+        {
+            ss << "cumulative_reductionbfp16";
+        }
+
+        MIOPEN_LOG_FUNCTION(inputDesc, outputDesc, indicesDesc);
+        ss << " -d " << dim;
+        ss << " --excl " << exclusive;
+        ss << " --rev " << reverse;
+        ss << " --op " << cumOp;
+        ss << " -F " << ((is_fwd) ? "1" : "2");
+
+        MIOPEN_LOG_DRIVER_CMD(ss.str());
+    }
+}
+
+extern "C" miopenStatus_t
+miopenCumulativeReductionForward(miopenHandle_t handle,
+                                 const miopenTensorDescriptor_t inputDesc,
+                                 const void* input,
+                                 const miopenTensorDescriptor_t outputDesc,
+                                 void* output,
+                                 const miopenTensorDescriptor_t indicesDesc,
+                                 void* indices,
+                                 const int dim,
+                                 const bool exclusive,
+                                 const bool reverse,
+                                 const miopenCumOp_t cumOp)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        inputDesc,
+                        input,
+                        outputDesc,
+                        output,
+                        indicesDesc,
+                        indices,
+                        dim,
+                        exclusive,
+                        reverse,
+                        cumOp);
+
+    LogCmdCumulativeReduction(
+        inputDesc, outputDesc, indicesDesc, dim, exclusive, reverse, cumOp, true);
+    return miopen::try_([&] {
+        miopen::cumulative_reduction::CumulativeReductionForward(miopen::deref(handle),
+                                                                 miopen::deref(inputDesc),
+                                                                 DataCast(input),
+                                                                 miopen::deref(outputDesc),
+                                                                 DataCast(output),
+                                                                 miopen::deref(indicesDesc),
+                                                                 DataCast(indices),
+                                                                 dim,
+                                                                 exclusive,
+                                                                 reverse,
+                                                                 cumOp);
+    });
+}
diff --git a/src/include/miopen/cumulative_reduction.hpp b/src/include/miopen/cumulative_reduction.hpp
new file mode 100644
index 0000000000..fc1ac43162
--- /dev/null
+++ b/src/include/miopen/cumulative_reduction.hpp
@@ -0,0 +1,50 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+struct Handle;
+struct TensorDescriptor;
+
+namespace cumulative_reduction {
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t
+CumulativeReductionForward(Handle& handle,
+                           const TensorDescriptor& inputDesc,
+                           ConstData_t input,
+                           const TensorDescriptor& outputDesc,
+                           Data_t output,
+                           const TensorDescriptor& indicesDesc,
+                           Data_t indices,
+                           int dim,
+                           bool exclusive,
+                           bool reverse,
+                           miopenCumOp_t cumOp);
+
+} // namespace cumulative_reduction
+} // namespace miopen
diff --git a/src/include/miopen/cumulative_reduction/invoke_params.hpp b/src/include/miopen/cumulative_reduction/invoke_params.hpp
new file mode 100644
index 0000000000..fac69df6f0
--- /dev/null
+++ b/src/include/miopen/cumulative_reduction/invoke_params.hpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+namespace cumulative_reduction {
+
+struct InvokeParams : public miopen::InvokeParams
+{
+    InvokeParams() = default;
+
+    const TensorDescriptor* inputDesc   = nullptr;
+    const TensorDescriptor* outputDesc  = nullptr;
+    const TensorDescriptor* indicesDesc = nullptr;
+
+    ConstData_t input = nullptr;
+    Data_t output     = nullptr;
+    Data_t indices    = nullptr;
+
+    int dim        = 0;
+    bool exclusive = false;
+    bool reverse   = false;
+
+    std::size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+};
+
+} // namespace cumulative_reduction
+
+} // namespace miopen
diff --git a/src/include/miopen/cumulative_reduction/problem_description.hpp b/src/include/miopen/cumulative_reduction/problem_description.hpp
new file mode 100644
index 0000000000..1cd2cf621c
--- /dev/null
+++ b/src/include/miopen/cumulative_reduction/problem_description.hpp
@@ -0,0 +1,132 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/miopen.h>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace cumulative_reduction {
+
+bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y);
+
+struct ForwardProblemDescription : ProblemDescriptionBase
+{
+    ForwardProblemDescription(const TensorDescriptor& inputDesc_,
+                              const TensorDescriptor& outputDesc_,
+                              const TensorDescriptor& indicesDesc_,
+                              const int& dim_,
+                              const miopenCumOp_t& cumOp_)
+        : inputDesc(inputDesc_),
+          outputDesc(outputDesc_),
+          indicesDesc(indicesDesc_),
+          dim(dim_),
+          cumOp(cumOp_)
+    {
+        if(IsValidDim())
+            dim = (dim < 0 ? dim + inputDesc.GetNumDims() : dim);
+        IsValidIndicesType();
+        IsSameLength();
+    }
+
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
+    const TensorDescriptor& GetIndicesDesc() const { return indicesDesc; }
+    const int& GetDim() const { return dim; }
+    const miopenCumOp_t& GetCumOp() const { return cumOp; }
+
+    bool IsValidDim() const
+    {
+        const int ndims = inputDesc.GetNumDims();
+        if(dim < -ndims || ndims - 1 < dim)
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         (std::stringstream()
+                          << "Cumulative Reduction: Operating dim value must be in range ["
+                          << -ndims << "," << ndims - 1 << "].")
+                             .str());
+        }
+        return true;
+    }
+
+    bool IsValidIndicesType() const
+    {
+        if(indicesDesc.GetElementSize() > 0 && indicesDesc.GetType() != miopenInt64)
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Cumulative Reduction: Indices tensor type must be int64.");
+        return true;
+    }
+
+    bool IsSameLength() const
+    {
+        if(outputDesc.GetElementSize() > 0 && !checkSameLength(inputDesc, outputDesc))
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Cumulative Reduction: Input and Output tensor sizes do not match.");
+        if(indicesDesc.GetElementSize() > 0 && !checkSameLength(inputDesc, indicesDesc))
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Cumulative Reduction: Input and Indices tensor sizes do not match.");
+        return true;
+    }
+
+    bool IsAllPacked() const
+    {
+        if(!inputDesc.IsPacked() || !outputDesc.IsPacked() || !indicesDesc.IsPacked())
+            return false;
+        return true;
+    }
+
+    bool IsAllDimStride1() const
+    {
+        if(inputDesc.GetStrides()[dim] != 1)
+            return false;
+        if(outputDesc.GetElementSize() > 0 && outputDesc.GetStrides()[dim] != 1)
+            return false;
+        if(indicesDesc.GetElementSize() > 0 && indicesDesc.GetStrides()[dim] != 1)
+            return false;
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    TensorDescriptor inputDesc;
+    TensorDescriptor outputDesc;
+    TensorDescriptor indicesDesc;
+    int dim;
+    miopenCumOp_t cumOp;
+
+    NetworkConfig MakeForwardNetworkConfig() const;
+};
+
+} // namespace cumulative_reduction
+
+} // namespace miopen
diff --git a/src/include/miopen/cumulative_reduction/solvers.hpp b/src/include/miopen/cumulative_reduction/solvers.hpp
new file mode 100644
index 0000000000..1e78b8851d
--- /dev/null
+++ b/src/include/miopen/cumulative_reduction/solvers.hpp
@@ -0,0 +1,59 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/cumulative_reduction/problem_description.hpp>
+#include <miopen/solver.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace cumulative_reduction {
+
+using ForwardSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::cumulative_reduction::ForwardProblemDescription>;
+
+struct ForwardContiguousLastDim final : ForwardSolverBase
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<ForwardContiguousLastDim>();
+    }
+
+    bool IsApplicable(
+        const ExecutionContext& context,
+        const miopen::cumulative_reduction::ForwardProblemDescription& problem) const override;
+    ConvSolution GetSolution(
+        const ExecutionContext& context,
+        const miopen::cumulative_reduction::ForwardProblemDescription& problem) const override;
+};
+
+} // namespace cumulative_reduction
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/cumulative_reduction/utils.hpp b/src/include/miopen/cumulative_reduction/utils.hpp
new file mode 100644
index 0000000000..9137019ff4
--- /dev/null
+++ b/src/include/miopen/cumulative_reduction/utils.hpp
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/solver.hpp>
+
+namespace miopen {
+namespace solver {
+namespace cumulative_reduction {
+
+KernelInfo make_hip_kernel(std::vector<size_t> localsize,
+                           std::vector<size_t> gridsize,
+                           std::string kernel_file,
+                           std::string kernel_name,
+                           KernelBuildParameters build_params);
+
+} // namespace cumulative_reduction
+} // namespace solver
+} // namespace miopen
diff --git a/src/kernels/MIOpenCumulativeReduction.cpp b/src/kernels/MIOpenCumulativeReduction.cpp
new file mode 100644
index 0000000000..a6567d441c
--- /dev/null
+++ b/src/kernels/MIOpenCumulativeReduction.cpp
@@ -0,0 +1,144 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "MIOpenCumulativeReduction.hpp"
+
+template <CumulativeReductionOp_t OP, uint64_t LOCAL_SIZE, typename... Ts>
+__device__ inline void CumulativeReductionScan(const bool& reverse,
+                                               const uint64_t& lid,
+                                               FLOAT_ACCUM* __restrict__ a,
+                                               Ts* __restrict__... b)
+{
+    // reduction
+    uint64_t stride = 1;
+    while(stride <= LOCAL_SIZE)
+    {
+        uint64_t idx = (lid + 1) * stride * 2 - 1;
+        if(idx < LOCAL_SIZE)
+            reduce_func<OP, FLOAT_ACCUM, Ts...>{}.calculate(
+                !reverse, a[idx], a[idx - stride], b[idx]..., b[idx - stride]...);
+        stride *= 2;
+        __syncthreads();
+    }
+
+    // post scan
+    stride = LOCAL_SIZE / 2;
+    while(stride > 0)
+    {
+        uint64_t idx = (lid + 1) * stride * 2 - 1;
+        if((idx + stride) < LOCAL_SIZE)
+            reduce_func<OP, FLOAT_ACCUM, Ts...>{}.calculate(
+                !reverse, a[idx + stride], a[idx], b[idx + stride]..., b[idx]...);
+        stride /= 2;
+        __syncthreads();
+    }
+}
+
+template <typename TI, typename TO, CumulativeReductionOp_t OP, uint64_t LOCAL_SIZE>
+__device__ void CumulativeReductionForwardContiguousLastDim(const TI* __restrict__ input,
+                                                            TO* __restrict__ output,
+                                                            int64_t* __restrict__ indices,
+                                                            const uint64_t reduce_size,
+                                                            const bool exclusive,
+                                                            const bool reverse)
+{
+    /*
+     * input = packed tensor with stride[last_dim]=1, output: the same as input, indices: the same
+as input
+     * reduce_size = input.size[last_dim]
+     * exclusive: TRUE to exclude input[i] when calculate output[i]
+     * reverse: reverse the operating order
+     *
+     * cumulative dimension = last dim
+     * blockSize = {1, LOCAL_SIZE}
+     * gridSize = {Number of input elements / input.size[last_dim], input.size[last_dim]}
+     */
+
+    __shared__ FLOAT_ACCUM otmp[LOCAL_SIZE];
+    int64_t* itmp = nullptr;
+    if(indices)
+    {
+        __shared__ int64_t _itmp[LOCAL_SIZE];
+        itmp = _itmp;
+    }
+
+    uint64_t lid = threadIdx.y;
+
+    uint64_t xid = blockIdx.x * blockDim.x + threadIdx.x;
+    uint64_t yid = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(exclusive <= yid && yid < reduce_size)
+    {
+        int64_t idx = (reverse ? reduce_size - static_cast<int64_t>(yid) + exclusive - 1
+                               : static_cast<int64_t>(yid) - exclusive);
+        otmp[lid]   = CVT_FLOAT2ACCUM(input[xid * reduce_size + idx]);
+        if(indices)
+            itmp[lid] = idx;
+    }
+    else
+    {
+        otmp[lid] = reduce_func<OP, FLOAT_ACCUM>{}.START_VAL;
+        if(indices)
+            itmp[lid] = (reverse ? reduce_size - static_cast<int64_t>(yid) + exclusive - 1
+                                 : static_cast<int64_t>(yid) - exclusive);
+    }
+    __syncthreads();
+
+    if(indices)
+        CumulativeReductionScan<OP, LOCAL_SIZE, int64_t>(reverse, lid, otmp, itmp);
+    else
+        CumulativeReductionScan<OP, LOCAL_SIZE>(reverse, lid, otmp);
+
+    if(yid < reduce_size)
+    {
+        int64_t idx =
+            (reverse ? reduce_size - static_cast<int64_t>(yid) - 1 : static_cast<int64_t>(yid));
+        if(output)
+            output[xid * reduce_size + idx] = CVT_ACCUM2FLOAT(otmp[lid]);
+        if(indices)
+            indices[xid * reduce_size + idx] = itmp[lid];
+    }
+}
+
+extern "C" __global__ void CumulativeReductionForwardContiguousLastDim(const INPUT_TYPE* input,
+                                                                       OUTPUT_TYPE* output,
+                                                                       int64_t* indices,
+                                                                       const uint64_t reduce_size,
+                                                                       const bool exclusive,
+                                                                       const bool reverse)
+{
+    // instantiate the kernel
+    CumulativeReductionForwardContiguousLastDim<INPUT_TYPE,
+                                                OUTPUT_TYPE,
+                                                (CumulativeReductionOp_t)OP_TYPE,
+                                                REDUCE_SIZE>(
+        input, output, indices, reduce_size, exclusive, reverse);
+}
diff --git a/src/kernels/MIOpenCumulativeReduction.hpp b/src/kernels/MIOpenCumulativeReduction.hpp
new file mode 100644
index 0000000000..7323642e8d
--- /dev/null
+++ b/src/kernels/MIOpenCumulativeReduction.hpp
@@ -0,0 +1,116 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_KERNELS_MIOPEN_CUMULATIVE_REDUCTIONS_HPP
+#define GUARD_KERNELS_MIOPEN_CUMULATIVE_REDUCTIONS_HPP
+
+#include "float_types.h"
+
+enum class CumulativeReductionOp_t
+{
+    Max  = 1,
+    Min  = 2,
+    Sum  = 3,
+    Prod = 4,
+};
+
+#ifndef __HIP_DEVICE_COMPILE__
+static_assert(MIOPEN_CUM_MAX == static_cast<int>(CumulativeReductionOp_t::Max));
+static_assert(MIOPEN_CUM_MIN == static_cast<int>(CumulativeReductionOp_t::Min));
+static_assert(MIOPEN_CUM_SUM == static_cast<int>(CumulativeReductionOp_t::Sum));
+static_assert(MIOPEN_CUM_PROD == static_cast<int>(CumulativeReductionOp_t::Prod));
+#endif
+
+inline constexpr void update() {}
+template <typename T, typename... Ts>
+inline constexpr void update(T& a, T b, Ts&... c, Ts... d)
+{
+    a = b;
+    update(c..., d...);
+}
+
+inline constexpr bool isgreater() { return false; }
+template <typename T, typename... Ts>
+inline constexpr bool isgreater(T& a, T b, Ts&... c, Ts... d)
+{
+    if(a != b)
+        return a > b;
+    return isgreater(c..., d...);
+}
+
+template <typename reduce_func_derivedT, typename T, typename... Ts>
+struct reduce_func_base
+{
+    inline constexpr bool isbetter(const T& /*a*/, const T& /*b*/) { return false; }
+    inline constexpr void combine(T& a, T b) { a = b; }
+    inline constexpr void calculate(const bool keep_greater, T& a, T b, Ts&... c, Ts... d)
+    {
+        auto derived = static_cast<reduce_func_derivedT*>(this);
+        if(!derived->isbetter(a, b))
+        {
+            if(derived->isbetter(a, b) != derived->isbetter(b, a) ||
+               isgreater(c..., d...) != keep_greater)
+                update(c..., d...);
+            derived->combine(a, b);
+        }
+    }
+};
+
+template <CumulativeReductionOp_t OP, typename T, typename... Ts>
+struct reduce_func;
+
+template <typename T, typename... Ts>
+struct reduce_func<CumulativeReductionOp_t::Max, T, Ts...>
+    : reduce_func_base<reduce_func<CumulativeReductionOp_t::Max, T, Ts...>, T, Ts...>
+{
+    const FLOAT_ACCUM START_VAL = -MAX_VAL_ACCUM;
+    inline constexpr bool isbetter(const T& a, const T& b) { return a > b; }
+};
+
+template <typename T, typename... Ts>
+struct reduce_func<CumulativeReductionOp_t::Min, T, Ts...>
+    : reduce_func_base<reduce_func<CumulativeReductionOp_t::Min, T, Ts...>, T, Ts...>
+{
+    const FLOAT_ACCUM START_VAL = MAX_VAL_ACCUM;
+    inline constexpr bool isbetter(const T& a, const T& b) { return a < b; }
+};
+
+template <typename T, typename... Ts>
+struct reduce_func<CumulativeReductionOp_t::Sum, T, Ts...>
+    : reduce_func_base<reduce_func<CumulativeReductionOp_t::Sum, T, Ts...>, T, Ts...>
+{
+    const FLOAT_ACCUM START_VAL = CVT_FP32_2ACCUM(0.0f);
+    inline constexpr void combine(T& a, T b) { a += b; }
+};
+
+template <typename T, typename... Ts>
+struct reduce_func<CumulativeReductionOp_t::Prod, T, Ts...>
+    : reduce_func_base<reduce_func<CumulativeReductionOp_t::Prod, T, Ts...>, T, Ts...>
+{
+    const FLOAT_ACCUM START_VAL = CVT_FP32_2ACCUM(1.0f);
+    inline constexpr void combine(T& a, T b) { a *= b; }
+};
+
+#endif // GUARD_GUARD_KERNELS_MIOPEN_CUMULATIVE_REDUCTIONS_HPP
diff --git a/src/solver.cpp b/src/solver.cpp
index 1f6873d5f7..87ec85a274 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -43,6 +43,7 @@
 #include <miopen/softmarginloss/solvers.hpp>
 #include <miopen/softmax/solvers.hpp>
 #include <miopen/multimarginloss/solvers.hpp>
+#include <miopen/cumulative_reduction/solvers.hpp>
 
 #include <miopen/conv_algo_name.hpp>
 #include <miopen/db.hpp>
@@ -688,6 +689,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
 
     Register(registry, ++id, Primitive::RoPE, rope::RoPEForward{}.SolverDbId());
     Register(registry, ++id, Primitive::RoPE, rope::RoPEBackward{}.SolverDbId());
+
     Register(registry, ++id, Primitive::ReLU, prelu::MultiWeightsBackward{}.SolverDbId());
     Register(registry, ++id, Primitive::ReLU, prelu::SingleWeightBackward{}.SolverDbId());
     Register(registry, ++id, Primitive::Kthvalue, kthvalue::KthvalueFwd{}.SolverDbId());
@@ -709,6 +711,11 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              multimarginloss::MultiMarginLossForward{}.SolverDbId());
 
     Register(registry, ++id, Primitive::Mha, mha::MhaCKFlashAttentionV2Forward{}.SolverDbId());
+
+    Register(registry,
+             ++id,
+             Primitive::Reduce,
+             cumulative_reduction::ForwardContiguousLastDim{}.SolverDbId());
     // IMPORTANT: New solvers should be added to the end of the function, and don't leave a white
     // space between this comment and the newly registered solver(s)!
 }
diff --git a/src/solver/cumulative_reduction/forward_cumulative_reduction_contiguous_lastdim.cpp b/src/solver/cumulative_reduction/forward_cumulative_reduction_contiguous_lastdim.cpp
new file mode 100644
index 0000000000..45c312f1a1
--- /dev/null
+++ b/src/solver/cumulative_reduction/forward_cumulative_reduction_contiguous_lastdim.cpp
@@ -0,0 +1,125 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/cumulative_reduction/invoke_params.hpp>
+#include <miopen/cumulative_reduction/solvers.hpp>
+#include <miopen/cumulative_reduction/utils.hpp>
+
+#define warpSizeCTX (context.GetStream().GetWavefrontWidth())
+#define LOCAL_SIZE_MAX 256
+#define LOCAL_SIZE_MIN warpSizeCTX
+
+namespace miopen {
+
+namespace solver {
+
+namespace cumulative_reduction {
+
+bool IsImprovementOverROCm(const ExecutionContext& /*context*/,
+                           const miopen::cumulative_reduction::ForwardProblemDescription& problem)
+{
+    if(problem.GetInputDesc().GetLengths()[problem.GetDim()] > LOCAL_SIZE_MAX)
+        return false;
+    return true;
+}
+
+bool ForwardContiguousLastDim::IsApplicable(
+    const ExecutionContext& context,
+    const miopen::cumulative_reduction::ForwardProblemDescription& problem) const
+{
+    if(!IsImprovementOverROCm(context, problem))
+        return false;
+    if(!problem.IsAllPacked())
+        return false;
+    if(!problem.IsAllDimStride1())
+        return false;
+    return true;
+}
+
+ConvSolution ForwardContiguousLastDim::GetSolution(
+    const ExecutionContext& context,
+    const miopen::cumulative_reduction::ForwardProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype        = problem.GetInputDesc().GetType();
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+    auto cum_op       = problem.GetCumOp();
+
+    auto size       = problem.GetInputDesc().GetElementSize();
+    auto inner_size = problem.GetInputDesc().GetLengths()[problem.GetDim()];
+    auto outer_size = size / inner_size;
+
+    // LOCAL_SIZE must be the smallest power of 2 that greater than inner_size and warpSize
+    auto local_size = LOCAL_SIZE_MIN;
+    while(local_size < inner_size)
+        local_size *= 2;
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+        {"OP_TYPE", cum_op},
+        {"REDUCE_SIZE", local_size},
+    };
+
+    {
+        result.construction_params.push_back(
+            make_hip_kernel({1, local_size},
+                            {outer_size, inner_size},
+                            "MIOpenCumulativeReduction.cpp",
+                            "CumulativeReductionForwardContiguousLastDim",
+                            build_params));
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            auto params = raw_params.CastTo<miopen::cumulative_reduction::InvokeParams>();
+
+            const int ndims             = deref(params.inputDesc).GetNumDims();
+            const unsigned int true_dim = ((params.dim % ndims) + ndims) % ndims;
+            auto kernel                 = handle_.Run(kernels[0]);
+            kernel(params.input,
+                   params.output,
+                   params.indices,
+                   static_cast<uint64_t>(deref(params.inputDesc).GetLengths()[true_dim]),
+                   params.exclusive,
+                   params.reverse);
+        };
+    };
+
+    return result;
+}
+
+} // namespace cumulative_reduction
+} // namespace solver
+} // namespace miopen
diff --git a/src/solver/cumulative_reduction/utils.cpp b/src/solver/cumulative_reduction/utils.cpp
new file mode 100644
index 0000000000..6ee3b00a15
--- /dev/null
+++ b/src/solver/cumulative_reduction/utils.cpp
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/mlo_internal.hpp>
+#include <miopen/cumulative_reduction/utils.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace cumulative_reduction {
+
+KernelInfo make_hip_kernel(std::vector<size_t> localsize,
+                           std::vector<size_t> gridsize,
+                           std::string kernel_file,
+                           std::string kernel_name,
+                           KernelBuildParameters build_params)
+{
+    while(localsize.size() < 3)
+        localsize.push_back(1);
+    while(gridsize.size() < 3)
+        gridsize.push_back(1);
+    for(int i = 0; i < localsize.size(); ++i)
+        gridsize[i] = AlignUp(gridsize[i], localsize[i]);
+    return KernelInfo{
+        build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name};
+}
+
+} // namespace cumulative_reduction
+} // namespace solver
+} // namespace miopen
diff --git a/test/cpu_cumulative_reduction.hpp b/test/cpu_cumulative_reduction.hpp
new file mode 100644
index 0000000000..922a87af19
--- /dev/null
+++ b/test/cpu_cumulative_reduction.hpp
@@ -0,0 +1,145 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "tensor_holder.hpp"
+#include <miopen/tensor_view_utils.hpp>
+#include <miopen/cumulative_reduction/utils.hpp>
+
+#include <limits>
+
+inline constexpr void update() {}
+
+template <typename T, typename... Ts>
+inline constexpr void update(T& a, T b, Ts&... c, Ts... d)
+{
+    a = b;
+    update(c..., d...);
+}
+
+template <typename T, typename... Ts>
+struct reduce_func_base
+{
+    reduce_func_base(){};
+    virtual ~reduce_func_base(){};
+    virtual inline bool isbetter(const T& /*a*/, const T& /*b*/) const { return false; }
+    virtual inline void combine(T& a, T b) const { a = b; }
+    inline constexpr void calculate(T& a, T b, Ts&... c, Ts... d) const
+    {
+        if(!isbetter(a, b))
+        {
+            combine(a, b);
+            update(c..., d...);
+        }
+    }
+};
+
+template <miopenCumOp_t OP, typename T, typename... Ts>
+struct reduce_func : reduce_func_base<T, Ts...>
+{
+    virtual ~reduce_func(){};
+};
+
+template <typename T, typename... Ts>
+struct reduce_func<MIOPEN_CUM_MAX, T, Ts...> : reduce_func_base<T, Ts...>
+{
+    const float START_VAL = -std::numeric_limits<float>::max();
+    inline bool isbetter(const T& a, const T& b) const override { return a > b; }
+};
+
+template <typename T, typename... Ts>
+struct reduce_func<MIOPEN_CUM_MIN, T, Ts...> : reduce_func_base<T, Ts...>
+{
+    const float START_VAL = std::numeric_limits<float>::max();
+    inline bool isbetter(const T& a, const T& b) const override { return a < b; }
+};
+
+template <typename T, typename... Ts>
+struct reduce_func<MIOPEN_CUM_SUM, T, Ts...> : reduce_func_base<T, Ts...>
+{
+    const float START_VAL = 0.0f;
+    inline void combine(T& a, T b) const override { a += b; }
+};
+
+template <typename T, typename... Ts>
+struct reduce_func<MIOPEN_CUM_PROD, T, Ts...> : reduce_func_base<T, Ts...>
+{
+    const float START_VAL = 1.0f;
+    inline void combine(T& a, T b) const override { a *= b; }
+};
+
+template <miopenCumOp_t OP, class T>
+void cpu_cumulative_reduction_forward(const tensor<T> input,
+                                      tensor<T>& ref_output,
+                                      tensor<int64_t>& ref_indices,
+                                      const int dim,
+                                      const bool exclusive,
+                                      const bool reverse,
+                                      const bool has_output  = true,
+                                      const bool has_indices = true)
+{
+    const int ndims     = input.desc.GetNumDims();
+    const auto true_dim = ((dim % ndims) + ndims) % ndims;
+
+    auto input_tv   = miopen::get_inner_expanded_tv<5>(input.desc);
+    auto output_tv  = miopen::get_inner_expanded_tv<5>(ref_output.desc);
+    auto indices_tv = miopen::get_inner_expanded_tv<5>(ref_indices.desc);
+
+    auto size       = input.desc.GetElementSize();
+    auto inner_size = input.desc.GetLengths()[true_dim];
+    auto outer_size = size / inner_size;
+
+    auto op_worker = reduce_func<OP, float, int64_t>{};
+
+    tensor_view_t<5> ignore_dim_input_tv = input_tv;
+    ignore_dim_input_tv.size[true_dim]   = 1;
+
+    par_ford(outer_size)([&](int64_t gid) {
+        auto tensor_layout = tensor_layout_t<5>(ignore_dim_input_tv, gid);
+        float cum_val      = op_worker.START_VAL;
+        int64_t cum_idx    = (reverse ? input_tv.size[true_dim] - 1 : 0);
+
+        ford(inner_size)([&](int64_t idx) {
+            int64_t tmp_idx =
+                (reverse ? input_tv.size[true_dim] - (idx - exclusive) - 1 : (idx - exclusive));
+            float tmp_val = op_worker.START_VAL;
+            if(0 <= tmp_idx && tmp_idx < inner_size)
+            {
+                tensor_layout.layout[true_dim] = tmp_idx;
+                tmp_val = static_cast<float>(input[input_tv.get_tensor_view_idx(tensor_layout)]);
+            }
+
+            op_worker.calculate(cum_val, tmp_val, cum_idx, tmp_idx);
+
+            tensor_layout.layout[true_dim] = (reverse ? input_tv.size[true_dim] - idx - 1 : idx);
+            if(has_output)
+                ref_output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast<T>(cum_val);
+            if(has_indices)
+                ref_indices[indices_tv.get_tensor_view_idx(tensor_layout)] = cum_idx;
+        });
+    });
+}
diff --git a/test/gtest/cumulative_reduction.cpp b/test/gtest/cumulative_reduction.cpp
new file mode 100644
index 0000000000..e20afcf0b0
--- /dev/null
+++ b/test/gtest/cumulative_reduction.cpp
@@ -0,0 +1,81 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "cumulative_reduction.hpp"
+
+namespace cumulative_reduction {
+using GPU_CumulativeReduction_fwd_FP32  = CumulativeReductionTest<float>;
+using GPU_CumulativeReduction_fwd_FP16  = CumulativeReductionTest<half>;
+using GPU_CumulativeReduction_fwd_BFP16 = CumulativeReductionTest<bfloat16>;
+} // namespace cumulative_reduction
+
+using namespace cumulative_reduction;
+
+TEST_P(GPU_CumulativeReduction_fwd_FP32, Test)
+{
+    RunTest();
+    Verify();
+};
+TEST_P(GPU_CumulativeReduction_fwd_FP16, Test)
+{
+    RunTest();
+    Verify();
+};
+TEST_P(GPU_CumulativeReduction_fwd_BFP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_CumulativeReduction_fwd_FP32,
+                         testing::ValuesIn(CumulativeReductionSmokeTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_CumulativeReduction_fwd_FP16,
+                         testing::ValuesIn(CumulativeReductionSmokeTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_CumulativeReduction_fwd_BFP16,
+                         testing::ValuesIn(CumulativeReductionSmokeTestConfigs()));
+
+INSTANTIATE_TEST_SUITE_P(Perf,
+                         GPU_CumulativeReduction_fwd_FP32,
+                         testing::ValuesIn(CumulativeReductionPerfTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Perf,
+                         GPU_CumulativeReduction_fwd_FP16,
+                         testing::ValuesIn(CumulativeReductionPerfTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Perf,
+                         GPU_CumulativeReduction_fwd_BFP16,
+                         testing::ValuesIn(CumulativeReductionPerfTestConfigs()));
+
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_CumulativeReduction_fwd_FP32,
+                         testing::ValuesIn(CumulativeReductionFullTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_CumulativeReduction_fwd_FP16,
+                         testing::ValuesIn(CumulativeReductionFullTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_CumulativeReduction_fwd_BFP16,
+                         testing::ValuesIn(CumulativeReductionFullTestConfigs()));
diff --git a/test/gtest/cumulative_reduction.hpp b/test/gtest/cumulative_reduction.hpp
new file mode 100644
index 0000000000..b89c683512
--- /dev/null
+++ b/test/gtest/cumulative_reduction.hpp
@@ -0,0 +1,305 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "cpu_cumulative_reduction.hpp"
+#include "get_handle.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/cumulative_reduction.hpp>
+#include <miopen/cumulative_reduction/solvers.hpp>
+
+#define FLOAT_ACCUM float
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+struct CumulativeReductionTestCase
+{
+    std::vector<size_t> lengths;
+    miopenCumOp_t op;
+    int dim;
+    bool exclusive;
+    bool reverse;
+    bool contiguous;
+
+    friend std::ostream& operator<<(std::ostream& os, const CumulativeReductionTestCase& tc)
+    {
+        return os << " Lengths:" << tc.lengths << " CumOp:" << tc.op << " Dim:" << tc.dim
+                  << " Exclusive:" << (tc.exclusive ? "True" : "False")
+                  << " Reverse:" << (tc.reverse ? "True" : "False")
+                  << " Contiguous:" << (tc.contiguous ? "True" : "False");
+    }
+};
+
+inline std::vector<size_t> GetStrides(std::vector<size_t> lengths, bool contiguous)
+{
+    if(!contiguous)
+        std::swap(lengths.front(), lengths.back());
+    std::vector<size_t> strides(lengths.size());
+    strides.back() = 1;
+    for(int i = lengths.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * lengths[i + 1];
+    if(!contiguous)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
+inline std::vector<CumulativeReductionTestCase>
+CumulativeReductionTestConfigs(const std::vector<std::vector<size_t>>& SizeList)
+{
+    std::vector<CumulativeReductionTestCase> tcs;
+
+    std::vector<miopenCumOp_t> ops = {
+        MIOPEN_CUM_MAX, MIOPEN_CUM_MIN, MIOPEN_CUM_SUM, MIOPEN_CUM_PROD};
+    std::vector<size_t> dims      = {-1, 0};
+    std::vector<bool> exclusives  = {false, true};
+    std::vector<bool> reverses    = {false, true};
+    std::vector<bool> contiguouss = {true, false};
+
+    auto&& handle = get_handle();
+    for(const auto& lengths : SizeList)
+    {
+        auto out_strides     = GetStrides(lengths, true);
+        auto indices_strides = GetStrides(lengths, true);
+        for(auto contiguous : contiguouss)
+        {
+            auto input_strides = GetStrides(lengths, contiguous);
+            for(auto op : ops)
+            {
+                for(auto dim : dims)
+                {
+                    for(auto exclusive : exclusives)
+                    {
+                        if(exclusive && (op == MIOPEN_CUM_MAX || op == MIOPEN_CUM_MIN))
+                            continue;
+                        for(auto reverse : reverses)
+                        {
+                            if(miopen::solver::cumulative_reduction::ForwardContiguousLastDim()
+                                   .IsApplicable(
+                                       miopen::ExecutionContext(&handle),
+                                       miopen::cumulative_reduction::ForwardProblemDescription(
+                                           miopen::TensorDescriptor(
+                                               miopen_type<float>{}, lengths, input_strides),
+                                           miopen::TensorDescriptor(
+                                               miopen_type<float>{}, lengths, out_strides),
+                                           miopen::TensorDescriptor(
+                                               miopen_type<int64_t>{}, lengths, indices_strides),
+                                           dim,
+                                           op)))
+                                tcs.push_back({lengths, op, dim, exclusive, reverse, contiguous});
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return tcs;
+}
+
+inline std::vector<std::vector<size_t>> GetSmokeTestSize()
+{
+    return {
+        {10},
+        {65, 100},
+        {65},
+        {70, 10},
+    };
+}
+
+inline std::vector<std::vector<size_t>> GetSmokePerfSize()
+{
+    return {
+        {512, 64, 112},
+    };
+}
+
+inline std::vector<CumulativeReductionTestCase> CumulativeReductionSmokeTestConfigs()
+{
+    return CumulativeReductionTestConfigs(GetSmokeTestSize());
+}
+
+inline std::vector<CumulativeReductionTestCase> CumulativeReductionPerfTestConfigs()
+{
+    return CumulativeReductionTestConfigs(GetSmokePerfSize());
+}
+
+inline std::vector<CumulativeReductionTestCase> CumulativeReductionFullTestConfigs()
+{
+    std::vector<CumulativeReductionTestCase> tcs;
+
+    auto smoke_test = CumulativeReductionSmokeTestConfigs();
+    auto perf_test  = CumulativeReductionPerfTestConfigs();
+
+    tcs.reserve(smoke_test.size() + perf_test.size());
+    for(const auto& test : smoke_test)
+        tcs.push_back(test);
+    for(const auto& test : perf_test)
+        tcs.push_back(test);
+
+    return tcs;
+}
+
+template <typename T>
+struct CumulativeReductionTest : public ::testing::TestWithParam<CumulativeReductionTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle               = get_handle();
+        cumulative_reduction_config = GetParam();
+        auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+
+        auto lengths = cumulative_reduction_config.lengths;
+
+        auto input_strides = GetStrides(lengths, cumulative_reduction_config.contiguous);
+        input              = tensor<T>{lengths, input_strides}.generate(gen_value);
+
+        auto out_strides = GetStrides(lengths, true);
+        output           = tensor<T>{lengths, out_strides};
+
+        auto indices_strides = GetStrides(lengths, true);
+        indices              = tensor<int64_t>{lengths, indices_strides};
+
+        ref_output  = tensor<T>{lengths, out_strides};
+        ref_indices = tensor<int64_t>{lengths, indices_strides};
+
+        input_dev   = handle.Write(input.data);
+        output_dev  = handle.Write(output.data);
+        indices_dev = handle.Write(indices.data);
+    }
+
+    void RunTest()
+    {
+        switch(cumulative_reduction_config.op)
+        {
+        case MIOPEN_CUM_MAX:
+            cpu_cumulative_reduction_forward<MIOPEN_CUM_MAX, T>(
+                input,
+                ref_output,
+                ref_indices,
+                cumulative_reduction_config.dim,
+                cumulative_reduction_config.exclusive,
+                cumulative_reduction_config.reverse);
+            break;
+        case MIOPEN_CUM_MIN:
+            cpu_cumulative_reduction_forward<MIOPEN_CUM_MIN, T>(
+                input,
+                ref_output,
+                ref_indices,
+                cumulative_reduction_config.dim,
+                cumulative_reduction_config.exclusive,
+                cumulative_reduction_config.reverse);
+            break;
+        case MIOPEN_CUM_SUM:
+            cpu_cumulative_reduction_forward<MIOPEN_CUM_SUM, T>(
+                input,
+                ref_output,
+                ref_indices,
+                cumulative_reduction_config.dim,
+                cumulative_reduction_config.exclusive,
+                cumulative_reduction_config.reverse);
+            break;
+        case MIOPEN_CUM_PROD:
+            cpu_cumulative_reduction_forward<MIOPEN_CUM_PROD, T>(
+                input,
+                ref_output,
+                ref_indices,
+                cumulative_reduction_config.dim,
+                cumulative_reduction_config.exclusive,
+                cumulative_reduction_config.reverse);
+            break;
+        }
+
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        status = miopen::cumulative_reduction::CumulativeReductionForward(
+            handle,
+            input.desc,
+            input_dev.get(),
+            output.desc,
+            output_dev.get(),
+            indices.desc,
+            indices_dev.get(),
+            cumulative_reduction_config.dim,
+            cumulative_reduction_config.exclusive,
+            cumulative_reduction_config.reverse,
+            cumulative_reduction_config.op);
+        EXPECT_EQ(status, miopenStatusSuccess);
+        output.data  = handle.Read<T>(output_dev, output.data.size());
+        indices.data = handle.Read<int64_t>(indices_dev, indices.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+
+        auto error_output  = miopen::rms_range(ref_output, output);
+        auto error_indices = miopen::rms_range(ref_indices, indices);
+        ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
+        ASSERT_EQ(miopen::range_distance(ref_indices), miopen::range_distance(indices));
+        EXPECT_LT(error_output, tolerance)
+            << "Error forward Output beyond tolerance Error: " << error_output
+            << " Tolerance: " << tolerance;
+        EXPECT_LT(error_indices, tolerance)
+            << "Error forward Indices beyond tolerance Error: " << error_indices
+            << " Tolerance: " << tolerance;
+    }
+
+    CumulativeReductionTestCase cumulative_reduction_config;
+
+    tensor<T> input;
+    tensor<T> output;
+    tensor<int64_t> indices;
+
+    tensor<T> ref_output;
+    tensor<int64_t> ref_indices;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+    miopen::Allocator::ManageDataPtr indices_dev;
+};