From 31942666ec763dea7f7befef9b308118c83201f2 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Tue, 17 Dec 2024 14:22:55 -0500
Subject: [PATCH] Address inspect tool, check module cmakelists, warnings and
 spell check

- missing includes
- prevent max/min being expanded as macros
- minor spell check correction
- remove pragma once in cpp file
- resolve implicit type conversions in rfa type to single and double and other places
- add dual license
- remove unnecessary command for macos ci
- use HPX_UNROLL instead of vanilla pragma
- clang-17 cannot unroll so use checks

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .github/workflows/macos_debug_fetch_hwloc.yml |   1 -
 libs/core/algorithms/CMakeLists.txt           |   3 +
 .../detail/reduce_deterministic.hpp           |   3 +
 .../hpx/parallel/algorithms/detail/rfa.hpp    | 159 ++++++++++++------
 .../unit/algorithms/reduce_deterministic.cpp  |  14 +-
 5 files changed, 125 insertions(+), 55 deletions(-)
diff --git a/.github/workflows/macos_debug_fetch_hwloc.yml b/.github/workflows/macos_debug_fetch_hwloc.yml
index f778a6b117d7..caec2af3ac25 100644
--- a/.github/workflows/macos_debug_fetch_hwloc.yml
+++ b/.github/workflows/macos_debug_fetch_hwloc.yml
@@ -19,7 +19,6 @@ jobs:
       run: |
           brew install --overwrite python-tk && \
           brew install --overwrite boost gperftools ninja autoconf automake && \
-          autoreconf -f -i \
           brew upgrade cmake
     - name: Configure
       shell: bash
diff --git a/libs/core/algorithms/CMakeLists.txt b/libs/core/algorithms/CMakeLists.txt
index 6fcfed897e2f..9090345722df 100644
--- a/libs/core/algorithms/CMakeLists.txt
+++ b/libs/core/algorithms/CMakeLists.txt
@@ -37,7 +37,9 @@ set(algorithms_headers
     hpx/parallel/algorithms/detail/parallel_stable_sort.hpp
     hpx/parallel/algorithms/detail/pivot.hpp
     hpx/parallel/algorithms/detail/reduce.hpp
+    hpx/parallel/algorithms/detail/reduce_deterministic.hpp
     hpx/parallel/algorithms/detail/replace.hpp
+    hpx/parallel/algorithms/detail/rfa.hpp
     hpx/parallel/algorithms/detail/rotate.hpp
     hpx/parallel/algorithms/detail/sample_sort.hpp
     hpx/parallel/algorithms/detail/search.hpp
@@ -72,6 +74,7 @@ set(algorithms_headers
     hpx/parallel/algorithms/partition.hpp
     hpx/parallel/algorithms/reduce_by_key.hpp
     hpx/parallel/algorithms/reduce.hpp
+    hpx/parallel/algorithms/reduce_deterministic.hpp
     hpx/parallel/algorithms/remove_copy.hpp
     hpx/parallel/algorithms/remove.hpp
     hpx/parallel/algorithms/replace.hpp
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index 87069858f492..b37730889172 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -13,6 +13,7 @@
 #include <hpx/parallel/util/loop.hpp>
 
 #include <cstddef>
+#include <cstring>
 #include <limits>
 #include <type_traits>
 #include <utility>
@@ -32,6 +33,8 @@ namespace hpx::parallel::detail {
             sequential_reduce_deterministic_t, ExPolicy&&, InIterB first,
             InIterE last, T init, Reduce&& r)
         {
+            /// TODO: Put constraint on Reduce to be a binary plus operator
+            (void) r;
             hpx::parallel::detail::rfa::RFA_bins<T> bins;
             bins.initialize_bins();
             std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
index 302f823fab71..fa9142cdf80b 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
@@ -1,3 +1,34 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// ---------------------------------------------------------------------------
+// This file has been taken from
+// https://github.com/maddyscientist/reproducible_floating_sums commit
+// b5a065741d4ea459437ca004b508de9dcb6a3e52. The boost copyright has been added
+// to this file in accordance with the dual license terms for the Reproducible
+// Floating-Point Summations and conformance with the HPX policy
+// https://github.com/maddyscientist/reproducible_floating_sums/blob/feature/cuda/LICENSE.md
+// ---------------------------------------------------------------------------
+//
+/// Copyright 2022 Richard Barnes, Peter Ahrens, James Demmel
+/// Permission is hereby granted, free of charge, to any person obtaining a copy
+/// of this software and associated documentation files (the "Software"), to deal
+/// in the Software without restriction, including without limitation the rights
+/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+/// copies of the Software, and to permit persons to whom the Software is
+/// furnished to do so, subject to the following conditions:
+/// The above copyright notice and this permission notice shall be included in
+/// all copies or substantial portions of the Software.
+/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+/// SOFTWARE.
 //Reproducible Floating Point Accumulations via Binned Floating Point
 //Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0.
 //ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel.
@@ -26,6 +57,10 @@
 #include <cmath>
 #include <cstdint>
 #include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <hpx/config.hpp>
 
 namespace hpx::parallel::detail::rfa {
     template <typename F>
@@ -179,7 +214,7 @@ namespace hpx::parallel::detail::rfa {
         static constexpr int FOLD = FOLD_;
 
     private:
-        std::array<ftype, 2 * FOLD> data = {0};
+        std::array<ftype, 2 * FOLD> data = {{0}};
 
         ///Floating-point precision bin width
         static constexpr auto BIN_WIDTH =
@@ -351,21 +386,21 @@ namespace hpx::parallel::detail::rfa {
 
         ///Get index of float-point precision
         ///The index of a non-binned type is the smallest index a binned type would
-        ///need to have to sum it reproducibly. Higher indicies correspond to smaller
+        ///need to have to sum it reproducibly. Higher indices correspond to smaller
         ///bins.
         static inline constexpr int binned_dindex(const ftype x)
         {
             int exp = EXP(x);
             if (exp == 0)
             {
-                if (x == 0.0)
+                if (x == static_cast<ftype>(0.0))
                 {
                     return MAXINDEX;
                 }
                 else
                 {
                     std::frexp(x, &exp);
-                    return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
+                    return (std::max)((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
                 }
             }
             return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
@@ -373,7 +408,7 @@ namespace hpx::parallel::detail::rfa {
 
         ///Get index of manually specified binned double precision
         ///The index of a binned type is the bin that it corresponds to. Higher
-        ///indicies correspond to smaller bins.
+        ///indices correspond to smaller bins.
         inline int binned_index() const
         {
             return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
@@ -416,7 +451,9 @@ namespace hpx::parallel::detail::rfa {
                 int shift = binned_index() - X_index;
                 if (shift > 0)
                 {
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                    HPX_UNROLL
+#endif
                     for (int i = FOLD - 1; i >= 1; i--)
                     {
                         if (i < shift)
@@ -425,7 +462,9 @@ namespace hpx::parallel::detail::rfa {
                         carry(i * inccarY) = carry((i - shift) * inccarY);
                     }
                     const ftype* const bins = binned_bins(X_index);
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                    HPX_UNROLL
+#endif
                     for (int j = 0; j < FOLD; j++)
                     {
                         if (j >= shift)
@@ -457,16 +496,19 @@ namespace hpx::parallel::detail::rfa {
             if (binned_index0())
             {
                 M = primary(0);
-                ftype qd = x * COMPRESSION;
+                ftype qd = x * static_cast<ftype>(COMPRESSION);
                 auto& ql = get_bits(qd);
                 ql |= 1;
                 qd += M;
                 primary(0) = qd;
                 M -= qd;
-                M *= EXPANSION * 0.5;
+                auto temp_m = (double) (((double) EXPANSION) * 0.5);
+                M *= static_cast<ftype>(temp_m);
                 x += M;
                 x += M;
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 1; i < FOLD - 1; i++)
                 {
                     M = primary(i * incpriY);
@@ -485,7 +527,9 @@ namespace hpx::parallel::detail::rfa {
             {
                 ftype qd = x;
                 auto& ql = get_bits(qd);
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 0; i < FOLD - 1; i++)
                 {
                     M = primary(i * incpriY);
@@ -550,7 +594,7 @@ namespace hpx::parallel::detail::rfa {
             int i = 0;
 
             if (ISNANINF(primary(0)))
-                return primary(0);
+                return (double) primary(0);
             if (ISZERO(primary(0)))
                 return 0.0;
 
@@ -564,29 +608,36 @@ namespace hpx::parallel::detail::rfa {
             {
                 scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
                 scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
-                scaled = std::max(
-                    std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
+                scaled = (std::max)(
+                    (std::min)(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
                 if (X_index == 0)
                 {
-                    Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION);
-                    Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down);
-                    Y += (primary(0) - bins[0]) * scale_down * EXPANSION;
+                    Y += ((double) carry(0)) *
+                        ((((double) bins[0]) / 6.0) * scale_down * EXPANSION);
+                    Y += ((double) carry(inccarX)) *
+                        ((((double) bins[1]) / 6.0) * scale_down);
+                    Y += ((double) primary(0) - (double) bins[0]) * scale_down *
+                        EXPANSION;
                     i = 2;
                 }
                 else
                 {
-                    Y += carry(0) * ((bins[0] / 6.0) * scale_down);
+                    Y += ((double) carry(0)) *
+                        (((double) bins[0] / 6.0) * scale_down);
                     i = 1;
                 }
                 for (; i < scaled; i++)
                 {
-                    Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down);
-                    Y +=
-                        (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down;
+                    Y += ((double) carry(i * inccarX)) *
+                        (((double) bins[i] / 6.0) * scale_down);
+                    Y += ((double) primary((i - 1) * incpriX) -
+                             (double) (bins[i - 1])) *
+                        scale_down;
                 }
                 if (i == FOLD)
                 {
-                    Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) *
+                    Y += ((double) primary((FOLD - 1) * incpriX) -
+                             (double) (bins[FOLD - 1])) *
                         scale_down;
                     return Y * scale_up;
                 }
@@ -597,20 +648,23 @@ namespace hpx::parallel::detail::rfa {
                 Y *= scale_up;
                 for (; i < FOLD; i++)
                 {
-                    Y += carry(i * inccarX) * (bins[i] / 6.0);
-                    Y += primary((i - 1) * incpriX) - bins[i - 1];
+                    Y += ((double) carry(i * inccarX)) *
+                        ((double) bins[i] / 6.0);
+                    Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
                 }
-                Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1];
+                Y += ((double) primary((FOLD - 1) * incpriX) -
+                    ((double) bins[FOLD - 1]));
             }
             else
             {
-                Y += carry(0) * (bins[0] / 6.0);
+                Y += ((double) carry(0)) * ((double) bins[0] / 6.0);
                 for (i = 1; i < FOLD; i++)
                 {
-                    Y += carry(i * inccarX) * (bins[i] / 6.0);
-                    Y += (primary((i - 1) * incpriX) - bins[i - 1]);
+                    Y += ((double) carry(i * inccarX)) *
+                        ((double) bins[i] / 6.0);
+                    Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
                 }
-                Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+                Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
             }
             return Y;
         }
@@ -627,7 +681,7 @@ namespace hpx::parallel::detail::rfa {
             if (ISNANINF(primary(0)))
                 return primary(0);
             if (ISZERO(primary(0)))
-                return 0.0;
+                return 0.0f;
 
             //Note that the following order of summation is in order of decreasing
             //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
@@ -636,20 +690,22 @@ namespace hpx::parallel::detail::rfa {
             const auto* const bins = binned_bins(X_index);
             if (X_index == 0)
             {
-                Y += (double) carry(0) * (double) (bins[0] / 6.0) *
+                Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0) *
                     (double) EXPANSION;
-                Y += (double) carry(inccarX) * (double) (bins[1] / 6.0);
+                Y += (double) carry(inccarX) *
+                    (double) (((double) bins[1]) / 6.0);
                 Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
                 i = 2;
             }
             else
             {
-                Y += (double) carry(0) * (double) (bins[0] / 6.0);
+                Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0);
                 i = 1;
             }
             for (; i < FOLD; i++)
             {
-                Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0);
+                Y += (double) carry(i * inccarX) *
+                    (double) (((double) bins[i]) / 6.0);
                 Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
             }
             Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
@@ -694,8 +750,10 @@ namespace hpx::parallel::detail::rfa {
             if (shift > 0)
             {
                 const auto* const bins = binned_bins(Y_index);
-                //shift Y upwards and add X to Y
-#pragma unroll
+//shift Y upwards and add X to Y
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = FOLD - 1; i >= 1; i--)
                 {
                     if (i < shift)
@@ -705,7 +763,9 @@ namespace hpx::parallel::detail::rfa {
                     carry(i * inccarY) =
                         x.carry(i * inccarX) + carry((i - shift) * inccarY);
                 }
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 0; i < FOLD; i++)
                 {
                     if (i == shift)
@@ -717,8 +777,10 @@ namespace hpx::parallel::detail::rfa {
             else if (shift < 0)
             {
                 const auto* const bins = binned_bins(X_index);
-                //shift X upwards and add X to Y
-#pragma unroll
+//shift X upwards and add X to Y
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 0; i < FOLD; i++)
                 {
                     if (i < -shift)
@@ -731,8 +793,10 @@ namespace hpx::parallel::detail::rfa {
             else if (shift == 0)
             {
                 const auto* const bins = binned_bins(X_index);
-                // add X to Y
-#pragma unroll
+// add X to Y
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 0; i < FOLD; i++)
                 {
                     primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
@@ -771,7 +835,7 @@ namespace hpx::parallel::detail::rfa {
         }
 
         ///Return the endurance of the binned fp
-        constexpr int endurance() const
+        constexpr size_t endurance() const
         {
             return ENDURANCE;
         }
@@ -867,11 +931,11 @@ namespace hpx::parallel::detail::rfa {
         {
             if (std::is_same_v<ftype, float>)
             {
-                return binned_conv_single(1, 1);
+                return static_cast<ftype>(binned_conv_single(1, 1));
             }
             else
             {
-                return binned_conv_double(1, 1);
+                return static_cast<ftype>(binned_conv_double(1, 1));
             }
         }
 
@@ -888,7 +952,8 @@ namespace hpx::parallel::detail::rfa {
         {
             const double X = std::abs(max_abs_val);
             const double S = std::abs(binned_sum);
-            return static_cast<ftype>(max(X, std::ldexp(0.5, MIN_EXP - 1)) *
+            return static_cast<ftype>(
+                (std::max)(X, std::ldexp(0.5, MIN_EXP - 1)) *
                     std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
                 ((7.0 * EPSILON) /
                     (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
@@ -973,7 +1038,7 @@ namespace hpx::parallel::detail::rfa {
             T max_abs_val = input[0];
             for (size_t i = 0; i < N; i++)
             {
-                max_abs_val = max(max_abs_val, std::abs(input[i]));
+                max_abs_val = (std::max)(max_abs_val, std::abs(input[i]));
             }
             add(input, N, max_abs_val);
         }
@@ -1142,4 +1207,4 @@ namespace hpx::parallel::detail::rfa {
         }
     };
 
-}    // namespace hpx::parallel::detail::rfa
\ No newline at end of file
+}    // namespace hpx::parallel::detail::rfa
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index 92dd2e7f3dc2..87e050b999df 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -4,8 +4,6 @@
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#pragma once
-
 #include <hpx/init.hpp>
 #include <hpx/modules/testing.hpp>
 #include <hpx/parallel/algorithms/detail/rfa.hpp>
@@ -19,6 +17,7 @@
 #include <limits>
 #include <numeric>
 #include <random>
+#include <string>
 #include <vector>
 
 #include "test_utils.hpp"
@@ -27,11 +26,12 @@ int seed = std::random_device{}();
 std::mt19937 gen(seed);
 
 template <typename T>
-T get_rand(
-    T LO = std::numeric_limits<T>::min(), T HI = std::numeric_limits<T>::max())
+T get_rand(T LO = (std::numeric_limits<T>::min)(),
+    T HI = (std::numeric_limits<T>::max)())
 {
     return LO +
-        static_cast<T>(std::rand()) / (static_cast<T>(RAND_MAX / (HI - LO)));
+        static_cast<T>(std::rand()) /
+        (static_cast<T>(static_cast<T>((RAND_MAX)) / (HI - LO)));
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -75,8 +75,8 @@ void test_reduce1(IteratorTag)
     FloatTypeNonDeterministic r3 = std::accumulate(
         nondeterministic.begin(), nondeterministic.end(), val_non_det);
 
-    HPX_TEST_EQ(r1, r3);
-    HPX_TEST_EQ(r2, r3);
+    HPX_TEST_EQ(static_cast<FloatTypeNonDeterministic>(r1), r3);
+    HPX_TEST_EQ(static_cast<FloatTypeNonDeterministic>(r2), r3);
 }
 
 template <typename IteratorTag, typename FloatTypeDeterministic,