From b2a656b9739a8d5dcb6351f427cf731eb89e3fd2 Mon Sep 17 00:00:00 2001
From: Thomas Gazzola <thomas.gazzola@total.com>
Date: Wed, 28 Jul 2021 16:46:26 -0700
Subject: [PATCH 1/2] Kahan 2x2 determinant computation reduces numerical
 imprecision.

---
 src/fixedSizeSquareMatrixOpsImpl.hpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index f006d866..2afa8367 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -88,7 +88,18 @@ struct SquareMatrixOps< 2 >
   static auto determinant( MATRIX const & matrix )
   {
     checkSizes< 2, 2 >( matrix );
-    return matrix[ 0 ][ 0 ] * matrix[ 1 ][ 1 ] - matrix[ 0 ][ 1 ] * matrix[ 1 ][ 0 ];
+
+    // Aliases for matrix [[a, b],[c, d]] for improved readability
+    auto const & a = matrix[0][0];
+    auto const & b = matrix[0][1];
+    auto const & c = matrix[1][0];
+    auto const & d = matrix[1][1];
+
+    // Using the more precise Kahan method to compute the 2x2 determinant.
+    auto const w = b * c;
+    auto const e = fma( -b, c, w );
+    auto const f = fma( a, d, -w );
+    return f + e;
   }
 
   /**

From 96b1434af55c1f650a035a890a9625acdfd83f42 Mon Sep 17 00:00:00 2001
From: Thomas Gazzola <thomas.gazzola@total.com>
Date: Thu, 29 Jul 2021 14:49:59 -0700
Subject: [PATCH 2/2] Defining the fma function in LvArray::math. I've built an
 integer dummy fallback in order to prevent integers to be cast to double (I
 had issues).

---
 src/fixedSizeSquareMatrixOpsImpl.hpp | 12 ++------
 src/math.hpp                         | 42 +++++++++++++++++++++++++++-
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/src/fixedSizeSquareMatrixOpsImpl.hpp b/src/fixedSizeSquareMatrixOpsImpl.hpp
index 2afa8367..21d86e0d 100644
--- a/src/fixedSizeSquareMatrixOpsImpl.hpp
+++ b/src/fixedSizeSquareMatrixOpsImpl.hpp
@@ -89,17 +89,9 @@ struct SquareMatrixOps< 2 >
   {
     checkSizes< 2, 2 >( matrix );
 
-    // Aliases for matrix [[a, b],[c, d]] for improved readability
-    auto const & a = matrix[0][0];
-    auto const & b = matrix[0][1];
-    auto const & c = matrix[1][0];
-    auto const & d = matrix[1][1];
-
     // Using the more precise Kahan method to compute the 2x2 determinant.
-    auto const w = b * c;
-    auto const e = fma( -b, c, w );
-    auto const f = fma( a, d, -w );
-    return f + e;
+    auto const tmp = matrix[0][1] * matrix[1][0];
+    return math::fma( -matrix[0][1], matrix[1][0], tmp ) + math::fma( matrix[0][0], matrix[1][1], -tmp );
   }
 
   /**
diff --git a/src/math.hpp b/src/math.hpp
index f832e0fa..faea4f18 100644
--- a/src/math.hpp
+++ b/src/math.hpp
@@ -28,7 +28,7 @@ namespace LvArray
 {
 
 /**
- * @brief Contains protable wrappers around cmath functions and some cuda specific functions.
+ * @brief Contains portable wrappers around cmath functions and some cuda specific functions.
  */
 namespace math
 {
@@ -317,6 +317,46 @@ max( T const a, T const b )
 #endif
 }
 
+/**
+ * @return Returns @p x * @p y + @p z.
+ * @tparam T A floating point type.
+ * @param x The first number.
+ * @param y The second number.
+ * @param z The third number.
+ * @note fma stands for fused multiply add.
+ *
+ * The function computes the result without losing precision in any intermediate result.
+ * Integer arguments cast to double.
+ * We want to avoid that by providing a version dedicated to integers.
+ */
+template< typename T >
+LVARRAY_HOST_DEVICE inline constexpr
+std::enable_if_t< std::is_floating_point< T >::value, T >
+fma( T const x, T const y, T const z )
+{
+#if defined(__CUDA_ARCH__)
+  return ::fma( x, y, z );
+#else
+  return std::fma( x, y, z );
+#endif
+}
+
+/**
+ * @return Returns @p x * @p y + @p z.
+ * @tparam T A floating point type.
+ * @param x The first number.
+ * @param y The second number.
+ * @param z The third number.
+ * @note This is a dummy implementation for integers (in order to not cast to doubles).
+ */
+template< typename T >
+LVARRAY_HOST_DEVICE inline constexpr
+std::enable_if_t< std::is_integral< T >::value, T >
+fma( T const x, T const y, T const z )
+{
+  return x * y + z;
+}
+
 #if defined( LVARRAY_USE_CUDA )
 
 /// @copydoc max( T, T )