Merge pull request #3935 from kif/3934_opencl_order_fp_operations

Fix order of floating point operations
silx-kit · Sep 29, 2023 · 4bd043f · 4bd043f
2 parents 6949b8b + cef3ab6
commit 4bd043f
Showing 1 changed file with 7 additions and 0 deletions.
diff --git a/src/silx/resources/opencl/doubleword.cl b/src/silx/resources/opencl/doubleword.cl
@@ -29,6 +29,7 @@
  *
  * We use the trick to declare some variable "volatile" to enforce the actual
  * precision reduction of those variables.
+ * This has to be used in combination with #pragma clang fp contract(on)
 */
 
 #ifndef X87_VOLATILE
@@ -37,6 +38,7 @@
 
 //Algorithm 1, p23, theorem 1.1.12. Requires e_x > e_y, valid if |x| > |y| 
 inline fp2 fast_fp_plus_fp(fp x, fp y){
+    #pragma clang fp contract(on)
     X87_VOLATILE fp s = x + y;
     X87_VOLATILE fp z = s - x;
     fp e = y - z;
@@ -45,6 +47,7 @@ inline fp2 fast_fp_plus_fp(fp x, fp y){
 
 //Algorithm 2, p24, same as fast_fp_plus_fp without the condition on e_x and e_y
 inline fp2 fp_plus_fp(fp x, fp y){
+    #pragma clang fp contract(on)
     X87_VOLATILE fp s = x + y;
     X87_VOLATILE fp xp = s - y;
     X87_VOLATILE fp yp = s - xp;
@@ -62,6 +65,7 @@ inline fp2 fp_times_fp(fp x, fp y){
 
 //Algorithm 7, p38: Addition of a FP to a DW. 10flop bounds:2u²+5u³
 inline fp2 dw_plus_fp(fp2 x, fp y){
+    #pragma clang fp contract(on)
     fp2 s = fp_plus_fp(x.s0, y);
     X87_VOLATILE fp v = x.s1 + s.s1;
     return fast_fp_plus_fp(s.s0, v);
@@ -83,13 +87,15 @@ inline fp2 dw_times_fp(fp2 x, fp y){
 
 //Algorithm 14, p52: Multiplication DW*DW, 8 flops bounds:6u²
 inline fp2 dw_times_dw(fp2 x, fp2 y){
+    #pragma clang fp contract(on)
     fp2 c = fp_times_fp(x.s0, y.s0);
     X87_VOLATILE fp l = fma(x.s1, y.s0, x.s0 * y.s1);
     return fast_fp_plus_fp(c.s0, c.s1 + l);
 }
 
 //Algorithm 17, p55: Division DW / FP, 10flops bounds: 3.5u²
 inline fp2 dw_div_fp(fp2 x, fp y){
+    #pragma clang fp contract(on)
     X87_VOLATILE fp th = x.s0 / y;
     fp2 pi = fp_times_fp(th, y);
     fp2 d = x - pi;
@@ -100,6 +106,7 @@ inline fp2 dw_div_fp(fp2 x, fp y){
 
 //Derived from algorithm 20, p64: Inversion 1/ DW, 22 flops
 inline fp2 inv_dw(fp2 y){
+    #pragma clang fp contract(on)
     X87_VOLATILE fp th = one/y.s0;
     X87_VOLATILE fp rh = fma(-y.s0, th, one);
     X87_VOLATILE fp rl = -y.s1 * th;