Skip to content

Commit

Permalink
upgraded approximations
Browse files Browse the repository at this point in the history
  • Loading branch information
Geolm committed Jan 29, 2024
1 parent fed9464 commit a8049cd
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 35 deletions.
77 changes: 74 additions & 3 deletions extra/simd_approx_math.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,20 @@ static simd_vector simd_approx_cos(simd_vector a);
// max error : 2.682209015e-07, ~2.5x faster than simd_sin
static simd_vector simd_approx_sin(simd_vector a);

// max error : 0.000068, ~2.8x faster than simd_acos
// max error : 6.520748138e-05, ~2.8x faster than simd_acos
static simd_vector simd_approx_acos(simd_vector x);

// max relative error : 0.001726, ~3.7x faster than simd_exp
// max error : 6.520736497e-05, ~2.8x faster than simd_acos
static simd_vector simd_approx_asin(simd_vector x);

// max relative error : 1.727835275e-03, ~3.7x faster than simd_exp
static simd_vector simd_approx_exp(simd_vector x);

// max error : 2.321995254e-07
static simd_vector simd_approx_exp2(simd_vector x);

static simd_vector simd_approx_log2(simd_vector x);


//----------------------------------------------------------------------------------------------------------------------
// range reduction from hlslpp, polynomial computed with lolremez
Expand Down Expand Up @@ -83,7 +91,70 @@ static inline simd_vector simd_approx_acos(simd_vector x)
}

//----------------------------------------------------------------------------------------------------------------------
// based on https://stackoverflow.com/questions/47025373/fastest-implementation-of-the-natural-exponential-function-using-sse
// based on https://developer.download.nvidia.com/cg/asin.html
static inline simd_vector simd_approx_asin(simd_vector x)
{
simd_vector negate = simd_select(simd_splat_zero(), simd_splat(1.f), simd_cmp_lt(x, simd_splat_zero()));
x = simd_abs(x);
simd_vector result = simd_polynomial4(x, (float[]){-0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f});
result = simd_sub(simd_splat(SIMD_MATH_PI2), simd_mul(simd_sqrt(simd_sub(simd_splat(1.f), x)), result));
return simd_sub(result, simd_mul(simd_mul(simd_splat(2.f), result), negate));
}

//----------------------------------------------------------------------------------------------------------------------
// based on https://github.com/redorav/hlslpp/blob/master/include/hlsl%2B%2B_vector_float8.h
static simd_vector simd_approx_exp2(simd_vector x)
{
simd_vector invalid_mask = simd_isnan(x);
simd_vector input_is_infinity = simd_cmp_eq(x, simd_splat_positive_infinity());
simd_vector equal_to_zero = simd_cmp_eq(x, simd_splat_zero());
simd_vector one = simd_splat(1.f);

// clamp values
x = simd_clamp(x, simd_splat(-127.f), simd_splat(127.f));

simd_vector ipart = simd_floor(x);
simd_vector fpart = simd_sub(x, ipart);

simd_vectori i = simd_shift_left_i(simd_add_i(simd_convert_from_float(ipart), simd_splat_i(127)), 23);
simd_vector expipart = simd_cast_from_int(i);

// minimax polynomial fit of 2^x, in range [-0.5, 0.5[
simd_vector expfpart = simd_polynomial6(fpart, (float[]) {1.8775767e-3f, 8.9893397e-3f, 5.5826318e-2f, 2.4015361e-1f, 6.9315308e-1f, 1.f});

simd_vector result = simd_mul(expipart, expfpart);
result = simd_select(result, one, equal_to_zero);
result = simd_or(result, invalid_mask);
result = simd_select(result, simd_splat_positive_infinity(), input_is_infinity); // +inf arg will be +inf
return result;
}

//----------------------------------------------------------------------------------------------------------------------
// based on https://github.com/redorav/hlslpp/blob/master/include/hlsl%2B%2B_vector_float8.h
static simd_vector simd_approx_log2(simd_vector x)
{
simd_vector one = simd_splat(1.f);
simd_vectori i = simd_cast_from_float(x);
simd_vector e = simd_convert_from_int(simd_sub_i(simd_shift_right_i( simd_and_i(i, simd_splat_i(0x7F800000)), 23), simd_splat_i(127)));
simd_vector m = simd_or(simd_cast_from_int(simd_and_i(i, simd_splat_i(0x007FFFFF))), one);

// minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
simd_vector p = simd_polynomial6(m, (float[]) {-3.4436006e-2f, 3.1821337e-1f, -1.2315303f, 2.5988452f, -3.3241990f, 3.1157899f});

// this effectively increases the polynomial degree by one, but ensures that log2(1) == 0
simd_vector result = simd_fmad(p, simd_sub(m, one), e);

// we can't compute a logarithm beyond this value, so we'll mark it as -infinity to indicate close to 0
result = simd_select(result, simd_splat_negative_infinity(), simd_cmp_le(result, simd_splat(-127.f)));

// check for negative values and return NaN
result = simd_select(result, simd_splat_nan(), simd_cmp_lt(x, simd_splat_zero()));

return result;
}

//----------------------------------------------------------------------------------------------------------------------
// based on based on https://stackoverflow.com/questions/47025373/fastest-implementation-of-the-natural-exponential-function-using-sse
static inline simd_vector simd_approx_exp(simd_vector x)
{
simd_vector c0 = simd_splat(0.3371894346f);
Expand Down
30 changes: 11 additions & 19 deletions extra/simd_color.h
Original file line number Diff line number Diff line change
@@ -1,38 +1,30 @@
#ifndef __SIMD__COLOR__H__
#define __SIMD__COLOR__H__

#include "../simd.h"
#include "simd_math.h"

//----------------------------------------------------------------------------------------------------------------------
//
// Color manipulation functions

static simd_vector simd_approx_srgb_to_linear(simd_vector value); // max error : 0.000079
static simd_vector simd_approx_linear_to_srgb(simd_vector value); // max error : 0.003851 (enough for 8bits value)
static simd_vector simd_srgb_to_linear(simd_vector value); // max error : 4.267692566e-05
static simd_vector simd_linear_to_srgb(simd_vector value); // max error : 1.192092896e-07


//----------------------------------------------------------------------------------------------------------------------
static inline simd_vector simd_approx_linear_to_srgb(simd_vector value)
static inline simd_vector simd_linear_to_srgb(simd_vector value)
{
// range [0.000000; 0.042500]
// f(x) = 0.003309 + 12.323890*x^1 + -307.151428*x^2 + 3512.375244*x^3 + -3457.260986*x^4 + 240.709824*x^5

// Degree 5 approximation of f(x) = 1.055 * pow(x, 1/2.4) - 0.055
// on interval [ 0.042500, 1 ]
// p(x)=((((3.7378898*x-1.1148316e+1)*x+1.2821273e+1)*x-7.416023)*x+2.887472)*x+1.2067896e-1
// Estimated max error: 2.9748487e-3
simd_vector above_threshold = simd_cmp_gt(value, simd_splat(.0425f));
simd_vector result = simd_select(simd_splat(240.709824f), simd_splat(3.7378898f), above_threshold);
result = simd_fmad(result, value, simd_select(simd_splat(-3457.260986f), simd_splat(-11.148315f), above_threshold));
result = simd_fmad(result, value, simd_select(simd_splat(3512.375244f), simd_splat(12.821273f), above_threshold));
result = simd_fmad(result, value, simd_select(simd_splat(-307.151428f), simd_splat(-7.4160228f), above_threshold));
result = simd_fmad(result, value, simd_select(simd_splat(12.323890f), simd_splat(2.8874719f), above_threshold));
result = simd_fmad(result, value, simd_select(simd_splat(0.003309f), simd_splat(0.12067895f), above_threshold));
simd_vector small_value = simd_cmp_lt(value, simd_splat(0.0031308f));
simd_vector result_small = simd_mul(value, simd_splat(12.92f));
simd_vector value_1_3 = simd_cbrt(value);
simd_vector result = simd_mul(value_1_3, simd_sqrt(simd_sqrt(value_1_3)));
result = simd_fmad(simd_splat(1.055f), result, simd_splat(-0.055f));
result = simd_select(result, result_small, small_value);
return result;
}

//----------------------------------------------------------------------------------------------------------------------
static inline simd_vector simd_approx_srgb_to_linear(simd_vector value)
static inline simd_vector simd_srgb_to_linear(simd_vector value)
{
simd_vector big_value = simd_cmp_ge(value, simd_splat(0.04045f));
simd_vector result0 = simd_mul(value, simd_splat(1.f / 12.92f));
Expand Down
13 changes: 7 additions & 6 deletions tests/test_simd.c
Original file line number Diff line number Diff line change
Expand Up @@ -426,19 +426,20 @@ int main(int argc, char * argv[])
GREATEST_MAIN_BEGIN();

RUN_SUITE(load);
RUN_SUITE(interlace_deinterlace);
RUN_SUITE(export);
RUN_SUITE(rounding);
RUN_SUITE(arithmetic);
RUN_SUITE(sqrt_and_rcp);
RUN_SUITE(abs_neg_min_max);
RUN_SUITE(horizontal);
RUN_TEST(sort);
RUN_SUITE(float_extraction);
RUN_SUITE(horizontal);
RUN_SUITE(trigonometry);
RUN_SUITE(exponentiation);
RUN_SUITE(approximations);
RUN_SUITE(color_space);
RUN_SUITE(arithmetic);
RUN_SUITE(sqrt_and_rcp);
RUN_SUITE(abs_neg_min_max);
RUN_SUITE(export);
RUN_SUITE(collision_2d);
RUN_SUITE(interlace_deinterlace);

GREATEST_MAIN_END();
}
22 changes: 15 additions & 7 deletions tests/test_simd_math.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,9 @@ SUITE(trigonometry)
printf(".");
RUN_TESTp(generic_test, sinf, simd_sin, -10.f, 10.f, FLT_EPSILON, false, "simd_sin");
RUN_TESTp(generic_test, cosf, simd_cos, -10.f, 10.f, FLT_EPSILON, false, "simd_cos");
RUN_TESTp(generic_test, sinf, simd_approx_sin, -10.f, 10.f, 2.e-06f, false, "simd_approx_sin");
RUN_TESTp(generic_test, cosf, simd_approx_cos, -10.f, 10.f, 2.e-06f, false, "simd_approx_cos");
RUN_TESTp(generic_test, acosf, simd_acos, -1.f, 1.f, 1.e-06f, false, "simd_acos");
RUN_TESTp(generic_test, asinf, simd_asin, -1.f, 1.f, 1.e-06f, false, "simd_asin");
RUN_TESTp(generic_test, atanf, simd_atan, -10.f, 10.f, 1.e-04f, false, "simd_atan");
RUN_TESTp(generic_test, acosf, simd_approx_acos, -1.f, 1.f, 1.e-04f, false, "simd_approx_arcos");
RUN_TESTp(generic_test2, atan2_xy, simd_atan2, 3.e-07f, false, "simd_atan2");
RUN_TEST(sinuscosinus);
}
Expand All @@ -151,17 +148,28 @@ SUITE(exponentiation)
{
printf(".");
RUN_TESTp(generic_test, logf, simd_log, FLT_EPSILON, 1000.f, 1.e-06f, true, "simd_log");
RUN_TESTp(generic_test, log2f, simd_log2, FLT_EPSILON, 1.e20f, 3.e-07f, true, "simd_log2");
RUN_TESTp(generic_test, log2f, simd_log2, FLT_EPSILON, 1.e20f, 3.e07f, true, "simd_log2");
RUN_TESTp(generic_test, expf, simd_exp, -87.f, 87.f, 1.e-06f, true, "simd_exp");
RUN_TESTp(generic_test, exp2f, simd_exp2, -126.f, 126.f, 2.e-07f, true, "simd_exp2");
RUN_TESTp(generic_test, expf, simd_approx_exp, -87.f, 87.f, 2.e-03f, true, "simd_approx_exp");
RUN_TESTp(generic_test, cbrtf, simd_cbrt, -100.f, 100.f, 2.e-07f, true, "simd_cbrt");
}

SUITE(approximations)
{
printf(".");
RUN_TESTp(generic_test, sinf, simd_approx_sin, -10.f, 10.f, 2.e-06f, false, "simd_approx_sin");
RUN_TESTp(generic_test, cosf, simd_approx_cos, -10.f, 10.f, 2.e-06f, false, "simd_approx_cos");
RUN_TESTp(generic_test, acosf, simd_approx_acos, -1.f, 1.f, 1.e-04f, false, "simd_approx_acos");
RUN_TESTp(generic_test, asinf, simd_approx_asin, -1.f, 1.f, 1.e-04f, false, "simd_approx_asin");
RUN_TESTp(generic_test, expf, simd_approx_exp, -87.f, 87.f, 2.e-03f, true, "simd_approx_exp");
RUN_TESTp(generic_test, exp2f, simd_approx_exp2, -126.f, 126.f, 3.e-07f, true, "simd_approx_exp2");
RUN_TESTp(generic_test, log2f, simd_approx_log2, FLT_EPSILON, 1.e20f, 3.e-07f, true, "simd_approx_log2");
}

SUITE(color_space)
{
printf(".");
RUN_TESTp(generic_test, srgb_to_linear, simd_approx_srgb_to_linear, 0.f, 1.f, 1.e-04f, false, "simd_approx_srgb_to_linear");
RUN_TESTp(generic_test, linear_to_srgb, simd_approx_linear_to_srgb, 0.f, 1.f, 4.e-03f, false, "simd_approx_linear_to_srgb");
RUN_TESTp(generic_test, srgb_to_linear, simd_srgb_to_linear, 0.f, 1.f, 1.e-04f, false, "simd_srgb_to_linear");
RUN_TESTp(generic_test, linear_to_srgb, simd_linear_to_srgb, 0.f, 1.f, 4.e-03f, false, "simd_linear_to_srgb");
}

0 comments on commit a8049cd

Please sign in to comment.