Skip to content

Commit

Permalink
Merge pull request #269 from ebassi/madd-avx
Browse files Browse the repository at this point in the history
Add AVX implementation of graphene_simd4f_madd()
  • Loading branch information
ebassi authored Aug 15, 2024
2 parents e44377a + b185f55 commit db2b756
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 20 deletions.
10 changes: 10 additions & 0 deletions include/graphene-config.h.meson
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ extern "C" {
# if defined(GRAPHENE_USE_SSE)
# include <xmmintrin.h>
# include <emmintrin.h>
#
# if defined(_M_IX86_FP)
# if _M_IX86_FP >= 2
# define GRAPHENE_USE_SSE4_1
Expand All @@ -67,9 +68,18 @@ extern "C" {
# elif defined(_MSC_VER)
# define GRAPHENE_USE_SSE4_1
# endif
#
# if defined(__AVX__)
# #define GRAPHENE_USE_AVX
# endif
#
# if defined(GRAPHENE_USE_SSE4_1)
# include <smmintrin.h>
# endif
#
# if defined(GRAPHENE_USE_AVX)
# include <immintrin.h>
# endif
typedef __m128 graphene_simd4f_t;
# elif defined(GRAPHENE_USE_ARM_NEON)
# if defined (_MSC_VER) && (_MSC_VER < 1920) && defined (_M_ARM64)
Expand Down
73 changes: 53 additions & 20 deletions include/graphene-simd4f.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,11 @@ graphene_simd4f_t graphene_simd4f_ceil (const graphene_simd4f_t
GRAPHENE_AVAILABLE_IN_1_12
graphene_simd4f_t graphene_simd4f_floor (const graphene_simd4f_t s);

GRAPHENE_AVAILABLE_IN_1_0
graphene_simd4f_t graphene_simd4f_madd (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c);

#if !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_SSE)

/* SSE2 implementation of SIMD 4f */
Expand Down Expand Up @@ -504,6 +509,18 @@ typedef GRAPHENE_ALIGN16 union {
}))
# endif

# if defined(GRAPHENE_USE_AVX)
# define graphene_simd4f_madd(a,b,c) \
(__extension__ ({ \
(graphene_simd4f_t) _mm_fmadd_ps ((a), (b), (c)); \
}))
# else
# define graphene_simd4f_madd(a,b,c) \
(__extension__ ({ \
(graphene_simd4f_t) _mm_add_ps (_mm_mul_ps ((a), (b)), (c)); \
}))
# endif

/* On MSVC, we use static inlines */
# elif defined (_MSC_VER) /* Visual Studio SSE intrinsics */

Expand Down Expand Up @@ -835,6 +852,20 @@ _simd4f_floor (const graphene_simd4f_t s)
#endif
}

#define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c)

static inline graphene_simd4f_t
_simd4f_madd (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c)
{
#if defined(GRAPHENE_USE_AVX)
return _mm_fmadd_ps (a, b, c);
#else
return _mm_add_ps (_mm_mul_ps (a, b), c);
#endif
}

#else /* SSE intrinsics-not GCC or Visual Studio */

# error "Need GCC-compatible or Visual Studio compiler for SSE extensions."
Expand Down Expand Up @@ -1158,6 +1189,11 @@ typedef int graphene_simd4i_t __attribute__((vector_size (16)));
(graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
}))

# define graphene_simd4f_madd(a,b,c) \
(__extension__ ({ \
(graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
}))

#elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_ARM_NEON)

/* ARM Neon implementation of SIMD4f */
Expand Down Expand Up @@ -1498,6 +1534,11 @@ typedef float32x2_t graphene_simd2f_t;
(graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
}))

# define graphene_simd4f_madd(a,b,c) \
(__extension__ ({ \
(graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
}))

#elif defined _MSC_VER /* Visual Studio ARM */

# define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w)
Expand Down Expand Up @@ -1840,6 +1881,16 @@ _simd4f_floor (const graphene_simd4f_t s)
return graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w);
}

# define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c)

static inline graphene_simd4f_t
_simd4f_madd (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c)
{
return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
}

#else /* ARM NEON intrinsics-not GCC or Visual Studio */

# error "Need GCC-compatible or Visual Studio compiler for ARM NEON extensions."
Expand Down Expand Up @@ -1956,33 +2007,15 @@ _simd4f_floor (const graphene_simd4f_t s)
(graphene_simd4f_ceil ((s)))
#define graphene_simd4f_floor(s) \
(graphene_simd4f_floor ((s)))
#define graphene_simd4f_madd(a,b,c) \
(graphene_simd4f_madd ((a), (b), (c)))

#else
# error "Unsupported simd4f implementation."
#endif

/* Generic operations, inlined */

/**
* graphene_simd4f_madd:
* @m1: a #graphene_simd4f_t
* @m2: a #graphene_simd4f_t
* @a: a #graphene_simd4f_t
*
* Adds @a to the product of @m1 and @m2.
*
* Returns: the result vector
*
* Since: 1.0
*/
static inline graphene_simd4f_t
graphene_simd4f_madd (const graphene_simd4f_t m1,
const graphene_simd4f_t m2,
const graphene_simd4f_t a)
{
return graphene_simd4f_add (graphene_simd4f_mul (m1, m2), a);
}

/**
* graphene_simd4f_sum:
* @v: a #graphene_simd4f_t
Expand Down
28 changes: 28 additions & 0 deletions src/graphene-simd4f.c
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,26 @@ graphene_simd4f_t
return graphene_simd4f_floor (s);
}

/**
* graphene_simd4f_madd:
* @a: a #graphene_simd4f_t
* @b: a #graphene_simd4f_t
* @c: a #graphene_simd4f_t
*
* Adds @a to the product of @m1 and @m2.
*
* Returns: the result vector
*
* Since: 1.0
*/
graphene_simd4f_t
(graphene_simd4f_madd) (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c)
{
return graphene_simd4f_madd (a, b, c);
}

#else /* GRAPHENE_USE_SCALAR */

graphene_simd4f_t
Expand Down Expand Up @@ -1516,4 +1536,12 @@ graphene_simd4f_t
return graphene_simd4f_init (floorf (s.x), floorf (s.y), floorf (s.z), floorf (s.w));
}

graphene_simd4f_t
(graphene_simd4f_madd) (const graphene_simd4f_t a,
const graphene_simd4f_t b,
const graphene_simd4f_t c)
{
return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
}

#endif /* GRAPHENE_USE_SCALAR */

0 comments on commit db2b756

Please sign in to comment.