simd

NEON/AVX simd library, one header file library.

This is a multiplatform simd intrinsic "vector size agnostic" library. There are already libraries to translate intrinsics like SSE2Neon for example. But the idea behind this library is little different : with the same code be able to use 256 bits AVX on my intel-based computer and 128 bits NEON on my M1 Mac.

Unit tests build status (Linux/MacOs/Windows)

optional headers

extra/simd_math.h : contains math (sin/cos for example) specific functions
extra/simd_2d_collision.h : 2d collision functions, batch collision to fully use SIMD registers

example

Simple function to compute aabb from a list of points. Not the most optimal function but does work on any sized simd vector.

#include "simd.h"

typedef struct {float x, y, z;} point;

void simd_compute_aabb(const point* points, int num_points, point* aabb_min, point* aabb_max)
{
    int num_vec = num_points / simd_vector_width;
    int remaining_points = num_points - (num_vec * simd_vector_width);
    
    simd_vector min_x = simd_splat(FLT_MAX); simd_vector min_y = min_x; simd_vector min_z = min_x;
    simd_vector max_x = simd_splat(-FLT_MAX); simd_vector max_y = max_x; simd_vector max_z = max_x;
    
    for(int i=0; i<num_vec; ++i)
    {
        simd_vector x, y, z;
        simd_load_xyz_unorder((const float*)points, &x, &y, &z);
        
        min_x = simd_min(min_x, x);
        min_y = simd_min(min_y, y);
        min_z = simd_min(min_z, z);
        max_x = simd_max(max_x, x);
        max_y = simd_max(max_y, y);
        max_z = simd_max(max_z, z);
        
        points += simd_vector_width;
    }
    
    aabb_min->x = simd_hmin(min_x);
    aabb_min->y = simd_hmin(min_y);
    aabb_min->z = simd_hmin(min_z);
    
    aabb_max->x = simd_hmax(max_x);
    aabb_max->y = simd_hmax(max_y);
    aabb_max->z = simd_hmax(max_z);
    
    for(int i=0; i<remaining_points; ++i)
    {
        aabb_min->x = min(aabb_min->x, points[i].x);
        aabb_min->y = min(aabb_min->y, points[i].y);
        aabb_min->z = min(aabb_min->z, points[i].z);
        aabb_max->x = max(aabb_max->x, points[i].x);
        aabb_max->y = max(aabb_max->y, points[i].y);
        aabb_max->z = max(aabb_max->z, points[i].z);
    }
}

documentation

The idea of the libray is to not assume a specific simd vector width (4 for SSE/Neon, 8 for AVX and so on) but use simd_vector_width variable instead. As a result the library does not contains function to shuffle lanes, horizontal operation (which are not optimal anyway). The API is based on AVX and Neon, this is not a direct AVX to Neon translation or vice versa.

simd_vector is typedef to the native simd vector of the platform (avx or neon).

load/store/set

// returns a vector loaded for the pointer [array] of floats
simd_vector simd_load(const float* array);

// loads a partial array of [count] floats, fills the rest with [unload_value]
simd_vector simd_load_partial(const float* array, int count, float unload_value);

// stores a vector [a] at the pointer [array] 
void simd_store(float* array, simd_vector a);

// stores [count] floats from vector [a] at pointer [array]
void simd_store_partial(float* array, simd_vector a, int count);

// loads 2 channels data from [array] and deinterleave data in [x] and [y].
// reads simd_vector_width*2 floats. preserves order.
void simd_load_xy(const float* array, simd_vector* x, simd_vector* y);

// loads 2 channels data from [array] and deinterleave data in [x] and [y].
// reads simd_vector_width*2 floats. Does not preserve order, faster on AVX than previous function.
void simd_load_xy_unorder(const float* array, simd_vector* x, simd_vector* y);

// loads 3 channels data from [array] and deinterleave data in [x], [y] and [z].
// reads simd_vector_width*3 floats. preserves order.
void simd_load_xyz(const float* array, simd_vector* x, simd_vector* y, simd_vector* z);

// loads 3 channels data from [array] and deinterleave data in [x], [y] and [z].
// reads simd_vector_width*3 floats. Does not preserve order, faster on AVX than previous function.
void simd_load_xyz_unorder(const float* array, simd_vector* x, simd_vector* y, simd_vector* z);

// loads 4 channels data from [array] and deinterleave data in [x], [y], [z] and [w]
// reads simd_vector_width*4 floats. preserves order.
void simd_load_xyzw(const float* array, simd_vector* x, simd_vector* y, simd_vector* z, simd_vector* w);

// loads 4 channels data from [array] and deinterleave data in [x], [y], [z] and [w]
// reads simd_vector_width*4 floats. Does not preserve order, faster on AVX than previous function.
void simd_load_xyzw_unorder(const float* array, simd_vector* x, simd_vector* y, simd_vector* z, simd_vector* w);

// returns a vector with all lanes set to [value] 
simd_vector simd_splat(float value);

// returns a vector with all lanes set zero
simd_vector simd_splat_zero(void);

// export a vector to int16_t : the floats have to be in the right range [-32768.f; 32767.f]
void simd_export_int16(simd_vector input, int16_t* output);

// export 4 vectors to int8_t : the floats have to be in the right range [-128.f; 127.f]
void simd_export_int8(simd_vector a, simd_vector b, simd_vector c, simd_vector d, int8_t* output);

arithmetic

// returns a+b
simd_vector simd_add(simd_vector a, simd_vector b);

// returns a-b
simd_vector simd_sub(simd_vector a, simd_vector b);

// returns a*b
simd_vector simd_mul(simd_vector a, simd_vector b);

// returns a/b
simd_vector simd_div(simd_vector a, simd_vector b);

// returns 1/b
simd_vector simd_rcp(simd_vector a);

// returns 1/sqrt(a)
simd_vector simd_rsqrt(simd_vector a);

// returns sqrt(a)
simd_vector simd_sqrt(simd_vector a);

// returns the absolute value of a
simd_vector simd_abs(simd_vector a);

// returns a*b+c (fused multiply-add)
// with AVX it is emulated if the __FMA__ macro is not defined 
simd_vector simd_fmad(simd_vector a, simd_vector b, simd_vector c);

// returns -a
simd_vector simd_neg(simd_vector a);

comparison

all comparison functions return a vector with the comparison result in each lanes a value of NAN for true and zero for false those result vectors can be use with simd_select and simd_any

// greater than comparison
simd_vector simd_cmp_gt(simd_vector a, simd_vector b);

// greater or equal comparison
simd_vector simd_cmp_ge(simd_vector a, simd_vector b);

// less than comparison
simd_vector simd_cmp_lt(simd_vector a, simd_vector b); 

// less or equal comparison
simd_vector simd_cmp_le(simd_vector a, simd_vector b); 

// equal comparison
simd_vector simd_cmp_eq(simd_vector a, simd_vector b); 

// not equal comparison
simd_vector simd_cmp_neq(simd_vector a, simd_vector b); 

// returns a vector with value from a or b depending of the mask
// mask can be obtain by a comparison
simd_vector simd_select(simd_vector a, simd_vector b, simd_vector mask);

// returns 1 if all abs(a-b) < epsilon, otherwise 0
int simd_equal(simd_vector a, simd_vector b, simd_vector epsilon);

// returns 1 if any of the lanes is true (NAN), otherwise 0
int simd_any(simd_vector a);

// returns 1 if all lanes are true (NAN), otherwise returns 0
int simd_all(simd_vector a);

rounding

// returns the fractionnal part of [a]
simd_vector simd_fract(simd_vector a);

// returns the largest integer not greater than [a]
simd_vector simd_floor(simd_vector a);

// returns the smallest integer less than [a]
simd_vector simd_ceil(simd_vector a);

// returns round value to the nearest integer
simd_vector simd_round(simd_vector a);

logical

// returns a logical or b (based on bits)
simd_vector simd_or(simd_vector a, simd_vector b);

// returns a logical and b (based on bits)
simd_vector simd_and(simd_vector a, simd_vector b);

// returns a logical and not b (based on bits)
simd_vector simd_andnot(simd_vector a, simd_vector b);

horizontal operation

// returns the minimum value of all lanes in the vector
float simd_hmin(simd_vector a);

// returns the maximum value of all lanes in the vector
float simd_hmax(simd_vector a);

// returns the sum of all lanes
float simd_hsum(simd_vector a);

misc

// returns a sorted vector in ascending order
simd_vector simd_sort(simd_vector input);

// reverses the order of the vector
simd_vector simd_reverse(simd_vector a);

// returns a vector with minimum values from a and b
simd_vector simd_min(simd_vector a, simd_vector b);

// returns a vector with maximum values from a and b
simd_vector simd_max(simd_vector a, simd_vector b);

// returns a vector with values clamped between [range_min] and [range_max]
simd_vector simd_clamp(simd_vector a, simd_vector range_min, simd_vector range_max);

// returns a vector with values in [0;1]
simd_vector simd_saturate(simd_vector a);

// returns a linear interpolated vector based on [a] and [b] with [t] values in [0;1] range
simd_vector simd_lerp(simd_vector a, simd_vector b, simd_vector t);

Name		Name	Last commit message	Last commit date
Latest commit History 314 Commits
.github/workflows		.github/workflows
benchmark		benchmark
doc		doc
extra		extra
tests		tests
.gitignore		.gitignore
LICENSE		LICENSE
README.md		README.md
simd.h		simd.h

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

simd

Unit tests build status (Linux/MacOs/Windows)

optional headers

example

documentation

load/store/set

arithmetic

comparison

rounding

logical

horizontal operation

misc

About

Releases

Packages

Languages

License

Geolm/simd

Folders and files

Latest commit

History

Repository files navigation

simd

Unit tests build status (Linux/MacOs/Windows)

optional headers

example

documentation

load/store/set

arithmetic

comparison

rounding

logical

horizontal operation

misc

About

Topics

Resources

License

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages