Skip to content

Commit

Permalink
Merge pull request OpenMathLib#4330 from bartoldeman/asum-init-mask
Browse files Browse the repository at this point in the history
Use _mm_set1_epi{32,64x} to init mask in x86-64 [cz]asum
  • Loading branch information
martin-frbg authored Nov 20, 2023
2 parents 864c65b + c34e2cf commit 2ea65ba
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 13 deletions.
10 changes: 4 additions & 6 deletions kernel/x86_64/casum_microk_skylakex-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
#ifdef __NVCOMPILER
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
#endif
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))

#if (!(defined(__NVCOMPILER) ))
//&& NVCOMPVERS < 2309))
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))

#define HAVE_CASUM_KERNEL 1

Expand All @@ -21,15 +20,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)

if (n2 < 64) {
__m128 accum_10, accum_11, accum_12, accum_13;
__m128 abs_mask1 = abs_mask1;
__m128 abs_mask1;

accum_10 = _mm_setzero_ps();
accum_11 = _mm_setzero_ps();
accum_12 = _mm_setzero_ps();
accum_13 = _mm_setzero_ps();

abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1);
abs_mask1 = (__m128)_mm_set1_epi32(0x7fffffff);

_mm_prefetch(&x1[0], _MM_HINT_T0);

Expand Down
11 changes: 4 additions & 7 deletions kernel/x86_64/zasum_microk_skylakex-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
#ifdef __NVCOMPILER
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
#endif
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))

#if (!(defined(__NVCOMPILER) ))
//&& NVCOMPVERS < 2309))
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))

#define HAVE_ZASUM_KERNEL 1

Expand All @@ -22,16 +21,14 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)

if (n2 < 32) {
__m128d accum_10, accum_11, accum_12, accum_13;
__m128d abs_mask1 = abs_mask1;
__m128d abs_mask1;

accum_10 = _mm_setzero_pd();
accum_11 = _mm_setzero_pd();
accum_12 = _mm_setzero_pd();
accum_13 = _mm_setzero_pd();

// abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1);
abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);

_mm_prefetch(&x1[0], _MM_HINT_T0);
if (n2 >= 16){
Expand Down

0 comments on commit 2ea65ba

Please sign in to comment.