Skip to content

Commit

Permalink
Use _mm_set1_epi{32,64x} to init mask in x86-64 [cz]asum
Browse files Browse the repository at this point in the history
for skylake kernels. This is the same method as used in [sd]asum.
_mm_set1_epi64x was commented out for zasum, but has the advantage
of avoiding possible undefined behaviour (using an uninitialized
variable), optimized out by NVHPC and icx. The new code works
fine with those compilers.

For GCC 12.3 the generated code is identical; no matter what method
you use, the compiler optimizes the code into a compile-time
constant, there is no performance benefit using mm_cmpeq_epi8
since the corresponding instruction (VPCMPEQB) isn't actually
generated!
  • Loading branch information
bartoldeman committed Nov 19, 2023
1 parent 22aa401 commit c34e2cf
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 13 deletions.
10 changes: 4 additions & 6 deletions kernel/x86_64/casum_microk_skylakex-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
#ifdef __NVCOMPILER
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
#endif
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))

#if (!(defined(__NVCOMPILER) ))
//&& NVCOMPVERS < 2309))
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))

#define HAVE_CASUM_KERNEL 1

Expand All @@ -21,15 +20,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)

if (n2 < 64) {
__m128 accum_10, accum_11, accum_12, accum_13;
__m128 abs_mask1 = abs_mask1;
__m128 abs_mask1;

accum_10 = _mm_setzero_ps();
accum_11 = _mm_setzero_ps();
accum_12 = _mm_setzero_ps();
accum_13 = _mm_setzero_ps();

abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1);
abs_mask1 = (__m128)_mm_set1_epi32(0x7fffffff);

_mm_prefetch(&x1[0], _MM_HINT_T0);

Expand Down
11 changes: 4 additions & 7 deletions kernel/x86_64/zasum_microk_skylakex-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
#ifdef __NVCOMPILER
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
#endif
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))

#if (!(defined(__NVCOMPILER) ))
//&& NVCOMPVERS < 2309))
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))

#define HAVE_ZASUM_KERNEL 1

Expand All @@ -22,16 +21,14 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)

if (n2 < 32) {
__m128d accum_10, accum_11, accum_12, accum_13;
__m128d abs_mask1 = abs_mask1;
__m128d abs_mask1;

accum_10 = _mm_setzero_pd();
accum_11 = _mm_setzero_pd();
accum_12 = _mm_setzero_pd();
accum_13 = _mm_setzero_pd();

// abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1);
abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);

_mm_prefetch(&x1[0], _MM_HINT_T0);
if (n2 >= 16){
Expand Down

0 comments on commit c34e2cf

Please sign in to comment.