From c34e2cf380a757621f166e6ce8ea8755155158f2 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Sun, 19 Nov 2023 21:21:23 +0000 Subject: [PATCH] Use _mm_set1_epi{32,64x} to init mask in x86-64 [cz]asum for skylake kernels. This is the same method as used in [sd]asum. _mm_set1_epi64x was commented out for zasum, but has the advantage of avoiding possible undefined behaviour (using an uninitialized variable), optimized out by NVHPC and icx. The new code works fine with those compilers. For GCC 12.3 the generated code is identical; no matter what method you use, the compiler optimizes the code into a compile-time constant, there is no performance benefit using mm_cmpeq_epi8 since the corresponding instruction (VPCMPEQB) isn't actually generated! --- kernel/x86_64/casum_microk_skylakex-2.c | 10 ++++------ kernel/x86_64/zasum_microk_skylakex-2.c | 11 ++++------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index 5d37fd5414..10b70ff201 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -2,10 +2,9 @@ #ifdef __NVCOMPILER #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) #endif -#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309))) +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) -#if (!(defined(__NVCOMPILER) )) -//&& NVCOMPVERS < 2309)) +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) #define HAVE_CASUM_KERNEL 1 @@ -21,15 +20,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x) if (n2 < 64) { __m128 accum_10, accum_11, accum_12, accum_13; - __m128 abs_mask1 = abs_mask1; + __m128 abs_mask1; accum_10 = _mm_setzero_ps(); accum_11 = _mm_setzero_ps(); accum_12 = _mm_setzero_ps(); accum_13 = _mm_setzero_ps(); - abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); - abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1); + abs_mask1 = (__m128)_mm_set1_epi32(0x7fffffff); _mm_prefetch(&x1[0], _MM_HINT_T0); diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index 7260922e79..f6bc8e37b2 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -2,10 +2,9 @@ #ifdef __NVCOMPILER #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) #endif -#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309))) +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) -#if (!(defined(__NVCOMPILER) )) -//&& NVCOMPVERS < 2309)) +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) #define HAVE_ZASUM_KERNEL 1 @@ -22,16 +21,14 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) if (n2 < 32) { __m128d accum_10, accum_11, accum_12, accum_13; - __m128d abs_mask1 = abs_mask1; + __m128d abs_mask1; accum_10 = _mm_setzero_pd(); accum_11 = _mm_setzero_pd(); accum_12 = _mm_setzero_pd(); accum_13 = _mm_setzero_pd(); - // abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff); - abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); - abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1); + abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff); _mm_prefetch(&x1[0], _MM_HINT_T0); if (n2 >= 16){