From be5e18c6f94716c6265f9e2cd58517fa138abcf5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 24 Feb 2024 23:55:43 +0100 Subject: [PATCH 1/2] Add kernel definitions for CSUM and ZSUM --- kernel/x86_64/KERNEL | 2 ++ kernel/x86_64/KERNEL.SKYLAKEX | 2 ++ 2 files changed, 4 insertions(+) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index f8278c3b4c..ec4290e823 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -489,5 +489,7 @@ XGEMM3MKERNEL = xgemm3m_kernel_2x2.S SSUMKERNEL = ../arm/sum.c DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = zsum_sse.S +ZSUMKERNEL = zsum_sse2.S SOMATCOPY_RT = omatcopy_rt.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 548e5dcfcf..7e946ef2ea 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -46,3 +46,5 @@ ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c +CSUMKERNEL = csum.c +ZSUMKERNEL = zsum.c From 8f8ef3492a3eb753f8e6e1201dfe4c7107c5d779 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 24 Feb 2024 23:57:50 +0100 Subject: [PATCH 2/2] Add CSUM and ZSUM kernels (trivially derived from their existing ASUM counterparts) --- kernel/x86_64/csum.c | 131 +++++++++++ kernel/x86_64/csum_microk_skylakex-2.c | 289 ++++++++++++++++++++++++ kernel/x86_64/zsum.c | 131 +++++++++++ kernel/x86_64/zsum_microk_skylakex-2.c | 280 +++++++++++++++++++++++ kernel/x86_64/zsum_sse.S | 299 +++++++++++++++++++++++++ kernel/x86_64/zsum_sse2.S | 283 +++++++++++++++++++++++ 6 files changed, 1413 insertions(+) create mode 100644 kernel/x86_64/csum.c create mode 100644 kernel/x86_64/csum_microk_skylakex-2.c create mode 100644 kernel/x86_64/zsum.c create mode 100644 kernel/x86_64/zsum_microk_skylakex-2.c create mode 100644 kernel/x86_64/zsum_sse.S create mode 100644 kernel/x86_64/zsum_sse2.S diff --git a/kernel/x86_64/csum.c b/kernel/x86_64/csum.c new file mode 100644 index 0000000000..e85b5cae12 --- /dev/null +++ b/kernel/x86_64/csum.c @@ -0,0 +1,131 @@ +#include "common.h" + +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) +#include "csum_microk_skylakex-2.c" +#endif + +#ifndef HAVE_CSUM_KERNEL +static FLOAT csum_kernel(BLASLONG n, FLOAT *x) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x1 = x; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + sum0 += x1[0]; + sum1 += x1[1]; + sum2 += x1[2]; + sum3 += x1[3]; + + sum0 += x1[4]; + sum1 += x1[5]; + sum2 += x1[6]; + sum3 += x1[7]; + + x1+=8; + i+=4; + } + + while (i < n) { + sum4 += (x1[0] + x1[1]); + x1 += 2; + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ip = 0; + BLASLONG inc_x2; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + if (inc_x == 1) { + sumf = csum_kernel(n, x); + } + else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + ip += inc_x2; + i++; + } + } + + return(sumf); +} + +#if defined(SMP) +static int sum_thread_function(BLASLONG n, + BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, + FLOAT *x, BLASLONG inc_x, + FLOAT * dummy3, BLASLONG dummy4, + FLOAT * result, BLASLONG dummy5) +{ + *(FLOAT *) result = sum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, + BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, + int (*function)(), + int nthread); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha[2]; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 10000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/10000 ? num_cpu : n/10000; + + if (nthreads == 1) { + sumf = sum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, + NULL, 0, result, 0, (int (*)(void))sum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = sum_compute(n, x, inc_x); +#endif + return(sumf); +} diff --git a/kernel/x86_64/csum_microk_skylakex-2.c b/kernel/x86_64/csum_microk_skylakex-2.c new file mode 100644 index 0000000000..ec882efa13 --- /dev/null +++ b/kernel/x86_64/csum_microk_skylakex-2.c @@ -0,0 +1,289 @@ +/* need a new enough GCC for avx512 support */ +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) + +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) + +#define HAVE_CASUM_KERNEL 1 + +#include + +#include + +static FLOAT casum_kernel(BLASLONG n, FLOAT *x) +{ + FLOAT *x1 = x; + FLOAT sumf=0.0; + BLASLONG n2 = n + n; + + if (n2 < 64) { + __m128 accum_10, accum_11, accum_12, accum_13; + + accum_10 = _mm_setzero_ps(); + accum_11 = _mm_setzero_ps(); + accum_12 = _mm_setzero_ps(); + accum_13 = _mm_setzero_ps(); + + _mm_prefetch(&x1[0], _MM_HINT_T0); + + if (n2 >= 32){ + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + __m128 x02 = _mm_loadu_ps(&x1[ 8]); + __m128 x03 = _mm_loadu_ps(&x1[12]); + + _mm_prefetch(&x1[16], _MM_HINT_T0); + __m128 x04 = _mm_loadu_ps(&x1[16]); + __m128 x05 = _mm_loadu_ps(&x1[20]); + __m128 x06 = _mm_loadu_ps(&x1[24]); + __m128 x07 = _mm_loadu_ps(&x1[28]); + + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + accum_12 = _mm_add_ps(accum_12, x02); + accum_13 = _mm_add_ps(accum_13, x03); + + accum_10 = _mm_add_ps(accum_10, x04); + accum_11 = _mm_add_ps(accum_11, x05); + accum_12 = _mm_add_ps(accum_12, x06); + accum_13 = _mm_add_ps(accum_13, x07); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + __m128 x02 = _mm_loadu_ps(&x1[ 8]); + __m128 x03 = _mm_loadu_ps(&x1[12]); + + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + accum_12 = _mm_add_ps(accum_12, x02); + accum_13 = _mm_add_ps(accum_13, x03); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + + n2 -= 8; + x1 += 8; + } + + if (n2 >= 4) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + accum_10 = _mm_add_ps(accum_10, x00); + + n2 -= 4; + x1 += 4; + } + + if (n2) { + sumf += (x1[0] + x1[1]); + } + + accum_10 = _mm_add_ps(accum_10, accum_11); + accum_12 = _mm_add_ps(accum_12, accum_13); + accum_10 = _mm_add_ps(accum_10, accum_12); + + accum_10 = _mm_hadd_ps(accum_10, accum_10); + accum_10 = _mm_hadd_ps(accum_10, accum_10); + + sumf += accum_10[0]; + } + else { + __m512 accum_0, accum_1, accum_2, accum_3; + __m512 x00, x01, x02, x03, x04, x05, x06, x07; + + accum_0 = _mm512_setzero_ps(); + accum_1 = _mm512_setzero_ps(); + accum_2 = _mm512_setzero_ps(); + accum_3 = _mm512_setzero_ps(); + + // alignment has side-effect when the size of input array is not large enough + if (n2 < 256) { + if (n2 >= 128) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[ 16]); + x02 = _mm512_loadu_ps(&x1[ 32]); + x03 = _mm512_loadu_ps(&x1[ 48]); + x04 = _mm512_loadu_ps(&x1[ 64]); + x05 = _mm512_loadu_ps(&x1[ 80]); + x06 = _mm512_loadu_ps(&x1[ 96]); + x07 = _mm512_loadu_ps(&x1[112]); + + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + accum_0 = _mm512_add_ps(accum_0, x04); + accum_1 = _mm512_add_ps(accum_1, x05); + accum_2 = _mm512_add_ps(accum_2, x06); + accum_3 = _mm512_add_ps(accum_3, x07); + + n2 -= 128; + x1 += 128; + } + + if (n2 >= 64) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[16]); + x02 = _mm512_loadu_ps(&x1[32]); + x03 = _mm512_loadu_ps(&x1[48]); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[16]); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_loadu_ps(&x1[ 0]); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= 16; + x1 += 16; + } + + if (n2) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2)); + x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &tail_mask16), &x1[ 0]); + accum_0 = _mm512_add_ps(accum_0, x00); + } + accum_0 = _mm512_add_ps(accum_0, accum_1); + accum_2 = _mm512_add_ps(accum_2, accum_3); + accum_0 = _mm512_add_ps(accum_0, accum_2); + + sumf = _mm512_reduce_add_ps(accum_0); + } + // n2 >= 256, doing alignment + else { + + int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf; + + if (0 != align_header) { + uint16_t align_mask16 = (((uint16_t)0xffff) >> (16 - align_header)); + x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &x1[0]); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= align_header; + x1 += align_header; + } + + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[ 16]); + x02 = _mm512_load_ps(&x1[ 32]); + x03 = _mm512_load_ps(&x1[ 48]); + x04 = _mm512_load_ps(&x1[ 64]); + x05 = _mm512_load_ps(&x1[ 80]); + x06 = _mm512_load_ps(&x1[ 96]); + x07 = _mm512_load_ps(&x1[112]); + + n2 -= 128; + x1 += 128; + + while (n2 >= 128) { + + accum_0 = _mm512_add_ps(accum_0, x00); + x00 = _mm512_load_ps(&x1[ 0]); + accum_1 = _mm512_add_ps(accum_1, x01); + x01 = _mm512_load_ps(&x1[ 16]); + accum_2 = _mm512_add_ps(accum_2, x02); + x02 = _mm512_load_ps(&x1[ 32]); + accum_3 = _mm512_add_ps(accum_3, x03); + x03 = _mm512_load_ps(&x1[ 48]); + + accum_0 = _mm512_add_ps(accum_0, x04); + x04 = _mm512_load_ps(&x1[ 64]); + accum_1 = _mm512_add_ps(accum_1, x05); + x05 = _mm512_load_ps(&x1[ 80]); + accum_2 = _mm512_add_ps(accum_2, x06); + x06 = _mm512_load_ps(&x1[ 96]); + accum_3 = _mm512_add_ps(accum_3, x07); + x07 = _mm512_load_ps(&x1[112]); + + n2 -= 128; + x1 += 128; + } + + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + accum_0 = _mm512_add_ps(accum_0, x04); + accum_1 = _mm512_add_ps(accum_1, x05); + accum_2 = _mm512_add_ps(accum_2, x06); + accum_3 = _mm512_add_ps(accum_3, x07); + + if (n2 >= 64) { + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[16]); + x02 = _mm512_load_ps(&x1[32]); + x03 = _mm512_load_ps(&x1[48]); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[16]); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_load_ps(&x1[ 0]); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= 16; + x1 += 16; + } + + if (n2) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2)); + x00 = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &x1[ 0]); + accum_0 = _mm512_add_ps(accum_0, x00); + } + + accum_0 = _mm512_add_ps(accum_0, accum_1); + accum_2 = _mm512_add_ps(accum_2, accum_3); + accum_0 = _mm512_add_ps(accum_0, accum_2); + sumf = _mm512_reduce_add_ps(accum_0); + } + } + + return sumf; +} +#endif +#endif diff --git a/kernel/x86_64/zsum.c b/kernel/x86_64/zsum.c new file mode 100644 index 0000000000..5973c12538 --- /dev/null +++ b/kernel/x86_64/zsum.c @@ -0,0 +1,131 @@ +#include "common.h" + +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) +#include "zsum_microk_skylakex-2.c" +#endif + +#ifndef HAVE_ZASUM_KERNEL +static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x1 = x; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + sum0 += x1[0]; + sum1 += x1[1]; + sum2 += x1[2]; + sum3 += x1[3]; + + sum0 += x1[4]; + sum1 += x1[5]; + sum2 += x1[6]; + sum3 += x1[7]; + + x1+=8; + i+=4; + } + + while (i < n) { + sum4 += x1[0] + x1[1]; + x1 += 2; + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ip = 0; + BLASLONG inc_x2; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + if (inc_x == 1) { + sumf = zsum_kernel(n, x); + } + else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + ip += inc_x2; + i++; + } + } + + return(sumf); +} + +#if defined(SMP) +static int sum_thread_function(BLASLONG n, + BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, + FLOAT *x, BLASLONG inc_x, + FLOAT * dummy3, BLASLONG dummy4, + FLOAT * result, BLASLONG dummy5) +{ + *(FLOAT *) result = sum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, + BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, + int (*function)(), + int nthread); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha[2]; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 10000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/10000 ? num_cpu : n/10000; + + if (nthreads == 1) { + sumf = sum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, + NULL, 0, result, 0, (int (*)(void))sum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = sum_compute(n, x, inc_x); +#endif + return(sumf); +} diff --git a/kernel/x86_64/zsum_microk_skylakex-2.c b/kernel/x86_64/zsum_microk_skylakex-2.c new file mode 100644 index 0000000000..0bca7ce6d7 --- /dev/null +++ b/kernel/x86_64/zsum_microk_skylakex-2.c @@ -0,0 +1,280 @@ +/* need a new enough GCC for avx512 support */ +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) + +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) + +#define HAVE_ZSUM_KERNEL 1 + +#include + +#include + +static FLOAT zsum_kernel(BLASLONG n, FLOAT *x) +{ + FLOAT *x1 = x; + FLOAT sumf=0.0; + BLASLONG n2 = n + n; + + + if (n2 < 32) { + __m128d accum_10, accum_11, accum_12, accum_13; + + accum_10 = _mm_setzero_pd(); + accum_11 = _mm_setzero_pd(); + accum_12 = _mm_setzero_pd(); + accum_13 = _mm_setzero_pd(); + + _mm_prefetch(&x1[0], _MM_HINT_T0); + if (n2 >= 16){ + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + __m128d x02 = _mm_loadu_pd(&x1[ 4]); + __m128d x03 = _mm_loadu_pd(&x1[ 6]); + + _mm_prefetch(&x1[8], _MM_HINT_T0); + __m128d x04 = _mm_loadu_pd(&x1[ 8]); + __m128d x05 = _mm_loadu_pd(&x1[10]); + __m128d x06 = _mm_loadu_pd(&x1[12]); + __m128d x07 = _mm_loadu_pd(&x1[14]); + + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + accum_12 = _mm_add_pd(accum_12, x02); + accum_13 = _mm_add_pd(accum_13, x03); + + accum_10 = _mm_add_pd(accum_10, x04); + accum_11 = _mm_add_pd(accum_11, x05); + accum_12 = _mm_add_pd(accum_12, x06); + accum_13 = _mm_add_pd(accum_13, x07); + + x1 += 16; + n2 -= 16; + } + + if (n2 >= 8) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + __m128d x02 = _mm_loadu_pd(&x1[ 4]); + __m128d x03 = _mm_loadu_pd(&x1[ 6]); + + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + accum_12 = _mm_add_pd(accum_12, x02); + accum_13 = _mm_add_pd(accum_13, x03); + + n2 -= 8; + x1 += 8; + } + + if (n2 >= 4) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + + n2 -= 4; + x1 += 4; + } + + if (n2) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + accum_10 = _mm_add_pd(accum_10, x00); + } + + accum_10 = _mm_add_pd(accum_10, accum_11); + accum_12 = _mm_add_pd(accum_12, accum_13); + accum_10 = _mm_add_pd(accum_10, accum_12); + + accum_10 = _mm_hadd_pd(accum_10, accum_10); + + sumf = accum_10[0]; + } + else { + __m512d accum_0, accum_1, accum_2, accum_3; + __m512d x00, x01, x02, x03, x04, x05, x06, x07; + __m512d abs_mask = (__m512d)_mm512_set1_epi64(0x7fffffffffffffff); + + accum_0 = _mm512_setzero_pd(); + accum_1 = _mm512_setzero_pd(); + accum_2 = _mm512_setzero_pd(); + accum_3 = _mm512_setzero_pd(); + + // alignment has side-effect when the size of input array is not large enough + if (n2 < 128) { + if (n2 >= 64) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x02 = _mm512_loadu_pd(&x1[16]); + x03 = _mm512_loadu_pd(&x1[24]); + x04 = _mm512_loadu_pd(&x1[32]); + x05 = _mm512_loadu_pd(&x1[40]); + x06 = _mm512_loadu_pd(&x1[48]); + x07 = _mm512_loadu_pd(&x1[56]); + + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + accum_0 = _mm512_add_pd(accum_0, x04); + accum_1 = _mm512_add_pd(accum_1, x05); + accum_2 = _mm512_add_pd(accum_2, x06); + accum_3 = _mm512_add_pd(accum_3, x07); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x02 = _mm512_loadu_pd(&x1[16]); + x03 = _mm512_loadu_pd(&x1[24]); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + x00 = _mm512_loadu_pd(&x1[ 0]); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= 8; + x1 += 8; + } + + if (n2) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2)); + x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x1[ 0]); + accum_0 = _mm512_add_pd(accum_0, x00); + } + accum_0 = _mm512_add_pd(accum_0, accum_1); + accum_2 = _mm512_add_pd(accum_2, accum_3); + accum_0 = _mm512_add_pd(accum_0, accum_2); + sumf = _mm512_reduce_add_pd(accum_0); + } + // n2 >= 128, doing alignment + else { + + int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7; + + if (0 != align_header) { + unsigned char align_mask8 = (((unsigned char)0xff) >> (8 - align_header)); + x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &x1[0]); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= align_header; + x1 += align_header; + } + + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x02 = _mm512_load_pd(&x1[16]); + x03 = _mm512_load_pd(&x1[24]); + x04 = _mm512_load_pd(&x1[32]); + x05 = _mm512_load_pd(&x1[40]); + x06 = _mm512_load_pd(&x1[48]); + x07 = _mm512_load_pd(&x1[56]); + + n2 -= 64; + x1 += 64; + + while (n2 >= 64) { + accum_0 = _mm512_add_pd(accum_0, x00); + x00 = _mm512_load_pd(&x1[ 0]); + accum_1 = _mm512_add_pd(accum_1, x01); + x01 = _mm512_load_pd(&x1[ 8]); + accum_2 = _mm512_add_pd(accum_2, x02); + x02 = _mm512_load_pd(&x1[16]); + accum_3 = _mm512_add_pd(accum_3, x03); + x03 = _mm512_load_pd(&x1[24]); + + accum_0 = _mm512_add_pd(accum_0, x04); + x04 = _mm512_load_pd(&x1[32]); + accum_1 = _mm512_add_pd(accum_1, x05); + x05 = _mm512_load_pd(&x1[40]); + accum_2 = _mm512_add_pd(accum_2, x06); + x06 = _mm512_load_pd(&x1[48]); + accum_3 = _mm512_add_pd(accum_3, x07); + x07 = _mm512_load_pd(&x1[56]); + + n2 -= 64; + x1 += 64; + } + + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + accum_0 = _mm512_add_pd(accum_0, x04); + accum_1 = _mm512_add_pd(accum_1, x05); + accum_2 = _mm512_add_pd(accum_2, x06); + accum_3 = _mm512_add_pd(accum_3, x07); + + if (n2 >= 32) { + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x02 = _mm512_load_pd(&x1[16]); + x03 = _mm512_load_pd(&x1[24]); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + x00 = _mm512_load_pd(&x1[ 0]); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= 8; + x1 += 8; + } + + if (n2) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2)); + x00 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &x1[ 0]); + accum_0 = _mm512_add_pd(accum_0, x00); + } + + accum_0 = _mm512_add_pd(accum_0, accum_1); + accum_2 = _mm512_add_pd(accum_2, accum_3); + accum_0 = _mm512_add_pd(accum_0, accum_2); + sumf = _mm512_reduce_add_pd(accum_0); + } + } + + return sumf; +} +#endif +#endif diff --git a/kernel/x86_64/zsum_sse.S b/kernel/x86_64/zsum_sse.S new file mode 100644 index 0000000000..b679b42b0e --- /dev/null +++ b/kernel/x86_64/zsum_sse.S @@ -0,0 +1,299 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + salq $ZBASE_SHIFT, INCX + + cmpq $2 * SIZE, INCX + jne .L100 + + subq $-32 * SIZE, X + addq M, M + + cmpq $3, M + jle .L18 + + testq $4, X + je .L05 + movss -32 * SIZE(X), %xmm0 + addq $SIZE, X + decq M + jle .L998 + ALIGN_3 + +.L05: + testq $8, X + je .L10 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(X), %xmm1 + addq $2 * SIZE, X + subq $2, M + jle .L998 + ALIGN_3 + +.L10: + movq M, I + sarq $5, I + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + movaps -16 * SIZE(X), %xmm8 + movaps -12 * SIZE(X), %xmm9 + movaps -8 * SIZE(X), %xmm10 + movaps -4 * SIZE(X), %xmm11 + decq I + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + + addps %xmm6, %xmm2 + movaps 8 * SIZE(X), %xmm6 + + addps %xmm7, %xmm3 + movaps 12 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + addps %xmm8, %xmm0 + movaps 16 * SIZE(X), %xmm8 + + addps %xmm9, %xmm1 + movaps 20 * SIZE(X), %xmm9 + + addps %xmm10, %xmm2 + movaps 24 * SIZE(X), %xmm10 + + addps %xmm11, %xmm3 + movaps 28 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + decq I + jg .L11 + ALIGN_3 + +.L12: + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + + addps %xmm6, %xmm2 + addps %xmm7, %xmm3 + + addps %xmm8, %xmm0 + addps %xmm9, %xmm1 + + addps %xmm10, %xmm2 + addps %xmm11, %xmm3 + + addq $32 * SIZE, X + ALIGN_3 + +.L14: + testq $31, M + jle .L998 + +.L15: + testq $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + addps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm4 + addps %xmm4, %xmm0 + + movaps -20 * SIZE(X), %xmm5 + addps %xmm5, %xmm1 + + addq $16 * SIZE, X + ALIGN_3 + +.L16: + testq $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + addps %xmm5, %xmm1 + + addq $8 * SIZE, X + ALIGN_3 + +.L17: + testq $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm6 + addps %xmm6, %xmm2 + addq $4 * SIZE, X + ALIGN_3 + +.L18: + testq $2, M + je .L19 + +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd -32 * SIZE(X), %xmm7 + addps %xmm7, %xmm3 + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq $1, M + je .L998 + + movss -32 * SIZE(X), %xmm6 + addps %xmm6, %xmm2 + jmp .L998 + ALIGN_4 + +.L100: + movq M, I + sarq $2, I + jle .L105 + ALIGN_4 + +.L101: + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + + addps %xmm4, %xmm0 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + + addps %xmm5, %xmm1 + + decq I + jg .L101 + ALIGN_4 + +.L105: +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + andq $3, M + jle .L998 + ALIGN_4 + +.L106: + movsd (X), %xmm4 + addps %xmm4, %xmm0 + addq INCX, X + decq M + jg .L106 + ALIGN_4 + +.L998: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zsum_sse2.S b/kernel/x86_64/zsum_sse2.S new file mode 100644 index 0000000000..6f667164dc --- /dev/null +++ b/kernel/x86_64/zsum_sse2.S @@ -0,0 +1,283 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + salq $ZBASE_SHIFT, INCX + + cmpq $2 * SIZE, INCX + jne .L40 + + subq $-16 * SIZE, X + addq M, M + + testq $SIZE, X + je .L05 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -16 * SIZE(X), %xmm0 + addq $SIZE, X + + subq $1, M + jle .L999 + ALIGN_3 + +.L05: + movq M, I + sarq $4, I + jle .L20 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + movaps -8 * SIZE(X), %xmm8 + movaps -6 * SIZE(X), %xmm9 + movaps -4 * SIZE(X), %xmm10 + movaps -2 * SIZE(X), %xmm11 + + decq I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + + addpd %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + addpd %xmm7, %xmm3 + movaps 6 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + addpd %xmm8, %xmm0 + movaps 8 * SIZE(X), %xmm8 + + addpd %xmm9, %xmm1 + movaps 10 * SIZE(X), %xmm9 + + addpd %xmm10, %xmm2 + movaps 12 * SIZE(X), %xmm10 + + addpd %xmm11, %xmm3 + movaps 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + decq I + jg .L10 + ALIGN_4 + +.L11: + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + + subq $-16 * SIZE, X + ALIGN_3 + +.L20: + andq $15, M + jle .L998 + + testq $8, M + je .L21 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + addq $8 * SIZE, X + ALIGN_3 + +.L21: + testq $4, M + je .L22 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + addq $4 * SIZE, X + ALIGN_3 + +.L22: + testq $2, M + je .L23 + + movaps -16 * SIZE(X), %xmm6 + addpd %xmm6, %xmm3 + addq $2 * SIZE, X + +.L23: + testq $1, M + je .L998 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -16 * SIZE(X), %xmm4 + addsd %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + prefetcht0 PREFETCHSIZE * SIZE(X) +#endif + +#ifdef PENTIUM4 + prefetchnta PREFETCHSIZE * SIZE(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + addq INCX, X + addpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + addpd %xmm5, %xmm1 + + movsd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm6 + addq INCX, X + addpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm7 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + addpd %xmm7, %xmm3 + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $3, M + jle .L998 + ALIGN_4 + + +.L61: + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + addpd %xmm4, %xmm0 + addq INCX, X + decq M + jg .L61 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE