From 99ab109c689ddaf16e564ed95b3300d645e2f808 Mon Sep 17 00:00:00 2001 From: KlausT Date: Thu, 8 Jan 2015 07:29:34 +0100 Subject: [PATCH] x11_simd changes --- x11/cuda_x11_simd512.cu | 105 +++++++++++++++++++--------------------- x11/simd_functions.cu | 58 ++++++++++------------ 2 files changed, 74 insertions(+), 89 deletions(-) diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu index f3937bf138..96776b9e53 100644 --- a/x11/cuda_x11_simd512.cu +++ b/x11/cuda_x11_simd512.cu @@ -19,16 +19,15 @@ uint4 *d_temp4[8]; // texture bound to d_temp4[thr_id], for read access in Compaction kernel texture texRef1D_128; -__constant__ uint8_t c_perm[8][8] = { - { 2, 3, 6, 7, 0, 1, 4, 5 }, - { 6, 7, 2, 3, 4, 5, 0, 1 }, - { 7, 6, 5, 4, 3, 2, 1, 0 }, - { 1, 0, 3, 2, 5, 4, 7, 6 }, - { 0, 1, 4, 5, 6, 7, 2, 3 }, - { 6, 7, 2, 3, 0, 1, 4, 5 }, - { 6, 7, 0, 1, 4, 5, 2, 3 }, - { 4, 5, 2, 3, 6, 7, 0, 1 } -}; +__constant__ uint8_t c_perm0[8] = { 2, 3, 6, 7, 0, 1, 4, 5 }; +__constant__ uint8_t c_perm1[8] = { 6, 7, 2, 3, 4, 5, 0, 1 }; +__constant__ uint8_t c_perm2[8] = { 7, 6, 5, 4, 3, 2, 1, 0 }; +__constant__ uint8_t c_perm3[8] = { 1, 0, 3, 2, 5, 4, 7, 6 }; +__constant__ uint8_t c_perm4[8] = { 0, 1, 4, 5, 6, 7, 2, 3 }; +__constant__ uint8_t c_perm5[8] = { 6, 7, 2, 3, 0, 1, 4, 5 }; +__constant__ uint8_t c_perm6[8] = { 6, 7, 0, 1, 4, 5, 2, 3 }; +__constant__ uint8_t c_perm7[8] = { 4, 5, 2, 3, 6, 7, 0, 1 }; + __constant__ uint32_t c_IV_512[32] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, @@ -275,7 +274,7 @@ __device__ __forceinline__ void FFT_16(int *y) { } __device__ __forceinline__ -void FFT_128_full(int y[128]) +void FFT_128_full(int *y) { int i; @@ -287,12 +286,12 @@ void FFT_128_full(int y[128]) /*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i*8+(threadIdx.x&7)]); #pragma unroll 8 - for (i=0; i<8; i++) - FFT_16(y+2*i); // eight sequential FFT16's, each one executed in parallel by 8 threads + for (i=0; i<16; i+=2) + FFT_16(y+i); // eight sequential FFT16's, each one executed in parallel by 8 threads } __device__ __forceinline__ -void FFT_256_halfzero(int y[256]) +void FFT_256_halfzero(int *y) { /* * FFT_256 using w=41 as 256th root of unity. @@ -306,8 +305,8 @@ void FFT_256_halfzero(int y[256]) for (int i=0; i<8; i++) y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(threadIdx.x&7)]); #pragma unroll 8 - for (int i=8; i<16; i++) - y[16+i] = 0; + for (int i=24; i<32; i++) + y[i] = 0; /* handle X^255 with an additional butterfly */ if ((threadIdx.x&7) == 7) @@ -324,7 +323,7 @@ void FFT_256_halfzero(int y[256]) /***************************************************/ __device__ __forceinline__ -void Expansion(const uint32_t *data, uint4 *g_temp4) +void Expansion(const uint32_t *const __restrict__ data, uint4 *const __restrict__ g_temp4) { /* Message Expansion using Number Theoretical Transform similar to FFT */ int expanded[32]; @@ -341,9 +340,6 @@ void Expansion(const uint32_t *data, uint4 *g_temp4) // store w matrices in global memory -#define mul_185(x) ( (x)*185 ) -#define mul_233(x) ( (x)*233 ) - uint4 vec0; int P, Q, P1, Q1, P2, Q2; bool even = (threadIdx.x & 1) == 0; @@ -357,16 +353,16 @@ void Expansion(const uint32_t *data, uint4 *g_temp4) P1 = expanded[ 0]; P2 = __shfl(expanded[ 2], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8); + vec0.x = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm0[threadIdx.x&7], 8); P1 = expanded[ 8]; P2 = __shfl(expanded[10], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8); + vec0.y = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm0[threadIdx.x&7], 8); P1 = expanded[ 4]; P2 = __shfl(expanded[ 6], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8); + vec0.z = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm0[threadIdx.x&7], 8); P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8); + vec0.w = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm0[threadIdx.x&7], 8); g_temp4[threadIdx.x&7] = vec0; // 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 @@ -378,16 +374,16 @@ void Expansion(const uint32_t *data, uint4 *g_temp4) P1 = expanded[ 1]; P2 = __shfl(expanded[ 3], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8); + vec0.x = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm1[threadIdx.x&7], 8); P1 = expanded[ 9]; P2 = __shfl(expanded[11], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8); + vec0.y = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm1[threadIdx.x&7], 8); P1 = expanded[ 5]; P2 = __shfl(expanded[ 7], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8); + vec0.z = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm1[threadIdx.x&7], 8); P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8); + vec0.w = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm1[threadIdx.x&7], 8); g_temp4[8+(threadIdx.x&7)] = vec0; // 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 @@ -401,16 +397,16 @@ void Expansion(const uint32_t *data, uint4 *g_temp4) P1 = hi?expanded[ 1]:expanded[ 0]; P2 = __shfl(hi?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; Q1 = hi?expanded[17]:expanded[16]; Q2 = __shfl(hi?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8); + vec0.x = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm2[threadIdx.x&7], 8); P1 = hi?expanded[ 9]:expanded[ 8]; P2 = __shfl(hi?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; Q1 = hi?expanded[25]:expanded[24]; Q2 = __shfl(hi?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8); + vec0.y = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm2[threadIdx.x&7], 8); P1 = hi?expanded[ 5]:expanded[ 4]; P2 = __shfl(hi?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; Q1 = hi?expanded[21]:expanded[20]; Q2 = __shfl(hi?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8); + vec0.z = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm2[threadIdx.x&7], 8); P1 = hi?expanded[13]:expanded[12]; P2 = __shfl(hi?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; Q1 = hi?expanded[29]:expanded[28]; Q2 = __shfl(hi?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8); + vec0.w = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm2[threadIdx.x&7], 8); g_temp4[16+(threadIdx.x&7)] = vec0; // 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 @@ -424,16 +420,16 @@ void Expansion(const uint32_t *data, uint4 *g_temp4) P1 = lo?expanded[ 1]:expanded[ 0]; P2 = __shfl(lo?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; Q1 = lo?expanded[17]:expanded[16]; Q2 = __shfl(lo?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8); + vec0.x = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm3[threadIdx.x&7], 8); P1 = lo?expanded[ 9]:expanded[ 8]; P2 = __shfl(lo?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; Q1 = lo?expanded[25]:expanded[24]; Q2 = __shfl(lo?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8); + vec0.y = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm3[threadIdx.x&7], 8); P1 = lo?expanded[ 5]:expanded[ 4]; P2 = __shfl(lo?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; Q1 = lo?expanded[21]:expanded[20]; Q2 = __shfl(lo?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8); + vec0.z = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm3[threadIdx.x&7], 8); P1 = lo?expanded[13]:expanded[12]; P2 = __shfl(lo?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; Q1 = lo?expanded[29]:expanded[28]; Q2 = __shfl(lo?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8); + vec0.w = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm3[threadIdx.x&7], 8); g_temp4[24+(threadIdx.x&7)] = vec0; // 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 @@ -451,19 +447,19 @@ void Expansion(const uint32_t *data, uint4 *g_temp4) P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8); Q2 = sel?expanded[2]:expanded[3]; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8); + vec0.x = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm4[threadIdx.x&7], 8); P1 = sel?expanded[8]:expanded[9]; Q1 = __shfl(P1, threadIdx.x^1, 8); Q2 = sel?expanded[10]:expanded[11]; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8); + vec0.y = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm4[threadIdx.x&7], 8); P1 = sel?expanded[4]:expanded[5]; Q1 = __shfl(P1, threadIdx.x^1, 8); Q2 = sel?expanded[6]:expanded[7]; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8); + vec0.z = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm4[threadIdx.x&7], 8); P1 = sel?expanded[12]:expanded[13]; Q1 = __shfl(P1, threadIdx.x^1, 8); Q2 = sel?expanded[14]:expanded[15]; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8); + vec0.w = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm4[threadIdx.x&7], 8); g_temp4[32+(threadIdx.x&7)] = vec0; @@ -475,19 +471,19 @@ void Expansion(const uint32_t *data, uint4 *g_temp4) P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8); Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8); + vec0.x = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm5[threadIdx.x&7], 8); P1 = sel?expanded[9]:expanded[8]; Q1 = __shfl(P1, threadIdx.x^1, 8); Q2 = sel?expanded[11]:expanded[10]; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8); + vec0.y = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm5[threadIdx.x&7], 8); P1 = sel?expanded[5]:expanded[4]; Q1 = __shfl(P1, threadIdx.x^1, 8); Q2 = sel?expanded[7]:expanded[6]; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8); + vec0.z = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm5[threadIdx.x&7], 8); P1 = sel?expanded[13]:expanded[12]; Q1 = __shfl(P1, threadIdx.x^1, 8); Q2 = sel?expanded[15]:expanded[14]; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8); + vec0.w = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm5[threadIdx.x&7], 8); g_temp4[40+(threadIdx.x&7)] = vec0; @@ -502,19 +498,19 @@ void Expansion(const uint32_t *data, uint4 *g_temp4) t = __shfl(expanded[17],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[16]; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[19],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[18]; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8); + vec0.x = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm6[threadIdx.x&7], 8); t = __shfl(expanded[25],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[24]; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[27],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[26]; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8); + vec0.y = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm6[threadIdx.x&7], 8); t = __shfl(expanded[21],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[20]; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[23],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[22]; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8); + vec0.z = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm6[threadIdx.x&7], 8); t = __shfl(expanded[29],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[28]; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[31],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[30]; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8); + vec0.w = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm6[threadIdx.x&7], 8); g_temp4[48+(threadIdx.x&7)] = vec0; @@ -528,29 +524,26 @@ void Expansion(const uint32_t *data, uint4 *g_temp4) t = __shfl(expanded[16],(threadIdx.x+4)&7,8); P1 = sel?expanded[17]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[18],(threadIdx.x+4)&7,8); Q2 = sel?expanded[19]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8); + vec0.x = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm7[threadIdx.x&7], 8); t = __shfl(expanded[24],(threadIdx.x+4)&7,8); P1 = sel?expanded[25]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[26],(threadIdx.x+4)&7,8); Q2 = sel?expanded[27]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8); + vec0.y = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm7[threadIdx.x&7], 8); t = __shfl(expanded[20],(threadIdx.x+4)&7,8); P1 = sel?expanded[21]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[22],(threadIdx.x+4)&7,8); Q2 = sel?expanded[23]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8); + vec0.z = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm7[threadIdx.x&7], 8); t = __shfl(expanded[28],(threadIdx.x+4)&7,8); P1 = sel?expanded[29]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[30],(threadIdx.x+4)&7,8); Q2 = sel?expanded[31]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8); + vec0.w = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm7[threadIdx.x&7], 8); g_temp4[56+(threadIdx.x&7)] = vec0; - -#undef mul_185 -#undef mul_233 } /***************************************************/ __global__ void __launch_bounds__(TPB, 4) -x11_simd512_gpu_expand_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4) +x11_simd512_gpu_expand_64(uint32_t threads, uint32_t startNounce, const uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector, uint4 *const __restrict__ g_temp4) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x)/8; if (thread < threads) diff --git a/x11/simd_functions.cu b/x11/simd_functions.cu index 9ee9f55f05..fdf00e2615 100644 --- a/x11/simd_functions.cu +++ b/x11/simd_functions.cu @@ -1053,8 +1053,8 @@ static __constant__ uint32_t d_cw0[8][8] = { 0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, 0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3 }; -__device__ __forceinline__ void Round8_0_final(uint32_t *A, - int r, int s, int t, int u) { +__device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, int u) +{ STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]); @@ -1076,8 +1076,8 @@ static __constant__ uint32_t d_cw1[8][8] = { 0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, 0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80 }; -__device__ __forceinline__ void Round8_1_final(uint32_t *A, - int r, int s, int t, int u) { +__device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, int u) +{ STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]); @@ -1099,8 +1099,8 @@ static __constant__ uint32_t d_cw2[8][8] = { 0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, 0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE }; -__device__ __forceinline__ void Round8_2_final(uint32_t *A, - int r, int s, int t, int u) { +__device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, int u) +{ STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]); @@ -1122,10 +1122,8 @@ static __constant__ uint32_t d_cw3[8][8] = { 0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, 0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D }; -__device__ __forceinline__ void Round8_3_final(uint32_t *A, - int r, int s, int t, int u) { - - +__device__ __forceinline__ void Round8_3_final(uint32_t *A, int r, int s, int t, int u) +{ STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]); STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]); STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]); @@ -1143,8 +1141,7 @@ __device__ __forceinline__ void Round8_3_final(uint32_t *A, #define expanded_vector(x) __ldg(&g_fft4[x]) #endif -__device__ __forceinline__ void Round8_0(uint32_t *const __restrict__ A, const int thr_offset, - int r, int s, int t, int u, const uint4 *const __restrict__ g_fft4) +__device__ __forceinline__ void Round8_0(uint32_t *const __restrict__ A, const int thr_offset, int r, int s, int t, int u, const uint4 *const __restrict__ g_fft4) { uint32_t w[8]; uint4 hv1, hv2; @@ -1177,8 +1174,7 @@ __device__ __forceinline__ void Round8_0(uint32_t *const __restrict__ A, const i } -__device__ __forceinline__ void Round8_1(uint32_t *const __restrict__ A, const int thr_offset, - int r, int s, int t, int u, const uint4 *const __restrict__ g_fft4) +__device__ __forceinline__ void Round8_1(uint32_t *const __restrict__ A, const int thr_offset, int r, int s, int t, int u, const uint4 *const __restrict__ g_fft4) { uint32_t w[8]; uint4 hv1, hv2; @@ -1211,8 +1207,7 @@ __device__ __forceinline__ void Round8_1(uint32_t *const __restrict__ A, const i } -__device__ __forceinline__ void Round8_2(uint32_t *const __restrict__ A, const int thr_offset, - int r, int s, int t, int u, const uint4 *const __restrict__ g_fft4) +__device__ __forceinline__ void Round8_2(uint32_t *const __restrict__ A, const int thr_offset, int r, int s, int t, int u, const uint4 *const __restrict__ g_fft4) { uint32_t w[8]; uint4 hv1, hv2; @@ -1245,8 +1240,7 @@ __device__ __forceinline__ void Round8_2(uint32_t *const __restrict__ A, const i } -__device__ __forceinline__ void Round8_3(uint32_t *const __restrict__ A, const int thr_offset, - int r, int s, int t, int u, const uint4 *const __restrict__ g_fft4) +__device__ __forceinline__ void Round8_3(uint32_t *const __restrict__ A, const int thr_offset, int r, int s, int t, int u, const uint4 *const __restrict__ g_fft4) { uint32_t w[8]; uint4 hv1, hv2; @@ -1338,21 +1332,20 @@ __device__ __forceinline__ void Compression2(const int texture_id, const uint4 * for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i]; } -__device__ __forceinline__ void SIMD_Compress_Final(uint32_t *A, const uint32_t *M) { +__device__ __forceinline__ void SIMD_Compress_Final(uint32_t *const __restrict__ A) +{ uint32_t IV[4][8]; int i; #pragma unroll 8 - for(i=0; i<8; i++) { + for(i=0; i<8; i++) + { IV[0][i] = A[i]; IV[1][i] = (&A[8])[i]; IV[2][i] = (&A[16])[i]; IV[3][i] = (&A[24])[i]; } -#pragma unroll 8 - for(i=0; i<8; i++) { - A[i] ^= M[i]; - (&A[8])[i] ^= M[8+i]; - } + A[0] ^= 512; + Round8_0_final(A, 3, 23, 17, 27); Round8_1_final(A, 28, 19, 22, 7); Round8_2_final(A, 29, 9, 15, 5); @@ -1363,17 +1356,16 @@ __device__ __forceinline__ void SIMD_Compress_Final(uint32_t *A, const uint32_t STEP8_IF_35(IV[3], 25, 4, &A[8], &A[16], &A[24], A); } -__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) { +__device__ __forceinline__ void Final(uint32_t *const __restrict__ hashval, const int texture_id, const uint4 *const __restrict__ g_fft4, const uint32_t *const __restrict__ g_state) +{ uint32_t A[32]; int i; - uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)]; + const uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)]; #pragma unroll 32 - for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i]; - uint32_t buffer[16]; - buffer[0] = 512; -#pragma unroll 15 - for (i=1; i < 16; i++) buffer[i] = 0; - SIMD_Compress_Final(A, buffer); + for (i=0; i < 32; i++) + A[i] = state[threadIdx.x+blockDim.x*i]; + + SIMD_Compress_Final(A); #pragma unroll 16 for (i=0; i < 16; i++) hashval[i] = A[i];