Skip to content

Commit

Permalink
x11_simd changes
Browse files Browse the repository at this point in the history
  • Loading branch information
KlausT committed Jan 8, 2015
1 parent 9690f45 commit 99ab109
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 89 deletions.
105 changes: 49 additions & 56 deletions x11/cuda_x11_simd512.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,15 @@ uint4 *d_temp4[8];
// texture bound to d_temp4[thr_id], for read access in Compaction kernel
texture<uint4, 1, cudaReadModeElementType> texRef1D_128;

__constant__ uint8_t c_perm[8][8] = {
{ 2, 3, 6, 7, 0, 1, 4, 5 },
{ 6, 7, 2, 3, 4, 5, 0, 1 },
{ 7, 6, 5, 4, 3, 2, 1, 0 },
{ 1, 0, 3, 2, 5, 4, 7, 6 },
{ 0, 1, 4, 5, 6, 7, 2, 3 },
{ 6, 7, 2, 3, 0, 1, 4, 5 },
{ 6, 7, 0, 1, 4, 5, 2, 3 },
{ 4, 5, 2, 3, 6, 7, 0, 1 }
};
__constant__ uint8_t c_perm0[8] = { 2, 3, 6, 7, 0, 1, 4, 5 };
__constant__ uint8_t c_perm1[8] = { 6, 7, 2, 3, 4, 5, 0, 1 };
__constant__ uint8_t c_perm2[8] = { 7, 6, 5, 4, 3, 2, 1, 0 };
__constant__ uint8_t c_perm3[8] = { 1, 0, 3, 2, 5, 4, 7, 6 };
__constant__ uint8_t c_perm4[8] = { 0, 1, 4, 5, 6, 7, 2, 3 };
__constant__ uint8_t c_perm5[8] = { 6, 7, 2, 3, 0, 1, 4, 5 };
__constant__ uint8_t c_perm6[8] = { 6, 7, 0, 1, 4, 5, 2, 3 };
__constant__ uint8_t c_perm7[8] = { 4, 5, 2, 3, 6, 7, 0, 1 };


__constant__ uint32_t c_IV_512[32] = {
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
Expand Down Expand Up @@ -275,7 +274,7 @@ __device__ __forceinline__ void FFT_16(int *y) {
}

__device__ __forceinline__
void FFT_128_full(int y[128])
void FFT_128_full(int *y)
{
int i;

Expand All @@ -287,12 +286,12 @@ void FFT_128_full(int y[128])
/*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i*8+(threadIdx.x&7)]);

#pragma unroll 8
for (i=0; i<8; i++)
FFT_16(y+2*i); // eight sequential FFT16's, each one executed in parallel by 8 threads
for (i=0; i<16; i+=2)
FFT_16(y+i); // eight sequential FFT16's, each one executed in parallel by 8 threads
}

__device__ __forceinline__
void FFT_256_halfzero(int y[256])
void FFT_256_halfzero(int *y)
{
/*
* FFT_256 using w=41 as 256th root of unity.
Expand All @@ -306,8 +305,8 @@ void FFT_256_halfzero(int y[256])
for (int i=0; i<8; i++)
y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(threadIdx.x&7)]);
#pragma unroll 8
for (int i=8; i<16; i++)
y[16+i] = 0;
for (int i=24; i<32; i++)
y[i] = 0;

/* handle X^255 with an additional butterfly */
if ((threadIdx.x&7) == 7)
Expand All @@ -324,7 +323,7 @@ void FFT_256_halfzero(int y[256])
/***************************************************/

__device__ __forceinline__
void Expansion(const uint32_t *data, uint4 *g_temp4)
void Expansion(const uint32_t *const __restrict__ data, uint4 *const __restrict__ g_temp4)
{
/* Message Expansion using Number Theoretical Transform similar to FFT */
int expanded[32];
Expand All @@ -341,9 +340,6 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)

// store w matrices in global memory

#define mul_185(x) ( (x)*185 )
#define mul_233(x) ( (x)*233 )

uint4 vec0;
int P, Q, P1, Q1, P2, Q2;
bool even = (threadIdx.x & 1) == 0;
Expand All @@ -357,16 +353,16 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)

P1 = expanded[ 0]; P2 = __shfl(expanded[ 2], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
vec0.x = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm0[threadIdx.x&7], 8);
P1 = expanded[ 8]; P2 = __shfl(expanded[10], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
vec0.y = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm0[threadIdx.x&7], 8);
P1 = expanded[ 4]; P2 = __shfl(expanded[ 6], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
vec0.z = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm0[threadIdx.x&7], 8);
P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
vec0.w = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm0[threadIdx.x&7], 8);
g_temp4[threadIdx.x&7] = vec0;

// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
Expand All @@ -378,16 +374,16 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)

P1 = expanded[ 1]; P2 = __shfl(expanded[ 3], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
vec0.x = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm1[threadIdx.x&7], 8);
P1 = expanded[ 9]; P2 = __shfl(expanded[11], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
vec0.y = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm1[threadIdx.x&7], 8);
P1 = expanded[ 5]; P2 = __shfl(expanded[ 7], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
vec0.z = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm1[threadIdx.x&7], 8);
P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
vec0.w = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm1[threadIdx.x&7], 8);
g_temp4[8+(threadIdx.x&7)] = vec0;

// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
Expand All @@ -401,16 +397,16 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)

P1 = hi?expanded[ 1]:expanded[ 0]; P2 = __shfl(hi?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = hi?expanded[17]:expanded[16]; Q2 = __shfl(hi?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
vec0.x = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm2[threadIdx.x&7], 8);
P1 = hi?expanded[ 9]:expanded[ 8]; P2 = __shfl(hi?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = hi?expanded[25]:expanded[24]; Q2 = __shfl(hi?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
vec0.y = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm2[threadIdx.x&7], 8);
P1 = hi?expanded[ 5]:expanded[ 4]; P2 = __shfl(hi?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = hi?expanded[21]:expanded[20]; Q2 = __shfl(hi?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
vec0.z = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm2[threadIdx.x&7], 8);
P1 = hi?expanded[13]:expanded[12]; P2 = __shfl(hi?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = hi?expanded[29]:expanded[28]; Q2 = __shfl(hi?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
vec0.w = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm2[threadIdx.x&7], 8);
g_temp4[16+(threadIdx.x&7)] = vec0;

// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
Expand All @@ -424,16 +420,16 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)

P1 = lo?expanded[ 1]:expanded[ 0]; P2 = __shfl(lo?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = lo?expanded[17]:expanded[16]; Q2 = __shfl(lo?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
vec0.x = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm3[threadIdx.x&7], 8);
P1 = lo?expanded[ 9]:expanded[ 8]; P2 = __shfl(lo?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = lo?expanded[25]:expanded[24]; Q2 = __shfl(lo?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
vec0.y = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm3[threadIdx.x&7], 8);
P1 = lo?expanded[ 5]:expanded[ 4]; P2 = __shfl(lo?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = lo?expanded[21]:expanded[20]; Q2 = __shfl(lo?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
vec0.z = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm3[threadIdx.x&7], 8);
P1 = lo?expanded[13]:expanded[12]; P2 = __shfl(lo?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = lo?expanded[29]:expanded[28]; Q2 = __shfl(lo?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
vec0.w = __shfl((int)__byte_perm(185*P, 185*Q , 0x5410), c_perm3[threadIdx.x&7], 8);
g_temp4[24+(threadIdx.x&7)] = vec0;

// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
Expand All @@ -451,19 +447,19 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)
P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[2]:expanded[3]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
vec0.x = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm4[threadIdx.x&7], 8);
P1 = sel?expanded[8]:expanded[9]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[10]:expanded[11]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
vec0.y = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm4[threadIdx.x&7], 8);
P1 = sel?expanded[4]:expanded[5]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[6]:expanded[7]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
vec0.z = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm4[threadIdx.x&7], 8);
P1 = sel?expanded[12]:expanded[13]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[14]:expanded[15]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
vec0.w = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm4[threadIdx.x&7], 8);

g_temp4[32+(threadIdx.x&7)] = vec0;

Expand All @@ -475,19 +471,19 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)
P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
vec0.x = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm5[threadIdx.x&7], 8);
P1 = sel?expanded[9]:expanded[8]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[11]:expanded[10]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
vec0.y = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm5[threadIdx.x&7], 8);
P1 = sel?expanded[5]:expanded[4]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[7]:expanded[6]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
vec0.z = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm5[threadIdx.x&7], 8);
P1 = sel?expanded[13]:expanded[12]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[15]:expanded[14]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
vec0.w = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm5[threadIdx.x&7], 8);

g_temp4[40+(threadIdx.x&7)] = vec0;

Expand All @@ -502,19 +498,19 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)
t = __shfl(expanded[17],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[16]; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[19],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[18]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
vec0.x = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm6[threadIdx.x&7], 8);
t = __shfl(expanded[25],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[24]; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[27],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[26]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
vec0.y = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm6[threadIdx.x&7], 8);
t = __shfl(expanded[21],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[20]; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[23],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[22]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
vec0.z = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm6[threadIdx.x&7], 8);
t = __shfl(expanded[29],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[28]; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[31],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[30]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
vec0.w = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm6[threadIdx.x&7], 8);

g_temp4[48+(threadIdx.x&7)] = vec0;

Expand All @@ -528,29 +524,26 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)
t = __shfl(expanded[16],(threadIdx.x+4)&7,8); P1 = sel?expanded[17]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[18],(threadIdx.x+4)&7,8); Q2 = sel?expanded[19]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
vec0.x = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm7[threadIdx.x&7], 8);
t = __shfl(expanded[24],(threadIdx.x+4)&7,8); P1 = sel?expanded[25]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[26],(threadIdx.x+4)&7,8); Q2 = sel?expanded[27]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
vec0.y = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm7[threadIdx.x&7], 8);
t = __shfl(expanded[20],(threadIdx.x+4)&7,8); P1 = sel?expanded[21]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[22],(threadIdx.x+4)&7,8); Q2 = sel?expanded[23]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
vec0.z = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm7[threadIdx.x&7], 8);
t = __shfl(expanded[28],(threadIdx.x+4)&7,8); P1 = sel?expanded[29]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[30],(threadIdx.x+4)&7,8); Q2 = sel?expanded[31]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
vec0.w = __shfl((int)__byte_perm(233*P, 233*Q , 0x5410), c_perm7[threadIdx.x&7], 8);

g_temp4[56+(threadIdx.x&7)] = vec0;

#undef mul_185
#undef mul_233
}

/***************************************************/
__global__ void __launch_bounds__(TPB, 4)
x11_simd512_gpu_expand_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4)
x11_simd512_gpu_expand_64(uint32_t threads, uint32_t startNounce, const uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector, uint4 *const __restrict__ g_temp4)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x)/8;
if (thread < threads)
Expand Down
Loading

0 comments on commit 99ab109

Please sign in to comment.