From 3b210695040903702482ec14e8fce421bd171b92 Mon Sep 17 00:00:00 2001 From: Christian Buchner Date: Sat, 14 Jun 2014 01:43:28 +0200 Subject: [PATCH] bump to revision V1.1 with Killer Groestl --- JHA/cuda_jha_keccak512.cu | 2 - JHA/jackpotcoin.cu | 16 +- README.txt | 13 +- bitslice_transformations_quad.cu | 418 +++++++++++++++++++ ccminer.vcxproj | 12 + ccminer.vcxproj.filters | 6 + configure.ac | 2 +- cpu-miner.c | 6 +- cpuminer-config.h | 4 +- cuda_groestlcoin.cu | 537 ++++++------------------- cuda_myriadgroestl.cu | 665 ++++++++++--------------------- cuda_nist5.cu | 1 - groestl_functions_quad.cu | 315 +++++++++++++++ groestlcoin.cpp | 227 +++++------ heavy/cuda_blake512.cu | 2 - heavy/cuda_combine.cu | 2 - heavy/cuda_groestl512.cu | 2 - heavy/cuda_hefty1.cu | 2 - heavy/cuda_keccak512.cu | 1 - heavy/cuda_sha256.cu | 1 - myriadgroestl.cpp | 10 +- quark/animecoin.cu | 1 - quark/cuda_bmw512.cu | 4 - quark/cuda_jh512.cu | 2 - quark/cuda_quark_blake512.cu | 4 - quark/cuda_quark_checkhash.cu | 2 - quark/cuda_quark_groestl512.cu | 414 +++++-------------- quark/cuda_quark_keccak512.cu | 2 - quark/cuda_skein512.cu | 1 - quark/quarkcoin.cu | 1 - x11/cuda_x11_cubehash512.cu | 2 - x11/cuda_x11_echo.cu | 2 - x11/cuda_x11_luffa512.cu | 2 - x11/cuda_x11_shavite512.cu | 2 - x11/x11.cu | 5 +- 35 files changed, 1301 insertions(+), 1387 deletions(-) create mode 100644 bitslice_transformations_quad.cu create mode 100644 groestl_functions_quad.cu diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu index 2906d47b79..be5b61ac57 100644 --- a/JHA/cuda_jha_keccak512.cu +++ b/JHA/cuda_jha_keccak512.cu @@ -567,8 +567,6 @@ __host__ void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t start // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - jackpot_keccak512_gpu_hash<<>>(threads, startNounce, (uint64_t*)d_hash); MyStreamSynchronize(NULL, order, thr_id); } diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu index e67661322a..7d6d4cf5b7 100644 --- a/JHA/jackpotcoin.cu +++ b/JHA/jackpotcoin.cu @@ -101,14 +101,12 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, { const uint32_t first_nonce = pdata[19]; - // TODO: entfernen für eine Release! Ist nur zum Testen! if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x0000ff; const uint32_t Htarg = ptarget[7]; const int throughput = 256*4096*4; // 100; - //const int throughput = 256*256*2+100; // 100; static bool init[8] = {0,0,0,0,0,0,0,0}; if (!init[thr_id]) @@ -167,16 +165,18 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); } - // Runde 2 (ohne Gröstl) + // Runde 3 (komplett) // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], d_branch1Nonces[thr_id], &nrm1, - d_branch3Nonces[thr_id], &nrm3, + d_branch2Nonces[thr_id], &nrm2, order++); - // verfolge den skein-pfad weiter - quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + if (nrm1+nrm2 == nrm3) { + quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + } // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], @@ -226,7 +226,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { pdata[19] = foundNonce; - *hashes_done = (foundNonce - first_nonce + 1)/4; + *hashes_done = (foundNonce - first_nonce + 1)/2; //applog(LOG_INFO, "GPU #%d: result for nonce $%08X does validate on CPU (%d rounds)!", thr_id, foundNonce, rounds); return 1; } else { @@ -238,6 +238,6 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); - *hashes_done = (pdata[19] - first_nonce + 1)/4; + *hashes_done = (pdata[19] - first_nonce + 1)/2; return 0; } diff --git a/README.txt b/README.txt index 09a33b6bdc..aace22699e 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccMiner release 1.0 (May 10th 2014) - "Did anyone say X11?" +ccMiner release 1.1 (June 14th 2014) - "Killer Groestl!" ------------------------------------------------------------- *************************************************************** @@ -30,13 +30,12 @@ FugueCoin GroestlCoin & Myriad-Groestl JackpotCoin QuarkCoin family & AnimeCoin +TalkCoin +DarkCoin and other X11 coins where some of these coins have a VERY NOTABLE nVidia advantage over competing AMD (OpenCL) implementations. -X11 algo is being worked on. It will be released when we -have achieved a nice nVidia advantage. - We did not take a big effort on improving usability, so please set your parameters carefuly. @@ -140,6 +139,12 @@ features. >>> RELEASE HISTORY <<< + June 14th 2014 released Killer Groestl quad version which I deem + sufficiently hard to port over to AMD. It isn't + the fastest option for Compute 3.5 and 5.0 cards, + but it is still much faster than the table based + versions. + May 10th 2014 added X11, but without the bells & whistles (no killer Groestl, SIMD hash quite slow still) diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu new file mode 100644 index 0000000000..f4a5f2a325 --- /dev/null +++ b/bitslice_transformations_quad.cu @@ -0,0 +1,418 @@ + +__device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output) +{ + int n = threadIdx.x % 4; + uint32_t other[8]; +#pragma unroll 8 + for (int i = 0; i < 8; i++) { + input[i] =__shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4); + other[i] =__shfl((int)input[i], (threadIdx.x + 1) % 4, 4); + input[i] = __shfl((int)input[i], threadIdx.x & 2, 4); + other[i] = __shfl((int)other[i], threadIdx.x & 2, 4); + if (threadIdx.x & 1) { + input[i] = __byte_perm(input[i], 0, 0x1032); + other[i] = __byte_perm(other[i], 0, 0x1032); + } + output[i] = 0; + } + + output[ 0] |= (input[ 0] & 0x00000001); + output[ 0] |= ((other[ 0] & 0x00000001) << 1); + output[ 0] |= ((input[ 1] & 0x00000001) << 2); + output[ 0] |= ((other[ 1] & 0x00000001) << 3); + output[ 0] |= ((input[ 2] & 0x00000001) << 4); + output[ 0] |= ((other[ 2] & 0x00000001) << 5); + output[ 0] |= ((input[ 3] & 0x00000001) << 6); + output[ 0] |= ((other[ 3] & 0x00000001) << 7); + output[ 0] |= ((input[ 4] & 0x00000001) << 8); + output[ 0] |= ((other[ 4] & 0x00000001) << 9); + output[ 0] |= ((input[ 5] & 0x00000001) <<10); + output[ 0] |= ((other[ 5] & 0x00000001) <<11); + output[ 0] |= ((input[ 6] & 0x00000001) <<12); + output[ 0] |= ((other[ 6] & 0x00000001) <<13); + output[ 0] |= ((input[ 7] & 0x00000001) <<14); + output[ 0] |= ((other[ 7] & 0x00000001) <<15); + output[ 0] |= ((input[ 0] & 0x00000100) << 8); + output[ 0] |= ((other[ 0] & 0x00000100) << 9); + output[ 0] |= ((input[ 1] & 0x00000100) <<10); + output[ 0] |= ((other[ 1] & 0x00000100) <<11); + output[ 0] |= ((input[ 2] & 0x00000100) <<12); + output[ 0] |= ((other[ 2] & 0x00000100) <<13); + output[ 0] |= ((input[ 3] & 0x00000100) <<14); + output[ 0] |= ((other[ 3] & 0x00000100) <<15); + output[ 0] |= ((input[ 4] & 0x00000100) <<16); + output[ 0] |= ((other[ 4] & 0x00000100) <<17); + output[ 0] |= ((input[ 5] & 0x00000100) <<18); + output[ 0] |= ((other[ 5] & 0x00000100) <<19); + output[ 0] |= ((input[ 6] & 0x00000100) <<20); + output[ 0] |= ((other[ 6] & 0x00000100) <<21); + output[ 0] |= ((input[ 7] & 0x00000100) <<22); + output[ 0] |= ((other[ 7] & 0x00000100) <<23); + output[ 1] |= ((input[ 0] & 0x00000002) >> 1); + output[ 1] |= (other[ 0] & 0x00000002); + output[ 1] |= ((input[ 1] & 0x00000002) << 1); + output[ 1] |= ((other[ 1] & 0x00000002) << 2); + output[ 1] |= ((input[ 2] & 0x00000002) << 3); + output[ 1] |= ((other[ 2] & 0x00000002) << 4); + output[ 1] |= ((input[ 3] & 0x00000002) << 5); + output[ 1] |= ((other[ 3] & 0x00000002) << 6); + output[ 1] |= ((input[ 4] & 0x00000002) << 7); + output[ 1] |= ((other[ 4] & 0x00000002) << 8); + output[ 1] |= ((input[ 5] & 0x00000002) << 9); + output[ 1] |= ((other[ 5] & 0x00000002) <<10); + output[ 1] |= ((input[ 6] & 0x00000002) <<11); + output[ 1] |= ((other[ 6] & 0x00000002) <<12); + output[ 1] |= ((input[ 7] & 0x00000002) <<13); + output[ 1] |= ((other[ 7] & 0x00000002) <<14); + output[ 1] |= ((input[ 0] & 0x00000200) << 7); + output[ 1] |= ((other[ 0] & 0x00000200) << 8); + output[ 1] |= ((input[ 1] & 0x00000200) << 9); + output[ 1] |= ((other[ 1] & 0x00000200) <<10); + output[ 1] |= ((input[ 2] & 0x00000200) <<11); + output[ 1] |= ((other[ 2] & 0x00000200) <<12); + output[ 1] |= ((input[ 3] & 0x00000200) <<13); + output[ 1] |= ((other[ 3] & 0x00000200) <<14); + output[ 1] |= ((input[ 4] & 0x00000200) <<15); + output[ 1] |= ((other[ 4] & 0x00000200) <<16); + output[ 1] |= ((input[ 5] & 0x00000200) <<17); + output[ 1] |= ((other[ 5] & 0x00000200) <<18); + output[ 1] |= ((input[ 6] & 0x00000200) <<19); + output[ 1] |= ((other[ 6] & 0x00000200) <<20); + output[ 1] |= ((input[ 7] & 0x00000200) <<21); + output[ 1] |= ((other[ 7] & 0x00000200) <<22); + output[ 2] |= ((input[ 0] & 0x00000004) >> 2); + output[ 2] |= ((other[ 0] & 0x00000004) >> 1); + output[ 2] |= (input[ 1] & 0x00000004); + output[ 2] |= ((other[ 1] & 0x00000004) << 1); + output[ 2] |= ((input[ 2] & 0x00000004) << 2); + output[ 2] |= ((other[ 2] & 0x00000004) << 3); + output[ 2] |= ((input[ 3] & 0x00000004) << 4); + output[ 2] |= ((other[ 3] & 0x00000004) << 5); + output[ 2] |= ((input[ 4] & 0x00000004) << 6); + output[ 2] |= ((other[ 4] & 0x00000004) << 7); + output[ 2] |= ((input[ 5] & 0x00000004) << 8); + output[ 2] |= ((other[ 5] & 0x00000004) << 9); + output[ 2] |= ((input[ 6] & 0x00000004) <<10); + output[ 2] |= ((other[ 6] & 0x00000004) <<11); + output[ 2] |= ((input[ 7] & 0x00000004) <<12); + output[ 2] |= ((other[ 7] & 0x00000004) <<13); + output[ 2] |= ((input[ 0] & 0x00000400) << 6); + output[ 2] |= ((other[ 0] & 0x00000400) << 7); + output[ 2] |= ((input[ 1] & 0x00000400) << 8); + output[ 2] |= ((other[ 1] & 0x00000400) << 9); + output[ 2] |= ((input[ 2] & 0x00000400) <<10); + output[ 2] |= ((other[ 2] & 0x00000400) <<11); + output[ 2] |= ((input[ 3] & 0x00000400) <<12); + output[ 2] |= ((other[ 3] & 0x00000400) <<13); + output[ 2] |= ((input[ 4] & 0x00000400) <<14); + output[ 2] |= ((other[ 4] & 0x00000400) <<15); + output[ 2] |= ((input[ 5] & 0x00000400) <<16); + output[ 2] |= ((other[ 5] & 0x00000400) <<17); + output[ 2] |= ((input[ 6] & 0x00000400) <<18); + output[ 2] |= ((other[ 6] & 0x00000400) <<19); + output[ 2] |= ((input[ 7] & 0x00000400) <<20); + output[ 2] |= ((other[ 7] & 0x00000400) <<21); + output[ 3] |= ((input[ 0] & 0x00000008) >> 3); + output[ 3] |= ((other[ 0] & 0x00000008) >> 2); + output[ 3] |= ((input[ 1] & 0x00000008) >> 1); + output[ 3] |= (other[ 1] & 0x00000008); + output[ 3] |= ((input[ 2] & 0x00000008) << 1); + output[ 3] |= ((other[ 2] & 0x00000008) << 2); + output[ 3] |= ((input[ 3] & 0x00000008) << 3); + output[ 3] |= ((other[ 3] & 0x00000008) << 4); + output[ 3] |= ((input[ 4] & 0x00000008) << 5); + output[ 3] |= ((other[ 4] & 0x00000008) << 6); + output[ 3] |= ((input[ 5] & 0x00000008) << 7); + output[ 3] |= ((other[ 5] & 0x00000008) << 8); + output[ 3] |= ((input[ 6] & 0x00000008) << 9); + output[ 3] |= ((other[ 6] & 0x00000008) <<10); + output[ 3] |= ((input[ 7] & 0x00000008) <<11); + output[ 3] |= ((other[ 7] & 0x00000008) <<12); + output[ 3] |= ((input[ 0] & 0x00000800) << 5); + output[ 3] |= ((other[ 0] & 0x00000800) << 6); + output[ 3] |= ((input[ 1] & 0x00000800) << 7); + output[ 3] |= ((other[ 1] & 0x00000800) << 8); + output[ 3] |= ((input[ 2] & 0x00000800) << 9); + output[ 3] |= ((other[ 2] & 0x00000800) <<10); + output[ 3] |= ((input[ 3] & 0x00000800) <<11); + output[ 3] |= ((other[ 3] & 0x00000800) <<12); + output[ 3] |= ((input[ 4] & 0x00000800) <<13); + output[ 3] |= ((other[ 4] & 0x00000800) <<14); + output[ 3] |= ((input[ 5] & 0x00000800) <<15); + output[ 3] |= ((other[ 5] & 0x00000800) <<16); + output[ 3] |= ((input[ 6] & 0x00000800) <<17); + output[ 3] |= ((other[ 6] & 0x00000800) <<18); + output[ 3] |= ((input[ 7] & 0x00000800) <<19); + output[ 3] |= ((other[ 7] & 0x00000800) <<20); + output[ 4] |= ((input[ 0] & 0x00000010) >> 4); + output[ 4] |= ((other[ 0] & 0x00000010) >> 3); + output[ 4] |= ((input[ 1] & 0x00000010) >> 2); + output[ 4] |= ((other[ 1] & 0x00000010) >> 1); + output[ 4] |= (input[ 2] & 0x00000010); + output[ 4] |= ((other[ 2] & 0x00000010) << 1); + output[ 4] |= ((input[ 3] & 0x00000010) << 2); + output[ 4] |= ((other[ 3] & 0x00000010) << 3); + output[ 4] |= ((input[ 4] & 0x00000010) << 4); + output[ 4] |= ((other[ 4] & 0x00000010) << 5); + output[ 4] |= ((input[ 5] & 0x00000010) << 6); + output[ 4] |= ((other[ 5] & 0x00000010) << 7); + output[ 4] |= ((input[ 6] & 0x00000010) << 8); + output[ 4] |= ((other[ 6] & 0x00000010) << 9); + output[ 4] |= ((input[ 7] & 0x00000010) <<10); + output[ 4] |= ((other[ 7] & 0x00000010) <<11); + output[ 4] |= ((input[ 0] & 0x00001000) << 4); + output[ 4] |= ((other[ 0] & 0x00001000) << 5); + output[ 4] |= ((input[ 1] & 0x00001000) << 6); + output[ 4] |= ((other[ 1] & 0x00001000) << 7); + output[ 4] |= ((input[ 2] & 0x00001000) << 8); + output[ 4] |= ((other[ 2] & 0x00001000) << 9); + output[ 4] |= ((input[ 3] & 0x00001000) <<10); + output[ 4] |= ((other[ 3] & 0x00001000) <<11); + output[ 4] |= ((input[ 4] & 0x00001000) <<12); + output[ 4] |= ((other[ 4] & 0x00001000) <<13); + output[ 4] |= ((input[ 5] & 0x00001000) <<14); + output[ 4] |= ((other[ 5] & 0x00001000) <<15); + output[ 4] |= ((input[ 6] & 0x00001000) <<16); + output[ 4] |= ((other[ 6] & 0x00001000) <<17); + output[ 4] |= ((input[ 7] & 0x00001000) <<18); + output[ 4] |= ((other[ 7] & 0x00001000) <<19); + output[ 5] |= ((input[ 0] & 0x00000020) >> 5); + output[ 5] |= ((other[ 0] & 0x00000020) >> 4); + output[ 5] |= ((input[ 1] & 0x00000020) >> 3); + output[ 5] |= ((other[ 1] & 0x00000020) >> 2); + output[ 5] |= ((input[ 2] & 0x00000020) >> 1); + output[ 5] |= (other[ 2] & 0x00000020); + output[ 5] |= ((input[ 3] & 0x00000020) << 1); + output[ 5] |= ((other[ 3] & 0x00000020) << 2); + output[ 5] |= ((input[ 4] & 0x00000020) << 3); + output[ 5] |= ((other[ 4] & 0x00000020) << 4); + output[ 5] |= ((input[ 5] & 0x00000020) << 5); + output[ 5] |= ((other[ 5] & 0x00000020) << 6); + output[ 5] |= ((input[ 6] & 0x00000020) << 7); + output[ 5] |= ((other[ 6] & 0x00000020) << 8); + output[ 5] |= ((input[ 7] & 0x00000020) << 9); + output[ 5] |= ((other[ 7] & 0x00000020) <<10); + output[ 5] |= ((input[ 0] & 0x00002000) << 3); + output[ 5] |= ((other[ 0] & 0x00002000) << 4); + output[ 5] |= ((input[ 1] & 0x00002000) << 5); + output[ 5] |= ((other[ 1] & 0x00002000) << 6); + output[ 5] |= ((input[ 2] & 0x00002000) << 7); + output[ 5] |= ((other[ 2] & 0x00002000) << 8); + output[ 5] |= ((input[ 3] & 0x00002000) << 9); + output[ 5] |= ((other[ 3] & 0x00002000) <<10); + output[ 5] |= ((input[ 4] & 0x00002000) <<11); + output[ 5] |= ((other[ 4] & 0x00002000) <<12); + output[ 5] |= ((input[ 5] & 0x00002000) <<13); + output[ 5] |= ((other[ 5] & 0x00002000) <<14); + output[ 5] |= ((input[ 6] & 0x00002000) <<15); + output[ 5] |= ((other[ 6] & 0x00002000) <<16); + output[ 5] |= ((input[ 7] & 0x00002000) <<17); + output[ 5] |= ((other[ 7] & 0x00002000) <<18); + output[ 6] |= ((input[ 0] & 0x00000040) >> 6); + output[ 6] |= ((other[ 0] & 0x00000040) >> 5); + output[ 6] |= ((input[ 1] & 0x00000040) >> 4); + output[ 6] |= ((other[ 1] & 0x00000040) >> 3); + output[ 6] |= ((input[ 2] & 0x00000040) >> 2); + output[ 6] |= ((other[ 2] & 0x00000040) >> 1); + output[ 6] |= (input[ 3] & 0x00000040); + output[ 6] |= ((other[ 3] & 0x00000040) << 1); + output[ 6] |= ((input[ 4] & 0x00000040) << 2); + output[ 6] |= ((other[ 4] & 0x00000040) << 3); + output[ 6] |= ((input[ 5] & 0x00000040) << 4); + output[ 6] |= ((other[ 5] & 0x00000040) << 5); + output[ 6] |= ((input[ 6] & 0x00000040) << 6); + output[ 6] |= ((other[ 6] & 0x00000040) << 7); + output[ 6] |= ((input[ 7] & 0x00000040) << 8); + output[ 6] |= ((other[ 7] & 0x00000040) << 9); + output[ 6] |= ((input[ 0] & 0x00004000) << 2); + output[ 6] |= ((other[ 0] & 0x00004000) << 3); + output[ 6] |= ((input[ 1] & 0x00004000) << 4); + output[ 6] |= ((other[ 1] & 0x00004000) << 5); + output[ 6] |= ((input[ 2] & 0x00004000) << 6); + output[ 6] |= ((other[ 2] & 0x00004000) << 7); + output[ 6] |= ((input[ 3] & 0x00004000) << 8); + output[ 6] |= ((other[ 3] & 0x00004000) << 9); + output[ 6] |= ((input[ 4] & 0x00004000) <<10); + output[ 6] |= ((other[ 4] & 0x00004000) <<11); + output[ 6] |= ((input[ 5] & 0x00004000) <<12); + output[ 6] |= ((other[ 5] & 0x00004000) <<13); + output[ 6] |= ((input[ 6] & 0x00004000) <<14); + output[ 6] |= ((other[ 6] & 0x00004000) <<15); + output[ 6] |= ((input[ 7] & 0x00004000) <<16); + output[ 6] |= ((other[ 7] & 0x00004000) <<17); + output[ 7] |= ((input[ 0] & 0x00000080) >> 7); + output[ 7] |= ((other[ 0] & 0x00000080) >> 6); + output[ 7] |= ((input[ 1] & 0x00000080) >> 5); + output[ 7] |= ((other[ 1] & 0x00000080) >> 4); + output[ 7] |= ((input[ 2] & 0x00000080) >> 3); + output[ 7] |= ((other[ 2] & 0x00000080) >> 2); + output[ 7] |= ((input[ 3] & 0x00000080) >> 1); + output[ 7] |= (other[ 3] & 0x00000080); + output[ 7] |= ((input[ 4] & 0x00000080) << 1); + output[ 7] |= ((other[ 4] & 0x00000080) << 2); + output[ 7] |= ((input[ 5] & 0x00000080) << 3); + output[ 7] |= ((other[ 5] & 0x00000080) << 4); + output[ 7] |= ((input[ 6] & 0x00000080) << 5); + output[ 7] |= ((other[ 6] & 0x00000080) << 6); + output[ 7] |= ((input[ 7] & 0x00000080) << 7); + output[ 7] |= ((other[ 7] & 0x00000080) << 8); + output[ 7] |= ((input[ 0] & 0x00008000) << 1); + output[ 7] |= ((other[ 0] & 0x00008000) << 2); + output[ 7] |= ((input[ 1] & 0x00008000) << 3); + output[ 7] |= ((other[ 1] & 0x00008000) << 4); + output[ 7] |= ((input[ 2] & 0x00008000) << 5); + output[ 7] |= ((other[ 2] & 0x00008000) << 6); + output[ 7] |= ((input[ 3] & 0x00008000) << 7); + output[ 7] |= ((other[ 3] & 0x00008000) << 8); + output[ 7] |= ((input[ 4] & 0x00008000) << 9); + output[ 7] |= ((other[ 4] & 0x00008000) <<10); + output[ 7] |= ((input[ 5] & 0x00008000) <<11); + output[ 7] |= ((other[ 5] & 0x00008000) <<12); + output[ 7] |= ((input[ 6] & 0x00008000) <<13); + output[ 7] |= ((other[ 6] & 0x00008000) <<14); + output[ 7] |= ((input[ 7] & 0x00008000) <<15); + output[ 7] |= ((other[ 7] & 0x00008000) <<16); +} + +__device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *output) +{ +#pragma unroll 8 + for (int i=0; i < 16; i+=2) output[i] = 0; + + output[ 0] |= ((input[ 0] & 0x00000100) >> 8); + output[ 0] |= ((input[ 1] & 0x00000100) >> 7); + output[ 0] |= ((input[ 2] & 0x00000100) >> 6); + output[ 0] |= ((input[ 3] & 0x00000100) >> 5); + output[ 0] |= ((input[ 4] & 0x00000100) >> 4); + output[ 0] |= ((input[ 5] & 0x00000100) >> 3); + output[ 0] |= ((input[ 6] & 0x00000100) >> 2); + output[ 0] |= ((input[ 7] & 0x00000100) >> 1); + output[ 0] |= ((input[ 0] & 0x01000000) >>16); + output[ 0] |= ((input[ 1] & 0x01000000) >>15); + output[ 0] |= ((input[ 2] & 0x01000000) >>14); + output[ 0] |= ((input[ 3] & 0x01000000) >>13); + output[ 0] |= ((input[ 4] & 0x01000000) >>12); + output[ 0] |= ((input[ 5] & 0x01000000) >>11); + output[ 0] |= ((input[ 6] & 0x01000000) >>10); + output[ 0] |= ((input[ 7] & 0x01000000) >> 9); + output[ 2] |= ((input[ 0] & 0x00000200) >> 9); + output[ 2] |= ((input[ 1] & 0x00000200) >> 8); + output[ 2] |= ((input[ 2] & 0x00000200) >> 7); + output[ 2] |= ((input[ 3] & 0x00000200) >> 6); + output[ 2] |= ((input[ 4] & 0x00000200) >> 5); + output[ 2] |= ((input[ 5] & 0x00000200) >> 4); + output[ 2] |= ((input[ 6] & 0x00000200) >> 3); + output[ 2] |= ((input[ 7] & 0x00000200) >> 2); + output[ 2] |= ((input[ 0] & 0x02000000) >>17); + output[ 2] |= ((input[ 1] & 0x02000000) >>16); + output[ 2] |= ((input[ 2] & 0x02000000) >>15); + output[ 2] |= ((input[ 3] & 0x02000000) >>14); + output[ 2] |= ((input[ 4] & 0x02000000) >>13); + output[ 2] |= ((input[ 5] & 0x02000000) >>12); + output[ 2] |= ((input[ 6] & 0x02000000) >>11); + output[ 2] |= ((input[ 7] & 0x02000000) >>10); + output[ 4] |= ((input[ 0] & 0x00000400) >>10); + output[ 4] |= ((input[ 1] & 0x00000400) >> 9); + output[ 4] |= ((input[ 2] & 0x00000400) >> 8); + output[ 4] |= ((input[ 3] & 0x00000400) >> 7); + output[ 4] |= ((input[ 4] & 0x00000400) >> 6); + output[ 4] |= ((input[ 5] & 0x00000400) >> 5); + output[ 4] |= ((input[ 6] & 0x00000400) >> 4); + output[ 4] |= ((input[ 7] & 0x00000400) >> 3); + output[ 4] |= ((input[ 0] & 0x04000000) >>18); + output[ 4] |= ((input[ 1] & 0x04000000) >>17); + output[ 4] |= ((input[ 2] & 0x04000000) >>16); + output[ 4] |= ((input[ 3] & 0x04000000) >>15); + output[ 4] |= ((input[ 4] & 0x04000000) >>14); + output[ 4] |= ((input[ 5] & 0x04000000) >>13); + output[ 4] |= ((input[ 6] & 0x04000000) >>12); + output[ 4] |= ((input[ 7] & 0x04000000) >>11); + output[ 6] |= ((input[ 0] & 0x00000800) >>11); + output[ 6] |= ((input[ 1] & 0x00000800) >>10); + output[ 6] |= ((input[ 2] & 0x00000800) >> 9); + output[ 6] |= ((input[ 3] & 0x00000800) >> 8); + output[ 6] |= ((input[ 4] & 0x00000800) >> 7); + output[ 6] |= ((input[ 5] & 0x00000800) >> 6); + output[ 6] |= ((input[ 6] & 0x00000800) >> 5); + output[ 6] |= ((input[ 7] & 0x00000800) >> 4); + output[ 6] |= ((input[ 0] & 0x08000000) >>19); + output[ 6] |= ((input[ 1] & 0x08000000) >>18); + output[ 6] |= ((input[ 2] & 0x08000000) >>17); + output[ 6] |= ((input[ 3] & 0x08000000) >>16); + output[ 6] |= ((input[ 4] & 0x08000000) >>15); + output[ 6] |= ((input[ 5] & 0x08000000) >>14); + output[ 6] |= ((input[ 6] & 0x08000000) >>13); + output[ 6] |= ((input[ 7] & 0x08000000) >>12); + output[ 8] |= ((input[ 0] & 0x00001000) >>12); + output[ 8] |= ((input[ 1] & 0x00001000) >>11); + output[ 8] |= ((input[ 2] & 0x00001000) >>10); + output[ 8] |= ((input[ 3] & 0x00001000) >> 9); + output[ 8] |= ((input[ 4] & 0x00001000) >> 8); + output[ 8] |= ((input[ 5] & 0x00001000) >> 7); + output[ 8] |= ((input[ 6] & 0x00001000) >> 6); + output[ 8] |= ((input[ 7] & 0x00001000) >> 5); + output[ 8] |= ((input[ 0] & 0x10000000) >>20); + output[ 8] |= ((input[ 1] & 0x10000000) >>19); + output[ 8] |= ((input[ 2] & 0x10000000) >>18); + output[ 8] |= ((input[ 3] & 0x10000000) >>17); + output[ 8] |= ((input[ 4] & 0x10000000) >>16); + output[ 8] |= ((input[ 5] & 0x10000000) >>15); + output[ 8] |= ((input[ 6] & 0x10000000) >>14); + output[ 8] |= ((input[ 7] & 0x10000000) >>13); + output[10] |= ((input[ 0] & 0x00002000) >>13); + output[10] |= ((input[ 1] & 0x00002000) >>12); + output[10] |= ((input[ 2] & 0x00002000) >>11); + output[10] |= ((input[ 3] & 0x00002000) >>10); + output[10] |= ((input[ 4] & 0x00002000) >> 9); + output[10] |= ((input[ 5] & 0x00002000) >> 8); + output[10] |= ((input[ 6] & 0x00002000) >> 7); + output[10] |= ((input[ 7] & 0x00002000) >> 6); + output[10] |= ((input[ 0] & 0x20000000) >>21); + output[10] |= ((input[ 1] & 0x20000000) >>20); + output[10] |= ((input[ 2] & 0x20000000) >>19); + output[10] |= ((input[ 3] & 0x20000000) >>18); + output[10] |= ((input[ 4] & 0x20000000) >>17); + output[10] |= ((input[ 5] & 0x20000000) >>16); + output[10] |= ((input[ 6] & 0x20000000) >>15); + output[10] |= ((input[ 7] & 0x20000000) >>14); + output[12] |= ((input[ 0] & 0x00004000) >>14); + output[12] |= ((input[ 1] & 0x00004000) >>13); + output[12] |= ((input[ 2] & 0x00004000) >>12); + output[12] |= ((input[ 3] & 0x00004000) >>11); + output[12] |= ((input[ 4] & 0x00004000) >>10); + output[12] |= ((input[ 5] & 0x00004000) >> 9); + output[12] |= ((input[ 6] & 0x00004000) >> 8); + output[12] |= ((input[ 7] & 0x00004000) >> 7); + output[12] |= ((input[ 0] & 0x40000000) >>22); + output[12] |= ((input[ 1] & 0x40000000) >>21); + output[12] |= ((input[ 2] & 0x40000000) >>20); + output[12] |= ((input[ 3] & 0x40000000) >>19); + output[12] |= ((input[ 4] & 0x40000000) >>18); + output[12] |= ((input[ 5] & 0x40000000) >>17); + output[12] |= ((input[ 6] & 0x40000000) >>16); + output[12] |= ((input[ 7] & 0x40000000) >>15); + output[14] |= ((input[ 0] & 0x00008000) >>15); + output[14] |= ((input[ 1] & 0x00008000) >>14); + output[14] |= ((input[ 2] & 0x00008000) >>13); + output[14] |= ((input[ 3] & 0x00008000) >>12); + output[14] |= ((input[ 4] & 0x00008000) >>11); + output[14] |= ((input[ 5] & 0x00008000) >>10); + output[14] |= ((input[ 6] & 0x00008000) >> 9); + output[14] |= ((input[ 7] & 0x00008000) >> 8); + output[14] |= ((input[ 0] & 0x80000000) >>23); + output[14] |= ((input[ 1] & 0x80000000) >>22); + output[14] |= ((input[ 2] & 0x80000000) >>21); + output[14] |= ((input[ 3] & 0x80000000) >>20); + output[14] |= ((input[ 4] & 0x80000000) >>19); + output[14] |= ((input[ 5] & 0x80000000) >>18); + output[14] |= ((input[ 6] & 0x80000000) >>17); + output[14] |= ((input[ 7] & 0x80000000) >>16); + +#pragma unroll 8 + for (int i = 0; i < 16; i+=2) { + if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032); + output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610); + output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4); + if ((threadIdx.x % 4) != 0) output[i] = output[i+1] = 0; + } +} diff --git a/ccminer.vcxproj b/ccminer.vcxproj index ddca98a9ad..36d4fd5fbe 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -287,6 +287,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + true + true + true + true + -Xptxas "-abi=no -v" %(AdditionalOptions) -Xptxas "-abi=no -v" %(AdditionalOptions) @@ -311,6 +317,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" -Xptxas "-abi=no -v" %(AdditionalOptions) -Xptxas "-abi=no -v" %(AdditionalOptions) + + true + true + true + true + -Xptxas "-abi=no -v" %(AdditionalOptions) -Xptxas "-abi=no -v" %(AdditionalOptions) diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 8b7a596809..57c2998a89 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -340,5 +340,11 @@ Source Files\CUDA\x11 + + Source Files\CUDA + + + Source Files\CUDA + \ No newline at end of file diff --git a/configure.ac b/configure.ac index e33571afc4..80438fd5de 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2014.05.10]) +AC_INIT([ccminer], [2014.06.14]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 5eb2d7d041..854802cbf5 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -917,8 +917,8 @@ static void *miner_thread(void *userdata) goto out; } - if (opt_benchmark) - if (++rounds == 1) exit(0); +// if (opt_benchmark) +// if (++rounds == 1) exit(0); /* record scanhash elapsed time */ gettimeofday(&tv_end, NULL); @@ -1469,7 +1469,7 @@ static void signal_handler(int sig) } #endif -#define PROGRAM_VERSION "1.0" +#define PROGRAM_VERSION "1.1" int main(int argc, char *argv[]) { struct thr_info *thr; diff --git a/cpuminer-config.h b/cpuminer-config.h index 0dca3af1d2..03114b9ab0 100644 --- a/cpuminer-config.h +++ b/cpuminer-config.h @@ -152,7 +152,7 @@ #define PACKAGE_NAME "ccminer" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "ccminer 2014.05.10" +#define PACKAGE_STRING "ccminer 2014.06.14" /* Define to the one symbol short name of this package. */ #undef PACKAGE_TARNAME @@ -161,7 +161,7 @@ #undef PACKAGE_URL /* Define to the version of this package. */ -#define PACKAGE_VERSION "2014.05.10" +#define PACKAGE_VERSION "2014.06.14" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu index b33ff9ebc1..095d3260d7 100644 --- a/cuda_groestlcoin.cu +++ b/cuda_groestlcoin.cu @@ -1,4 +1,4 @@ -// Auf Groestlcoin spezialisierte Version von Groestl +// Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice #include #include "cuda_runtime.h" @@ -7,9 +7,6 @@ #include #include -// it's unfortunate that this is a compile time constant. -#define MAXWELL_OR_FERMI 1 - // aus cpu-miner.c extern int device_map[8]; @@ -18,456 +15,152 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t // Folgende Definitionen später durch header ersetzen typedef unsigned char uint8_t; +typedef unsigned short uint16_t; typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; // diese Struktur wird in der Init Funktion angefordert -static cudaDeviceProp props; +static cudaDeviceProp props[8]; +// globaler Speicher für alle HeftyHashes aller Threads __constant__ uint32_t pTarget[8]; // Single GPU extern uint32_t *d_resultNonce[8]; __constant__ uint32_t groestlcoin_gpu_msg[32]; -#define SPH_C32(x) ((uint32_t)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) - -#define PC32up(j, r) ((uint32_t)((j) + (r))) -#define PC32dn(j, r) 0 -#define QC32up(j, r) 0xFFFFFFFF -#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24))) - -#define B32_0(x) __byte_perm(x, 0, 0x4440) -//((x) & 0xFF) -#define B32_1(x) __byte_perm(x, 0, 0x4441) -//(((x) >> 8) & 0xFF) -#define B32_2(x) __byte_perm(x, 0, 0x4442) -//(((x) >> 16) & 0xFF) -#define B32_3(x) __byte_perm(x, 0, 0x4443) -//((x) >> 24) - -#if MAXWELL_OR_FERMI -#define USE_SHARED 1 -// Maxwell and Fermi cards get the best speed with SHARED access it seems. -#if USE_SHARED -#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) -#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x)))) -#define T1up(x) (*((uint32_t*)mixtabs + (512+(x)))) -#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) -#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x)))) -#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) -#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) -#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x)))) -#else -#define T0up(x) tex1Dfetch(t0up1, x) -#define T0dn(x) tex1Dfetch(t0dn1, x) -#define T1up(x) tex1Dfetch(t1up1, x) -#define T1dn(x) tex1Dfetch(t1dn1, x) -#define T2up(x) tex1Dfetch(t2up1, x) -#define T2dn(x) tex1Dfetch(t2dn1, x) -#define T3up(x) tex1Dfetch(t3up1, x) -#define T3dn(x) tex1Dfetch(t3dn1, x) -#endif -#else -#define USE_SHARED 1 -// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5! -#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) -#define T0dn(x) tex1Dfetch(t0dn1, x) -#define T1up(x) tex1Dfetch(t1up1, x) -#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) -#define T2up(x) tex1Dfetch(t2up1, x) -#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) -#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) -#define T3dn(x) tex1Dfetch(t3dn1, x) -#endif - -texture t0up1; -texture t0dn1; -texture t1up1; -texture t1dn1; -texture t2up1; -texture t2dn1; -texture t3up1; -texture t3dn1; - -extern uint32_t T0up_cpu[]; -extern uint32_t T0dn_cpu[]; -extern uint32_t T1up_cpu[]; -extern uint32_t T1dn_cpu[]; -extern uint32_t T2up_cpu[]; -extern uint32_t T2dn_cpu[]; -extern uint32_t T3up_cpu[]; -extern uint32_t T3dn_cpu[]; - -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) - - -__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs) +// 64 Register Variante für Compute 3.0 +#include "groestl_functions_quad.cu" +#include "bitslice_transformations_quad.cu" + +#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) + +__global__ void __launch_bounds__(256, 4) + groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce) { - uint32_t t[32]; - -//#pragma unroll 14 - for(int r=0;r<14;r++) - { - switch(r) - { - case 0: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break; - case 1: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break; - case 2: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break; - case 3: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break; - case 4: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break; - case 5: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break; - case 6: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break; - case 7: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break; - case 8: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break; - case 9: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break; - case 10: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break; - case 11: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break; - case 12: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break; - case 13: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break; - } - - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) - { - uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]); - uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]); - uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]); - uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]); - - t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^ - T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 ); + // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen + int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4; + if (thread < threads) + { + // GROESTL + uint32_t paddedInput[8]; +#pragma unroll 8 + for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+threadIdx.x%4]; - t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^ - T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 ); - } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; - } -} + uint32_t nounce = startNounce + thread; + if ((threadIdx.x % 4) == 3) + paddedInput[4] = SWAB32(nounce); // 4*4+3 = 19 -__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs) -{ -//#pragma unroll 14 - for(int r=0;r<14;r++) - { - uint32_t t[32]; - - switch(r) - { - case 0: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break; - case 1: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break; - case 2: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break; - case 3: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break; - case 4: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break; - case 5: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break; - case 6: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break; - case 7: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break; - case 8: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break; - case 9: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break; - case 10: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break; - case 11: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break; - case 12: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break; - case 13: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break; - } - - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) + uint32_t msgBitsliced[8]; + to_bitslice_quad(paddedInput, msgBitsliced); + + uint32_t state[8]; + for (int round=0; round<2; round++) { - uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]); - uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]); - uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]); - uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]); - - t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ - T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 ); + groestl512_progressMessage_quad(state, msgBitsliced); + + if (round < 1) + { + // Verkettung zweier Runden inclusive Padding. + msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x%4)==3)*0x2000); + msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341); + msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341); + msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341); + msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341); + msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341); + msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341); + msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + ((threadIdx.x%4)==0)*0x0010); + } + } - t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ - T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 ); + // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash + uint32_t out_state[16]; + from_bitslice_quad(state, out_state); + + if (threadIdx.x % 4 == 0) + { + int i, position = -1; + bool rc = true; + + #pragma unroll 8 + for (i = 7; i >= 0; i--) { + if (out_state[i] > pTarget[i]) { + if(position < i) { + position = i; + rc = false; + } + } + if (out_state[i] < pTarget[i]) { + if(position < i) { + position = i; + rc = true; + } + } + } + + if(rc == true) + if(resNounce[0] > nounce) + resNounce[0] = nounce; } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; } } -#if USE_SHARED -__global__ void /* __launch_bounds__(256) */ -#else -__global__ void -#endif - - groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce) -{ -#if USE_SHARED - extern __shared__ char mixtabs[]; - - if (threadIdx.x < 256) - { - *((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x); - *((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x); - *((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x); - *((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x); - *((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x); - *((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x); - *((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x); - *((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x); - } - - __syncthreads(); -#endif - - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // GROESTL - uint32_t message[32]; - uint32_t state[32]; - -#pragma unroll 32 - for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k]; - - uint32_t nounce = startNounce + thread; - message[19] = SWAB32(nounce); - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] = message[u]; - state[31] ^= 0x20000; - - // Perm -#if USE_SHARED - groestlcoin_perm_P(state, mixtabs); - state[31] ^= 0x20000; - groestlcoin_perm_Q(message, mixtabs); -#else - groestlcoin_perm_P(state, NULL); - state[31] ^= 0x20000; - groestlcoin_perm_Q(message, NULL); -#endif -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; - -#pragma unroll 32 - for(int u=0;u<32;u++) message[u] = state[u]; - -#if USE_SHARED - groestlcoin_perm_P(message, mixtabs); -#else - groestlcoin_perm_P(message, NULL); -#endif - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; - - //// - //// 2. Runde groestl - //// -#pragma unroll 16 - for(int k=0;k<16;k++) message[k] = state[k + 16]; -#pragma unroll 14 - for(int k=1;k<15;k++) - message[k+16] = 0; - - message[16] = 0x80; - message[31] = 0x01000000; - -#pragma unroll 32 - for(int u=0;u<32;u++) - state[u] = message[u]; - state[31] ^= 0x20000; - - // Perm -#if USE_SHARED - groestlcoin_perm_P(state, mixtabs); - state[31] ^= 0x20000; - groestlcoin_perm_Q(message, mixtabs); -#else - groestlcoin_perm_P(state, NULL); - state[31] ^= 0x20000; - groestlcoin_perm_Q(message, NULL); -#endif - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; - -#pragma unroll 32 - for(int u=0;u<32;u++) message[u] = state[u]; - -#if USE_SHARED - groestlcoin_perm_P(message, mixtabs); -#else - groestlcoin_perm_P(message, NULL); -#endif - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; - - // kopiere Ergebnis - int i, position = -1; - bool rc = true; - -#pragma unroll 8 - for (i = 7; i >= 0; i--) { - if (state[i+16] > pTarget[i]) { - if(position < i) { - position = i; - rc = false; - } - } - if (state[i+16] < pTarget[i]) { - if(position < i) { - position = i; - rc = true; - } - } - } - - if(rc == true) - if(resNounce[0] > nounce) - resNounce[0] = nounce; - } -} - -#define texDef(texname, texmem, texsource, texsize) \ - unsigned int *texmem; \ - cudaMalloc(&texmem, texsize); \ - cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ - texname.normalized = 0; \ - texname.filterMode = cudaFilterModePoint; \ - texname.addressMode[0] = cudaAddressModeClamp; \ - { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ // Setup-Funktionen __host__ void groestlcoin_cpu_init(int thr_id, int threads) { - cudaSetDevice(device_map[thr_id]); - - cudaGetDeviceProperties(&props, device_map[thr_id]); - - // Texturen mit obigem Makro initialisieren - texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256); - texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); - texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256); - texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); - texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256); - texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); - texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256); - texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); - - // Speicher für Gewinner-Nonce belegen - cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); + cudaSetDevice(device_map[thr_id]); + + cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); + + // Speicher für Gewinner-Nonce belegen + cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); } __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn) { - // Nachricht expandieren und setzen - uint32_t msgBlock[32]; + // Nachricht expandieren und setzen + uint32_t msgBlock[32]; - memset(msgBlock, 0, sizeof(uint32_t) * 32); - memcpy(&msgBlock[0], data, 80); + memset(msgBlock, 0, sizeof(uint32_t) * 32); + memcpy(&msgBlock[0], data, 80); - // Erweitere die Nachricht auf den Nachrichtenblock (padding) - // Unsere Nachricht hat 80 Byte - msgBlock[20] = 0x80; - msgBlock[31] = 0x01000000; + // Erweitere die Nachricht auf den Nachrichtenblock (padding) + // Unsere Nachricht hat 80 Byte + msgBlock[20] = 0x80; + msgBlock[31] = 0x01000000; - // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird - // auf der GPU ausgeführt) + // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird + // auf der GPU ausgeführt) - // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) - cudaMemcpyToSymbol( groestlcoin_gpu_msg, - msgBlock, - 128); + // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) + cudaMemcpyToSymbol( groestlcoin_gpu_msg, + msgBlock, + 128); - cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); - cudaMemcpyToSymbol( pTarget, - pTargetIn, - sizeof(uint32_t) * 8 ); + cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); + cudaMemcpyToSymbol( pTarget, + pTargetIn, + sizeof(uint32_t) * 8 ); } __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) { - // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, - // alle anderen mit 512 Threads. - int threadsperblock = (props.major >= 3) ? 768 : 512; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs -#if USE_SHARED - size_t shared_size = 8 * 256 * sizeof(uint32_t); -#else - size_t shared_size = 0; -#endif - -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - //fprintf(stderr, "ThrID: %d\n", thr_id); - cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); - groestlcoin_gpu_hash<<>>(threads, startNounce, d_resultNonce[thr_id]); - - // Strategisches Sleep Kommando zur Senkung der CPU Last - MyStreamSynchronize(NULL, 0, thr_id); - - cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + int threadsperblock = 256; + + // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle + // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl + int factor = 4; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock)); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; + + cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); + groestlcoin_gpu_hash_quad<<>>(threads, startNounce, d_resultNonce[thr_id]); + + // Strategisches Sleep Kommando zur Senkung der CPU Last + MyStreamSynchronize(NULL, 0, thr_id); + + cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); } diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu index 062db46c21..3a992e0302 100644 --- a/cuda_myriadgroestl.cu +++ b/cuda_myriadgroestl.cu @@ -1,4 +1,4 @@ -// Auf Myriadcoin spezialisierte Version von Groestl +// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice #include #include "cuda_runtime.h" @@ -7,9 +7,6 @@ #include #include -// it's unfortunate that this is a compile time constant. -#define MAXWELL_OR_FERMI 1 - // aus cpu-miner.c extern int device_map[8]; @@ -22,30 +19,49 @@ typedef unsigned short uint16_t; typedef unsigned int uint32_t; // diese Struktur wird in der Init Funktion angefordert -static cudaDeviceProp props; +static cudaDeviceProp props[8]; +// globaler Speicher für alle HeftyHashes aller Threads __constant__ uint32_t pTarget[8]; // Single GPU +uint32_t *d_outputHashes[8]; extern uint32_t *d_resultNonce[8]; __constant__ uint32_t myriadgroestl_gpu_msg[32]; // muss expandiert werden __constant__ uint32_t myr_sha256_gpu_constantTable[64]; +__constant__ uint32_t myr_sha256_gpu_constantTable2[64]; __constant__ uint32_t myr_sha256_gpu_hashTable[8]; uint32_t myr_sha256_cpu_hashTable[] = { - 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; uint32_t myr_sha256_cpu_constantTable[] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, }; +uint32_t myr_sha256_cpu_w2Table[] = { + 0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200, + 0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020, + 0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549, + 0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91, + 0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7, + 0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d, + 0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484 }; + +// 64 Register Variante für Compute 3.0 +#include "groestl_functions_quad.cu" +#include "bitslice_transformations_quad.cu" + +#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) + #if __CUDA_ARCH__ < 350 // Kepler (Compute 3.0) #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) @@ -53,452 +69,219 @@ uint32_t myr_sha256_cpu_constantTable[] = { // Kepler (Compute 3.5) #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) #endif -#define R(x, n) ((x) >> (n)) -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) -#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) -#define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3)) -#define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10)) - -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) +#define R(x, n) ((x) >> (n)) +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) +#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) +#define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3)) +#define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10)) __device__ void myriadgroestl_gpu_sha256(uint32_t *message) { - uint32_t W1[16]; - uint32_t W2[16]; + uint32_t W1[16]; + uint32_t W2[16]; - // Initialisiere die register a bis h mit der Hash-Tabelle - uint32_t regs[8]; - uint32_t hash[8]; + // Initialisiere die register a bis h mit der Hash-Tabelle + uint32_t regs[8]; + uint32_t hash[8]; - // pre + // pre #pragma unroll 8 - for (int k=0; k < 8; k++) - { - regs[k] = myr_sha256_gpu_hashTable[k]; - hash[k] = regs[k]; - } - + for (int k=0; k < 8; k++) + { + regs[k] = myr_sha256_gpu_hashTable[k]; + hash[k] = regs[k]; + } + #pragma unroll 16 - for(int k=0;k<16;k++) - W1[k] = SWAB32(message[k]); + for(int k=0;k<16;k++) + W1[k] = SWAB32(message[k]); // Progress W1 #pragma unroll 16 - for(int j=0;j<16;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = T1 + T2; - regs[4] += T1; - } + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; + regs[0] = T1 + T2; + regs[4] += T1; + } // Progress W2...W3 -#pragma unroll 3 - for(int k=0;k<3;k++) - { +////// PART 1 #pragma unroll 2 - for(int j=0;j<2;j++) - W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + for(int j=0;j<2;j++) + W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; #pragma unroll 5 - for(int j=2;j<7;j++) - W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + for(int j=2;j<7;j++) + W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; #pragma unroll 8 - for(int j=7;j<15;j++) - W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; + for(int j=7;j<15;j++) + W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; - W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; + W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; - // Rundenfunktion + // Rundenfunktion #pragma unroll 16 - for(int j=0;j<16;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; - regs[0] = T1 + T2; - regs[4] += T1; - } - -#pragma unroll 16 - for(int j=0;j<16;j++) - W1[j] = W2[j]; - } + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16] + W2[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; + regs[0] = T1 + T2; + regs[4] += T1; + } -#pragma unroll 8 - for(int k=0;k<8;k++) - hash[k] += regs[k]; +////// PART 2 +#pragma unroll 2 + for(int j=0;j<2;j++) + W1[j] = s1(W2[14+j]) + W2[9+j] + s0(W2[1+j]) + W2[j]; +#pragma unroll 5 + for(int j=2;j<7;j++) + W1[j] = s1(W1[j-2]) + W2[9+j] + s0(W2[1+j]) + W2[j]; - ///// - ///// Zweite Runde (wegen Msg-Padding) - ///// #pragma unroll 8 - for(int k=0;k<8;k++) - regs[k] = hash[k]; + for(int j=7;j<15;j++) + W1[j] = s1(W1[j-2]) + W1[j-7] + s0(W2[1+j]) + W2[j]; - W1[0] = SWAB32(0x80); -#pragma unroll 14 - for(int k=1;k<15;k++) - W1[k] = 0; - W1[15] = 512; + W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15]; -// Progress W1 + // Rundenfunktion #pragma unroll 16 - for(int j=0;j<16;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = T1 + T2; - regs[4] += T1; - } + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 32] + W1[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; + regs[0] = T1 + T2; + regs[4] += T1; + } -// Progress W2...W3 -#pragma unroll 3 - for(int k=0;k<3;k++) - { +////// PART 3 #pragma unroll 2 - for(int j=0;j<2;j++) - W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + for(int j=0;j<2;j++) + W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; #pragma unroll 5 - for(int j=2;j<7;j++) - W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + for(int j=2;j<7;j++) + W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; #pragma unroll 8 - for(int j=7;j<15;j++) - W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; + for(int j=7;j<15;j++) + W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; - W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; + W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; - // Rundenfunktion + // Rundenfunktion #pragma unroll 16 - for(int j=0;j<16;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; - regs[0] = T1 + T2; - regs[4] += T1; - } + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 48] + W2[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; + regs[0] = T1 + T2; + regs[4] += T1; + } -#pragma unroll 16 - for(int j=0;j<16;j++) - W1[j] = W2[j]; - } +#pragma unroll 8 + for(int k=0;k<8;k++) + hash[k] += regs[k]; + ///// + ///// Zweite Runde (wegen Msg-Padding) + ///// #pragma unroll 8 - for(int k=0;k<8;k++) - hash[k] += regs[k]; + for(int k=0;k<8;k++) + regs[k] = hash[k]; - //// FERTIG +// Progress W1 +#pragma unroll 64 + for(int j=0;j<64;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable2[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; + regs[0] = T1 + T2; + regs[4] += T1; + } #pragma unroll 8 - for(int k=0;k<8;k++) - message[k] = SWAB32(hash[k]); -} + for(int k=0;k<8;k++) + hash[k] += regs[k]; -#define SPH_C32(x) ((uint32_t)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) - -#define PC32up(j, r) ((uint32_t)((j) + (r))) -#define PC32dn(j, r) 0 -#define QC32up(j, r) 0xFFFFFFFF -#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24))) - -#define B32_0(x) __byte_perm(x, 0, 0x4440) -//((x) & 0xFF) -#define B32_1(x) __byte_perm(x, 0, 0x4441) -//(((x) >> 8) & 0xFF) -#define B32_2(x) __byte_perm(x, 0, 0x4442) -//(((x) >> 16) & 0xFF) -#define B32_3(x) __byte_perm(x, 0, 0x4443) -//((x) >> 24) - -#if MAXWELL_OR_FERMI -#define USE_SHARED 1 -// Maxwell and Fermi cards get the best speed with SHARED access it seems. -#if USE_SHARED -#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) -#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x)))) -#define T1up(x) (*((uint32_t*)mixtabs + (512+(x)))) -#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) -#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x)))) -#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) -#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) -#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x)))) -#else -#define T0up(x) tex1Dfetch(t0up1, x) -#define T0dn(x) tex1Dfetch(t0dn1, x) -#define T1up(x) tex1Dfetch(t1up1, x) -#define T1dn(x) tex1Dfetch(t1dn1, x) -#define T2up(x) tex1Dfetch(t2up1, x) -#define T2dn(x) tex1Dfetch(t2dn1, x) -#define T3up(x) tex1Dfetch(t3up1, x) -#define T3dn(x) tex1Dfetch(t3dn1, x) -#endif -#else -#define USE_SHARED 1 -// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5! -#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) -#define T0dn(x) tex1Dfetch(t0dn1, x) -#define T1up(x) tex1Dfetch(t1up1, x) -#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) -#define T2up(x) tex1Dfetch(t2up1, x) -#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) -#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) -#define T3dn(x) tex1Dfetch(t3dn1, x) -#endif + //// FERTIG -texture t0up1; -texture t0dn1; -texture t1up1; -texture t1dn1; -texture t2up1; -texture t2dn1; -texture t3up1; -texture t3dn1; +#pragma unroll 8 + for(int k=0;k<8;k++) + message[k] = SWAB32(hash[k]); +} -extern uint32_t T0up_cpu[]; -extern uint32_t T0dn_cpu[]; -extern uint32_t T1up_cpu[]; -extern uint32_t T1dn_cpu[]; -extern uint32_t T2up_cpu[]; -extern uint32_t T2dn_cpu[]; -extern uint32_t T3up_cpu[]; -extern uint32_t T3dn_cpu[]; +__global__ void __launch_bounds__(256, 4) + myriadgroestl_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *hashBuffer) +{ + // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen + int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4; + if (thread < threads) + { + // GROESTL + uint32_t paddedInput[8]; +#pragma unroll 8 + for(int k=0;k<8;k++) paddedInput[k] = myriadgroestl_gpu_msg[4*k+threadIdx.x%4]; -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) + uint32_t nounce = startNounce + thread; + if ((threadIdx.x % 4) == 3) + paddedInput[4] = SWAB32(nounce); // 4*4+3 = 19 + uint32_t msgBitsliced[8]; + to_bitslice_quad(paddedInput, msgBitsliced); -__device__ __forceinline__ void myriadgroestl_perm_P(uint32_t *a, char *mixtabs) -{ - uint32_t t[32]; - -//#pragma unroll 14 - for(int r=0;r<14;r++) - { - switch(r) - { - case 0: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break; - case 1: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break; - case 2: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break; - case 3: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break; - case 4: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break; - case 5: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break; - case 6: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break; - case 7: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break; - case 8: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break; - case 9: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break; - case 10: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break; - case 11: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break; - case 12: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break; - case 13: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break; - } + uint32_t state[8]; - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) - { - uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]); - uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]); - uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]); - uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]); - - t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^ - T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 ); + groestl512_progressMessage_quad(state, msgBitsliced); - t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^ - T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 ); - } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; - } -} + uint32_t out_state[16]; + from_bitslice_quad(state, out_state); -__device__ __forceinline__ void myriadgroestl_perm_Q(uint32_t *a, char *mixtabs) -{ -//#pragma unroll 14 - for(int r=0;r<14;r++) - { - uint32_t t[32]; - - switch(r) - { - case 0: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break; - case 1: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break; - case 2: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break; - case 3: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break; - case 4: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break; - case 5: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break; - case 6: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break; - case 7: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break; - case 8: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break; - case 9: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break; - case 10: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break; - case 11: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break; - case 12: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break; - case 13: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break; - } - - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) + if ((threadIdx.x & 0x03) == 0) { - uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]); - uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]); - uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]); - uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]); - - t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ - T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 ); - - t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ - T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 ); + uint32_t *outpHash = &hashBuffer[16 * thread]; +#pragma unroll 16 + for(int k=0;k<16;k++) outpHash[k] = out_state[k]; } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; } } -__global__ void -myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce) +__global__ void + myriadgroestl_gpu_hash_quad2(int threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer) { -#if USE_SHARED - extern __shared__ char mixtabs[]; - - if (threadIdx.x < 256) - { - *((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x); - *((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x); - *((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x); - *((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x); - *((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x); - *((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x); - *((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x); - *((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x); - } - - __syncthreads(); -#endif - int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - // GROESTL - uint32_t message[32]; - uint32_t state[32]; - -#pragma unroll 32 - for(int k=0;k<32;k++) message[k] = myriadgroestl_gpu_msg[k]; - - uint32_t nounce = startNounce + thread; - message[19] = SWAB32(nounce); - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] = message[u]; - state[31] ^= 0x20000; - - // Perm -#if USE_SHARED - myriadgroestl_perm_P(state, mixtabs); - state[31] ^= 0x20000; - myriadgroestl_perm_Q(message, mixtabs); -#else - myriadgroestl_perm_P(state, NULL); - state[31] ^= 0x20000; - myriadgroestl_perm_Q(message, NULL); -#endif -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; - -#pragma unroll 32 - for(int u=0;u<32;u++) message[u] = state[u]; - -#if USE_SHARED - myriadgroestl_perm_P(message, mixtabs); -#else - myriadgroestl_perm_P(message, NULL); -#endif - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; + uint32_t nounce = startNounce + thread; uint32_t out_state[16]; + uint32_t *inpHash = &hashBuffer[16 * thread]; #pragma unroll 16 - for(int u=0;u<16;u++) out_state[u] = state[u+16]; + for (int i=0; i < 16; i++) + out_state[i] = inpHash[i]; + myriadgroestl_gpu_sha256(out_state); int i, position = -1; @@ -526,43 +309,35 @@ myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce) } } -#define texDef(texname, texmem, texsource, texsize) \ - unsigned int *texmem; \ - cudaMalloc(&texmem, texsize); \ - cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ - texname.normalized = 0; \ - texname.filterMode = cudaFilterModePoint; \ - texname.addressMode[0] = cudaAddressModeClamp; \ - { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ - // Setup-Funktionen __host__ void myriadgroestl_cpu_init(int thr_id, int threads) { - cudaSetDevice(device_map[thr_id]); - - cudaMemcpyToSymbol( myr_sha256_gpu_hashTable, - myr_sha256_cpu_hashTable, - sizeof(uint32_t) * 8 ); - - cudaMemcpyToSymbol( myr_sha256_gpu_constantTable, - myr_sha256_cpu_constantTable, - sizeof(uint32_t) * 64 ); - - cudaGetDeviceProperties(&props, device_map[thr_id]); - - // Texturen mit obigem Makro initialisieren - texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256); - texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); - texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256); - texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); - texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256); - texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); - texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256); - texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); + cudaSetDevice(device_map[thr_id]); + + cudaMemcpyToSymbol( myr_sha256_gpu_hashTable, + myr_sha256_cpu_hashTable, + sizeof(uint32_t) * 8 ); + + cudaMemcpyToSymbol( myr_sha256_gpu_constantTable, + myr_sha256_cpu_constantTable, + sizeof(uint32_t) * 64 ); + + // zweite CPU-Tabelle bauen und auf die GPU laden + uint32_t temp[64]; + for(int i=0;i<64;i++) + temp[i] = myr_sha256_cpu_w2Table[i] + myr_sha256_cpu_constantTable[i]; + + cudaMemcpyToSymbol( myr_sha256_gpu_constantTable2, + temp, + sizeof(uint32_t) * 64 ); + + cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); // Speicher für Gewinner-Nonce belegen cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); + + // Speicher für temporäreHashes + cudaMalloc(&d_outputHashes[thr_id], 16*sizeof(uint32_t)*threads); } __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn) @@ -594,25 +369,23 @@ __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn __host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) { - // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, - // alle anderen mit 512 Threads. - int threadsperblock = (props.major >= 3) ? 768 : 512; + int threadsperblock = 256; - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); + // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle + // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl + const int factor=4; - // Größe des dynamischen Shared Memory Bereichs -#if USE_SHARED - size_t shared_size = 8 * 256 * sizeof(uint32_t); -#else - size_t shared_size = 0; -#endif + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - //fprintf(stderr, "ThrID: %d\n", thr_id); cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); - myriadgroestl_gpu_hash<<>>(threads, startNounce, d_resultNonce[thr_id]); + // berechne wie viele Thread Blocks wir brauchen + dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock)); + dim3 block(threadsperblock); + + myriadgroestl_gpu_hash_quad<<>>(threads, startNounce, d_outputHashes[thr_id]); + dim3 grid2((threads + threadsperblock-1)/threadsperblock); + myriadgroestl_gpu_hash_quad2<<>>(threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]); // Strategisches Sleep Kommando zur Senkung der CPU Last MyStreamSynchronize(NULL, 0, thr_id); diff --git a/cuda_nist5.cu b/cuda_nist5.cu index 0296b4e99a..4e37f6968c 100644 --- a/cuda_nist5.cu +++ b/cuda_nist5.cu @@ -85,7 +85,6 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, { const uint32_t first_nonce = pdata[19]; - // TODO: entfernen für eine Release! Ist nur zum Testen! if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x0000ff; diff --git a/groestl_functions_quad.cu b/groestl_functions_quad.cu new file mode 100644 index 0000000000..745a843d54 --- /dev/null +++ b/groestl_functions_quad.cu @@ -0,0 +1,315 @@ + +__device__ __forceinline__ void G256_Mul2(uint32_t *regs) +{ + uint32_t tmp = regs[7]; + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] ^ tmp; + regs[3] = regs[2] ^ tmp; + regs[2] = regs[1]; + regs[1] = regs[0] ^ tmp; + regs[0] = tmp; +} + +__device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round) +{ + x0 = ~x0; + x1 = ~x1; + x2 = ~x2; + x3 = ~x3; + x4 = ~x4; + x5 = ~x5; + x6 = ~x6; + x7 = ~x7; + + if ((threadIdx.x & 0x03) == 3) { + x0 ^= ((- (round & 0x01) ) & 0xFFFF0000); + x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF0000); + x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF0000); + x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF0000); + x4 ^= 0xAAAA0000; + x5 ^= 0xCCCC0000; + x6 ^= 0xF0F00000; + x7 ^= 0xFF000000; + } +} + +__device__ __forceinline__ void G256_AddRoundConstantP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round) +{ + if ((threadIdx.x & 0x03) == 0) + { + x4 ^= 0xAAAA; + x5 ^= 0xCCCC; + x6 ^= 0xF0F0; + x7 ^= 0xFF00; + + x0 ^= ((- (round & 0x01) ) & 0xFFFF); + x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF); + x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF); + x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF); + } +} + +__device__ __forceinline__ void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, + uint32_t &y3, uint32_t &y2, uint32_t &y1, uint32_t &y0) +{ + uint32_t t0,t1,t2; + + t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1)); + t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0; + t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1; + + t0 = (x2^x3) & (y2^y3); + x3 = (x3 & y3) ^ t0 ^ t1; + x2 = (x2 & y2) ^ t0 ^ t2; + + t0 = (x0^x1) & (y0^y1); + x1 = (x1 & y1) ^ t0 ^ t1; + x0 = (x0 & y0) ^ t0 ^ t2; +} + +__device__ __forceinline__ void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0) +{ + uint32_t t0,t1,t2,t3,t4,t5,t6,a,b; + + t3 = x7; + t2 = x6; + t1 = x5; + t0 = x4; + + G16mul_quad(t3, t2, t1, t0, x3, x2, x1, x0); + + a = (x4 ^ x0); + t0 ^= a; + t2 ^= (x7 ^ x3) ^ (x5 ^ x1); + t1 ^= (x5 ^ x1) ^ a; + t3 ^= (x6 ^ x2) ^ a; + + b = t0 ^ t1; + t4 = (t2 ^ t3) & b; + a = t4 ^ t3 ^ t1; + t5 = (t3 & t1) ^ a; + t6 = (t2 & t0) ^ a ^ (t2 ^ t0); + + t4 = (t5 ^ t6) & b; + t1 = (t6 & t1) ^ t4; + t0 = (t5 & t0) ^ t4; + + t4 = (t5 ^ t6) & (t2^t3); + t3 = (t6 & t3) ^ t4; + t2 = (t5 & t2) ^ t4; + + G16mul_quad(x3, x2, x1, x0, t1, t0, t3, t2); + + G16mul_quad(x7, x6, x5, x4, t1, t0, t3, t2); +} + +__device__ __forceinline__ void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7) +{ + uint32_t t0, t1; + t0 = x0 ^ x1 ^ x2; + t1 = x5 ^ x6; + x2 = t0 ^ t1 ^ x7; + x6 = t0 ^ x3 ^ x6; + x3 = x0 ^ x1 ^ x3 ^ x4 ^ x7; + x4 = x0 ^ x4 ^ t1; + x2 = t0 ^ t1 ^ x7; + x1 = x0 ^ x1 ^ t1; + x7 = x0 ^ t1 ^ x7; + x5 = x0 ^ t1; +} + +__device__ __forceinline__ void transXtoA_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7) +{ + uint32_t t0,t2,t3,t5; + + x1 ^= x4; + t0 = x1 ^ x6; + x1 ^= x5; + + t2 = x0 ^ x2; + x2 = x3 ^ x5; + t2 ^= x2 ^ x6; + x2 ^= x7; + t3 = x4 ^ x2 ^ x6; + + t5 = x0 ^ x6; + x4 = x3 ^ x7; + x0 = x3 ^ x5; + + x6 = t0; + x3 = t2; + x7 = t3; + x5 = t5; +} + +__device__ __forceinline__ void sbox_quad(uint32_t *r) +{ + transAtoX_quad(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]); + + G256_inv_quad(r[2], r[4], r[1], r[7], r[3], r[0], r[5], r[6]); + + transXtoA_quad(r[7], r[1], r[4], r[2], r[6], r[5], r[0], r[3]); + + r[0] = ~r[0]; + r[1] = ~r[1]; + r[5] = ~r[5]; + r[6] = ~r[6]; +} + +__device__ __forceinline__ void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0) +{ + uint32_t t0,t1; + + int tpos = threadIdx.x & 0x03; + int shift1 = tpos << 1; + int shift2 = shift1+1 + ((tpos == 3)<<2); + + t0 = __byte_perm(x0, 0, 0x1010)>>shift1; + t1 = __byte_perm(x0, 0, 0x3232)>>shift2; + x0 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x1, 0, 0x1010)>>shift1; + t1 = __byte_perm(x1, 0, 0x3232)>>shift2; + x1 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x2, 0, 0x1010)>>shift1; + t1 = __byte_perm(x2, 0, 0x3232)>>shift2; + x2 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x3, 0, 0x1010)>>shift1; + t1 = __byte_perm(x3, 0, 0x3232)>>shift2; + x3 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x4, 0, 0x1010)>>shift1; + t1 = __byte_perm(x4, 0, 0x3232)>>shift2; + x4 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x5, 0, 0x1010)>>shift1; + t1 = __byte_perm(x5, 0, 0x3232)>>shift2; + x5 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x6, 0, 0x1010)>>shift1; + t1 = __byte_perm(x6, 0, 0x3232)>>shift2; + x6 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x7, 0, 0x1010)>>shift1; + t1 = __byte_perm(x7, 0, 0x3232)>>shift2; + x7 = __byte_perm(t0, t1, 0x5410); +} + +__device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0) +{ + uint32_t t0,t1; + + int tpos = threadIdx.x & 0x03; + int shift1 = (1-(tpos>>1)) + ((tpos & 0x01)<<2); + int shift2 = shift1+2 + ((tpos == 1)<<2); + + t0 = __byte_perm(x0, 0, 0x1010)>>shift1; + t1 = __byte_perm(x0, 0, 0x3232)>>shift2; + x0 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x1, 0, 0x1010)>>shift1; + t1 = __byte_perm(x1, 0, 0x3232)>>shift2; + x1 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x2, 0, 0x1010)>>shift1; + t1 = __byte_perm(x2, 0, 0x3232)>>shift2; + x2 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x3, 0, 0x1010)>>shift1; + t1 = __byte_perm(x3, 0, 0x3232)>>shift2; + x3 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x4, 0, 0x1010)>>shift1; + t1 = __byte_perm(x4, 0, 0x3232)>>shift2; + x4 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x5, 0, 0x1010)>>shift1; + t1 = __byte_perm(x5, 0, 0x3232)>>shift2; + x5 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x6, 0, 0x1010)>>shift1; + t1 = __byte_perm(x6, 0, 0x3232)>>shift2; + x6 = __byte_perm(t0, t1, 0x5410); + + t0 = __byte_perm(x7, 0, 0x1010)>>shift1; + t1 = __byte_perm(x7, 0, 0x3232)>>shift2; + x7 = __byte_perm(t0, t1, 0x5410); +} + +__device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r) +{ +#define SHIFT64_16(hi, lo) __byte_perm(lo, hi, 0x5432) +#define A(v, u) __shfl((int)r[v], ((threadIdx.x+u)&0x03), 4) +#define S(idx, l) SHIFT64_16( A(idx, (l+1)), A(idx, l) ) + +#define DOUBLE_ODD(i, bc) ( S(i, (bc)) ^ A(i, (bc) + 1) ) +#define DOUBLE_EVEN(i, bc) ( S(i, (bc)) ^ A(i, (bc) ) ) + +#define SINGLE_ODD(i, bc) ( S(i, (bc)) ) +#define SINGLE_EVEN(i, bc) ( A(i, (bc)) ) + uint32_t b[8]; + +#pragma unroll 8 + for(int i=0;i<8;i++) + b[i] = DOUBLE_ODD(i, 1) ^ DOUBLE_EVEN(i, 3); + + G256_Mul2(b); +#pragma unroll 8 + for(int i=0;i<8;i++) + b[i] = b[i] ^ DOUBLE_ODD(i, 3) ^ DOUBLE_ODD(i, 4) ^ SINGLE_ODD(i, 6); + + G256_Mul2(b); +#pragma unroll 8 + for(int i=0;i<8;i++) + r[i] = b[i] ^ DOUBLE_EVEN(i, 2) ^ DOUBLE_EVEN(i, 3) ^ SINGLE_EVEN(i, 5); + +#undef S +#undef A +#undef SHIFT64_16 +#undef t +#undef X +} + +__device__ __forceinline__ void groestl512_perm_P_quad(uint32_t *r) +{ + for(int round=0;round<14;round++) + { + G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round); + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + } +} + +__device__ __forceinline__ void groestl512_perm_Q_quad(uint32_t *r) +{ + for(int round=0;round<14;round++) + { + G256_AddRoundConstantQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round); + sbox_quad(r); + G256_ShiftBytesQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + } +} + +__device__ __forceinline__ void groestl512_progressMessage_quad(uint32_t *state, uint32_t *message) +{ +#pragma unroll 8 + for(int u=0;u<8;u++) state[u] = message[u]; + + if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000; + groestl512_perm_P_quad(state); + if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000; + groestl512_perm_Q_quad(message); +#pragma unroll 8 + for(int u=0;u<8;u++) state[u] ^= message[u]; +#pragma unroll 8 + for(int u=0;u<8;u++) message[u] = state[u]; + groestl512_perm_P_quad(message); +#pragma unroll 8 + for(int u=0;u<8;u++) state[u] ^= message[u]; +} diff --git a/groestlcoin.cpp b/groestlcoin.cpp index c8b785056e..dc3b0feabe 100644 --- a/groestlcoin.cpp +++ b/groestlcoin.cpp @@ -15,163 +15,118 @@ void sha256func(unsigned char *hash, const unsigned char *data, int len) { - uint32_t S[16], T[16]; - int i, r; - - sha256_init(S); - for (r = len; r > -9; r -= 64) { - if (r < 64) - memset(T, 0, 64); - memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); - if (r >= 0 && r < 64) - ((unsigned char *)T)[r] = 0x80; - for (i = 0; i < 16; i++) - T[i] = be32dec(T + i); - if (r < 56) - T[15] = 8 * len; - sha256_transform(S, T, 0); - } - /* - memcpy(S + 8, sha256d_hash1 + 8, 32); - sha256_init(T); - sha256_transform(T, S, 0); - */ - for (i = 0; i < 8; i++) - be32enc((uint32_t *)hash + i, T[i]); + uint32_t S[16], T[16]; + int i, r; + + sha256_init(S); + for (r = len; r > -9; r -= 64) { + if (r < 64) + memset(T, 0, 64); + memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); + if (r >= 0 && r < 64) + ((unsigned char *)T)[r] = 0x80; + for (i = 0; i < 16; i++) + T[i] = be32dec(T + i); + if (r < 56) + T[15] = 8 * len; + sha256_transform(S, T, 0); + } + /* + memcpy(S + 8, sha256d_hash1 + 8, 32); + sha256_init(T); + sha256_transform(T, S, 0); + */ + for (i = 0; i < 8; i++) + be32enc((uint32_t *)hash + i, T[i]); } static void groestlhash(void *state, const void *input) { - // Tryout GPU-groestl + // Tryout GPU-groestl sph_groestl512_context ctx_groestl[2]; static unsigned char pblank[1]; - int ii; uint32_t mask = 8; uint32_t zero = 0; - //these uint512 in the c++ source of the client are backed by an array of uint32 - uint32_t hashA[16], hashB[16]; + //these uint512 in the c++ source of the client are backed by an array of uint32 + uint32_t hashA[16], hashB[16]; sph_groestl512_init(&ctx_groestl[0]); sph_groestl512 (&ctx_groestl[0], input, 80); //6 - sph_groestl512_close(&ctx_groestl[0], hashA); //7 + sph_groestl512_close(&ctx_groestl[0], hashA); //7 - sph_groestl512_init(&ctx_groestl[1]); - sph_groestl512 (&ctx_groestl[1], hashA, 64); //6 + sph_groestl512_init(&ctx_groestl[1]); + sph_groestl512 (&ctx_groestl[1], hashA, 64); //6 sph_groestl512_close(&ctx_groestl[1], hashB); //7 - memcpy(state, hashB, 32); + memcpy(state, hashB, 32); } - +extern bool opt_benchmark; extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t start_nonce = pdata[19]++; - const uint32_t Htarg = ptarget[7]; - const uint32_t throughPut = 4096 * 128; - //const uint32_t throughPut = 1; - int i; - uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t)); - - // init - static bool init[8] = { false, false, false, false, false, false, false, false }; - if(!init[thr_id]) - { - groestlcoin_cpu_init(thr_id, throughPut); - init[thr_id] = true; - } - - // Endian Drehung ist notwendig - //char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"}; - //pdata = (uint32_t*)testdata; - uint32_t endiandata[32]; - for (int kk=0; kk < 32; kk++) - be32enc(&endiandata[kk], pdata[kk]); - - // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) - groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget); - - do { - // GPU - uint32_t foundNounce = 0xFFFFFFFF; - - groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce); - - /* - { - for(i=0;i<<>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); else if (BLOCKSIZE == 84) diff --git a/heavy/cuda_combine.cu b/heavy/cuda_combine.cu index c9036f3515..e2a8b721a2 100644 --- a/heavy/cuda_combine.cu +++ b/heavy/cuda_combine.cu @@ -141,8 +141,6 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - combine_gpu_hash<<>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]); // da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden diff --git a/heavy/cuda_groestl512.cu b/heavy/cuda_groestl512.cu index bf86105a51..5b1b26745a 100644 --- a/heavy/cuda_groestl512.cu +++ b/heavy/cuda_groestl512.cu @@ -824,8 +824,6 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce) // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - if (BLOCKSIZE == 84) groestl512_gpu_hash<84><<>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); else if (BLOCKSIZE == 80) diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu index db5fca3321..4b60818c15 100644 --- a/heavy/cuda_hefty1.cu +++ b/heavy/cuda_hefty1.cu @@ -416,8 +416,6 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce) size_t shared_size = 0; #endif -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - hefty_gpu_hash<<>>(threads, startNounce, (void*)d_heftyHashes[thr_id]); // Strategisches Sleep Kommando zur Senkung der CPU Last diff --git a/heavy/cuda_keccak512.cu b/heavy/cuda_keccak512.cu index 958579392c..eb69e3bc3b 100644 --- a/heavy/cuda_keccak512.cu +++ b/heavy/cuda_keccak512.cu @@ -279,7 +279,6 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce) // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); if (BLOCKSIZE==84) keccak512_gpu_hash<84><<>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); else if (BLOCKSIZE==80) diff --git a/heavy/cuda_sha256.cu b/heavy/cuda_sha256.cu index 404a2a29b1..f520778b51 100644 --- a/heavy/cuda_sha256.cu +++ b/heavy/cuda_sha256.cu @@ -271,7 +271,6 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce) // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); if (BLOCKSIZE == 84) sha256_gpu_hash<84><<>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); else if (BLOCKSIZE == 80) { diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp index 6ccecce599..45d9745102 100644 --- a/myriadgroestl.cpp +++ b/myriadgroestl.cpp @@ -35,17 +35,19 @@ static void myriadhash(void *state, const void *input) memcpy(state, hashB, 32); } - +extern bool opt_benchmark; extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { - uint32_t start_nonce = pdata[19]++; + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x000000ff; + + uint32_t start_nonce = pdata[19]++; const uint32_t throughPut = 128 * 1024; -// const uint32_t throughPut = 1; + uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t)); - // TODO: entfernen für eine Release! Ist nur zum Testen! if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x0000ff; diff --git a/quark/animecoin.cu b/quark/animecoin.cu index fa771aa9e3..6d395be0d9 100644 --- a/quark/animecoin.cu +++ b/quark/animecoin.cu @@ -175,7 +175,6 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, { const uint32_t first_nonce = pdata[19]; - // TODO: entfernen für eine Release! Ist nur zum Testen! if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x00000f; diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu index 7c706f228c..760d028883 100644 --- a/quark/cuda_bmw512.cu +++ b/quark/cuda_bmw512.cu @@ -447,8 +447,6 @@ __host__ void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNo // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - quark_bmw512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); MyStreamSynchronize(NULL, order, thr_id); } @@ -464,8 +462,6 @@ __host__ void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNo // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - quark_bmw512_gpu_hash_80<<>>(threads, startNounce, (uint64_t*)d_hash); MyStreamSynchronize(NULL, order, thr_id); } diff --git a/quark/cuda_jh512.cu b/quark/cuda_jh512.cu index 779af44f67..c55ac1a977 100644 --- a/quark/cuda_jh512.cu +++ b/quark/cuda_jh512.cu @@ -350,8 +350,6 @@ __host__ void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNou // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - quark_jh512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); MyStreamSynchronize(NULL, order, thr_id); } diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu index 80e500eea3..eabef09fc2 100644 --- a/quark/cuda_quark_blake512.cu +++ b/quark/cuda_quark_blake512.cu @@ -406,8 +406,6 @@ __host__ void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t start // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - quark_blake512_gpu_hash_64<<>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash); // Strategisches Sleep Kommando zur Senkung der CPU Last @@ -425,8 +423,6 @@ __host__ void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t start // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - quark_blake512_gpu_hash_80<<>>(threads, startNounce, d_outputHash); // Strategisches Sleep Kommando zur Senkung der CPU Last diff --git a/quark/cuda_quark_checkhash.cu b/quark/cuda_quark_checkhash.cu index c4052f2336..8502bf4336 100644 --- a/quark/cuda_quark_checkhash.cu +++ b/quark/cuda_quark_checkhash.cu @@ -89,8 +89,6 @@ __host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t star // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - quark_check_gpu_hash_64<<>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]); // Strategisches Sleep Kommando zur Senkung der CPU Last diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu index 3d71853208..dbe23c8f9f 100644 --- a/quark/cuda_quark_groestl512.cu +++ b/quark/cuda_quark_groestl512.cu @@ -1,4 +1,4 @@ -// Auf QuarkCoin spezialisierte Version von Groestl +// Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice #include #include "cuda_runtime.h" @@ -7,9 +7,6 @@ #include #include -// it's unfortunate that this is a compile time constant. -#define MAXWELL_OR_FERMI 1 - // aus cpu-miner.c extern int device_map[8]; @@ -18,353 +15,137 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t // Folgende Definitionen später durch header ersetzen typedef unsigned char uint8_t; +typedef unsigned short uint16_t; typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; // diese Struktur wird in der Init Funktion angefordert static cudaDeviceProp props[8]; -#define SPH_C32(x) ((uint32_t)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) - -#define PC32up(j, r) ((uint32_t)((j) + (r))) -#define PC32dn(j, r) 0 -#define QC32up(j, r) 0xFFFFFFFF -#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24))) - -#define B32_0(x) __byte_perm(x, 0, 0x4440) -//((x) & 0xFF) -#define B32_1(x) __byte_perm(x, 0, 0x4441) -//(((x) >> 8) & 0xFF) -#define B32_2(x) __byte_perm(x, 0, 0x4442) -//(((x) >> 16) & 0xFF) -#define B32_3(x) __byte_perm(x, 0, 0x4443) -//((x) >> 24) - -#if MAXWELL_OR_FERMI -#define USE_SHARED 1 -// Maxwell and Fermi cards get the best speed with SHARED access it seems. -#if USE_SHARED -#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) -#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x)))) -#define T1up(x) (*((uint32_t*)mixtabs + (512+(x)))) -#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) -#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x)))) -#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) -#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) -#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x)))) -#else -#define T0up(x) tex1Dfetch(t0up1, x) -#define T0dn(x) tex1Dfetch(t0dn1, x) -#define T1up(x) tex1Dfetch(t1up1, x) -#define T1dn(x) tex1Dfetch(t1dn1, x) -#define T2up(x) tex1Dfetch(t2up1, x) -#define T2dn(x) tex1Dfetch(t2dn1, x) -#define T3up(x) tex1Dfetch(t3up1, x) -#define T3dn(x) tex1Dfetch(t3dn1, x) -#endif -#else -#define USE_SHARED 1 -// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5! -#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) -#define T0dn(x) tex1Dfetch(t0dn1, x) -#define T1up(x) tex1Dfetch(t1up1, x) -#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) -#define T2up(x) tex1Dfetch(t2up1, x) -#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) -#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) -#define T3dn(x) tex1Dfetch(t3dn1, x) -#endif - -texture t0up1; -texture t0dn1; -texture t1up1; -texture t1dn1; -texture t2up1; -texture t2dn1; -texture t3up1; -texture t3dn1; - -extern uint32_t T0up_cpu[]; -extern uint32_t T0dn_cpu[]; -extern uint32_t T1up_cpu[]; -extern uint32_t T1dn_cpu[]; -extern uint32_t T2up_cpu[]; -extern uint32_t T2dn_cpu[]; -extern uint32_t T3up_cpu[]; -extern uint32_t T3dn_cpu[]; - -__device__ __forceinline__ void quark_groestl512_perm_P(uint32_t *a, char *mixtabs) -{ - uint32_t t[32]; +// 64 Register Variante für Compute 3.0 +#include "groestl_functions_quad.cu" +#include "bitslice_transformations_quad.cu" -//#pragma unroll 14 - for(int r=0;r<14;r++) +__global__ void __launch_bounds__(256, 4) + quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) +{ + // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen + int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; + if (thread < threads) { - switch(r) - { - case 0: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 0); break; - case 1: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 1); break; - case 2: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 2); break; - case 3: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 3); break; - case 4: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 4); break; - case 5: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 5); break; - case 6: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 6); break; - case 7: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 7); break; - case 8: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 8); break; - case 9: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 9); break; - case 10: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 10); break; - case 11: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 11); break; - case 12: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 12); break; - case 13: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 13); break; - } + // GROESTL + uint32_t message[8]; + uint32_t state[8]; - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) - { - uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]); - uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]); - uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]); - uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]); - - t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^ - T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 ); - - t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^ - T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 ); - } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; - } -} + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); -__device__ __forceinline__ void quark_groestl512_perm_Q(uint32_t *a, char *mixtabs) -{ -//#pragma unroll 14 - for(int r=0;r<14;r++) - { - uint32_t t[32]; + int hashPosition = nounce - startNounce; + uint32_t *inpHash = &g_hash[hashPosition<<4]; - switch(r) - { - case 0: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 0); a[(k*2)+1] ^= QC32dn(k<< 4, 0);} break; - case 1: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 1); a[(k*2)+1] ^= QC32dn(k<< 4, 1);} break; - case 2: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 2); a[(k*2)+1] ^= QC32dn(k<< 4, 2);} break; - case 3: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 3); a[(k*2)+1] ^= QC32dn(k<< 4, 3);} break; - case 4: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 4); a[(k*2)+1] ^= QC32dn(k<< 4, 4);} break; - case 5: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 5); a[(k*2)+1] ^= QC32dn(k<< 4, 5);} break; - case 6: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 6); a[(k*2)+1] ^= QC32dn(k<< 4, 6);} break; - case 7: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 7); a[(k*2)+1] ^= QC32dn(k<< 4, 7);} break; - case 8: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 8); a[(k*2)+1] ^= QC32dn(k<< 4, 8);} break; - case 9: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 9); a[(k*2)+1] ^= QC32dn(k<< 4, 9);} break; - case 10: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 10); a[(k*2)+1] ^= QC32dn(k<< 4, 10);} break; - case 11: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 11); a[(k*2)+1] ^= QC32dn(k<< 4, 11);} break; - case 12: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 12); a[(k*2)+1] ^= QC32dn(k<< 4, 12);} break; - case 13: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 13); a[(k*2)+1] ^= QC32dn(k<< 4, 13);} break; - } +#pragma unroll 4 + for(int k=0;k<4;k++) message[k] = inpHash[(k<<2) + (threadIdx.x&0x03)]; +#pragma unroll 4 + for(int k=4;k<8;k++) message[k] = 0; - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) + if ((threadIdx.x&0x03) == 0) message[4] = 0x80; + if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000; + + uint32_t msgBitsliced[8]; + to_bitslice_quad(message, msgBitsliced); + + groestl512_progressMessage_quad(state, msgBitsliced); + + // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash + uint32_t *outpHash = &g_hash[hashPosition<<4]; + uint32_t hash[16]; + from_bitslice_quad(state, hash); + + if ((threadIdx.x & 0x03) == 0) { - uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]); - uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]); - uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]); - uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]); - - t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ - T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 ); - - t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ - T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 ); +#pragma unroll 16 + for(int k=0;k<16;k++) outpHash[k] = hash[k]; } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; } } -__global__ void quark_groestl512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) -{ -#if USE_SHARED - extern __shared__ char mixtabs[]; - - if (threadIdx.x < 256) - { - *((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x); - *((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x); - *((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x); - *((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x); - *((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x); - *((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x); - *((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x); - *((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x); - } - - __syncthreads(); -#endif - int thread = (blockDim.x * blockIdx.x + threadIdx.x); +__global__ void __launch_bounds__(256, 4) + quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2; if (thread < threads) { // GROESTL - uint32_t message[32]; - uint32_t state[32]; + uint32_t message[8]; + uint32_t state[8]; uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); int hashPosition = nounce - startNounce; - uint32_t *inpHash = &g_hash[16 * hashPosition]; + uint32_t *inpHash = &g_hash[hashPosition<<4]; -#pragma unroll 16 - for(int k=0;k<16;k++) message[k] = inpHash[k]; -#pragma unroll 14 - for(int k=1;k<15;k++) - message[k+16] = 0; - - message[16] = 0x80; - message[31] = 0x01000000; - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] = message[u]; - state[31] ^= 0x20000; - - // Perm -#if USE_SHARED - quark_groestl512_perm_P(state, mixtabs); - state[31] ^= 0x20000; - quark_groestl512_perm_Q(message, mixtabs); -#else - quark_groestl512_perm_P(state, NULL); - state[31] ^= 0x20000; - quark_groestl512_perm_Q(message, NULL); -#endif -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; - -#pragma unroll 32 - for(int u=0;u<32;u++) message[u] = state[u]; - -#if USE_SHARED - quark_groestl512_perm_P(message, mixtabs); -#else - quark_groestl512_perm_P(message, NULL); -#endif - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; - // Erzeugten Hash rausschreiben - - uint32_t *outpHash = &g_hash[16 * hashPosition]; +#pragma unroll 4 + for(int k=0;k<4;k++) message[k] = inpHash[(k<<2)+(threadIdx.x&0x03)]; +#pragma unroll 4 + for(int k=4;k<8;k++) message[k] = 0; + + if ((threadIdx.x&0x03) == 0) message[4] = 0x80; + if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000; + + uint32_t msgBitsliced[8]; + to_bitslice_quad(message, msgBitsliced); + + for (int round=0; round<2; round++) + { + groestl512_progressMessage_quad(state, msgBitsliced); + + if (round < 1) + { + // Verkettung zweier Runden inclusive Padding. + msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + (((threadIdx.x%4)==3)<<13)); + msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341); + msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341); + msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341); + msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341); + msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341); + msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341); + msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + (((threadIdx.x%4)==0)<<4)); + } + } + // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash + uint32_t *outpHash = &g_hash[hashPosition<<4]; + uint32_t hash[16]; + from_bitslice_quad(state, hash); + + if ((threadIdx.x & 0x03) == 0) + { #pragma unroll 16 - for(int k=0;k<16;k++) outpHash[k] = state[k+16]; + for(int k=0;k<16;k++) outpHash[k] = hash[k]; + } } } -#define texDef(texname, texmem, texsource, texsize) \ - unsigned int *texmem; \ - cudaMalloc(&texmem, texsize); \ - cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ - texname.normalized = 0; \ - texname.filterMode = cudaFilterModePoint; \ - texname.addressMode[0] = cudaAddressModeClamp; \ - { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ - // Setup-Funktionen __host__ void quark_groestl512_cpu_init(int thr_id, int threads) { cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); - -// Texturen mit obigem Makro initialisieren - texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256); - texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); - texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256); - texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); - texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256); - texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); - texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256); - texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); } __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { - // Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern, - // alle anderen mit 512 Threads. - int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512; + int threadsperblock = 256; + + // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle + // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl + const int factor = 4; // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock)); dim3 block(threadsperblock); // Größe des dynamischen Shared Memory Bereichs -#if USE_SHARED - size_t shared_size = 8 * 256 * sizeof(uint32_t); -#else size_t shared_size = 0; -#endif -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - //fprintf(stderr, "ThrID: %d\n", thr_id); - quark_groestl512_gpu_hash_64<<>>(threads, startNounce, d_hash, d_nonceVector); + quark_groestl512_gpu_hash_64_quad<<>>(threads, startNounce, d_hash, d_nonceVector); // Strategisches Sleep Kommando zur Senkung der CPU Last MyStreamSynchronize(NULL, order, thr_id); @@ -372,25 +153,20 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta __host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { - // Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern, - // alle anderen mit 512 Threads. - int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512; + int threadsperblock = 256; + + // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle + // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl + const int factor = 4; // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock)); dim3 block(threadsperblock); // Größe des dynamischen Shared Memory Bereichs -#if USE_SHARED - size_t shared_size = 8 * 256 * sizeof(uint32_t); -#else size_t shared_size = 0; -#endif -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - //fprintf(stderr, "ThrID: %d\n", thr_id); - quark_groestl512_gpu_hash_64<<>>(threads, startNounce, d_hash, d_nonceVector); - quark_groestl512_gpu_hash_64<<>>(threads, startNounce, d_hash, d_nonceVector); + quark_doublegroestl512_gpu_hash_64_quad<<>>(threads, startNounce, d_hash, d_nonceVector); // Strategisches Sleep Kommando zur Senkung der CPU Last MyStreamSynchronize(NULL, order, thr_id); diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu index c55a7a1289..775d5e2ee4 100644 --- a/quark/cuda_quark_keccak512.cu +++ b/quark/cuda_quark_keccak512.cu @@ -175,8 +175,6 @@ __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t star // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - quark_keccak512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); MyStreamSynchronize(NULL, order, thr_id); } diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu index dc4030a607..d32e9c0bbd 100644 --- a/quark/cuda_skein512.cu +++ b/quark/cuda_skein512.cu @@ -442,7 +442,6 @@ __host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t start // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); quark_skein512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); // Strategisches Sleep Kommando zur Senkung der CPU Last diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index acfe731946..0bc46d9fc0 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -157,7 +157,6 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, { const uint32_t first_nonce = pdata[19]; - // TODO: entfernen für eine Release! Ist nur zum Testen! if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x0000ff; diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu index 8769b4ef6e..602e7c4197 100644 --- a/x11/cuda_x11_cubehash512.cu +++ b/x11/cuda_x11_cubehash512.cu @@ -307,8 +307,6 @@ __host__ void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t star // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - x11_cubehash512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); MyStreamSynchronize(NULL, order, thr_id); } diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu index 34e0b028d4..ce96728ee9 100644 --- a/x11/cuda_x11_echo.cu +++ b/x11/cuda_x11_echo.cu @@ -225,8 +225,6 @@ __host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNou // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - x11_echo512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); MyStreamSynchronize(NULL, order, thr_id); } diff --git a/x11/cuda_x11_luffa512.cu b/x11/cuda_x11_luffa512.cu index b1a03c971a..c1ca6c0da3 100644 --- a/x11/cuda_x11_luffa512.cu +++ b/x11/cuda_x11_luffa512.cu @@ -376,8 +376,6 @@ __host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNo // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - x11_luffa512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); MyStreamSynchronize(NULL, order, thr_id); } diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu index 9d217e85cb..b3fd9258ed 100644 --- a/x11/cuda_x11_shavite512.cu +++ b/x11/cuda_x11_shavite512.cu @@ -1372,8 +1372,6 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - x11_shavite512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); MyStreamSynchronize(NULL, order, thr_id); } diff --git a/x11/x11.cu b/x11/x11.cu index 77b6a72358..5b22e4f20b 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -162,11 +162,8 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, { const uint32_t first_nonce = pdata[19]; - // TODO: entfernen für eine Release! Ist nur zum Testen! - if (opt_benchmark) { + if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x0000ff; - pdata[17] = 0; - } const uint32_t Htarg = ptarget[7];