From 1f8c05eb4c5bcd5ea53500d2e8163cb7b729462d Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 23 Jun 2018 13:13:17 +0200 Subject: [PATCH] upgrade cryptolight algo for AEON support they also forked to variant 1 system... --- ccminer.cpp | 2 +- crypto/cryptolight-core.cu | 76 ++++++++++++++++++++++++++++++++++++-- crypto/cryptolight-cpu.cpp | 39 +++++++++++++++---- crypto/cryptolight.cu | 18 ++++++--- crypto/cryptolight.h | 2 +- crypto/xmr-rpc.cpp | 7 +++- miner.h | 5 ++- util.cpp | 2 +- 8 files changed, 129 insertions(+), 22 deletions(-) diff --git a/ccminer.cpp b/ccminer.cpp index 04557059c1..652128423d 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -2380,7 +2380,7 @@ static void *miner_thread(void *userdata) rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done); break; case ALGO_CRYPTOLIGHT: - rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done); + rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1); break; case ALGO_CRYPTONIGHT: { diff --git a/crypto/cryptolight-core.cu b/crypto/cryptolight-core.cu index 110b7f23ce..8f0bb75e61 100644 --- a/crypto/cryptolight-core.cu +++ b/crypto/cryptolight-core.cu @@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t * } } +// -------------------------------------------------------------------------------------------------------------- + __global__ -void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b) +void cryptolight_old_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b) { __shared__ uint32_t __align__(16) sharedMemory[1024]; @@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int #endif // __CUDA_ARCH__ >= 300 } +__device__ __forceinline__ void store_variant1(uint32_t* long_state) +{ + uint4* Z = (uint4*) long_state; + const uint32_t tmp = (Z->z >> 24); // __byte_perm(src, 0, 0x7773); + const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1; + Z->z = (Z->z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24); +} + +#define MUL_SUM_XOR_DST_1(a,c,dst,tweak) { \ + uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \ + hi += ((uint64_t *)c)[0]; \ + ((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \ + ((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \ + ((uint64_t *)dst)[0] = hi; \ + ((uint64_t *)dst)[1] = lo ^ tweak; } + +__global__ +void cryptolight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx, + uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, + uint64_t * __restrict__ d_tweak) +{ + __shared__ __align__(16) uint32_t sharedMemory[1024]; + cn_aes_gpu_init(sharedMemory); + __syncthreads(); + + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) + { + const uint32_t batchsize = ITER >> (2 + bfactor); + const uint32_t start = partidx * batchsize; + const uint32_t end = start + batchsize; + const uint32_t longptr = thread << LONG_SHL_IDX; + uint32_t * long_state = &d_long_state[longptr]; + uint64_t tweak = d_tweak[thread]; + + void * ctx_a = (void*)(&d_ctx_a[thread << 2]); + void * ctx_b = (void*)(&d_ctx_b[thread << 2]); + uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4 + uint4 B = AS_UINT4(ctx_b); + uint32_t* a = (uint32_t*)&A; + uint32_t* b = (uint32_t*)&B; + + for (int i = start; i < end; i++) + { + uint32_t c[4]; + uint32_t j = (A.x >> 2) & E2I_MASK2; + cn_aes_single_round(sharedMemory, &long_state[j], c, a); + XOR_BLOCKS_DST(c, b, &long_state[j]); + store_variant1(&long_state[j]); + MUL_SUM_XOR_DST_1(c, a, &long_state[(c[0] >> 2) & E2I_MASK2], tweak); + + j = (A.x >> 2) & E2I_MASK2; + cn_aes_single_round(sharedMemory, &long_state[j], b, a); + XOR_BLOCKS_DST(b, c, &long_state[j]); + store_variant1(&long_state[j]); + MUL_SUM_XOR_DST_1(b, a, &long_state[(b[0] >> 2) & E2I_MASK2], tweak); + } + if (bfactor) { + AS_UINT4(ctx_a) = A; + AS_UINT4(ctx_b) = B; + } + } +} + __global__ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2) { @@ -252,7 +318,7 @@ extern int device_bfactor[MAX_GPUS]; __host__ void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, - uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2) + uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak) { dim3 grid(blocks); dim3 block(threads); @@ -271,7 +337,11 @@ void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long for(i = 0; i < partcount; i++) { - cryptolight_core_gpu_phase2 <<= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); + dim3 b = device_sm[dev_id] >= 300 ? block4 : block; + if (variant == 0) + cryptolight_old_gpu_phase2 <<>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); + else + cryptolight_gpu_phase2 <<>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); if(partcount > 1) usleep(bsleep); } diff --git a/crypto/cryptolight-cpu.cpp b/crypto/cryptolight-cpu.cpp index b0ee386ccf..f995b4c71a 100644 --- a/crypto/cryptolight-cpu.cpp +++ b/crypto/cryptolight-cpu.cpp @@ -22,6 +22,16 @@ struct cryptonight_ctx { oaes_ctx* aes_ctx; }; + +static void cryptolight_store_variant(void* state, int variant) { + if (variant == 1) { + // use variant 1 like monero since june 2018 + const uint8_t tmp = ((const uint8_t*)(state))[11]; + const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30); + } +} + static void do_blake_hash(const void* input, int len, void* output) { uchar hash[32]; @@ -132,14 +142,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui ((uint64_t*) dst)[0] += ((uint64_t*) c)[0]; } -static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) { +static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) { uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1]; hi += ((uint64_t*) c)[0]; ((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi; ((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo; ((uint64_t*) dst)[0] = hi; - ((uint64_t*) dst)[1] = lo; + ((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo; } static void copy_block(uint8_t* dst, const uint8_t* src) { @@ -157,13 +167,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1]; } -static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx) +static int cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant) { size_t i, j; + if (variant && len < 43) + return 0; + keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len); ctx->aes_ctx = (oaes_ctx*) oaes_alloc(); memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); + const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0; + oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE); for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { #undef RND @@ -186,14 +201,16 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len, j = e2i(ctx->a); aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a); xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]); + cryptolight_store_variant(&ctx->long_state[j], variant); - mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]); + mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak); j = e2i(ctx->a); aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a); xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]); + cryptolight_store_variant(&ctx->long_state[j], variant); - mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]); + mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak); } memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); @@ -219,11 +236,19 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len, if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo); oaes_free((OAES_CTX **) &ctx->aes_ctx); + return 1; } -void cryptolight_hash(void* output, const void* input, int len) +int cryptolight_hash_variant(void* output, const void* input, int len, int variant) { struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx)); - cryptolight_hash_ctx(output, input, len, ctx); + int rc = cryptolight_hash_ctx(output, input, len, ctx, variant); free(ctx); + return rc; } + +void cryptolight_hash(void* output, const void* input) +{ + cryptolight_hash_variant(output, input, 76, 1); +} + diff --git a/crypto/cryptolight.cu b/crypto/cryptolight.cu index 2b720e16de..c2a10e419d 100644 --- a/crypto/cryptolight.cu +++ b/crypto/cryptolight.cu @@ -11,12 +11,13 @@ static uint32_t *d_ctx_state[MAX_GPUS]; static uint32_t *d_ctx_key1[MAX_GPUS]; static uint32_t *d_ctx_key2[MAX_GPUS]; static uint32_t *d_ctx_text[MAX_GPUS]; +static uint64_t *d_ctx_tweak[MAX_GPUS]; static uint32_t *d_ctx_a[MAX_GPUS]; static uint32_t *d_ctx_b[MAX_GPUS]; static bool init[MAX_GPUS] = { 0 }; -extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant) { int res = 0; uint32_t throughput = 0; @@ -26,6 +27,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ uint32_t *nonceptr = (uint32_t*) (&pdata[39]); const uint32_t first_nonce = *nonceptr; uint32_t nonce = first_nonce; + int dev_id = device_map[thr_id]; if(opt_benchmark) { ptarget[7] = 0x00ff; @@ -33,6 +35,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ if(!init[thr_id]) { + if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) { + device_config[thr_id] = strdup("80x32"); + } + if (device_config[thr_id]) { sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads); throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); @@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput); init[thr_id] = true; } @@ -91,8 +98,8 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX }; cryptonight_extra_setData(thr_id, pdata, ptarget); - cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], 0, NULL); - cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); + cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]); + cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]); cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]); *hashes_done = nonce - first_nonce + throughput; @@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39); memcpy(tempdata, pdata, 76); *tempnonceptr = resNonces[0]; - cryptolight_hash(vhash, tempdata, 76); + cryptolight_hash_variant(vhash, tempdata, 76, variant); if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) { res = 1; @@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ if(resNonces[1] != UINT32_MAX) { *tempnonceptr = resNonces[1]; - cryptolight_hash(vhash, tempdata, 76); + cryptolight_hash_variant(vhash, tempdata, 76, variant); if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) { res++; work->nonces[1] = resNonces[1]; @@ -157,6 +164,7 @@ void free_cryptolight(int thr_id) cudaFree(d_ctx_key1[thr_id]); cudaFree(d_ctx_key2[thr_id]); cudaFree(d_ctx_text[thr_id]); + cudaFree(d_ctx_tweak[thr_id]); cudaFree(d_ctx_a[thr_id]); cudaFree(d_ctx_b[thr_id]); diff --git a/crypto/cryptolight.h b/crypto/cryptolight.h index a135aaa82f..482d0f8f7c 100644 --- a/crypto/cryptolight.h +++ b/crypto/cryptolight.h @@ -135,7 +135,7 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line) } } -void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); +void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak); void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget); void cryptonight_extra_init(int thr_id/*, uint32_t threads*/); diff --git a/crypto/xmr-rpc.cpp b/crypto/xmr-rpc.cpp index d19f972236..433caa7d90 100644 --- a/crypto/xmr-rpc.cpp +++ b/crypto/xmr-rpc.cpp @@ -550,16 +550,19 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work) } else if (opt_algo == ALGO_CRYPTOLIGHT) { + int variant = 1; uint32_t nonce = work->nonces[idnonce]; noncestr = bin2hex((unsigned char*) &nonce, 4); last_found_nonce = nonce; - cryptolight_hash(hash, data, 76); + //if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork) + // variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1; + cryptolight_hash_variant(hash, data, 76, variant); work_set_target_ratio(work, (uint32_t*) hash); } else if (opt_algo == ALGO_CRYPTONIGHT) { - uint32_t nonce = work->nonces[idnonce]; int variant = 0; + uint32_t nonce = work->nonces[idnonce]; noncestr = bin2hex((unsigned char*) &nonce, 4); last_found_nonce = nonce; if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork) diff --git a/miner.h b/miner.h index 260e61f067..86088cb84f 100644 --- a/miner.h +++ b/miner.h @@ -279,7 +279,7 @@ extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant); extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant); extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -900,7 +900,8 @@ void blake2b_hash(void *output, const void *input); void blake2s_hash(void *output, const void *input); void bmw_hash(void *state, const void *input); void c11hash(void *output, const void *input); -void cryptolight_hash(void* output, const void* input, int len); +int cryptolight_hash_variant(void* output, const void* input, int len, int variant); +void cryptolight_hash(void* output, const void* input); int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant); void cryptonight_hash(void* output, const void* input); void monero_hash(void* output, const void* input); diff --git a/util.cpp b/util.cpp index fb5fafd46f..9c2194d0ca 100644 --- a/util.cpp +++ b/util.cpp @@ -2193,7 +2193,7 @@ void print_hash_tests(void) c11hash(&hash[0], &buf[0]); printpfx("c11", hash); - cryptolight_hash(&hash[0], &buf[0], 76); + cryptolight_hash(&hash[0], &buf[0]); printpfx("cryptolight", hash); cryptonight_hash(&hash[0], &buf[0]);