forked from KlausT/ccminer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cuda_groestlcoin.cu
103 lines (80 loc) · 3.74 KB
/
cuda_groestlcoin.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
// Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
#include <host_defines.h>
// globaler Speicher für alle HeftyHashes aller Threads
static uint32_t *d_resultNonce[MAX_GPUS];
__constant__ uint32_t groestlcoin_gpu_msg[20];
// 64 Register Variante für Compute 3.0
#include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
#define SWAB32(x) cuda_swab32(x)
__global__ __launch_bounds__(512, 2)
void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, int thr_id, uint32_t target)
{
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
// if (thread < threads)
{
// GROESTL
uint32_t paddedInput[8] = { 0 };
const uint32_t nounce = startNounce + thread;
paddedInput[0] = groestlcoin_gpu_msg[(threadIdx.x & 3)];
paddedInput[1] = groestlcoin_gpu_msg[4 + (threadIdx.x & 3)];
paddedInput[2] = groestlcoin_gpu_msg[8 + (threadIdx.x & 3)];
paddedInput[3] = groestlcoin_gpu_msg[12 + (threadIdx.x & 3)];
paddedInput[4] = groestlcoin_gpu_msg[16 + (threadIdx.x & 3)];
if ((threadIdx.x & 3) == 3) paddedInput[4] = SWAB32(nounce);
if ((threadIdx.x & 3) == 0) paddedInput[5] = 0x80;
if ((threadIdx.x & 3)==3) paddedInput[7] = 0x01000000;
uint32_t msgBitsliced[8];
myr_to_bitslice_quad(paddedInput, msgBitsliced);
uint32_t state[8];
groestl512_progressMessage_quad(state, msgBitsliced);
msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x & 3)==3)*0x2000);
msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
msgBitsliced[7] = __byte_perm(state[7], 0x00800100, 0x4341 + ((threadIdx.x & 3) == 0) * 0x0010);
groestl512_progressMessage_quad(state, msgBitsliced);
uint32_t out_state[16];
from_bitslice_quad_final(state, out_state);
if ((threadIdx.x & 3) == 0)
{
if (out_state[7] <= target)
{
uint32_t tmp = atomicExch(resNounce, nounce);
if (tmp != 0xffffffff)
resNounce[1] = tmp;
}
}
}
}
// Setup-Funktionen
__host__ void groestlcoin_cpu_init(int thr_id, uint32_t threads)
{
cudaMalloc(&d_resultNonce[thr_id], 2 * sizeof(uint32_t));
}
__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data )
{
uint32_t msgBlock[20];
memcpy(&msgBlock[0], data, 80);
cudaMemcpyToSymbolAsync(groestlcoin_gpu_msg, msgBlock, 80, 0, cudaMemcpyHostToDevice, gpustream[thr_id]);
cudaMemsetAsync(d_resultNonce[thr_id], 0xFF, 2 * sizeof(uint32_t), gpustream[thr_id]);
}
__host__ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *nounce, uint32_t target)
{
uint32_t threadsperblock = 512;
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
int factor = 4;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
dim3 block(threadsperblock);
groestlcoin_gpu_hash_quad<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_resultNonce[thr_id], thr_id, target);
CUDA_SAFE_CALL(cudaMemcpyAsync(nounce, d_resultNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); cudaStreamSynchronize(gpustream[thr_id]);
}