Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xcn #46

Open
wants to merge 66 commits into
base: master
Choose a base branch
from
Open

xcn #46

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
0ee02d2
dirty version of X15 for nvidia cards
djm34 Jun 27, 2014
fe0aa45
missing include sph_fugue.h
djm34 Jun 29, 2014
d0ee69a
added X14 and qubit algo
djm34 Jul 1, 2014
f17a00a
makefile for X14 and qubit
djm34 Jul 1, 2014
2aaae7b
forgotten \
djm34 Jul 1, 2014
16427c4
new whirlpool kernel
djm34 Jul 4, 2014
8e23d04
added fresh algo
djm34 Jul 7, 2014
ef26604
cleaner code and faster on compute 3.0 cards
djm34 Jul 7, 2014
a795b85
changing default difficulty to fresh algo
djm34 Jul 7, 2014
cb32a5a
speed improvement
djm34 Jul 18, 2014
0c2e3d6
whirlcoin
djm34 Jul 21, 2014
75d1588
change of stratum
djm34 Jul 21, 2014
9bf6ffa
might be easier to compile on linux
djm34 Jul 21, 2014
556f279
xor replace by xor1
djm34 Jul 21, 2014
de8ba98
inline asm back to cuda_helper.h
djm34 Jul 21, 2014
1241e61
x17 added
djm34 Jul 23, 2014
73ce83f
correction for linux
djm34 Jul 23, 2014
d6a5643
removed double reference
djm34 Jul 23, 2014
02994b0
vxproj with changes for x17
djm34 Jul 23, 2014
fe4ca3d
cuda 5.5 build
djm34 Jul 23, 2014
5da4daa
allow to see the files where they belong
djm34 Jul 23, 2014
b8e8f52
added keccak-256 (maxcoin)
djm34 Jul 25, 2014
c6e2d75
change compilation option
djm34 Jul 26, 2014
165d7f9
makefile correction
djm34 Jul 26, 2014
e990384
removed a remaining compute_50
djm34 Jul 26, 2014
283bbe8
revert back change to whirlpool.cu fix to makefile
djm34 Jul 26, 2014
7ff2ea5
Revert "revert back change to whirlpool.cu fix to makefile"
djm34 Jul 26, 2014
31a443d
revert back change to makefile and whirpool.cu
djm34 Jul 26, 2014
ec199eb
fix name screw up
djm34 Jul 26, 2014
a4cde8d
added line suppressed by mistake in makefile
djm34 Jul 27, 2014
ff465e0
fed up with github
djm34 Jul 27, 2014
74e418a
doomcoin added (luffa512) -a luffa
djm34 Jul 29, 2014
97a15a9
typo...
djm34 Jul 29, 2014
c66e2f6
speed improvement over doomcoin luffa
djm34 Jul 30, 2014
73283c3
compatibility issue with linux
djm34 Jul 30, 2014
8f943a1
added goalcoin
djm34 Aug 12, 2014
dd6ad01
M7 and Deepcoin algo
djm34 Aug 23, 2014
51fc7d1
cleaning
djm34 Aug 23, 2014
c1f2af2
correct conflict
djm34 Aug 23, 2014
025544c
correct typo in makefile
djm34 Aug 23, 2014
b9dd12d
2 more typos
djm34 Aug 23, 2014
ca0a3ed
correct whirlpool/whirlcoin screw up
djm34 Aug 23, 2014
b6565f3
solve compile error with printf/ redefinition of the integer type
djm34 Aug 24, 2014
46031c5
make use of cuda_helper.h rather than locally defined macro
djm34 Aug 24, 2014
f984397
correct linking problem with mpir lib (linux)
djm34 Aug 24, 2014
b0c429d
improvement on whirlcoin code/fine (or not) tuning
djm34 Aug 24, 2014
a8a46e8
relative path to mpir in release version
djm34 Aug 24, 2014
07f7df6
delete
djm34 Aug 24, 2014
d66c205
add gitignore
djm34 Aug 24, 2014
6d46c8c
various update
djm34 Sep 1, 2014
3cd198a
correct compatibility issues
djm34 Sep 1, 2014
f45ed01
lyra2re release
djm34 Dec 12, 2014
145cf73
cpu algo for lyra2RE and neoscrypt
djm34 Dec 12, 2014
acf895f
lyra2re
djm34 Dec 12, 2014
bba58e4
lyra2re correcting typing
djm34 Dec 12, 2014
b89c89c
removed thrust /corrected add some omitted files
djm34 Dec 13, 2014
6a7b4a5
removed autogenerated files
djm34 Dec 13, 2014
3a7de8d
whatever
djm34 Dec 13, 2014
accbaab
90% chance problems fixed
djm34 Dec 13, 2014
e8633a8
95% chance problem fixed
djm34 Dec 13, 2014
2e1bd6d
fix m7 algo as well as optimization
djm34 Dec 14, 2014
1383587
change in threadperblock value... better hashrate for 750ti & 9xx serie
djm34 Dec 20, 2014
6174788
pluck algo /globalblocktamplate method added
djm34 Feb 19, 2015
d161aa3
update
djm34 Feb 19, 2015
cdacdb0
updated difficulty to match the one used on pools
djm34 Feb 20, 2015
361d855
bug fix
djm34 Feb 20, 2015
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Binary file added .gitignore
Binary file not shown.
251 changes: 251 additions & 0 deletions Algo256/cuda_blake256.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
/**
* Blake-256 Cuda Kernel (Tested on SM 5.0)
*
* Tanguy Pruvot - Nov. 2014
*/

#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"


#include <stdio.h>
#include <stdint.h>
#include <memory.h>

extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

extern int compute_version[8];
#include "cuda_helper.h"

__constant__ static uint32_t c_data[20];

__constant__ static uint32_t sigma[16][16];
static uint32_t c_sigma[16][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};


static const uint32_t c_IV256[8] = {
0x6A09E667, 0xBB67AE85,
0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C,
0x1F83D9AB, 0x5BE0CD19
};

__device__ __constant__ static uint32_t cpu_h[8];

__device__ __constant__ static uint32_t u256[16];
static const uint32_t c_u256[16] = {
0x243F6A88, 0x85A308D3,
0x13198A2E, 0x03707344,
0xA4093822, 0x299F31D0,
0x082EFA98, 0xEC4E6C89,
0x452821E6, 0x38D01377,
0xBE5466CF, 0x34E90C6C,
0xC0AC29B7, 0xC97C50DD,
0x3F84D5B5, 0xB5470917
};

#define GS2(a,b,c,d,x) { \
const uint32_t idx1 = sigma[r][x]; \
const uint32_t idx2 = sigma[r][x+1]; \
v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \
v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \
v[c] += v[d]; \
v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
\
v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \
v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \
v[c] += v[d]; \
v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
}
//#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#define hostGS(a,b,c,d,x) { \
const uint32_t idx1 = c_sigma[r][x]; \
const uint32_t idx2 = c_sigma[r][x+1]; \
v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \
v[d] = ROTR32(v[d] ^ v[a], 16); \
v[c] += v[d]; \
v[b] = ROTR32(v[b] ^ v[c], 12); \
\
v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \
v[d] = ROTR32(v[d] ^ v[a], 8); \
v[c] += v[d]; \
v[b] = ROTR32(v[b] ^ v[c], 7); \
}

/* Second part (64-80) msg never change, store it */
__device__ __constant__ static const uint32_t c_Padding[16] = {
0, 0, 0, 0,
0x80000000, 0, 0, 0,
0, 0, 0, 0,
0, 1, 0, 640,
};

__host__ __forceinline__ static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint32_t T0)
{
uint32_t m[16];
uint32_t v[16];


for (int i = 0; i < 16; i++) {
m[i] = block[i];
}


for (int i = 0; i < 8; i++)
v[i] = h[i];

v[8] = c_u256[0];
v[9] = c_u256[1];
v[10] = c_u256[2];
v[11] = c_u256[3];

v[12] = c_u256[4] ^ T0;
v[13] = c_u256[5] ^ T0;
v[14] = c_u256[6];
v[15] = c_u256[7];


for (int r = 0; r < 14; r++) {
/* column step */
hostGS(0, 4, 0x8, 0xC, 0x0);
hostGS(1, 5, 0x9, 0xD, 0x2);
hostGS(2, 6, 0xA, 0xE, 0x4);
hostGS(3, 7, 0xB, 0xF, 0x6);
/* diagonal step */
hostGS(0, 5, 0xA, 0xF, 0x8);
hostGS(1, 6, 0xB, 0xC, 0xA);
hostGS(2, 7, 0x8, 0xD, 0xC);
hostGS(3, 4, 0x9, 0xE, 0xE);
}

for (int i = 0; i < 16; i++) {
int j = i & 7;
h[j] ^= v[i];
}
}

void blake256_cpu_init(int thr_id, int threads)
{

cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);
}

__device__ __forceinline__ static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0)
{
uint32_t m[16];
uint32_t v[16];

m[0] = block[0];
m[1] = block[1];
m[2] = block[2];
m[3] = block[3];

#pragma unroll
for (int i = 4; i < 16; i++) {
m[i] = c_Padding[i];
}

#pragma unroll 8
for (int i = 0; i < 8; i++)
v[i] = h[i];

v[8] = u256[0];
v[9] = u256[1];
v[10] = u256[2];
v[11] = u256[3];

v[12] = u256[4] ^ T0;
v[13] = u256[5] ^ T0;
v[14] = u256[6];
v[15] = u256[7];

#pragma unroll 14
for (int r = 0; r < 14; r++) {
/* column step */
GS2(0, 4, 0x8, 0xC, 0x0);
GS2(1, 5, 0x9, 0xD, 0x2);
GS2(2, 6, 0xA, 0xE, 0x4);
GS2(3, 7, 0xB, 0xF, 0x6);
/* diagonal step */
GS2(0, 5, 0xA, 0xF, 0x8);
GS2(1, 6, 0xB, 0xC, 0xA);
GS2(2, 7, 0x8, 0xD, 0xC);
GS2(3, 4, 0x9, 0xE, 0xE);
}
#pragma unroll 16
for (int i = 0; i < 16; i++) {
int j = i & 7;
h[j] ^= v[i];
}
}


__global__ __launch_bounds__(256,3) void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nonce = startNonce + thread;
uint32_t h[8];

uint32_t input[4];
#pragma unroll 8
for (int i = 0; i<8; i++) { h[i] = cpu_h[i];}
#pragma unroll 3
for (int i = 0; i < 3; ++i) input[i] = c_data[16 + i];
input[3] = nonce;
blake256_compress2nd(h, input, 640);


#pragma unroll
for (int i = 0; i<4; i++) { Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1])); }

}
}

__host__ void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
{
const int threadsperblock = 256;


dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;


blake256_gpu_hash_80 << <grid, block, shared_size >> >(threads, startNonce, Hash);
MyStreamSynchronize(NULL, order, thr_id);

}

__host__ void blake256_cpu_setBlock_80(uint32_t *pdata)
{
uint32_t data[20];
memcpy(data, pdata, 80);
uint32_t h[8];
for (int i = 0; i<8; i++) { h[i] = c_IV256[i]; }
blake256_compress1st(h, pdata, 512);
cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice);
}

Loading