From 3b210695040903702482ec14e8fce421bd171b92 Mon Sep 17 00:00:00 2001
From: Christian Buchner <Christian.Buchner@gmail.com>
Date: Sat, 14 Jun 2014 01:43:28 +0200
Subject: [PATCH] bump to revision V1.1 with Killer Groestl

---
 JHA/cuda_jha_keccak512.cu        |   2 -
 JHA/jackpotcoin.cu               |  16 +-
 README.txt                       |  13 +-
 bitslice_transformations_quad.cu | 418 +++++++++++++++++++
 ccminer.vcxproj                  |  12 +
 ccminer.vcxproj.filters          |   6 +
 configure.ac                     |   2 +-
 cpu-miner.c                      |   6 +-
 cpuminer-config.h                |   4 +-
 cuda_groestlcoin.cu              | 537 ++++++-------------------
 cuda_myriadgroestl.cu            | 665 ++++++++++---------------------
 cuda_nist5.cu                    |   1 -
 groestl_functions_quad.cu        | 315 +++++++++++++++
 groestlcoin.cpp                  | 227 +++++------
 heavy/cuda_blake512.cu           |   2 -
 heavy/cuda_combine.cu            |   2 -
 heavy/cuda_groestl512.cu         |   2 -
 heavy/cuda_hefty1.cu             |   2 -
 heavy/cuda_keccak512.cu          |   1 -
 heavy/cuda_sha256.cu             |   1 -
 myriadgroestl.cpp                |  10 +-
 quark/animecoin.cu               |   1 -
 quark/cuda_bmw512.cu             |   4 -
 quark/cuda_jh512.cu              |   2 -
 quark/cuda_quark_blake512.cu     |   4 -
 quark/cuda_quark_checkhash.cu    |   2 -
 quark/cuda_quark_groestl512.cu   | 414 +++++--------------
 quark/cuda_quark_keccak512.cu    |   2 -
 quark/cuda_skein512.cu           |   1 -
 quark/quarkcoin.cu               |   1 -
 x11/cuda_x11_cubehash512.cu      |   2 -
 x11/cuda_x11_echo.cu             |   2 -
 x11/cuda_x11_luffa512.cu         |   2 -
 x11/cuda_x11_shavite512.cu       |   2 -
 x11/x11.cu                       |   5 +-
 35 files changed, 1301 insertions(+), 1387 deletions(-)
 create mode 100644 bitslice_transformations_quad.cu
 create mode 100644 groestl_functions_quad.cu

diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu
index 2906d47b79..be5b61ac57 100644
--- a/JHA/cuda_jha_keccak512.cu
+++ b/JHA/cuda_jha_keccak512.cu
@@ -567,8 +567,6 @@ __host__ void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t start
     // Größe des dynamischen Shared Memory Bereichs
     size_t shared_size = 0;
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
     jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
     MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index e67661322a..7d6d4cf5b7 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -101,14 +101,12 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 
-	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 
 	const uint32_t Htarg = ptarget[7];
 
 	const int throughput = 256*4096*4; // 100;
-	//const int throughput = 256*256*2+100; // 100;
 
 	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id])
@@ -167,16 +165,18 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 		}
 
-		// Runde 2 (ohne Gröstl)
+		// Runde 3 (komplett)
 
 		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
 		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
 			d_branch1Nonces[thr_id], &nrm1,
-			d_branch3Nonces[thr_id], &nrm3,
+			d_branch2Nonces[thr_id], &nrm2,
 			order++);
 
-		// verfolge den skein-pfad weiter
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		if (nrm1+nrm2 == nrm3) {
+			quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
+			quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+		}
 
 		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
 		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
@@ -226,7 +226,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
 
 				pdata[19] = foundNonce;
-				*hashes_done = (foundNonce - first_nonce + 1)/4;
+				*hashes_done = (foundNonce - first_nonce + 1)/2;
 				//applog(LOG_INFO, "GPU #%d: result for nonce $%08X does validate on CPU (%d rounds)!", thr_id, foundNonce, rounds);
 				return 1;
 			} else {
@@ -238,6 +238,6 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 
-	*hashes_done = (pdata[19] - first_nonce + 1)/4;
+	*hashes_done = (pdata[19] - first_nonce + 1)/2;
 	return 0;
 }
diff --git a/README.txt b/README.txt
index 09a33b6bdc..aace22699e 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccMiner release 1.0 (May 10th 2014) - "Did anyone say X11?"
+ccMiner release 1.1 (June 14th 2014) - "Killer Groestl!"
 -------------------------------------------------------------
 
 ***************************************************************
@@ -30,13 +30,12 @@ FugueCoin
 GroestlCoin & Myriad-Groestl
 JackpotCoin
 QuarkCoin family & AnimeCoin
+TalkCoin
+DarkCoin and other X11 coins
 
 where some of these coins have a VERY NOTABLE nVidia advantage
 over competing AMD (OpenCL) implementations.
 
-X11 algo is being worked on. It will be released when we
-have achieved a nice nVidia advantage.
-
 We did not take a big effort on improving usability, so please set
 your parameters carefuly.
 
@@ -140,6 +139,12 @@ features.
 
 >>> RELEASE HISTORY <<<
 
+  June 14th 2014  released Killer Groestl quad version which I deem
+                  sufficiently hard to port over to AMD. It isn't
+                  the fastest option for Compute 3.5 and 5.0 cards,
+                  but it is still much faster than the table based
+                  versions.
+
   May 10th 2014   added X11, but without the bells & whistles
                   (no killer Groestl, SIMD hash quite slow still)
 
diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu
new file mode 100644
index 0000000000..f4a5f2a325
--- /dev/null
+++ b/bitslice_transformations_quad.cu
@@ -0,0 +1,418 @@
+
+__device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output)
+{
+    int n = threadIdx.x % 4;
+    uint32_t other[8];
+#pragma unroll 8
+    for (int i = 0; i < 8; i++) {
+        input[i] =__shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
+        other[i] =__shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
+        input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
+        other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
+        if (threadIdx.x & 1) {
+            input[i] = __byte_perm(input[i], 0, 0x1032);
+            other[i] = __byte_perm(other[i], 0, 0x1032);
+        }
+        output[i] = 0;
+    }
+
+    output[ 0] |=  (input[ 0] & 0x00000001);
+    output[ 0] |= ((other[ 0] & 0x00000001) << 1);
+    output[ 0] |= ((input[ 1] & 0x00000001) << 2);
+    output[ 0] |= ((other[ 1] & 0x00000001) << 3);
+    output[ 0] |= ((input[ 2] & 0x00000001) << 4);
+    output[ 0] |= ((other[ 2] & 0x00000001) << 5);
+    output[ 0] |= ((input[ 3] & 0x00000001) << 6);
+    output[ 0] |= ((other[ 3] & 0x00000001) << 7);
+    output[ 0] |= ((input[ 4] & 0x00000001) << 8);
+    output[ 0] |= ((other[ 4] & 0x00000001) << 9);
+    output[ 0] |= ((input[ 5] & 0x00000001) <<10);
+    output[ 0] |= ((other[ 5] & 0x00000001) <<11);
+    output[ 0] |= ((input[ 6] & 0x00000001) <<12);
+    output[ 0] |= ((other[ 6] & 0x00000001) <<13);
+    output[ 0] |= ((input[ 7] & 0x00000001) <<14);
+    output[ 0] |= ((other[ 7] & 0x00000001) <<15);
+    output[ 0] |= ((input[ 0] & 0x00000100) << 8);
+    output[ 0] |= ((other[ 0] & 0x00000100) << 9);
+    output[ 0] |= ((input[ 1] & 0x00000100) <<10);
+    output[ 0] |= ((other[ 1] & 0x00000100) <<11);
+    output[ 0] |= ((input[ 2] & 0x00000100) <<12);
+    output[ 0] |= ((other[ 2] & 0x00000100) <<13);
+    output[ 0] |= ((input[ 3] & 0x00000100) <<14);
+    output[ 0] |= ((other[ 3] & 0x00000100) <<15);
+    output[ 0] |= ((input[ 4] & 0x00000100) <<16);
+    output[ 0] |= ((other[ 4] & 0x00000100) <<17);
+    output[ 0] |= ((input[ 5] & 0x00000100) <<18);
+    output[ 0] |= ((other[ 5] & 0x00000100) <<19);
+    output[ 0] |= ((input[ 6] & 0x00000100) <<20);
+    output[ 0] |= ((other[ 6] & 0x00000100) <<21);
+    output[ 0] |= ((input[ 7] & 0x00000100) <<22);
+    output[ 0] |= ((other[ 7] & 0x00000100) <<23);
+    output[ 1] |= ((input[ 0] & 0x00000002) >> 1);
+    output[ 1] |=  (other[ 0] & 0x00000002);
+    output[ 1] |= ((input[ 1] & 0x00000002) << 1);
+    output[ 1] |= ((other[ 1] & 0x00000002) << 2);
+    output[ 1] |= ((input[ 2] & 0x00000002) << 3);
+    output[ 1] |= ((other[ 2] & 0x00000002) << 4);
+    output[ 1] |= ((input[ 3] & 0x00000002) << 5);
+    output[ 1] |= ((other[ 3] & 0x00000002) << 6);
+    output[ 1] |= ((input[ 4] & 0x00000002) << 7);
+    output[ 1] |= ((other[ 4] & 0x00000002) << 8);
+    output[ 1] |= ((input[ 5] & 0x00000002) << 9);
+    output[ 1] |= ((other[ 5] & 0x00000002) <<10);
+    output[ 1] |= ((input[ 6] & 0x00000002) <<11);
+    output[ 1] |= ((other[ 6] & 0x00000002) <<12);
+    output[ 1] |= ((input[ 7] & 0x00000002) <<13);
+    output[ 1] |= ((other[ 7] & 0x00000002) <<14);
+    output[ 1] |= ((input[ 0] & 0x00000200) << 7);
+    output[ 1] |= ((other[ 0] & 0x00000200) << 8);
+    output[ 1] |= ((input[ 1] & 0x00000200) << 9);
+    output[ 1] |= ((other[ 1] & 0x00000200) <<10);
+    output[ 1] |= ((input[ 2] & 0x00000200) <<11);
+    output[ 1] |= ((other[ 2] & 0x00000200) <<12);
+    output[ 1] |= ((input[ 3] & 0x00000200) <<13);
+    output[ 1] |= ((other[ 3] & 0x00000200) <<14);
+    output[ 1] |= ((input[ 4] & 0x00000200) <<15);
+    output[ 1] |= ((other[ 4] & 0x00000200) <<16);
+    output[ 1] |= ((input[ 5] & 0x00000200) <<17);
+    output[ 1] |= ((other[ 5] & 0x00000200) <<18);
+    output[ 1] |= ((input[ 6] & 0x00000200) <<19);
+    output[ 1] |= ((other[ 6] & 0x00000200) <<20);
+    output[ 1] |= ((input[ 7] & 0x00000200) <<21);
+    output[ 1] |= ((other[ 7] & 0x00000200) <<22);
+    output[ 2] |= ((input[ 0] & 0x00000004) >> 2);
+    output[ 2] |= ((other[ 0] & 0x00000004) >> 1);
+    output[ 2] |=  (input[ 1] & 0x00000004);
+    output[ 2] |= ((other[ 1] & 0x00000004) << 1);
+    output[ 2] |= ((input[ 2] & 0x00000004) << 2);
+    output[ 2] |= ((other[ 2] & 0x00000004) << 3);
+    output[ 2] |= ((input[ 3] & 0x00000004) << 4);
+    output[ 2] |= ((other[ 3] & 0x00000004) << 5);
+    output[ 2] |= ((input[ 4] & 0x00000004) << 6);
+    output[ 2] |= ((other[ 4] & 0x00000004) << 7);
+    output[ 2] |= ((input[ 5] & 0x00000004) << 8);
+    output[ 2] |= ((other[ 5] & 0x00000004) << 9);
+    output[ 2] |= ((input[ 6] & 0x00000004) <<10);
+    output[ 2] |= ((other[ 6] & 0x00000004) <<11);
+    output[ 2] |= ((input[ 7] & 0x00000004) <<12);
+    output[ 2] |= ((other[ 7] & 0x00000004) <<13);
+    output[ 2] |= ((input[ 0] & 0x00000400) << 6);
+    output[ 2] |= ((other[ 0] & 0x00000400) << 7);
+    output[ 2] |= ((input[ 1] & 0x00000400) << 8);
+    output[ 2] |= ((other[ 1] & 0x00000400) << 9);
+    output[ 2] |= ((input[ 2] & 0x00000400) <<10);
+    output[ 2] |= ((other[ 2] & 0x00000400) <<11);
+    output[ 2] |= ((input[ 3] & 0x00000400) <<12);
+    output[ 2] |= ((other[ 3] & 0x00000400) <<13);
+    output[ 2] |= ((input[ 4] & 0x00000400) <<14);
+    output[ 2] |= ((other[ 4] & 0x00000400) <<15);
+    output[ 2] |= ((input[ 5] & 0x00000400) <<16);
+    output[ 2] |= ((other[ 5] & 0x00000400) <<17);
+    output[ 2] |= ((input[ 6] & 0x00000400) <<18);
+    output[ 2] |= ((other[ 6] & 0x00000400) <<19);
+    output[ 2] |= ((input[ 7] & 0x00000400) <<20);
+    output[ 2] |= ((other[ 7] & 0x00000400) <<21);
+    output[ 3] |= ((input[ 0] & 0x00000008) >> 3);
+    output[ 3] |= ((other[ 0] & 0x00000008) >> 2);
+    output[ 3] |= ((input[ 1] & 0x00000008) >> 1);
+    output[ 3] |=  (other[ 1] & 0x00000008);
+    output[ 3] |= ((input[ 2] & 0x00000008) << 1);
+    output[ 3] |= ((other[ 2] & 0x00000008) << 2);
+    output[ 3] |= ((input[ 3] & 0x00000008) << 3);
+    output[ 3] |= ((other[ 3] & 0x00000008) << 4);
+    output[ 3] |= ((input[ 4] & 0x00000008) << 5);
+    output[ 3] |= ((other[ 4] & 0x00000008) << 6);
+    output[ 3] |= ((input[ 5] & 0x00000008) << 7);
+    output[ 3] |= ((other[ 5] & 0x00000008) << 8);
+    output[ 3] |= ((input[ 6] & 0x00000008) << 9);
+    output[ 3] |= ((other[ 6] & 0x00000008) <<10);
+    output[ 3] |= ((input[ 7] & 0x00000008) <<11);
+    output[ 3] |= ((other[ 7] & 0x00000008) <<12);
+    output[ 3] |= ((input[ 0] & 0x00000800) << 5);
+    output[ 3] |= ((other[ 0] & 0x00000800) << 6);
+    output[ 3] |= ((input[ 1] & 0x00000800) << 7);
+    output[ 3] |= ((other[ 1] & 0x00000800) << 8);
+    output[ 3] |= ((input[ 2] & 0x00000800) << 9);
+    output[ 3] |= ((other[ 2] & 0x00000800) <<10);
+    output[ 3] |= ((input[ 3] & 0x00000800) <<11);
+    output[ 3] |= ((other[ 3] & 0x00000800) <<12);
+    output[ 3] |= ((input[ 4] & 0x00000800) <<13);
+    output[ 3] |= ((other[ 4] & 0x00000800) <<14);
+    output[ 3] |= ((input[ 5] & 0x00000800) <<15);
+    output[ 3] |= ((other[ 5] & 0x00000800) <<16);
+    output[ 3] |= ((input[ 6] & 0x00000800) <<17);
+    output[ 3] |= ((other[ 6] & 0x00000800) <<18);
+    output[ 3] |= ((input[ 7] & 0x00000800) <<19);
+    output[ 3] |= ((other[ 7] & 0x00000800) <<20);
+    output[ 4] |= ((input[ 0] & 0x00000010) >> 4);
+    output[ 4] |= ((other[ 0] & 0x00000010) >> 3);
+    output[ 4] |= ((input[ 1] & 0x00000010) >> 2);
+    output[ 4] |= ((other[ 1] & 0x00000010) >> 1);
+    output[ 4] |=  (input[ 2] & 0x00000010);
+    output[ 4] |= ((other[ 2] & 0x00000010) << 1);
+    output[ 4] |= ((input[ 3] & 0x00000010) << 2);
+    output[ 4] |= ((other[ 3] & 0x00000010) << 3);
+    output[ 4] |= ((input[ 4] & 0x00000010) << 4);
+    output[ 4] |= ((other[ 4] & 0x00000010) << 5);
+    output[ 4] |= ((input[ 5] & 0x00000010) << 6);
+    output[ 4] |= ((other[ 5] & 0x00000010) << 7);
+    output[ 4] |= ((input[ 6] & 0x00000010) << 8);
+    output[ 4] |= ((other[ 6] & 0x00000010) << 9);
+    output[ 4] |= ((input[ 7] & 0x00000010) <<10);
+    output[ 4] |= ((other[ 7] & 0x00000010) <<11);
+    output[ 4] |= ((input[ 0] & 0x00001000) << 4);
+    output[ 4] |= ((other[ 0] & 0x00001000) << 5);
+    output[ 4] |= ((input[ 1] & 0x00001000) << 6);
+    output[ 4] |= ((other[ 1] & 0x00001000) << 7);
+    output[ 4] |= ((input[ 2] & 0x00001000) << 8);
+    output[ 4] |= ((other[ 2] & 0x00001000) << 9);
+    output[ 4] |= ((input[ 3] & 0x00001000) <<10);
+    output[ 4] |= ((other[ 3] & 0x00001000) <<11);
+    output[ 4] |= ((input[ 4] & 0x00001000) <<12);
+    output[ 4] |= ((other[ 4] & 0x00001000) <<13);
+    output[ 4] |= ((input[ 5] & 0x00001000) <<14);
+    output[ 4] |= ((other[ 5] & 0x00001000) <<15);
+    output[ 4] |= ((input[ 6] & 0x00001000) <<16);
+    output[ 4] |= ((other[ 6] & 0x00001000) <<17);
+    output[ 4] |= ((input[ 7] & 0x00001000) <<18);
+    output[ 4] |= ((other[ 7] & 0x00001000) <<19);
+    output[ 5] |= ((input[ 0] & 0x00000020) >> 5);
+    output[ 5] |= ((other[ 0] & 0x00000020) >> 4);
+    output[ 5] |= ((input[ 1] & 0x00000020) >> 3);
+    output[ 5] |= ((other[ 1] & 0x00000020) >> 2);
+    output[ 5] |= ((input[ 2] & 0x00000020) >> 1);
+    output[ 5] |=  (other[ 2] & 0x00000020);
+    output[ 5] |= ((input[ 3] & 0x00000020) << 1);
+    output[ 5] |= ((other[ 3] & 0x00000020) << 2);
+    output[ 5] |= ((input[ 4] & 0x00000020) << 3);
+    output[ 5] |= ((other[ 4] & 0x00000020) << 4);
+    output[ 5] |= ((input[ 5] & 0x00000020) << 5);
+    output[ 5] |= ((other[ 5] & 0x00000020) << 6);
+    output[ 5] |= ((input[ 6] & 0x00000020) << 7);
+    output[ 5] |= ((other[ 6] & 0x00000020) << 8);
+    output[ 5] |= ((input[ 7] & 0x00000020) << 9);
+    output[ 5] |= ((other[ 7] & 0x00000020) <<10);
+    output[ 5] |= ((input[ 0] & 0x00002000) << 3);
+    output[ 5] |= ((other[ 0] & 0x00002000) << 4);
+    output[ 5] |= ((input[ 1] & 0x00002000) << 5);
+    output[ 5] |= ((other[ 1] & 0x00002000) << 6);
+    output[ 5] |= ((input[ 2] & 0x00002000) << 7);
+    output[ 5] |= ((other[ 2] & 0x00002000) << 8);
+    output[ 5] |= ((input[ 3] & 0x00002000) << 9);
+    output[ 5] |= ((other[ 3] & 0x00002000) <<10);
+    output[ 5] |= ((input[ 4] & 0x00002000) <<11);
+    output[ 5] |= ((other[ 4] & 0x00002000) <<12);
+    output[ 5] |= ((input[ 5] & 0x00002000) <<13);
+    output[ 5] |= ((other[ 5] & 0x00002000) <<14);
+    output[ 5] |= ((input[ 6] & 0x00002000) <<15);
+    output[ 5] |= ((other[ 6] & 0x00002000) <<16);
+    output[ 5] |= ((input[ 7] & 0x00002000) <<17);
+    output[ 5] |= ((other[ 7] & 0x00002000) <<18);
+    output[ 6] |= ((input[ 0] & 0x00000040) >> 6);
+    output[ 6] |= ((other[ 0] & 0x00000040) >> 5);
+    output[ 6] |= ((input[ 1] & 0x00000040) >> 4);
+    output[ 6] |= ((other[ 1] & 0x00000040) >> 3);
+    output[ 6] |= ((input[ 2] & 0x00000040) >> 2);
+    output[ 6] |= ((other[ 2] & 0x00000040) >> 1);
+    output[ 6] |=  (input[ 3] & 0x00000040);
+    output[ 6] |= ((other[ 3] & 0x00000040) << 1);
+    output[ 6] |= ((input[ 4] & 0x00000040) << 2);
+    output[ 6] |= ((other[ 4] & 0x00000040) << 3);
+    output[ 6] |= ((input[ 5] & 0x00000040) << 4);
+    output[ 6] |= ((other[ 5] & 0x00000040) << 5);
+    output[ 6] |= ((input[ 6] & 0x00000040) << 6);
+    output[ 6] |= ((other[ 6] & 0x00000040) << 7);
+    output[ 6] |= ((input[ 7] & 0x00000040) << 8);
+    output[ 6] |= ((other[ 7] & 0x00000040) << 9);
+    output[ 6] |= ((input[ 0] & 0x00004000) << 2);
+    output[ 6] |= ((other[ 0] & 0x00004000) << 3);
+    output[ 6] |= ((input[ 1] & 0x00004000) << 4);
+    output[ 6] |= ((other[ 1] & 0x00004000) << 5);
+    output[ 6] |= ((input[ 2] & 0x00004000) << 6);
+    output[ 6] |= ((other[ 2] & 0x00004000) << 7);
+    output[ 6] |= ((input[ 3] & 0x00004000) << 8);
+    output[ 6] |= ((other[ 3] & 0x00004000) << 9);
+    output[ 6] |= ((input[ 4] & 0x00004000) <<10);
+    output[ 6] |= ((other[ 4] & 0x00004000) <<11);
+    output[ 6] |= ((input[ 5] & 0x00004000) <<12);
+    output[ 6] |= ((other[ 5] & 0x00004000) <<13);
+    output[ 6] |= ((input[ 6] & 0x00004000) <<14);
+    output[ 6] |= ((other[ 6] & 0x00004000) <<15);
+    output[ 6] |= ((input[ 7] & 0x00004000) <<16);
+    output[ 6] |= ((other[ 7] & 0x00004000) <<17);
+    output[ 7] |= ((input[ 0] & 0x00000080) >> 7);
+    output[ 7] |= ((other[ 0] & 0x00000080) >> 6);
+    output[ 7] |= ((input[ 1] & 0x00000080) >> 5);
+    output[ 7] |= ((other[ 1] & 0x00000080) >> 4);
+    output[ 7] |= ((input[ 2] & 0x00000080) >> 3);
+    output[ 7] |= ((other[ 2] & 0x00000080) >> 2);
+    output[ 7] |= ((input[ 3] & 0x00000080) >> 1);
+    output[ 7] |=  (other[ 3] & 0x00000080);
+    output[ 7] |= ((input[ 4] & 0x00000080) << 1);
+    output[ 7] |= ((other[ 4] & 0x00000080) << 2);
+    output[ 7] |= ((input[ 5] & 0x00000080) << 3);
+    output[ 7] |= ((other[ 5] & 0x00000080) << 4);
+    output[ 7] |= ((input[ 6] & 0x00000080) << 5);
+    output[ 7] |= ((other[ 6] & 0x00000080) << 6);
+    output[ 7] |= ((input[ 7] & 0x00000080) << 7);
+    output[ 7] |= ((other[ 7] & 0x00000080) << 8);
+    output[ 7] |= ((input[ 0] & 0x00008000) << 1);
+    output[ 7] |= ((other[ 0] & 0x00008000) << 2);
+    output[ 7] |= ((input[ 1] & 0x00008000) << 3);
+    output[ 7] |= ((other[ 1] & 0x00008000) << 4);
+    output[ 7] |= ((input[ 2] & 0x00008000) << 5);
+    output[ 7] |= ((other[ 2] & 0x00008000) << 6);
+    output[ 7] |= ((input[ 3] & 0x00008000) << 7);
+    output[ 7] |= ((other[ 3] & 0x00008000) << 8);
+    output[ 7] |= ((input[ 4] & 0x00008000) << 9);
+    output[ 7] |= ((other[ 4] & 0x00008000) <<10);
+    output[ 7] |= ((input[ 5] & 0x00008000) <<11);
+    output[ 7] |= ((other[ 5] & 0x00008000) <<12);
+    output[ 7] |= ((input[ 6] & 0x00008000) <<13);
+    output[ 7] |= ((other[ 6] & 0x00008000) <<14);
+    output[ 7] |= ((input[ 7] & 0x00008000) <<15);
+    output[ 7] |= ((other[ 7] & 0x00008000) <<16);
+}
+
+__device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *output)
+{
+#pragma unroll 8
+    for (int i=0; i < 16; i+=2) output[i] = 0;
+
+    output[ 0] |= ((input[ 0] & 0x00000100) >> 8);
+    output[ 0] |= ((input[ 1] & 0x00000100) >> 7);
+    output[ 0] |= ((input[ 2] & 0x00000100) >> 6);
+    output[ 0] |= ((input[ 3] & 0x00000100) >> 5);
+    output[ 0] |= ((input[ 4] & 0x00000100) >> 4);
+    output[ 0] |= ((input[ 5] & 0x00000100) >> 3);
+    output[ 0] |= ((input[ 6] & 0x00000100) >> 2);
+    output[ 0] |= ((input[ 7] & 0x00000100) >> 1);
+    output[ 0] |= ((input[ 0] & 0x01000000) >>16);
+    output[ 0] |= ((input[ 1] & 0x01000000) >>15);
+    output[ 0] |= ((input[ 2] & 0x01000000) >>14);
+    output[ 0] |= ((input[ 3] & 0x01000000) >>13);
+    output[ 0] |= ((input[ 4] & 0x01000000) >>12);
+    output[ 0] |= ((input[ 5] & 0x01000000) >>11);
+    output[ 0] |= ((input[ 6] & 0x01000000) >>10);
+    output[ 0] |= ((input[ 7] & 0x01000000) >> 9);
+    output[ 2] |= ((input[ 0] & 0x00000200) >> 9);
+    output[ 2] |= ((input[ 1] & 0x00000200) >> 8);
+    output[ 2] |= ((input[ 2] & 0x00000200) >> 7);
+    output[ 2] |= ((input[ 3] & 0x00000200) >> 6);
+    output[ 2] |= ((input[ 4] & 0x00000200) >> 5);
+    output[ 2] |= ((input[ 5] & 0x00000200) >> 4);
+    output[ 2] |= ((input[ 6] & 0x00000200) >> 3);
+    output[ 2] |= ((input[ 7] & 0x00000200) >> 2);
+    output[ 2] |= ((input[ 0] & 0x02000000) >>17);
+    output[ 2] |= ((input[ 1] & 0x02000000) >>16);
+    output[ 2] |= ((input[ 2] & 0x02000000) >>15);
+    output[ 2] |= ((input[ 3] & 0x02000000) >>14);
+    output[ 2] |= ((input[ 4] & 0x02000000) >>13);
+    output[ 2] |= ((input[ 5] & 0x02000000) >>12);
+    output[ 2] |= ((input[ 6] & 0x02000000) >>11);
+    output[ 2] |= ((input[ 7] & 0x02000000) >>10);
+    output[ 4] |= ((input[ 0] & 0x00000400) >>10);
+    output[ 4] |= ((input[ 1] & 0x00000400) >> 9);
+    output[ 4] |= ((input[ 2] & 0x00000400) >> 8);
+    output[ 4] |= ((input[ 3] & 0x00000400) >> 7);
+    output[ 4] |= ((input[ 4] & 0x00000400) >> 6);
+    output[ 4] |= ((input[ 5] & 0x00000400) >> 5);
+    output[ 4] |= ((input[ 6] & 0x00000400) >> 4);
+    output[ 4] |= ((input[ 7] & 0x00000400) >> 3);
+    output[ 4] |= ((input[ 0] & 0x04000000) >>18);
+    output[ 4] |= ((input[ 1] & 0x04000000) >>17);
+    output[ 4] |= ((input[ 2] & 0x04000000) >>16);
+    output[ 4] |= ((input[ 3] & 0x04000000) >>15);
+    output[ 4] |= ((input[ 4] & 0x04000000) >>14);
+    output[ 4] |= ((input[ 5] & 0x04000000) >>13);
+    output[ 4] |= ((input[ 6] & 0x04000000) >>12);
+    output[ 4] |= ((input[ 7] & 0x04000000) >>11);
+    output[ 6] |= ((input[ 0] & 0x00000800) >>11);
+    output[ 6] |= ((input[ 1] & 0x00000800) >>10);
+    output[ 6] |= ((input[ 2] & 0x00000800) >> 9);
+    output[ 6] |= ((input[ 3] & 0x00000800) >> 8);
+    output[ 6] |= ((input[ 4] & 0x00000800) >> 7);
+    output[ 6] |= ((input[ 5] & 0x00000800) >> 6);
+    output[ 6] |= ((input[ 6] & 0x00000800) >> 5);
+    output[ 6] |= ((input[ 7] & 0x00000800) >> 4);
+    output[ 6] |= ((input[ 0] & 0x08000000) >>19);
+    output[ 6] |= ((input[ 1] & 0x08000000) >>18);
+    output[ 6] |= ((input[ 2] & 0x08000000) >>17);
+    output[ 6] |= ((input[ 3] & 0x08000000) >>16);
+    output[ 6] |= ((input[ 4] & 0x08000000) >>15);
+    output[ 6] |= ((input[ 5] & 0x08000000) >>14);
+    output[ 6] |= ((input[ 6] & 0x08000000) >>13);
+    output[ 6] |= ((input[ 7] & 0x08000000) >>12);
+    output[ 8] |= ((input[ 0] & 0x00001000) >>12);
+    output[ 8] |= ((input[ 1] & 0x00001000) >>11);
+    output[ 8] |= ((input[ 2] & 0x00001000) >>10);
+    output[ 8] |= ((input[ 3] & 0x00001000) >> 9);
+    output[ 8] |= ((input[ 4] & 0x00001000) >> 8);
+    output[ 8] |= ((input[ 5] & 0x00001000) >> 7);
+    output[ 8] |= ((input[ 6] & 0x00001000) >> 6);
+    output[ 8] |= ((input[ 7] & 0x00001000) >> 5);
+    output[ 8] |= ((input[ 0] & 0x10000000) >>20);
+    output[ 8] |= ((input[ 1] & 0x10000000) >>19);
+    output[ 8] |= ((input[ 2] & 0x10000000) >>18);
+    output[ 8] |= ((input[ 3] & 0x10000000) >>17);
+    output[ 8] |= ((input[ 4] & 0x10000000) >>16);
+    output[ 8] |= ((input[ 5] & 0x10000000) >>15);
+    output[ 8] |= ((input[ 6] & 0x10000000) >>14);
+    output[ 8] |= ((input[ 7] & 0x10000000) >>13);
+    output[10] |= ((input[ 0] & 0x00002000) >>13);
+    output[10] |= ((input[ 1] & 0x00002000) >>12);
+    output[10] |= ((input[ 2] & 0x00002000) >>11);
+    output[10] |= ((input[ 3] & 0x00002000) >>10);
+    output[10] |= ((input[ 4] & 0x00002000) >> 9);
+    output[10] |= ((input[ 5] & 0x00002000) >> 8);
+    output[10] |= ((input[ 6] & 0x00002000) >> 7);
+    output[10] |= ((input[ 7] & 0x00002000) >> 6);
+    output[10] |= ((input[ 0] & 0x20000000) >>21);
+    output[10] |= ((input[ 1] & 0x20000000) >>20);
+    output[10] |= ((input[ 2] & 0x20000000) >>19);
+    output[10] |= ((input[ 3] & 0x20000000) >>18);
+    output[10] |= ((input[ 4] & 0x20000000) >>17);
+    output[10] |= ((input[ 5] & 0x20000000) >>16);
+    output[10] |= ((input[ 6] & 0x20000000) >>15);
+    output[10] |= ((input[ 7] & 0x20000000) >>14);
+    output[12] |= ((input[ 0] & 0x00004000) >>14);
+    output[12] |= ((input[ 1] & 0x00004000) >>13);
+    output[12] |= ((input[ 2] & 0x00004000) >>12);
+    output[12] |= ((input[ 3] & 0x00004000) >>11);
+    output[12] |= ((input[ 4] & 0x00004000) >>10);
+    output[12] |= ((input[ 5] & 0x00004000) >> 9);
+    output[12] |= ((input[ 6] & 0x00004000) >> 8);
+    output[12] |= ((input[ 7] & 0x00004000) >> 7);
+    output[12] |= ((input[ 0] & 0x40000000) >>22);
+    output[12] |= ((input[ 1] & 0x40000000) >>21);
+    output[12] |= ((input[ 2] & 0x40000000) >>20);
+    output[12] |= ((input[ 3] & 0x40000000) >>19);
+    output[12] |= ((input[ 4] & 0x40000000) >>18);
+    output[12] |= ((input[ 5] & 0x40000000) >>17);
+    output[12] |= ((input[ 6] & 0x40000000) >>16);
+    output[12] |= ((input[ 7] & 0x40000000) >>15);
+    output[14] |= ((input[ 0] & 0x00008000) >>15);
+    output[14] |= ((input[ 1] & 0x00008000) >>14);
+    output[14] |= ((input[ 2] & 0x00008000) >>13);
+    output[14] |= ((input[ 3] & 0x00008000) >>12);
+    output[14] |= ((input[ 4] & 0x00008000) >>11);
+    output[14] |= ((input[ 5] & 0x00008000) >>10);
+    output[14] |= ((input[ 6] & 0x00008000) >> 9);
+    output[14] |= ((input[ 7] & 0x00008000) >> 8);
+    output[14] |= ((input[ 0] & 0x80000000) >>23);
+    output[14] |= ((input[ 1] & 0x80000000) >>22);
+    output[14] |= ((input[ 2] & 0x80000000) >>21);
+    output[14] |= ((input[ 3] & 0x80000000) >>20);
+    output[14] |= ((input[ 4] & 0x80000000) >>19);
+    output[14] |= ((input[ 5] & 0x80000000) >>18);
+    output[14] |= ((input[ 6] & 0x80000000) >>17);
+    output[14] |= ((input[ 7] & 0x80000000) >>16);
+
+#pragma unroll 8
+    for (int i = 0; i < 16; i+=2) {
+        if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
+        output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610);
+        output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4);
+        if ((threadIdx.x % 4) != 0) output[i] = output[i+1] = 0;
+    }
+}
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index ddca98a9ad..36d4fd5fbe 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -287,6 +287,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
     <ClInclude Include="uint256.h" />
   </ItemGroup>
   <ItemGroup>
+    <CudaCompile Include="bitslice_transformations_quad.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </CudaCompile>
     <CudaCompile Include="cuda_fugue256.cu">
       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@@ -311,6 +317,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
     </CudaCompile>
+    <CudaCompile Include="groestl_functions_quad.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </CudaCompile>
     <CudaCompile Include="heavy\cuda_blake512.cu">
       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 8b7a596809..57c2998a89 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -340,5 +340,11 @@
     <CudaCompile Include="x11\simd_functions.cu">
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
+    <CudaCompile Include="bitslice_transformations_quad.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="groestl_functions_quad.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/configure.ac b/configure.ac
index e33571afc4..80438fd5de 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.05.10])
+AC_INIT([ccminer], [2014.06.14])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 5eb2d7d041..854802cbf5 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -917,8 +917,8 @@ static void *miner_thread(void *userdata)
 			goto out;
 		}
 
-        if (opt_benchmark)
-            if (++rounds == 1) exit(0);
+//        if (opt_benchmark)
+//            if (++rounds == 1) exit(0);
 
 		/* record scanhash elapsed time */
 		gettimeofday(&tv_end, NULL);
@@ -1469,7 +1469,7 @@ static void signal_handler(int sig)
 }
 #endif
 
-#define PROGRAM_VERSION "1.0"
+#define PROGRAM_VERSION "1.1"
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
diff --git a/cpuminer-config.h b/cpuminer-config.h
index 0dca3af1d2..03114b9ab0 100644
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@@ -152,7 +152,7 @@
 #define PACKAGE_NAME "ccminer"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.05.10"
+#define PACKAGE_STRING "ccminer 2014.06.14"
 
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME
@@ -161,7 +161,7 @@
 #undef PACKAGE_URL
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.05.10"
+#define PACKAGE_VERSION "2014.06.14"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu
index b33ff9ebc1..095d3260d7 100644
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@@ -1,4 +1,4 @@
-// Auf Groestlcoin spezialisierte Version von Groestl
+// Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice
 
 #include <cuda.h>
 #include "cuda_runtime.h"
@@ -7,9 +7,6 @@
 #include <stdio.h>
 #include <memory.h>
 
-// it's unfortunate that this is a compile time constant.
-#define MAXWELL_OR_FERMI 1
-
 // aus cpu-miner.c
 extern int device_map[8];
 
@@ -18,456 +15,152 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 
 // Folgende Definitionen später durch header ersetzen
 typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
 typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
 
 // diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props;
+static cudaDeviceProp props[8];
 
+// globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
 extern uint32_t *d_resultNonce[8];
 
 __constant__ uint32_t groestlcoin_gpu_msg[32];
 
-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
-#define PC32up(j, r)   ((uint32_t)((j) + (r)))
-#define PC32dn(j, r)   0
-#define QC32up(j, r)   0xFFFFFFFF
-#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
-
-#define B32_0(x)    __byte_perm(x, 0, 0x4440)
-//((x) & 0xFF)
-#define B32_1(x)    __byte_perm(x, 0, 0x4441)
-//(((x) >> 8) & 0xFF)
-#define B32_2(x)    __byte_perm(x, 0, 0x4442)
-//(((x) >> 16) & 0xFF)
-#define B32_3(x)    __byte_perm(x, 0, 0x4443)
-//((x) >> 24)
-
-#if MAXWELL_OR_FERMI
-#define USE_SHARED 1
-// Maxwell and Fermi cards get the best speed with SHARED access it seems.
-#if USE_SHARED
-#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
-#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
-#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
-#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
-#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
-#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
-#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
-#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
-#else
-#define T0up(x) tex1Dfetch(t0up1, x)
-#define T0dn(x) tex1Dfetch(t0dn1, x)
-#define T1up(x) tex1Dfetch(t1up1, x)
-#define T1dn(x) tex1Dfetch(t1dn1, x)
-#define T2up(x) tex1Dfetch(t2up1, x)
-#define T2dn(x) tex1Dfetch(t2dn1, x)
-#define T3up(x) tex1Dfetch(t3up1, x)
-#define T3dn(x) tex1Dfetch(t3dn1, x)
-#endif
-#else
-#define USE_SHARED 1
-// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
-#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
-#define T0dn(x) tex1Dfetch(t0dn1, x)
-#define T1up(x) tex1Dfetch(t1up1, x)
-#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
-#define T2up(x) tex1Dfetch(t2up1, x)
-#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
-#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
-#define T3dn(x) tex1Dfetch(t3dn1, x)
-#endif
-
-texture<unsigned int, 1, cudaReadModeElementType> t0up1;
-texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
-texture<unsigned int, 1, cudaReadModeElementType> t1up1;
-texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
-texture<unsigned int, 1, cudaReadModeElementType> t2up1;
-texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
-texture<unsigned int, 1, cudaReadModeElementType> t3up1;
-texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
-
-extern uint32_t T0up_cpu[];
-extern uint32_t T0dn_cpu[];
-extern uint32_t T1up_cpu[];
-extern uint32_t T1dn_cpu[];
-extern uint32_t T2up_cpu[];
-extern uint32_t T2dn_cpu[];
-extern uint32_t T3up_cpu[];
-extern uint32_t T3dn_cpu[];
-
-#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
-
-
-__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
+// 64 Register Variante für Compute 3.0
+#include "groestl_functions_quad.cu"
+#include "bitslice_transformations_quad.cu"
+
+#define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+
+__global__ void __launch_bounds__(256, 4)
+ groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
 {
-	uint32_t t[32];
-
-//#pragma unroll 14
-	for(int r=0;r<14;r++)
-	{
-		switch(r)
-		{
-			case 0:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
-			case 1:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
-			case 2:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
-			case 3:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
-			case 4:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
-			case 5:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
-			case 6:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
-			case 7:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
-			case 8:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
-			case 9:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
-			case 10:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
-			case 11:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
-			case 12:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
-			case 13:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
-		}
-
-        // RBTT
-#pragma unroll 16
-        for(int k=0;k<32;k+=2)
-        {
-            uint32_t t0_0 = B32_0(a[(k     ) & 0x1f]), t9_0  = B32_0(a[(k +  9) & 0x1f]);
-            uint32_t t2_1 = B32_1(a[(k +  2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
-            uint32_t t4_2 = B32_2(a[(k +  4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
-            uint32_t t6_3 = B32_3(a[(k +  6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
-        
-            t[k + 0] =  T0up( t0_0 ) ^ T1up(  t2_1 ) ^ T2up(  t4_2 ) ^ T3up(  t6_3 ) ^ 
-                        T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
+    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
+    if (thread < threads)
+    {
+        // GROESTL
+        uint32_t paddedInput[8];
+#pragma unroll 8
+        for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+threadIdx.x%4];
 
-            t[k + 1] =  T0dn( t0_0 ) ^ T1dn(  t2_1 ) ^ T2dn(  t4_2 ) ^ T3dn(  t6_3 ) ^ 
-                        T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
-        }
-#pragma unroll 32
-        for(int k=0;k<32;k++)
-            a[k] = t[k];
-    }
-}
+        uint32_t nounce = startNounce + thread;
+        if ((threadIdx.x % 4) == 3)
+            paddedInput[4] = SWAB32(nounce);  // 4*4+3 = 19
 
-__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
-{	
-//#pragma unroll 14
-	for(int r=0;r<14;r++)
-	{
-		uint32_t t[32];
-
-		switch(r)
-		{
-			case 0:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
-			case 1:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
-			case 2:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
-			case 3:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
-			case 4:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
-			case 5:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
-			case 6:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
-			case 7:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
-			case 8:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
-			case 9:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
-			case 10:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
-			case 11:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
-			case 12:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
-			case 13:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
-		}
-
-        // RBTT
-#pragma unroll 16
-        for(int k=0;k<32;k+=2)
+        uint32_t msgBitsliced[8];
+        to_bitslice_quad(paddedInput, msgBitsliced);
+
+        uint32_t state[8];
+        for (int round=0; round<2; round++)
         {
-            uint32_t t2_0  = B32_0(a[(k +  2) & 0x1f]), t1_0  = B32_0(a[(k +  1) & 0x1f]);
-            uint32_t t6_1  = B32_1(a[(k +  6) & 0x1f]), t5_1  = B32_1(a[(k +  5) & 0x1f]);
-            uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2  = B32_2(a[(k +  9) & 0x1f]);
-            uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
-        
-            t[k + 0] =  T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ 
-                        T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn(  t9_2 ) ^ T3dn( t13_3 );
+            groestl512_progressMessage_quad(state, msgBitsliced);
+
+            if (round < 1)
+            {
+                // Verkettung zweier Runden inclusive Padding.
+                msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x%4)==3)*0x2000);
+                msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
+                msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
+                msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
+                msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
+                msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
+                msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
+                msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + ((threadIdx.x%4)==0)*0x0010);
+            }
+        }
 
-            t[k + 1] =  T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ 
-                        T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up(  t9_2 ) ^ T3up( t13_3 );
+        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
+        uint32_t out_state[16];
+        from_bitslice_quad(state, out_state);
+        
+        if (threadIdx.x % 4 == 0)
+        {
+            int i, position = -1;
+            bool rc = true;
+
+    #pragma unroll 8
+            for (i = 7; i >= 0; i--) {
+                if (out_state[i] > pTarget[i]) {
+                    if(position < i) {
+                        position = i;
+                        rc = false;
+                    }
+                 }
+                 if (out_state[i] < pTarget[i]) {
+                    if(position < i) {
+                        position = i;
+                        rc = true;
+                    }
+                 }
+            }
+
+            if(rc == true)
+                if(resNounce[0] > nounce)
+                    resNounce[0] = nounce;
         }
-#pragma unroll 32
-        for(int k=0;k<32;k++)
-            a[k] = t[k];
     }
 }
-#if USE_SHARED
-__global__ void  /* __launch_bounds__(256) */
-#else
-__global__ void 
-#endif
-
- groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
-{
-#if USE_SHARED
-	extern __shared__ char mixtabs[];
-
-	if (threadIdx.x < 256)
-	{
-		*((uint32_t*)mixtabs + (    threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
-		*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
-		*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
-		*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
-		*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
-		*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
-		*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
-		*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
-	}
-
-	__syncthreads();
-#endif
-
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		// GROESTL
-		uint32_t message[32];
-		uint32_t state[32];
-
-#pragma unroll 32
-		for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
-
-		uint32_t nounce = startNounce + thread;
-		message[19] = SWAB32(nounce);
-
-#pragma unroll 32
-		for(int u=0;u<32;u++) state[u] = message[u];
-		state[31] ^= 0x20000;
-
-		// Perm
-#if USE_SHARED
-		groestlcoin_perm_P(state, mixtabs);
-		state[31] ^= 0x20000;
-		groestlcoin_perm_Q(message, mixtabs);
-#else
-		groestlcoin_perm_P(state, NULL);
-		state[31] ^= 0x20000;
-		groestlcoin_perm_Q(message, NULL);
-#endif
-#pragma unroll 32
-		for(int u=0;u<32;u++) state[u] ^= message[u];
-
-#pragma unroll 32
-		for(int u=0;u<32;u++) message[u] = state[u];
-
-#if USE_SHARED
-		groestlcoin_perm_P(message, mixtabs);
-#else
-		groestlcoin_perm_P(message, NULL);
-#endif
-
-#pragma unroll 32
-		for(int u=0;u<32;u++) state[u] ^= message[u];
-
-		////
-		//// 2. Runde groestl
-		////
-#pragma unroll 16
-		for(int k=0;k<16;k++) message[k] = state[k + 16];
-#pragma unroll 14
-		for(int k=1;k<15;k++)
-			message[k+16] = 0;
-
-		message[16] = 0x80;
-		message[31] = 0x01000000;
-
-#pragma unroll 32
-		for(int u=0;u<32;u++)
-			state[u] = message[u];
-		state[31] ^= 0x20000;
-
-		// Perm
-#if USE_SHARED
-		groestlcoin_perm_P(state, mixtabs);
-		state[31] ^= 0x20000;
-		groestlcoin_perm_Q(message, mixtabs);
-#else
-		groestlcoin_perm_P(state, NULL);
-		state[31] ^= 0x20000;
-		groestlcoin_perm_Q(message, NULL);
-#endif
-		
-#pragma unroll 32
-		for(int u=0;u<32;u++) state[u] ^= message[u];
-
-#pragma unroll 32
-		for(int u=0;u<32;u++) message[u] = state[u];
-
-#if USE_SHARED
-		groestlcoin_perm_P(message, mixtabs);
-#else
-		groestlcoin_perm_P(message, NULL);
-#endif
-
-#pragma unroll 32
-		for(int u=0;u<32;u++) state[u] ^= message[u];
-
-		// kopiere Ergebnis
-		int i, position = -1;
-		bool rc = true;
-
-#pragma unroll 8
-		for (i = 7; i >= 0; i--) {
-			if (state[i+16] > pTarget[i]) {
-				if(position < i) {
-					position = i;
-					rc = false;
-				}
-	 		}
-	 		if (state[i+16] < pTarget[i]) {
-				if(position < i) {
-					position = i;
-					rc = true;
-				}
-	 		}
-		}
-
-		if(rc == true)
-			if(resNounce[0] > nounce)
-				resNounce[0] = nounce;
-	}
-}
-
-#define texDef(texname, texmem, texsource, texsize) \
-	unsigned int *texmem; \
-	cudaMalloc(&texmem, texsize); \
-	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
-	texname.normalized = 0; \
-	texname.filterMode = cudaFilterModePoint; \
-	texname.addressMode[0] = cudaAddressModeClamp; \
-	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
 
 // Setup-Funktionen
 __host__ void groestlcoin_cpu_init(int thr_id, int threads)
 {
-	cudaSetDevice(device_map[thr_id]);
-
-	cudaGetDeviceProperties(&props, device_map[thr_id]);
-
-	// Texturen mit obigem Makro initialisieren
-	texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
-	texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
-	texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
-	texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
-	texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
-	texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
-	texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
-	texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
-
-	// Speicher für Gewinner-Nonce belegen
-	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
+    cudaSetDevice(device_map[thr_id]);
+
+    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
+
+    // Speicher für Gewinner-Nonce belegen
+    cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
 }
 
 __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
-	// Nachricht expandieren und setzen
-	uint32_t msgBlock[32];
+    // Nachricht expandieren und setzen
+    uint32_t msgBlock[32];
 
-	memset(msgBlock, 0, sizeof(uint32_t) * 32);
-	memcpy(&msgBlock[0], data, 80);
+    memset(msgBlock, 0, sizeof(uint32_t) * 32);
+    memcpy(&msgBlock[0], data, 80);
 
-	// Erweitere die Nachricht auf den Nachrichtenblock (padding)
-	// Unsere Nachricht hat 80 Byte
-	msgBlock[20] = 0x80;
-	msgBlock[31] = 0x01000000;
+    // Erweitere die Nachricht auf den Nachrichtenblock (padding)
+    // Unsere Nachricht hat 80 Byte
+    msgBlock[20] = 0x80;
+    msgBlock[31] = 0x01000000;
 
-	// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
-	// auf der GPU ausgeführt)
+    // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
+    // auf der GPU ausgeführt)
 
-	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
-	cudaMemcpyToSymbol(	groestlcoin_gpu_msg,
-						msgBlock,
-						128);
+    // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
+    cudaMemcpyToSymbol( groestlcoin_gpu_msg,
+                        msgBlock,
+                        128);
 
-	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-	cudaMemcpyToSymbol(	pTarget,
-						pTargetIn,
-						sizeof(uint32_t) * 8 );
+    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+    cudaMemcpyToSymbol( pTarget,
+                        pTargetIn,
+                        sizeof(uint32_t) * 8 );
 }
 
 __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
 {
-	// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
-	// alle anderen mit 512 Threads.
-	int threadsperblock = (props.major >= 3) ? 768 : 512;
-
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
-
-	// Größe des dynamischen Shared Memory Bereichs
-#if USE_SHARED
-	size_t shared_size = 8 * 256 * sizeof(uint32_t);
-#else
-	size_t shared_size = 0;
-#endif
-
-//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-	//fprintf(stderr, "ThrID: %d\n", thr_id);
-	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-	groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
-
-	// Strategisches Sleep Kommando zur Senkung der CPU Last
-	MyStreamSynchronize(NULL, 0, thr_id);
-
-	cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+    int threadsperblock = 256;
+
+    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
+    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
+    int factor = 4;
+
+        // berechne wie viele Thread Blocks wir brauchen
+    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
+    dim3 block(threadsperblock);
+
+    // Größe des dynamischen Shared Memory Bereichs
+    size_t shared_size = 0;
+
+    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+    groestlcoin_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
+
+    // Strategisches Sleep Kommando zur Senkung der CPU Last
+    MyStreamSynchronize(NULL, 0, thr_id);
+
+    cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }
diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu
index 062db46c21..3a992e0302 100644
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@@ -1,4 +1,4 @@
-// Auf Myriadcoin spezialisierte Version von Groestl
+// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
 
 #include <cuda.h>
 #include "cuda_runtime.h"
@@ -7,9 +7,6 @@
 #include <stdio.h>
 #include <memory.h>
 
-// it's unfortunate that this is a compile time constant.
-#define MAXWELL_OR_FERMI 1
-
 // aus cpu-miner.c
 extern int device_map[8];
 
@@ -22,30 +19,49 @@ typedef unsigned short uint16_t;
 typedef unsigned int uint32_t;
 
 // diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props;
+static cudaDeviceProp props[8];
 
+// globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
+uint32_t *d_outputHashes[8];
 extern uint32_t *d_resultNonce[8];
 
 __constant__ uint32_t myriadgroestl_gpu_msg[32];
 
 // muss expandiert werden
 __constant__ uint32_t myr_sha256_gpu_constantTable[64];
+__constant__ uint32_t myr_sha256_gpu_constantTable2[64];
 __constant__ uint32_t myr_sha256_gpu_hashTable[8];
 
 uint32_t myr_sha256_cpu_hashTable[] = { 
-	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
+    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
 uint32_t myr_sha256_cpu_constantTable[] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 };
 
+uint32_t myr_sha256_cpu_w2Table[] = {
+    0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200,
+    0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020,
+    0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549,
+    0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91,
+    0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7,
+    0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d,
+    0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484 };
+
+// 64 Register Variante für Compute 3.0
+#include "groestl_functions_quad.cu"
+#include "bitslice_transformations_quad.cu"
+
+#define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+
 #if __CUDA_ARCH__ < 350 
     // Kepler (Compute 3.0)
     #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
@@ -53,452 +69,219 @@ uint32_t myr_sha256_cpu_constantTable[] = {
     // Kepler (Compute 3.5)
     #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
 #endif
-#define R(x, n)			((x) >> (n))
-#define Ch(x, y, z)		((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define S0(x)			(ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
-#define S1(x)			(ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
-#define s0(x)			(ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
-#define s1(x)			(ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
-
-#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+#define R(x, n)            ((x) >> (n))
+#define Ch(x, y, z)        ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)    ((x & (y | z)) | (y & z))
+#define S0(x)            (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
+#define S1(x)            (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
+#define s0(x)            (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
+#define s1(x)            (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
 
 __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
 {
-	uint32_t W1[16];
-	uint32_t W2[16];
+    uint32_t W1[16];
+    uint32_t W2[16];
 
-	// Initialisiere die register a bis h mit der Hash-Tabelle
-	uint32_t regs[8];
-	uint32_t hash[8];
+    // Initialisiere die register a bis h mit der Hash-Tabelle
+    uint32_t regs[8];
+    uint32_t hash[8];
 
-	// pre
+    // pre
 #pragma unroll 8
-	for (int k=0; k < 8; k++)
-	{
-		regs[k] = myr_sha256_gpu_hashTable[k];
-		hash[k] = regs[k];
-	}
-	
+    for (int k=0; k < 8; k++)
+    {
+        regs[k] = myr_sha256_gpu_hashTable[k];
+        hash[k] = regs[k];
+    }
+    
 #pragma unroll 16
-	for(int k=0;k<16;k++)
-		W1[k] = SWAB32(message[k]);
+    for(int k=0;k<16;k++)
+        W1[k] = SWAB32(message[k]);
 
 // Progress W1
 #pragma unroll 16
-	for(int j=0;j<16;j++)
-	{
-		uint32_t T1, T2;
-		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
-		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
-		#pragma unroll 7
-		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-		regs[0] = T1 + T2;
-		regs[4] += T1;
-	}
+    for(int j=0;j<16;j++)
+    {
+        uint32_t T1, T2;
+        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
+        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+        
+        #pragma unroll 7
+        for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+        regs[0] = T1 + T2;
+        regs[4] += T1;
+    }
 
 // Progress W2...W3
-#pragma unroll 3
-	for(int k=0;k<3;k++)
-	{
+////// PART 1
 #pragma unroll 2
-		for(int j=0;j<2;j++)
-			W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+    for(int j=0;j<2;j++)
+        W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
 #pragma unroll 5
-		for(int j=2;j<7;j++)
-			W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+    for(int j=2;j<7;j++)
+        W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
 
 #pragma unroll 8
-		for(int j=7;j<15;j++)
-			W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
+    for(int j=7;j<15;j++)
+        W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
 
-		W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+    W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
 
-		// Rundenfunktion
+    // Rundenfunktion
 #pragma unroll 16
-		for(int j=0;j<16;j++)
-		{
-			uint32_t T1, T2;
-			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
-			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
-			#pragma unroll 7
-			for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
-			regs[0] = T1 + T2;
-			regs[4] += T1;
-		}
-
-#pragma unroll 16
-		for(int j=0;j<16;j++)
-			W1[j] = W2[j];
-	}
+    for(int j=0;j<16;j++)
+    {
+        uint32_t T1, T2;
+        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16] + W2[j];
+        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+        
+        #pragma unroll 7
+        for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
+        regs[0] = T1 + T2;
+        regs[4] += T1;
+    }
 
-#pragma unroll 8
-	for(int k=0;k<8;k++)
-		hash[k] += regs[k];
+////// PART 2
+#pragma unroll 2
+    for(int j=0;j<2;j++)
+        W1[j] = s1(W2[14+j]) + W2[9+j] + s0(W2[1+j]) + W2[j];
+#pragma unroll 5
+    for(int j=2;j<7;j++)
+        W1[j] = s1(W1[j-2]) + W2[9+j] + s0(W2[1+j]) + W2[j];
 
-	/////
-	///// Zweite Runde (wegen Msg-Padding)
-	/////
 #pragma unroll 8
-	for(int k=0;k<8;k++)
-		regs[k] = hash[k];
+    for(int j=7;j<15;j++)
+        W1[j] = s1(W1[j-2]) + W1[j-7] + s0(W2[1+j]) + W2[j];
 
-	W1[0] = SWAB32(0x80);
-#pragma unroll 14
-	for(int k=1;k<15;k++)
-		W1[k] = 0;
-	W1[15] = 512;
+    W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
 
-// Progress W1
+    // Rundenfunktion
 #pragma unroll 16
-	for(int j=0;j<16;j++)
-	{
-		uint32_t T1, T2;
-		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
-		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
-		#pragma unroll 7
-		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-		regs[0] = T1 + T2;
-		regs[4] += T1;
-	}
+    for(int j=0;j<16;j++)
+    {
+        uint32_t T1, T2;
+        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 32] + W1[j];
+        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+        
+        #pragma unroll 7
+        for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
+        regs[0] = T1 + T2;
+        regs[4] += T1;
+    }
 
-// Progress W2...W3
-#pragma unroll 3
-	for(int k=0;k<3;k++)
-	{
+////// PART 3
 #pragma unroll 2
-		for(int j=0;j<2;j++)
-			W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+    for(int j=0;j<2;j++)
+        W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
 #pragma unroll 5
-		for(int j=2;j<7;j++)
-			W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+    for(int j=2;j<7;j++)
+        W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
 
 #pragma unroll 8
-		for(int j=7;j<15;j++)
-			W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
+    for(int j=7;j<15;j++)
+        W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
 
-		W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+    W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
 
-		// Rundenfunktion
+    // Rundenfunktion
 #pragma unroll 16
-		for(int j=0;j<16;j++)
-		{
-			uint32_t T1, T2;
-			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
-			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
-			#pragma unroll 7
-			for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
-			regs[0] = T1 + T2;
-			regs[4] += T1;
-		}
+    for(int j=0;j<16;j++)
+    {
+        uint32_t T1, T2;
+        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 48] + W2[j];
+        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+        
+        #pragma unroll 7
+        for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
+        regs[0] = T1 + T2;
+        regs[4] += T1;
+    }
 
-#pragma unroll 16
-		for(int j=0;j<16;j++)
-			W1[j] = W2[j];
-	}
+#pragma unroll 8
+    for(int k=0;k<8;k++)
+        hash[k] += regs[k];
 
+    /////
+    ///// Zweite Runde (wegen Msg-Padding)
+    /////
 #pragma unroll 8
-	for(int k=0;k<8;k++)
-		hash[k] += regs[k];
+    for(int k=0;k<8;k++)
+        regs[k] = hash[k];
 
-	//// FERTIG
+// Progress W1
+#pragma unroll 64
+    for(int j=0;j<64;j++)
+    {
+        uint32_t T1, T2;
+        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable2[j];
+        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+        
+        #pragma unroll 7
+        for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+        regs[0] = T1 + T2;
+        regs[4] += T1;
+    }
 
 #pragma unroll 8
-	for(int k=0;k<8;k++)
-		message[k] = SWAB32(hash[k]);
-}
+    for(int k=0;k<8;k++)
+        hash[k] += regs[k];
 
-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
-#define PC32up(j, r)   ((uint32_t)((j) + (r)))
-#define PC32dn(j, r)   0
-#define QC32up(j, r)   0xFFFFFFFF
-#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
-
-#define B32_0(x)    __byte_perm(x, 0, 0x4440)
-//((x) & 0xFF)
-#define B32_1(x)    __byte_perm(x, 0, 0x4441)
-//(((x) >> 8) & 0xFF)
-#define B32_2(x)    __byte_perm(x, 0, 0x4442)
-//(((x) >> 16) & 0xFF)
-#define B32_3(x)    __byte_perm(x, 0, 0x4443)
-//((x) >> 24)
-
-#if MAXWELL_OR_FERMI
-#define USE_SHARED 1
-// Maxwell and Fermi cards get the best speed with SHARED access it seems.
-#if USE_SHARED
-#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
-#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
-#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
-#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
-#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
-#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
-#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
-#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
-#else
-#define T0up(x) tex1Dfetch(t0up1, x)
-#define T0dn(x) tex1Dfetch(t0dn1, x)
-#define T1up(x) tex1Dfetch(t1up1, x)
-#define T1dn(x) tex1Dfetch(t1dn1, x)
-#define T2up(x) tex1Dfetch(t2up1, x)
-#define T2dn(x) tex1Dfetch(t2dn1, x)
-#define T3up(x) tex1Dfetch(t3up1, x)
-#define T3dn(x) tex1Dfetch(t3dn1, x)
-#endif
-#else
-#define USE_SHARED 1
-// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
-#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
-#define T0dn(x) tex1Dfetch(t0dn1, x)
-#define T1up(x) tex1Dfetch(t1up1, x)
-#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
-#define T2up(x) tex1Dfetch(t2up1, x)
-#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
-#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
-#define T3dn(x) tex1Dfetch(t3dn1, x)
-#endif
+    //// FERTIG
 
-texture<unsigned int, 1, cudaReadModeElementType> t0up1;
-texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
-texture<unsigned int, 1, cudaReadModeElementType> t1up1;
-texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
-texture<unsigned int, 1, cudaReadModeElementType> t2up1;
-texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
-texture<unsigned int, 1, cudaReadModeElementType> t3up1;
-texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
+#pragma unroll 8
+    for(int k=0;k<8;k++)
+        message[k] = SWAB32(hash[k]);
+}
 
-extern uint32_t T0up_cpu[];
-extern uint32_t T0dn_cpu[];
-extern uint32_t T1up_cpu[];
-extern uint32_t T1dn_cpu[];
-extern uint32_t T2up_cpu[];
-extern uint32_t T2dn_cpu[];
-extern uint32_t T3up_cpu[];
-extern uint32_t T3dn_cpu[];
+__global__ void __launch_bounds__(256, 4)
+ myriadgroestl_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *hashBuffer)
+{
+    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
+    if (thread < threads)
+    {
+        // GROESTL
+        uint32_t paddedInput[8];
+#pragma unroll 8
+        for(int k=0;k<8;k++) paddedInput[k] = myriadgroestl_gpu_msg[4*k+threadIdx.x%4];
 
-#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+        uint32_t nounce = startNounce + thread;
+        if ((threadIdx.x % 4) == 3)
+            paddedInput[4] = SWAB32(nounce);  // 4*4+3 = 19
 
+        uint32_t msgBitsliced[8];
+        to_bitslice_quad(paddedInput, msgBitsliced);
 
-__device__ __forceinline__ void myriadgroestl_perm_P(uint32_t *a, char *mixtabs)
-{
-	uint32_t t[32];
-
-//#pragma unroll 14
-	for(int r=0;r<14;r++)
-	{
-		switch(r)
-		{
-			case 0:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
-			case 1:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
-			case 2:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
-			case 3:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
-			case 4:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
-			case 5:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
-			case 6:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
-			case 7:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
-			case 8:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
-			case 9:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
-			case 10:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
-			case 11:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
-			case 12:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
-			case 13:
-#pragma unroll 16
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
-		}
+        uint32_t state[8];
 
-        // RBTT
-#pragma unroll 16
-        for(int k=0;k<32;k+=2)
-        {
-            uint32_t t0_0 = B32_0(a[(k     ) & 0x1f]), t9_0  = B32_0(a[(k +  9) & 0x1f]);
-            uint32_t t2_1 = B32_1(a[(k +  2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
-            uint32_t t4_2 = B32_2(a[(k +  4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
-            uint32_t t6_3 = B32_3(a[(k +  6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
-        
-            t[k + 0] =  T0up( t0_0 ) ^ T1up(  t2_1 ) ^ T2up(  t4_2 ) ^ T3up(  t6_3 ) ^ 
-                        T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
+        groestl512_progressMessage_quad(state, msgBitsliced);
 
-            t[k + 1] =  T0dn( t0_0 ) ^ T1dn(  t2_1 ) ^ T2dn(  t4_2 ) ^ T3dn(  t6_3 ) ^ 
-                        T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
-        }
-#pragma unroll 32
-        for(int k=0;k<32;k++)
-            a[k] = t[k];
-    }
-}
+        uint32_t out_state[16];
+        from_bitslice_quad(state, out_state);
 
-__device__ __forceinline__ void myriadgroestl_perm_Q(uint32_t *a, char *mixtabs)
-{	
-//#pragma unroll 14
-	for(int r=0;r<14;r++)
-	{
-		uint32_t t[32];
-
-		switch(r)
-		{
-			case 0:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
-			case 1:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
-			case 2:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
-			case 3:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
-			case 4:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
-			case 5:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
-			case 6:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
-			case 7:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
-			case 8:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
-			case 9:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
-			case 10:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
-			case 11:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
-			case 12:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
-			case 13:
-	#pragma unroll 16
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
-		}
-
-        // RBTT
-#pragma unroll 16
-        for(int k=0;k<32;k+=2)
+        if ((threadIdx.x & 0x03) == 0)
         {
-            uint32_t t2_0  = B32_0(a[(k +  2) & 0x1f]), t1_0  = B32_0(a[(k +  1) & 0x1f]);
-            uint32_t t6_1  = B32_1(a[(k +  6) & 0x1f]), t5_1  = B32_1(a[(k +  5) & 0x1f]);
-            uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2  = B32_2(a[(k +  9) & 0x1f]);
-            uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
-        
-            t[k + 0] =  T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ 
-                        T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn(  t9_2 ) ^ T3dn( t13_3 );
-
-            t[k + 1] =  T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ 
-                        T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up(  t9_2 ) ^ T3up( t13_3 );
+            uint32_t *outpHash = &hashBuffer[16 * thread];
+#pragma unroll 16
+            for(int k=0;k<16;k++) outpHash[k] = out_state[k];
         }
-#pragma unroll 32
-        for(int k=0;k<32;k++)
-            a[k] = t[k];
     }
 }
 
-__global__ void 
-myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
+__global__ void
+ myriadgroestl_gpu_hash_quad2(int threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
 {
-#if USE_SHARED
-	extern __shared__ char mixtabs[];
-
-	if (threadIdx.x < 256)
-	{
-		*((uint32_t*)mixtabs + (    threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
-		*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
-		*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
-		*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
-		*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
-		*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
-		*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
-		*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
-	}
-
-	__syncthreads();
-#endif
-
     int thread = (blockDim.x * blockIdx.x + threadIdx.x);
     if (thread < threads)
     {
-		// GROESTL
-		uint32_t message[32];
-		uint32_t state[32];
-
-#pragma unroll 32
-		for(int k=0;k<32;k++) message[k] = myriadgroestl_gpu_msg[k];
-
-		uint32_t nounce = startNounce + thread;
-		message[19] = SWAB32(nounce);
-
-#pragma unroll 32
-		for(int u=0;u<32;u++) state[u] = message[u];
-		state[31] ^= 0x20000;
-
-		// Perm
-#if USE_SHARED
-		myriadgroestl_perm_P(state, mixtabs);
-		state[31] ^= 0x20000;
-		myriadgroestl_perm_Q(message, mixtabs);
-#else
-		myriadgroestl_perm_P(state, NULL);
-		state[31] ^= 0x20000;
-		myriadgroestl_perm_Q(message, NULL);
-#endif
-#pragma unroll 32
-		for(int u=0;u<32;u++) state[u] ^= message[u];
-
-#pragma unroll 32
-		for(int u=0;u<32;u++) message[u] = state[u];
-
-#if USE_SHARED
-		myriadgroestl_perm_P(message, mixtabs);
-#else
-		myriadgroestl_perm_P(message, NULL);
-#endif
-
-#pragma unroll 32
-		for(int u=0;u<32;u++) state[u] ^= message[u];
+        uint32_t nounce = startNounce + thread;
 
         uint32_t out_state[16];
+        uint32_t *inpHash = &hashBuffer[16 * thread];
 #pragma unroll 16
-		for(int u=0;u<16;u++) out_state[u] = state[u+16];
+        for (int i=0; i < 16; i++)
+            out_state[i] = inpHash[i];
+
         myriadgroestl_gpu_sha256(out_state);
         
         int i, position = -1;
@@ -526,43 +309,35 @@ myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
     }
 }
 
-#define texDef(texname, texmem, texsource, texsize) \
-	unsigned int *texmem; \
-	cudaMalloc(&texmem, texsize); \
-	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
-	texname.normalized = 0; \
-	texname.filterMode = cudaFilterModePoint; \
-	texname.addressMode[0] = cudaAddressModeClamp; \
-	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
-
 // Setup-Funktionen
 __host__ void myriadgroestl_cpu_init(int thr_id, int threads)
 {
-	cudaSetDevice(device_map[thr_id]);
-	
-	cudaMemcpyToSymbol(	myr_sha256_gpu_hashTable,
-						myr_sha256_cpu_hashTable,
-						sizeof(uint32_t) * 8 );
-
-	cudaMemcpyToSymbol(	myr_sha256_gpu_constantTable,
-						myr_sha256_cpu_constantTable,
-						sizeof(uint32_t) * 64 );
-
-    cudaGetDeviceProperties(&props, device_map[thr_id]);
-
-	// Texturen mit obigem Makro initialisieren
-	texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
-	texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
-	texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
-	texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
-	texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
-	texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
-	texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
-	texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
+    cudaSetDevice(device_map[thr_id]);
+    
+    cudaMemcpyToSymbol( myr_sha256_gpu_hashTable,
+                        myr_sha256_cpu_hashTable,
+                        sizeof(uint32_t) * 8 );
+
+    cudaMemcpyToSymbol( myr_sha256_gpu_constantTable,
+                        myr_sha256_cpu_constantTable,
+                        sizeof(uint32_t) * 64 );
+
+    // zweite CPU-Tabelle bauen und auf die GPU laden
+    uint32_t temp[64];
+    for(int i=0;i<64;i++)
+        temp[i] = myr_sha256_cpu_w2Table[i] + myr_sha256_cpu_constantTable[i];
+
+    cudaMemcpyToSymbol( myr_sha256_gpu_constantTable2,
+                        temp,
+                        sizeof(uint32_t) * 64 );
+
+    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
 
     // Speicher für Gewinner-Nonce belegen
     cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
+
+    // Speicher für temporäreHashes
+    cudaMalloc(&d_outputHashes[thr_id], 16*sizeof(uint32_t)*threads); 
 }
 
 __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
@@ -594,25 +369,23 @@ __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn
 
 __host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
 {
-	// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
-	// alle anderen mit 512 Threads.
-	int threadsperblock = (props.major >= 3) ? 768 : 512;
+    int threadsperblock = 256;
 
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
+    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
+    const int factor=4;
 
-	// Größe des dynamischen Shared Memory Bereichs
-#if USE_SHARED
-	size_t shared_size = 8 * 256 * sizeof(uint32_t);
-#else
-	size_t shared_size = 0;
-#endif
+    // Größe des dynamischen Shared Memory Bereichs
+    size_t shared_size = 0;
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-    //fprintf(stderr, "ThrID: %d\n", thr_id);
     cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-    myriadgroestl_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
+    // berechne wie viele Thread Blocks wir brauchen
+    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
+    dim3 block(threadsperblock);
+
+    myriadgroestl_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_outputHashes[thr_id]);
+    dim3 grid2((threads + threadsperblock-1)/threadsperblock);
+    myriadgroestl_gpu_hash_quad2<<<grid2, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]);
 
     // Strategisches Sleep Kommando zur Senkung der CPU Last
     MyStreamSynchronize(NULL, 0, thr_id);
diff --git a/cuda_nist5.cu b/cuda_nist5.cu
index 0296b4e99a..4e37f6968c 100644
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@@ -85,7 +85,6 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 
-	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 
diff --git a/groestl_functions_quad.cu b/groestl_functions_quad.cu
new file mode 100644
index 0000000000..745a843d54
--- /dev/null
+++ b/groestl_functions_quad.cu
@@ -0,0 +1,315 @@
+
+__device__ __forceinline__ void G256_Mul2(uint32_t *regs)
+{
+    uint32_t tmp = regs[7];
+    regs[7] = regs[6];
+    regs[6] = regs[5];
+    regs[5] = regs[4];
+    regs[4] = regs[3] ^ tmp;
+    regs[3] = regs[2] ^ tmp;
+    regs[2] = regs[1];
+    regs[1] = regs[0] ^ tmp;
+    regs[0] = tmp;
+}
+
+__device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
+{
+    x0 = ~x0;
+    x1 = ~x1;
+    x2 = ~x2;
+    x3 = ~x3;
+    x4 = ~x4;
+    x5 = ~x5;
+    x6 = ~x6;
+    x7 = ~x7;
+
+    if ((threadIdx.x & 0x03) == 3) {
+        x0 ^= ((- (round & 0x01)    ) & 0xFFFF0000);
+        x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF0000);
+        x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF0000);
+        x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF0000);
+        x4 ^= 0xAAAA0000;
+        x5 ^= 0xCCCC0000;
+        x6 ^= 0xF0F00000;
+        x7 ^= 0xFF000000;
+    }
+}
+
+__device__ __forceinline__ void G256_AddRoundConstantP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
+{
+    if ((threadIdx.x & 0x03) == 0)
+    {
+        x4 ^= 0xAAAA;
+        x5 ^= 0xCCCC;
+        x6 ^= 0xF0F0;
+        x7 ^= 0xFF00;
+
+        x0 ^= ((- (round & 0x01)    ) & 0xFFFF);
+        x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF);
+        x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF);
+        x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF);
+    }
+}
+
+__device__ __forceinline__ void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0,
+                                       uint32_t &y3, uint32_t &y2, uint32_t &y1, uint32_t &y0)
+{
+    uint32_t t0,t1,t2;
+    
+    t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1));
+    t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0;
+    t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1;
+
+    t0 = (x2^x3) & (y2^y3);
+    x3 = (x3 & y3) ^ t0 ^ t1;
+    x2 = (x2 & y2) ^ t0 ^ t2;
+
+    t0 = (x0^x1) & (y0^y1);
+    x1 = (x1 & y1) ^ t0 ^ t1;
+    x0 = (x0 & y0) ^ t0 ^ t2;
+}
+
+__device__ __forceinline__ void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
+{
+    uint32_t t0,t1,t2,t3,t4,t5,t6,a,b;
+
+    t3 = x7;
+    t2 = x6;
+    t1 = x5;
+    t0 = x4;
+
+    G16mul_quad(t3, t2, t1, t0, x3, x2, x1, x0);
+
+    a = (x4 ^ x0);
+    t0 ^= a;
+    t2 ^= (x7 ^ x3) ^ (x5 ^ x1); 
+    t1 ^= (x5 ^ x1) ^ a;
+    t3 ^= (x6 ^ x2) ^ a;
+
+    b = t0 ^ t1;
+    t4 = (t2 ^ t3) & b;
+    a = t4 ^ t3 ^ t1;
+    t5 = (t3 & t1) ^ a;
+    t6 = (t2 & t0) ^ a ^ (t2 ^ t0);
+
+    t4 = (t5 ^ t6) & b;
+    t1 = (t6 & t1) ^ t4;
+    t0 = (t5 & t0) ^ t4;
+
+    t4 = (t5 ^ t6) & (t2^t3);
+    t3 = (t6 & t3) ^ t4;
+    t2 = (t5 & t2) ^ t4;
+
+    G16mul_quad(x3, x2, x1, x0, t1, t0, t3, t2);
+
+    G16mul_quad(x7, x6, x5, x4, t1, t0, t3, t2);
+}
+
+__device__ __forceinline__ void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
+{
+    uint32_t t0, t1;
+    t0 = x0 ^ x1 ^ x2;
+    t1 = x5 ^ x6;
+    x2 = t0 ^ t1 ^ x7;
+    x6 = t0 ^ x3 ^ x6;
+    x3 = x0 ^ x1 ^ x3 ^ x4 ^ x7;    
+    x4 = x0 ^ x4 ^ t1;
+    x2 = t0 ^ t1 ^ x7;
+    x1 = x0 ^ x1 ^ t1;
+    x7 = x0 ^ t1 ^ x7;
+    x5 = x0 ^ t1;
+}
+
+__device__ __forceinline__ void transXtoA_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
+{
+    uint32_t t0,t2,t3,t5;
+
+    x1 ^= x4;
+    t0 = x1 ^ x6;
+    x1 ^= x5;
+
+    t2 = x0 ^ x2;
+    x2 = x3 ^ x5;
+    t2 ^= x2 ^ x6;
+    x2 ^= x7;
+    t3 = x4 ^ x2 ^ x6;
+
+    t5 = x0 ^ x6;
+    x4 = x3 ^ x7;
+    x0 = x3 ^ x5;
+
+    x6 = t0;    
+    x3 = t2;
+    x7 = t3;    
+    x5 = t5;    
+}
+
+__device__ __forceinline__ void sbox_quad(uint32_t *r)
+{
+    transAtoX_quad(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]);
+
+    G256_inv_quad(r[2], r[4], r[1], r[7], r[3], r[0], r[5], r[6]);
+
+    transXtoA_quad(r[7], r[1], r[4], r[2], r[6], r[5], r[0], r[3]);
+    
+    r[0] = ~r[0];
+    r[1] = ~r[1];
+    r[5] = ~r[5];
+    r[6] = ~r[6];
+}
+
+__device__ __forceinline__ void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
+{
+    uint32_t t0,t1;
+
+    int tpos = threadIdx.x & 0x03;
+    int shift1 = tpos << 1;
+    int shift2 = shift1+1 + ((tpos == 3)<<2);
+
+    t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
+    x0 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
+    x1 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
+    x2 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
+    x3 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
+    x4 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
+    x5 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
+    x6 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
+    x7 = __byte_perm(t0, t1, 0x5410);
+}
+
+__device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
+{
+    uint32_t t0,t1;
+
+    int tpos = threadIdx.x & 0x03;
+    int shift1 = (1-(tpos>>1)) + ((tpos & 0x01)<<2);
+    int shift2 = shift1+2 + ((tpos == 1)<<2);
+
+    t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
+    x0 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
+    x1 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
+    x2 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
+    x3 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
+    x4 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
+    x5 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
+    x6 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
+    x7 = __byte_perm(t0, t1, 0x5410);
+}
+
+__device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
+{
+#define SHIFT64_16(hi, lo)    __byte_perm(lo, hi, 0x5432)
+#define A(v, u)             __shfl((int)r[v], ((threadIdx.x+u)&0x03), 4)
+#define S(idx, l)            SHIFT64_16( A(idx, (l+1)), A(idx, l) )
+
+#define DOUBLE_ODD(i, bc)        ( S(i, (bc)) ^ A(i, (bc) + 1) )
+#define DOUBLE_EVEN(i, bc)        ( S(i, (bc)) ^ A(i, (bc)    ) )
+
+#define SINGLE_ODD(i, bc)        ( S(i, (bc)) )
+#define SINGLE_EVEN(i, bc)        ( A(i, (bc)) )
+    uint32_t b[8];
+
+#pragma unroll 8
+    for(int i=0;i<8;i++)
+        b[i] = DOUBLE_ODD(i, 1) ^ DOUBLE_EVEN(i, 3);
+
+    G256_Mul2(b);
+#pragma unroll 8
+    for(int i=0;i<8;i++)
+        b[i] = b[i] ^ DOUBLE_ODD(i, 3) ^ DOUBLE_ODD(i, 4) ^ SINGLE_ODD(i, 6);
+
+    G256_Mul2(b);
+#pragma unroll 8
+    for(int i=0;i<8;i++)
+        r[i] = b[i] ^ DOUBLE_EVEN(i, 2) ^ DOUBLE_EVEN(i, 3) ^ SINGLE_EVEN(i, 5);
+
+#undef S
+#undef A
+#undef SHIFT64_16
+#undef t
+#undef X
+}
+
+__device__ __forceinline__ void groestl512_perm_P_quad(uint32_t *r)
+{
+    for(int round=0;round<14;round++)
+    {
+        G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
+        sbox_quad(r);
+        G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+        G256_MixFunction_quad(r);
+    }
+}
+
+__device__ __forceinline__ void groestl512_perm_Q_quad(uint32_t *r)
+{    
+    for(int round=0;round<14;round++)
+    {
+        G256_AddRoundConstantQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
+        sbox_quad(r);
+        G256_ShiftBytesQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+        G256_MixFunction_quad(r);
+    }
+}
+
+__device__ __forceinline__ void groestl512_progressMessage_quad(uint32_t *state, uint32_t *message)
+{
+#pragma unroll 8
+    for(int u=0;u<8;u++) state[u] = message[u];
+
+    if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
+    groestl512_perm_P_quad(state);
+    if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
+    groestl512_perm_Q_quad(message);
+#pragma unroll 8
+    for(int u=0;u<8;u++) state[u] ^= message[u];
+#pragma unroll 8
+    for(int u=0;u<8;u++) message[u] = state[u];
+    groestl512_perm_P_quad(message);
+#pragma unroll 8
+    for(int u=0;u<8;u++) state[u] ^= message[u];
+}
diff --git a/groestlcoin.cpp b/groestlcoin.cpp
index c8b785056e..dc3b0feabe 100644
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@@ -15,163 +15,118 @@
 
 void sha256func(unsigned char *hash, const unsigned char *data, int len)
 {
-	uint32_t S[16], T[16];
-	int i, r;
-
-	sha256_init(S);
-	for (r = len; r > -9; r -= 64) {
-		if (r < 64)
-			memset(T, 0, 64);
-		memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
-		if (r >= 0 && r < 64)
-			((unsigned char *)T)[r] = 0x80;
-		for (i = 0; i < 16; i++)
-			T[i] = be32dec(T + i);
-		if (r < 56)
-			T[15] = 8 * len;
-		sha256_transform(S, T, 0);
-	}
-	/*
-	memcpy(S + 8, sha256d_hash1 + 8, 32);
-	sha256_init(T);
-	sha256_transform(T, S, 0);
-	*/
-	for (i = 0; i < 8; i++)
-		be32enc((uint32_t *)hash + i, T[i]);
+    uint32_t S[16], T[16];
+    int i, r;
+
+    sha256_init(S);
+    for (r = len; r > -9; r -= 64) {
+        if (r < 64)
+            memset(T, 0, 64);
+        memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
+        if (r >= 0 && r < 64)
+            ((unsigned char *)T)[r] = 0x80;
+        for (i = 0; i < 16; i++)
+            T[i] = be32dec(T + i);
+        if (r < 56)
+            T[15] = 8 * len;
+        sha256_transform(S, T, 0);
+    }
+    /*
+    memcpy(S + 8, sha256d_hash1 + 8, 32);
+    sha256_init(T);
+    sha256_transform(T, S, 0);
+    */
+    for (i = 0; i < 8; i++)
+        be32enc((uint32_t *)hash + i, T[i]);
 }
 
 static void groestlhash(void *state, const void *input)
 {
-	// Tryout GPU-groestl
+    // Tryout GPU-groestl
 
     sph_groestl512_context     ctx_groestl[2];
     static unsigned char pblank[1];
-	int ii;
     uint32_t mask = 8;
     uint32_t zero = 0;
 
 
-	//these uint512 in the c++ source of the client are backed by an array of uint32
-    uint32_t hashA[16], hashB[16];	
+    //these uint512 in the c++ source of the client are backed by an array of uint32
+    uint32_t hashA[16], hashB[16];    
 
 
     sph_groestl512_init(&ctx_groestl[0]);
     sph_groestl512 (&ctx_groestl[0], input, 80); //6
-    sph_groestl512_close(&ctx_groestl[0], hashA); //7	
+    sph_groestl512_close(&ctx_groestl[0], hashA); //7    
 
-	sph_groestl512_init(&ctx_groestl[1]);
-	sph_groestl512 (&ctx_groestl[1], hashA, 64); //6
+    sph_groestl512_init(&ctx_groestl[1]);
+    sph_groestl512 (&ctx_groestl[1], hashA, 64); //6
     sph_groestl512_close(&ctx_groestl[1], hashB); //7
 
-	memcpy(state, hashB, 32);
+    memcpy(state, hashB, 32);
 }
 
-
+extern bool opt_benchmark;
 
 extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
-{	
-	uint32_t start_nonce = pdata[19]++;
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t throughPut = 4096 * 128;
-	//const uint32_t throughPut = 1;
-	int i;
-	uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
-
-	// init
-	static bool init[8] = { false, false, false, false, false, false, false, false };
-	if(!init[thr_id])
-	{
-		groestlcoin_cpu_init(thr_id, throughPut);
-		init[thr_id] = true;
-	}
-	
-	// Endian Drehung ist notwendig
-	//char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"};
-	//pdata = (uint32_t*)testdata;
-	uint32_t endiandata[32];
-	for (int kk=0; kk < 32; kk++)
-		be32enc(&endiandata[kk], pdata[kk]);
-
-	// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
-	groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
-	
-	do {
-		// GPU
-		uint32_t foundNounce = 0xFFFFFFFF;
-
-		groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
-
-		/*
-		{
-			for(i=0;i<throughPut;i++)
-			{
-				uint32_t tmpHash[8];
-				endiandata[19] = SWAP32(pdata[19]);
-				groestlhash(tmpHash, endiandata);
-				
-				int ii;
-				printf("result GPU: ");
-				for (ii=0; ii < 32; ii++)
-				{
-					printf ("%.2x",((uint8_t*)&outputHash[8*i])[ii]);
-				};
-				printf ("\n");	
-		
-
-				groestlhash(tmpHash, endiandata);
-				printf("result CPU: ");
-				for (ii=0; ii < 32; ii++)
-				{
-					printf ("%.2x",((uint8_t*)tmpHash)[ii]);
-				};
-				
-				
-			}
-			exit(0);
-		}		
-		*/
-		if(foundNounce < 0xffffffff)
-		{
-			uint32_t tmpHash[8];
-			endiandata[19] = SWAP32(foundNounce);
-			groestlhash(tmpHash, endiandata);
-			if (tmpHash[7] <= Htarg && 
-					fulltest(tmpHash, ptarget)) {
-						pdata[19] = foundNounce;
-						*hashes_done = foundNounce - start_nonce;
-						free(outputHash);
-				return true;
-			} else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
-			}
-
-			foundNounce = 0xffffffff;
-			/*
-			int ii;
-			printf("result GPU: ");
-			for (ii=0; ii < 32; ii++)
-			{
-				printf ("%.2x",((uint8_t*)&outputHash[0])[ii]);
-			};
-			printf ("\n");	
-			printf("result CPU: ");
-			for (ii=0; ii < 32; ii++)
-			{
-				printf ("%.2x",((uint8_t*)tmpHash)[ii]);
-			};
-			printf ("\n");	
-			*/
-		}
-
-		if (pdata[19] + throughPut < pdata[19])
-			pdata[19] = max_nonce;
-		else pdata[19] += throughPut;
-
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-	
-	*hashes_done = pdata[19] - start_nonce;
-	free(outputHash);
-	return 0;
+    uint32_t max_nonce, unsigned long *hashes_done)
+{    
+    if (opt_benchmark)
+        ((uint32_t*)ptarget)[7] = 0x000000ff;
+
+    uint32_t start_nonce = pdata[19]++;
+    const uint32_t Htarg = ptarget[7];
+    const uint32_t throughPut = 4096 * 128;
+    //const uint32_t throughPut = 1;
+    uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
+
+    // init
+    static bool init[8] = { false, false, false, false, false, false, false, false };
+    if(!init[thr_id])
+    {
+        groestlcoin_cpu_init(thr_id, throughPut);
+        init[thr_id] = true;
+    }
+    
+    // Endian Drehung ist notwendig
+    uint32_t endiandata[32];
+    for (int kk=0; kk < 32; kk++)
+        be32enc(&endiandata[kk], pdata[kk]);
+
+    // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
+    groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
+    
+    do {
+        // GPU
+        uint32_t foundNounce = 0xFFFFFFFF;
+
+        groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
+
+        if(foundNounce < 0xffffffff)
+        {
+            uint32_t tmpHash[8];
+            endiandata[19] = SWAP32(foundNounce);
+            groestlhash(tmpHash, endiandata);
+            if (tmpHash[7] <= Htarg && 
+                    fulltest(tmpHash, ptarget)) {
+                        pdata[19] = foundNounce;
+                        *hashes_done = foundNounce - start_nonce;
+                        free(outputHash);
+                return true;
+            } else {
+                applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
+            }
+
+            foundNounce = 0xffffffff;
+        }
+
+        if (pdata[19] + throughPut < pdata[19])
+            pdata[19] = max_nonce;
+        else pdata[19] += throughPut;
+
+    } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+    
+    *hashes_done = pdata[19] - start_nonce;
+    free(outputHash);
+    return 0;
 }
 
diff --git a/heavy/cuda_blake512.cu b/heavy/cuda_blake512.cu
index 9e394fd85d..dfa4240547 100644
--- a/heavy/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@@ -269,8 +269,6 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
-//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
 	if (BLOCKSIZE == 80)
 		blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
 	else if (BLOCKSIZE == 84)
diff --git a/heavy/cuda_combine.cu b/heavy/cuda_combine.cu
index c9036f3515..e2a8b721a2 100644
--- a/heavy/cuda_combine.cu
+++ b/heavy/cuda_combine.cu
@@ -141,8 +141,6 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
-//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
 	combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
 
 	// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
diff --git a/heavy/cuda_groestl512.cu b/heavy/cuda_groestl512.cu
index bf86105a51..5b1b26745a 100644
--- a/heavy/cuda_groestl512.cu
+++ b/heavy/cuda_groestl512.cu
@@ -824,8 +824,6 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
-//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
 	if (BLOCKSIZE == 84)
 		groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);	
 	else if (BLOCKSIZE == 80)
diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu
index db5fca3321..4b60818c15 100644
--- a/heavy/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@@ -416,8 +416,6 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
     size_t shared_size = 0;
 #endif
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
     hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
 
     // Strategisches Sleep Kommando zur Senkung der CPU Last
diff --git a/heavy/cuda_keccak512.cu b/heavy/cuda_keccak512.cu
index 958579392c..eb69e3bc3b 100644
--- a/heavy/cuda_keccak512.cu
+++ b/heavy/cuda_keccak512.cu
@@ -279,7 +279,6 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
-//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	if (BLOCKSIZE==84)
 		keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
 	else if (BLOCKSIZE==80)
diff --git a/heavy/cuda_sha256.cu b/heavy/cuda_sha256.cu
index 404a2a29b1..f520778b51 100644
--- a/heavy/cuda_sha256.cu
+++ b/heavy/cuda_sha256.cu
@@ -271,7 +271,6 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
-//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	if (BLOCKSIZE == 84)
 		sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
 	else if (BLOCKSIZE == 80) {
diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp
index 6ccecce599..45d9745102 100644
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@@ -35,17 +35,19 @@ static void myriadhash(void *state, const void *input)
     memcpy(state, hashB, 32);
 }
 
-
+extern bool opt_benchmark;
 
 extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {	
-	uint32_t start_nonce = pdata[19]++;	
+    if (opt_benchmark)
+        ((uint32_t*)ptarget)[7] = 0x000000ff;
+
+	uint32_t start_nonce = pdata[19]++;
 	const uint32_t throughPut = 128 * 1024;
-//	const uint32_t throughPut = 1;
+
 	uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
 
-	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 
diff --git a/quark/animecoin.cu b/quark/animecoin.cu
index fa771aa9e3..6d395be0d9 100644
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@@ -175,7 +175,6 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 
-	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x00000f;
 
diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu
index 7c706f228c..760d028883 100644
--- a/quark/cuda_bmw512.cu
+++ b/quark/cuda_bmw512.cu
@@ -447,8 +447,6 @@ __host__ void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
     // Größe des dynamischen Shared Memory Bereichs
     size_t shared_size = 0;
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
     quark_bmw512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
     MyStreamSynchronize(NULL, order, thr_id);
 }
@@ -464,8 +462,6 @@ __host__ void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNo
     // Größe des dynamischen Shared Memory Bereichs
     size_t shared_size = 0;
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
     quark_bmw512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
     MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/quark/cuda_jh512.cu b/quark/cuda_jh512.cu
index 779af44f67..c55ac1a977 100644
--- a/quark/cuda_jh512.cu
+++ b/quark/cuda_jh512.cu
@@ -350,8 +350,6 @@ __host__ void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
     // Größe des dynamischen Shared Memory Bereichs
     size_t shared_size = 0;
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
     quark_jh512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
     MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu
index 80e500eea3..eabef09fc2 100644
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@@ -406,8 +406,6 @@ __host__ void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t start
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
-//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
 	quark_blake512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
 
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
@@ -425,8 +423,6 @@ __host__ void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t start
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
-//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
 	quark_blake512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
 
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
diff --git a/quark/cuda_quark_checkhash.cu b/quark/cuda_quark_checkhash.cu
index c4052f2336..8502bf4336 100644
--- a/quark/cuda_quark_checkhash.cu
+++ b/quark/cuda_quark_checkhash.cu
@@ -89,8 +89,6 @@ __host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t star
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
-//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
 	quark_check_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
 
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu
index 3d71853208..dbe23c8f9f 100644
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@@ -1,4 +1,4 @@
-// Auf QuarkCoin spezialisierte Version von Groestl
+// Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice
 
 #include <cuda.h>
 #include "cuda_runtime.h"
@@ -7,9 +7,6 @@
 #include <stdio.h>
 #include <memory.h>
 
-// it's unfortunate that this is a compile time constant.
-#define MAXWELL_OR_FERMI 1
-
 // aus cpu-miner.c
 extern int device_map[8];
 
@@ -18,353 +15,137 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 
 // Folgende Definitionen später durch header ersetzen
 typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
 typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
 
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];
 
-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
-#define PC32up(j, r)   ((uint32_t)((j) + (r)))
-#define PC32dn(j, r)   0
-#define QC32up(j, r)   0xFFFFFFFF
-#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
-
-#define B32_0(x)    __byte_perm(x, 0, 0x4440)
-//((x) & 0xFF)
-#define B32_1(x)    __byte_perm(x, 0, 0x4441)
-//(((x) >> 8) & 0xFF)
-#define B32_2(x)    __byte_perm(x, 0, 0x4442)
-//(((x) >> 16) & 0xFF)
-#define B32_3(x)    __byte_perm(x, 0, 0x4443)
-//((x) >> 24)
-
-#if MAXWELL_OR_FERMI
-#define USE_SHARED 1
-// Maxwell and Fermi cards get the best speed with SHARED access it seems.
-#if USE_SHARED
-#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
-#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
-#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
-#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
-#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
-#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
-#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
-#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
-#else
-#define T0up(x) tex1Dfetch(t0up1, x)
-#define T0dn(x) tex1Dfetch(t0dn1, x)
-#define T1up(x) tex1Dfetch(t1up1, x)
-#define T1dn(x) tex1Dfetch(t1dn1, x)
-#define T2up(x) tex1Dfetch(t2up1, x)
-#define T2dn(x) tex1Dfetch(t2dn1, x)
-#define T3up(x) tex1Dfetch(t3up1, x)
-#define T3dn(x) tex1Dfetch(t3dn1, x)
-#endif
-#else
-#define USE_SHARED 1
-// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
-#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
-#define T0dn(x) tex1Dfetch(t0dn1, x)
-#define T1up(x) tex1Dfetch(t1up1, x)
-#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
-#define T2up(x) tex1Dfetch(t2up1, x)
-#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
-#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
-#define T3dn(x) tex1Dfetch(t3dn1, x)
-#endif
-
-texture<unsigned int, 1, cudaReadModeElementType> t0up1;
-texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
-texture<unsigned int, 1, cudaReadModeElementType> t1up1;
-texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
-texture<unsigned int, 1, cudaReadModeElementType> t2up1;
-texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
-texture<unsigned int, 1, cudaReadModeElementType> t3up1;
-texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
-
-extern uint32_t T0up_cpu[];
-extern uint32_t T0dn_cpu[];
-extern uint32_t T1up_cpu[];
-extern uint32_t T1dn_cpu[];
-extern uint32_t T2up_cpu[];
-extern uint32_t T2dn_cpu[];
-extern uint32_t T3up_cpu[];
-extern uint32_t T3dn_cpu[];
-
-__device__ __forceinline__ void quark_groestl512_perm_P(uint32_t *a, char *mixtabs)
-{
-    uint32_t t[32];
+// 64 Register Variante für Compute 3.0
+#include "groestl_functions_quad.cu"
+#include "bitslice_transformations_quad.cu"
 
-//#pragma unroll 14
-    for(int r=0;r<14;r++)
+__global__ void __launch_bounds__(256, 4)
+ quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
+{
+    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
+    if (thread < threads)
     {
-        switch(r)
-        {
-            case 0:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 0); break;
-            case 1:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 1); break;
-            case 2:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 2); break;
-            case 3:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 3); break;
-            case 4:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 4); break;
-            case 5:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 5); break;
-            case 6:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 6); break;
-            case 7:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 7); break;
-            case 8:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 8); break;
-            case 9:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 9); break;
-            case 10:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 10); break;
-            case 11:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 11); break;
-            case 12:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 12); break;
-            case 13:
-#pragma unroll 16
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 13); break;
-        }
+        // GROESTL
+        uint32_t message[8];
+        uint32_t state[8];
 
-        // RBTT
-#pragma unroll 16
-        for(int k=0;k<32;k+=2)
-        {
-            uint32_t t0_0 = B32_0(a[(k     ) & 0x1f]), t9_0  = B32_0(a[(k +  9) & 0x1f]);
-            uint32_t t2_1 = B32_1(a[(k +  2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
-            uint32_t t4_2 = B32_2(a[(k +  4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
-            uint32_t t6_3 = B32_3(a[(k +  6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
-	    
-            t[k + 0] =  T0up( t0_0 ) ^ T1up(  t2_1 ) ^ T2up(  t4_2 ) ^ T3up(  t6_3 ) ^ 
-                        T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
-
-            t[k + 1] =  T0dn( t0_0 ) ^ T1dn(  t2_1 ) ^ T2dn(  t4_2 ) ^ T3dn(  t6_3 ) ^ 
-                        T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
-        }
-#pragma unroll 32
-        for(int k=0;k<32;k++)
-            a[k] = t[k];
-    }
-}
+        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-__device__ __forceinline__ void quark_groestl512_perm_Q(uint32_t *a, char *mixtabs)
-{    
-//#pragma unroll 14
-    for(int r=0;r<14;r++)
-    {
-        uint32_t t[32];
+        int hashPosition = nounce - startNounce;
+        uint32_t *inpHash = &g_hash[hashPosition<<4];
 
-        switch(r)
-        {
-            case 0:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 0); a[(k*2)+1] ^= QC32dn(k<< 4, 0);} break;
-            case 1:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 1); a[(k*2)+1] ^= QC32dn(k<< 4, 1);} break;
-            case 2:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 2); a[(k*2)+1] ^= QC32dn(k<< 4, 2);} break;
-            case 3:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 3); a[(k*2)+1] ^= QC32dn(k<< 4, 3);} break;
-            case 4:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 4); a[(k*2)+1] ^= QC32dn(k<< 4, 4);} break;
-            case 5:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 5); a[(k*2)+1] ^= QC32dn(k<< 4, 5);} break;
-            case 6:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 6); a[(k*2)+1] ^= QC32dn(k<< 4, 6);} break;
-            case 7:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 7); a[(k*2)+1] ^= QC32dn(k<< 4, 7);} break;
-            case 8:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 8); a[(k*2)+1] ^= QC32dn(k<< 4, 8);} break;
-            case 9:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 9); a[(k*2)+1] ^= QC32dn(k<< 4, 9);} break;
-            case 10:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 10); a[(k*2)+1] ^= QC32dn(k<< 4, 10);} break;
-            case 11:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 11); a[(k*2)+1] ^= QC32dn(k<< 4, 11);} break;
-            case 12:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 12); a[(k*2)+1] ^= QC32dn(k<< 4, 12);} break;
-            case 13:
-    #pragma unroll 16
-                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 13); a[(k*2)+1] ^= QC32dn(k<< 4, 13);} break;
-        }
+#pragma unroll 4
+        for(int k=0;k<4;k++) message[k] = inpHash[(k<<2) + (threadIdx.x&0x03)];
+#pragma unroll 4
+        for(int k=4;k<8;k++) message[k] = 0;
 
-        // RBTT
-#pragma unroll 16
-        for(int k=0;k<32;k+=2)
+        if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
+        if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
+
+        uint32_t msgBitsliced[8];
+        to_bitslice_quad(message, msgBitsliced);
+
+        groestl512_progressMessage_quad(state, msgBitsliced);
+
+        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
+        uint32_t *outpHash = &g_hash[hashPosition<<4];
+        uint32_t hash[16];
+        from_bitslice_quad(state, hash);
+
+        if ((threadIdx.x & 0x03) == 0)
         {
-            uint32_t t2_0  = B32_0(a[(k +  2) & 0x1f]), t1_0  = B32_0(a[(k +  1) & 0x1f]);
-            uint32_t t6_1  = B32_1(a[(k +  6) & 0x1f]), t5_1  = B32_1(a[(k +  5) & 0x1f]);
-            uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2  = B32_2(a[(k +  9) & 0x1f]);
-            uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
-	    
-            t[k + 0] =  T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ 
-                        T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn(  t9_2 ) ^ T3dn( t13_3 );
-
-            t[k + 1] =  T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ 
-                        T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up(  t9_2 ) ^ T3up( t13_3 );
+#pragma unroll 16
+            for(int k=0;k<16;k++) outpHash[k] = hash[k];
         }
-#pragma unroll 32
-        for(int k=0;k<32;k++)
-            a[k] = t[k];
     }
 }
-__global__ void  quark_groestl512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
-{
-#if USE_SHARED
-    extern __shared__ char mixtabs[];
-
-    if (threadIdx.x < 256)
-    {
-        *((uint32_t*)mixtabs + (    threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
-        *((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
-        *((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
-        *((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
-        *((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
-        *((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
-        *((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
-        *((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
-    }
-
-    __syncthreads();
-#endif
 
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+__global__ void __launch_bounds__(256, 4)
+ quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
+{
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
     if (thread < threads)
     {
         // GROESTL
-        uint32_t message[32];
-        uint32_t state[32];
+        uint32_t message[8];
+        uint32_t state[8];
 
         uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
         int hashPosition = nounce - startNounce;
-        uint32_t *inpHash = &g_hash[16 * hashPosition];
+        uint32_t *inpHash = &g_hash[hashPosition<<4];
 
-#pragma unroll 16
-        for(int k=0;k<16;k++) message[k] = inpHash[k];
-#pragma unroll 14
-        for(int k=1;k<15;k++)
-            message[k+16] = 0;
-
-        message[16] = 0x80;
-        message[31] = 0x01000000;
-
-#pragma unroll 32
-        for(int u=0;u<32;u++) state[u] = message[u];
-        state[31] ^= 0x20000;
-
-        // Perm
-#if USE_SHARED
-        quark_groestl512_perm_P(state, mixtabs);
-        state[31] ^= 0x20000;
-        quark_groestl512_perm_Q(message, mixtabs);
-#else
-        quark_groestl512_perm_P(state, NULL);
-        state[31] ^= 0x20000;
-        quark_groestl512_perm_Q(message, NULL);
-#endif
-#pragma unroll 32
-        for(int u=0;u<32;u++) state[u] ^= message[u];
-
-#pragma unroll 32
-        for(int u=0;u<32;u++) message[u] = state[u];
-
-#if USE_SHARED
-        quark_groestl512_perm_P(message, mixtabs);
-#else
-        quark_groestl512_perm_P(message, NULL);
-#endif
-
-#pragma unroll 32
-        for(int u=0;u<32;u++) state[u] ^= message[u];
-        // Erzeugten Hash rausschreiben
-
-        uint32_t *outpHash = &g_hash[16 * hashPosition];
+#pragma unroll 4
+        for(int k=0;k<4;k++) message[k] = inpHash[(k<<2)+(threadIdx.x&0x03)];
+#pragma unroll 4
+        for(int k=4;k<8;k++) message[k] = 0;
+
+        if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
+        if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
+
+        uint32_t msgBitsliced[8];
+        to_bitslice_quad(message, msgBitsliced);
+
+        for (int round=0; round<2; round++)
+        {
+            groestl512_progressMessage_quad(state, msgBitsliced);
+
+            if (round < 1)
+            {
+                // Verkettung zweier Runden inclusive Padding.
+                msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + (((threadIdx.x%4)==3)<<13));
+                msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
+                msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
+                msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
+                msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
+                msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
+                msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
+                msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + (((threadIdx.x%4)==0)<<4));
+            }
+        }
 
+        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
+        uint32_t *outpHash = &g_hash[hashPosition<<4];
+        uint32_t hash[16];
+        from_bitslice_quad(state, hash);
+
+        if ((threadIdx.x & 0x03) == 0)
+        {
 #pragma unroll 16
-        for(int k=0;k<16;k++) outpHash[k] = state[k+16];
+            for(int k=0;k<16;k++) outpHash[k] = hash[k];
+        }
     }
 }
 
-#define texDef(texname, texmem, texsource, texsize) \
-    unsigned int *texmem; \
-    cudaMalloc(&texmem, texsize); \
-    cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
-    texname.normalized = 0; \
-    texname.filterMode = cudaFilterModePoint; \
-    texname.addressMode[0] = cudaAddressModeClamp; \
-    { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-      cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
-
 // Setup-Funktionen
 __host__ void quark_groestl512_cpu_init(int thr_id, int threads)
 {
     cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
-
-// Texturen mit obigem Makro initialisieren
-    texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
-    texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
-    texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
-    texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
-    texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
-    texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
-    texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
-    texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
 }
 
 __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    // Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern,
-    // alle anderen mit 512 Threads.
-    int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512;
+    int threadsperblock = 256;
+
+    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
+    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
+    const int factor = 4;
 
     // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
     dim3 block(threadsperblock);
 
     // Größe des dynamischen Shared Memory Bereichs
-#if USE_SHARED
-    size_t shared_size = 8 * 256 * sizeof(uint32_t);
-#else
     size_t shared_size = 0;
-#endif
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-    //fprintf(stderr, "ThrID: %d\n", thr_id);
-    quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
+    quark_groestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
 
     // Strategisches Sleep Kommando zur Senkung der CPU Last
     MyStreamSynchronize(NULL, order, thr_id);
@@ -372,25 +153,20 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta
 
 __host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    // Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern,
-    // alle anderen mit 512 Threads.
-    int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512;
+    int threadsperblock = 256;
+
+    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
+    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
+    const int factor = 4;
 
     // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
     dim3 block(threadsperblock);
 
     // Größe des dynamischen Shared Memory Bereichs
-#if USE_SHARED
-    size_t shared_size = 8 * 256 * sizeof(uint32_t);
-#else
     size_t shared_size = 0;
-#endif
 
-//  fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-    //fprintf(stderr, "ThrID: %d\n", thr_id);
-    quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
-    quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
+    quark_doublegroestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
 
     // Strategisches Sleep Kommando zur Senkung der CPU Last
     MyStreamSynchronize(NULL, order, thr_id);
diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu
index c55a7a1289..775d5e2ee4 100644
--- a/quark/cuda_quark_keccak512.cu
+++ b/quark/cuda_quark_keccak512.cu
@@ -175,8 +175,6 @@ __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t star
     // Größe des dynamischen Shared Memory Bereichs
     size_t shared_size = 0;
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
     quark_keccak512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
     MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu
index dc4030a607..d32e9c0bbd 100644
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@@ -442,7 +442,6 @@ __host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t start
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
-//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	quark_skein512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index acfe731946..0bc46d9fc0 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -157,7 +157,6 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 
-	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 
diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu
index 8769b4ef6e..602e7c4197 100644
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@@ -307,8 +307,6 @@ __host__ void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t star
     // Größe des dynamischen Shared Memory Bereichs
     size_t shared_size = 0;
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
     x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
     MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu
index 34e0b028d4..ce96728ee9 100644
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@@ -225,8 +225,6 @@ __host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
     // Größe des dynamischen Shared Memory Bereichs
     size_t shared_size = 0;
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
     x11_echo512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
     MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/x11/cuda_x11_luffa512.cu b/x11/cuda_x11_luffa512.cu
index b1a03c971a..c1ca6c0da3 100644
--- a/x11/cuda_x11_luffa512.cu
+++ b/x11/cuda_x11_luffa512.cu
@@ -376,8 +376,6 @@ __host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
     // Größe des dynamischen Shared Memory Bereichs
     size_t shared_size = 0;
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
     x11_luffa512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
     MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu
index 9d217e85cb..b3fd9258ed 100644
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@@ -1372,8 +1372,6 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
     // Größe des dynamischen Shared Memory Bereichs
     size_t shared_size = 0;
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
     x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
     MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/x11/x11.cu b/x11/x11.cu
index 77b6a72358..5b22e4f20b 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -162,11 +162,8 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 
-	// TODO: entfernen für eine Release! Ist nur zum Testen!
-	if (opt_benchmark) {
+	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
-        pdata[17] = 0;
-    }
 
 	const uint32_t Htarg = ptarget[7];