diff --git a/.gitattributes b/.gitattributes
index 412eeda78d..d63baf172c 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -20,3 +20,7 @@
 *.PDF	 diff=astextplain
 *.rtf	 diff=astextplain
 *.RTF	 diff=astextplain
+
+*.cu diff=cpp
+
+
diff --git a/.gitignore b/.gitignore
index 147df83941..d4fb4d4284 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-
 ccminer
 *.o
 
@@ -7,7 +6,6 @@ autom4te.cache
 
 Makefile
 Makefile.in
-INSTALL
 aclocal.m4
 configure
 configure.lineno
@@ -15,7 +13,7 @@ depcomp
 missing
 install-sh
 stamp-h1
-cpuminer-config.h*
+ccminer-config.h.in
 compile
 config.log
 config.status
@@ -41,6 +39,8 @@ x64/Release/
 *.opensdf
 *.sdf
 *.pdb
+*.db
+*.opendb
 
 .settings/
 .project
@@ -50,5 +50,3 @@ x64/Release/
 .cproject
 .buildpath
 
-compat/curl-for-windows/
-
diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu
index 21b0276eae..0cc43ec50b 100644
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@@ -10,10 +10,12 @@
 
 extern "C" {
 #include "sph/sph_blake.h"
-#include <stdint.h>
-#include <memory.h>
 }
 
+#include <cstdint>
+#include <memory.h>
+
+
 /* threads per block and throughput (intensity) */
 #define TPB 128
 
@@ -21,7 +23,7 @@ extern "C" {
 extern "C" int blake256_rounds = 14;
 
 /* hash by cpu with blake 256 */
-extern "C" void blake256hash(void *output, const void *input, int8_t rounds = 14)
+void blake256hash(void *output, const void *input, int8_t rounds = 14)
 {
 	uchar hash[64];
 	sph_blake256_context ctx;
@@ -38,7 +40,8 @@ extern "C" void blake256hash(void *output, const void *input, int8_t rounds = 14
 #include "cuda_helper.h"
 
 #if PRECALC64
-__constant__ uint32_t _ALIGN(32) d_data[12];
+__constant__ uint32_t _ALIGN(32) d_data[15];
+static THREAD uint32_t *h_data;
 #else
 __constant__ static uint32_t _ALIGN(32) c_data[20];
 /* midstate hash cache, this algo is run on 2 parts */
@@ -50,32 +53,11 @@ extern "C" uint32_t crc32_u32t(const uint32_t *buf, size_t size);
 
 /* 8 adapters max */
 static uint32_t *d_resNonce[MAX_GPUS];
-static uint32_t *h_resNonce[MAX_GPUS];
+static THREAD uint32_t *h_resNonce;
 
 /* max count of found nonces in one call */
 #define NBN 2
-static uint32_t extra_results[NBN] = { UINT32_MAX };
-
-/* prefer uint32_t to prevent size conversions = speed +5/10 % */
-__constant__
-static uint32_t _ALIGN(32) c_sigma[16][16] = {
-	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
-	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
-};
+static uint32_t extra_results[MAX_GPUS][NBN] = { UINT32_MAX };
 
 #if !PRECALC64
 __device__ __constant__
@@ -87,41 +69,18 @@ static const uint32_t __align__(32) c_IV256[8] = {
 };
 #endif
 
-__device__ __constant__
-static const uint32_t __align__(32) c_u256[16] = {
-	SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
-	SPH_C32(0x13198A2E), SPH_C32(0x03707344),
-	SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
-	SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
-	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
-	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
-	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
-	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
-};
-
-#define GS(a,b,c,d,x) { \
-	const uint32_t idx1 = c_sigma[r][x]; \
-	const uint32_t idx2 = c_sigma[r][x+1]; \
-	v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \
-	v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \
+#define GSPREC(a,b,c,d,x,y) { \
+	v[a] += (m[x] ^ c_u256[y]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \
 	v[c] += v[d]; \
-	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
-\
-	v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \
-	v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \
+	v[b] = ROTR32(v[b] ^ v[c], 12); \
+	v[a] += (m[y] ^ c_u256[x]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \
 	v[c] += v[d]; \
-	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
-}
+	v[b] = ROTR32(v[b] ^ v[c], 7); \
+	}
 
 /* Second part (64-80) msg never change, store it */
-__device__ __constant__
-static const uint32_t __align__(32) c_Padding[16] = {
-	0, 0, 0, 0,
-	0x80000000UL, 0, 0, 0,
-	0, 0, 0, 0,
-	0, 1, 0, 640,
-};
-
 __device__ static
 void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, const int rounds)
 {
@@ -133,7 +92,29 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co
 	m[2] = block[2];
 	m[3] = block[3];
 
-	for (uint32_t i = 4; i < 16; i++) {
+	const uint32_t c_u256[16] = 
+	{
+		SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
+		SPH_C32(0x13198A2E), SPH_C32(0x03707344),
+		SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
+		SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
+		SPH_C32(0x452821E6), SPH_C32(0x38D01377),
+		SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
+		SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
+		SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
+	};
+
+	 const uint32_t c_Padding[16] = {
+		0, 0, 0, 0,
+		0x80000000UL, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 1, 0, 640,
+	};
+
+
+	#pragma unroll
+	for (uint32_t i = 4; i < 16; i++) 
+	{
 #if PRECALC64
 		m[i] = c_Padding[i];
 #else
@@ -141,7 +122,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co
 #endif
 	}
 
-	//#pragma unroll 8
+#pragma unroll
 	for(uint32_t i = 0; i < 8; i++)
 		v[i] = h[i];
 
@@ -155,22 +136,288 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co
 	v[14] = c_u256[6];
 	v[15] = c_u256[7];
 
-	for (int r = 0; r < rounds; r++) {
-		/* column step */
-		GS(0, 4, 0x8, 0xC, 0x0);
-		GS(1, 5, 0x9, 0xD, 0x2);
-		GS(2, 6, 0xA, 0xE, 0x4);
-		GS(3, 7, 0xB, 0xF, 0x6);
-		/* diagonal step */
-		GS(0, 5, 0xA, 0xF, 0x8);
-		GS(1, 6, 0xB, 0xC, 0xA);
-		GS(2, 7, 0x8, 0xD, 0xC);
-		GS(3, 4, 0x9, 0xE, 0xE);
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	GSPREC(0, 4, 0x8, 0xC,0,1);
+	GSPREC(1, 5, 0x9, 0xD,2,3);
+	GSPREC(2, 6, 0xA, 0xE, 4,5);
+	GSPREC(3, 7, 0xB, 0xF, 6,7);
+	GSPREC(0, 5, 0xA, 0xF, 8,9);
+	GSPREC(1, 6, 0xB, 0xC, 10,11);
+	GSPREC(2, 7, 0x8, 0xD, 12,13);
+	GSPREC(3, 4, 0x9, 0xE, 14,15);
+	//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	GSPREC(0, 4, 0x8, 0xC, 14, 10);
+	GSPREC(1, 5, 0x9, 0xD, 4, 8);
+	GSPREC(2, 6, 0xA, 0xE, 9, 15);
+	GSPREC(3, 7, 0xB, 0xF, 13, 6);
+	GSPREC(0, 5, 0xA, 0xF, 1, 12);
+	GSPREC(1, 6, 0xB, 0xC, 0, 2);
+	GSPREC(2, 7, 0x8, 0xD, 11, 7);
+	GSPREC(3, 4, 0x9, 0xE, 5, 3);
+	//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	GSPREC(0, 4, 0x8, 0xC, 11, 8);
+	GSPREC(1, 5, 0x9, 0xD, 12, 0);
+	GSPREC(2, 6, 0xA, 0xE, 5, 2);
+	GSPREC(3, 7, 0xB, 0xF, 15, 13);
+	GSPREC(0, 5, 0xA, 0xF, 10, 14);
+	GSPREC(1, 6, 0xB, 0xC, 3, 6);
+	GSPREC(2, 7, 0x8, 0xD, 7, 1);
+	GSPREC(3, 4, 0x9, 0xE, 9, 4);
+	//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	GSPREC(0, 4, 0x8, 0xC, 7, 9);
+	GSPREC(1, 5, 0x9, 0xD, 3, 1);
+	GSPREC(2, 6, 0xA, 0xE, 13, 12);
+	GSPREC(3, 7, 0xB, 0xF, 11, 14);
+	GSPREC(0, 5, 0xA, 0xF, 2, 6);
+	GSPREC(1, 6, 0xB, 0xC, 5, 10);
+	GSPREC(2, 7, 0x8, 0xD, 4, 0);
+	GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+	//	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	GSPREC(0, 4, 0x8, 0xC, 9, 0);
+	GSPREC(1, 5, 0x9, 0xD, 5, 7);
+	GSPREC(2, 6, 0xA, 0xE, 2, 4);
+	GSPREC(3, 7, 0xB, 0xF, 10, 15);
+	GSPREC(0, 5, 0xA, 0xF, 14, 1);
+	GSPREC(1, 6, 0xB, 0xC, 11, 12);
+	GSPREC(2, 7, 0x8, 0xD, 6, 8);
+	GSPREC(3, 4, 0x9, 0xE, 3, 13);
+	//	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	GSPREC(0, 4, 0x8, 0xC, 2, 12);
+	GSPREC(1, 5, 0x9, 0xD, 6, 10);
+	GSPREC(2, 6, 0xA, 0xE, 0, 11);
+	GSPREC(3, 7, 0xB, 0xF, 8, 3);
+	GSPREC(0, 5, 0xA, 0xF, 4, 13);
+	GSPREC(1, 6, 0xB, 0xC, 7, 5);
+	GSPREC(2, 7, 0x8, 0xD, 15, 14);
+	GSPREC(3, 4, 0x9, 0xE, 1, 9);
+
+	//	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	GSPREC(0, 4, 0x8, 0xC, 12, 5);
+	GSPREC(1, 5, 0x9, 0xD, 1, 15);
+	GSPREC(2, 6, 0xA, 0xE, 14, 13);
+	GSPREC(3, 7, 0xB, 0xF, 4, 10);
+	GSPREC(0, 5, 0xA, 0xF, 0, 7);
+	GSPREC(1, 6, 0xB, 0xC, 6, 3);
+	GSPREC(2, 7, 0x8, 0xD, 9, 2);
+	GSPREC(3, 4, 0x9, 0xE, 8, 11);
+
+//	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	GSPREC(0, 4, 0x8, 0xC, 13, 11);
+	GSPREC(1, 5, 0x9, 0xD, 7, 14);
+	GSPREC(2, 6, 0xA, 0xE, 12, 1);
+	GSPREC(3, 7, 0xB, 0xF, 3, 9);
+	GSPREC(0, 5, 0xA, 0xF, 5, 0);
+	GSPREC(1, 6, 0xB, 0xC, 15, 4);
+	GSPREC(2, 7, 0x8, 0xD, 8, 6);
+	GSPREC(3, 4, 0x9, 0xE, 2, 10);
+//	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	GSPREC(0, 4, 0x8, 0xC, 6, 15);
+	GSPREC(1, 5, 0x9, 0xD, 14, 9);
+	GSPREC(2, 6, 0xA, 0xE, 11, 3);
+	GSPREC(3, 7, 0xB, 0xF, 0, 8);
+	GSPREC(0, 5, 0xA, 0xF, 12, 2);
+	GSPREC(1, 6, 0xB, 0xC, 13, 7);
+	GSPREC(2, 7, 0x8, 0xD, 1, 4);
+	GSPREC(3, 4, 0x9, 0xE, 10, 5);
+//	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	GSPREC(0, 4, 0x8, 0xC, 10, 2);
+	GSPREC(1, 5, 0x9, 0xD, 8, 4);
+	GSPREC(2, 6, 0xA, 0xE, 7, 6);
+	GSPREC(3, 7, 0xB, 0xF, 1, 5);
+	GSPREC(0, 5, 0xA, 0xF, 15, 11);
+	GSPREC(1, 6, 0xB, 0xC, 9, 14);
+	GSPREC(2, 7, 0x8, 0xD, 3, 12);
+	GSPREC(3, 4, 0x9, 0xE, 13, 0);
+//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	GSPREC(0, 4, 0x8, 0xC, 0, 1);
+	GSPREC(1, 5, 0x9, 0xD, 2, 3);
+	GSPREC(2, 6, 0xA, 0xE, 4, 5);
+	GSPREC(3, 7, 0xB, 0xF, 6, 7);
+	GSPREC(0, 5, 0xA, 0xF, 8, 9);
+	GSPREC(1, 6, 0xB, 0xC, 10, 11);
+	GSPREC(2, 7, 0x8, 0xD, 12, 13);
+	GSPREC(3, 4, 0x9, 0xE, 14, 15);
+
+//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	GSPREC(0, 4, 0x8, 0xC, 14, 10);
+	GSPREC(1, 5, 0x9, 0xD, 4, 8);
+	GSPREC(2, 6, 0xA, 0xE, 9, 15);
+	GSPREC(3, 7, 0xB, 0xF, 13, 6);
+	GSPREC(0, 5, 0xA, 0xF, 1, 12);
+	GSPREC(1, 6, 0xB, 0xC, 0, 2);
+	GSPREC(2, 7, 0x8, 0xD, 11, 7);
+	GSPREC(3, 4, 0x9, 0xE, 5, 3);
+
+//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	GSPREC(0, 4, 0x8, 0xC, 11, 8);
+	GSPREC(1, 5, 0x9, 0xD, 12, 0);
+	GSPREC(2, 6, 0xA, 0xE, 5, 2);
+	GSPREC(3, 7, 0xB, 0xF, 15, 13);
+	GSPREC(0, 5, 0xA, 0xF, 10, 14);
+	GSPREC(1, 6, 0xB, 0xC, 3, 6);
+	GSPREC(2, 7, 0x8, 0xD, 7, 1);
+	GSPREC(3, 4, 0x9, 0xE, 9, 4);
+	//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	GSPREC(0, 4, 0x8, 0xC, 7, 9);
+	GSPREC(1, 5, 0x9, 0xD, 3, 1);
+	GSPREC(2, 6, 0xA, 0xE, 13, 12);
+	GSPREC(3, 7, 0xB, 0xF, 11, 14);
+	GSPREC(0, 5, 0xA, 0xF, 2, 6);
+	GSPREC(1, 6, 0xB, 0xC, 5, 10);
+	GSPREC(2, 7, 0x8, 0xD, 4, 0);
+//	GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+
+#if PRECALC64
+	// only compute h6 & 7
+//	h[6U] ^= v[6U] ^ v[14U];
+	h[7] ^= v[7] ^ v[15];
+#else
+	//#pragma unroll 16
+	for (uint32_t i = 0; i < 16; i++) {
+		uint32_t j = i & 7U;
+		h[j] ^= v[i];
+	}
+#endif
+}
+
+
+/* Second part (64-80) msg never change, store it */
+__device__ static
+void blake256_compress_8(uint32_t *const __restrict__ h, const uint32_t *const __restrict__ block)
+{
+	uint32_t /*_ALIGN(8)*/ m[16];
+	uint32_t v[16];
+
+	m[0] = block[0];
+	m[1] = block[1];
+	m[2] = block[2];
+	m[3] = block[3];
+
+	const uint32_t c_u256[16] = 
+	{
+		SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
+		SPH_C32(0x13198A2E), SPH_C32(0x03707344),
+		SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
+		SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
+		SPH_C32(0x452821E6), SPH_C32(0x38D01377),
+		SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
+		SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
+		SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
+	};
+
+	const uint32_t c_Padding[16] = {
+		0, 0, 0, 0,
+		0x80000000UL, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 1, 0, 640,
+	};
+
+
+#pragma unroll
+	for (int i = 4; i < 16; i++) 
+	{
+		m[i] = c_Padding[i];
 	}
+
+#pragma unroll
+	for(int i = 0; i < 8; i++)
+		v[i] = h[i];
+
+	v[ 9] = c_u256[1];
+	v[10] = c_u256[2];
+	v[11] = c_u256[3];
+
+	v[13] = c_u256[5] ^ 640;
+	v[14] = c_u256[6];
+	v[15] = c_u256[7];
+
+	v[0] = d_data[11];
+	v[4] = d_data[12];
+	v[8] = d_data[13];
+	v[12] = d_data[14];
+
+	GSPREC(1, 5, 0x9, 0xD,2,3);
+	GSPREC(2, 6, 0xA, 0xE, 4,5);
+	GSPREC(3, 7, 0xB, 0xF, 6,7);
+	GSPREC(0, 5, 0xA, 0xF, 8,9);
+	GSPREC(1, 6, 0xB, 0xC, 10,11);
+	GSPREC(2, 7, 0x8, 0xD, 12,13);
+	GSPREC(3, 4, 0x9, 0xE, 14,15);
+	//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	GSPREC(0, 4, 0x8, 0xC, 14, 10);
+	GSPREC(1, 5, 0x9, 0xD, 4, 8);
+	GSPREC(2, 6, 0xA, 0xE, 9, 15);
+	GSPREC(3, 7, 0xB, 0xF, 13, 6);
+	GSPREC(0, 5, 0xA, 0xF, 1, 12);
+	GSPREC(1, 6, 0xB, 0xC, 0, 2);
+	GSPREC(2, 7, 0x8, 0xD, 11, 7);
+	GSPREC(3, 4, 0x9, 0xE, 5, 3);
+	//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	GSPREC(0, 4, 0x8, 0xC, 11, 8);
+	GSPREC(1, 5, 0x9, 0xD, 12, 0);
+	GSPREC(2, 6, 0xA, 0xE, 5, 2);
+	GSPREC(3, 7, 0xB, 0xF, 15, 13);
+	GSPREC(0, 5, 0xA, 0xF, 10, 14);
+	GSPREC(1, 6, 0xB, 0xC, 3, 6);
+	GSPREC(2, 7, 0x8, 0xD, 7, 1);
+	GSPREC(3, 4, 0x9, 0xE, 9, 4);
+	//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	GSPREC(0, 4, 0x8, 0xC, 7, 9);
+	GSPREC(1, 5, 0x9, 0xD, 3, 1);
+	GSPREC(2, 6, 0xA, 0xE, 13, 12);
+	GSPREC(3, 7, 0xB, 0xF, 11, 14);
+	GSPREC(0, 5, 0xA, 0xF, 2, 6);
+	GSPREC(1, 6, 0xB, 0xC, 5, 10);
+	GSPREC(2, 7, 0x8, 0xD, 4, 0);
+	GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+	//	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	GSPREC(0, 4, 0x8, 0xC, 9, 0);
+	GSPREC(1, 5, 0x9, 0xD, 5, 7);
+	GSPREC(2, 6, 0xA, 0xE, 2, 4);
+	GSPREC(3, 7, 0xB, 0xF, 10, 15);
+	GSPREC(0, 5, 0xA, 0xF, 14, 1);
+	GSPREC(1, 6, 0xB, 0xC, 11, 12);
+	GSPREC(2, 7, 0x8, 0xD, 6, 8);
+	GSPREC(3, 4, 0x9, 0xE, 3, 13);
+	//	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	GSPREC(0, 4, 0x8, 0xC, 2, 12);
+	GSPREC(1, 5, 0x9, 0xD, 6, 10);
+	GSPREC(2, 6, 0xA, 0xE, 0, 11);
+	GSPREC(3, 7, 0xB, 0xF, 8, 3);
+	GSPREC(0, 5, 0xA, 0xF, 4, 13);
+	GSPREC(1, 6, 0xB, 0xC, 7, 5);
+	GSPREC(2, 7, 0x8, 0xD, 15, 14);
+	GSPREC(3, 4, 0x9, 0xE, 1, 9);
+
+	//	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	GSPREC(0, 4, 0x8, 0xC, 12, 5);
+	GSPREC(1, 5, 0x9, 0xD, 1, 15);
+	GSPREC(2, 6, 0xA, 0xE, 14, 13);
+	GSPREC(3, 7, 0xB, 0xF, 4, 10);
+	GSPREC(0, 5, 0xA, 0xF, 0, 7);
+	GSPREC(1, 6, 0xB, 0xC, 6, 3);
+	GSPREC(2, 7, 0x8, 0xD, 9, 2);
+	GSPREC(3, 4, 0x9, 0xE, 8, 11);
+
+	//	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	GSPREC(0, 4, 0x8, 0xC, 13, 11);
+	GSPREC(1, 5, 0x9, 0xD, 7, 14);
+	GSPREC(2, 6, 0xA, 0xE, 12, 1);
+	GSPREC(3, 7, 0xB, 0xF, 3, 9);
+	GSPREC(0, 5, 0xA, 0xF, 5, 0);
+	GSPREC(1, 6, 0xB, 0xC, 15, 4);
+	GSPREC(2, 7, 0x8, 0xD, 8, 6);
+//	GSPREC(3, 4, 0x9, 0xE, 2, 10);
+	//	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+
+
 #if PRECALC64
 	// only compute h6 & 7
-	h[6U] ^= v[6U] ^ v[14U];
-	h[7U] ^= v[7U] ^ v[15U];
+//	h[6] ^= v[6] ^ v[14];
+	h[7] ^= v[7] ^ v[15];
 #else
 	//#pragma unroll 16
 	for (uint32_t i = 0; i < 16; i++) {
@@ -180,12 +427,13 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co
 #endif
 }
 
+
 #if !PRECALC64 /* original method */
 __global__
 void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce,
 	const uint64_t highTarget, const int crcsum, const int rounds)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
@@ -253,23 +501,23 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui
 	if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
 		return result;
 
-	blake256_gpu_hash_80<<<grid, block>>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds);
-	cudaDeviceSynchronize();
-	if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+	blake256_gpu_hash_80<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds);
+	//cudaDeviceSynchronize();
+	if (cudaSuccess == cudaMemcpyAsync(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		//cudaDeviceSynchronize(); /* seems no more required */
 		result = h_resNonce[thr_id][0];
 		for (int n=0; n < (NBN-1); n++)
-			extra_results[n] = h_resNonce[thr_id][n+1];
+			extra_results[thr_id][n] = h_resNonce[thr_id][n+1];
 	}
 	return result;
 }
 
 __host__
-void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
+void blake256_cpu_setBlock_80(int thr_id, uint32_t *pdata, const uint32_t *ptarget)
 {
 	uint32_t data[20];
 	memcpy(data, pdata, 80);
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice));
 }
 #else
 
@@ -278,10 +526,10 @@ void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
 
 __global__
 void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce,
-	const uint64_t highTarget, const int rounds, const bool trace)
+                          const uint32_t Target6, const uint32_t Target7, const int rounds, const bool trace)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
 		uint32_t _ALIGN(16) h[8];
@@ -301,14 +549,57 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
 
 		blake256_compress(h, ending, 640, rounds);
 
-		if (h[7] == 0 && cuda_swab32(h[6]) <= highTarget) {
+		if (h[7] <= Target7)
+		{
 #if NBN == 2
-			/* keep the smallest nonce, + extra one if found */
-			if (resNonce[0] > nonce) {
-				resNonce[1] = resNonce[0];
-				resNonce[0] = nonce;
+			uint32_t tmp = atomicCAS(resNonce, 0xffffffff, nonce);
+			if(tmp != 0xffffffff)
+				resNonce[1] = nonce;
+#else
+			resNonce[0] = nonce;
+#endif
+#ifdef _DEBUG
+			if (trace) {
+				uint64_t high64 = ((uint64_t*)h)[3];
+				printf("gpu:  %16llx\n", high64);
+				printf("gpu: %08x.%08x\n", h[7], h[6]);
+				printf("tgt:  %16llx\n", highTarget);
 			}
-			else
+#endif
+		}
+	}
+}
+
+
+__global__
+void blake256_gpu_hash_16_8(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce,
+                            const uint32_t Target6, const uint32_t Target7, const int rounds, const bool trace)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+		uint32_t _ALIGN(16) h[8];
+
+#pragma unroll
+		for (int i = 0; i < 8; i++)
+			h[i] = d_data[i];
+
+		// ------ Close: Bytes 64 to 80 ------
+
+		uint32_t _ALIGN(16) ending[4];
+		ending[0] = d_data[8];
+		ending[1] = d_data[9];
+		ending[2] = d_data[10];
+		ending[3] = nonce; /* our tested value */
+
+		blake256_compress_8(h, ending);
+
+		if (h[7] <= Target7)
+		{
+#if NBN == 2
+			uint32_t tmp = atomicCAS(resNonce, 0xffffffff, nonce);
+			if(tmp != 0xffffffff)
 				resNonce[1] = nonce;
 #else
 			resNonce[0] = nonce;
@@ -325,8 +616,9 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
 	}
 }
 
+
 __host__
-static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget,
+static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint32_t Target6, const uint32_t Target7,
 	const int8_t rounds)
 {
 	uint32_t result = UINT32_MAX;
@@ -334,18 +626,23 @@ static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, c
 	dim3 grid((threads + TPB-1)/TPB);
 	dim3 block(TPB);
 
-	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
-	if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
-		return result;
+	CUDA_SAFE_CALL(cudaMemsetAsync(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t), gpustream[thr_id]));
 
-	blake256_gpu_hash_16 <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], highTarget, (int) rounds, opt_tracegpu);
-	cudaDeviceSynchronize();
-	if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
-		//cudaDeviceSynchronize(); /* seems no more required */
-		result = h_resNonce[thr_id][0];
-		for (int n=0; n < (NBN-1); n++)
-			extra_results[n] = h_resNonce[thr_id][n+1];
+	if(rounds == 8)
+		blake256_gpu_hash_16_8 << <grid, block, 0, gpustream[thr_id] >> > (threads, startNonce, d_resNonce[thr_id], Target6, Target7, (int)rounds, opt_tracegpu);
+	else
+	{
+		if(rounds == 14)
+			blake256_gpu_hash_16 << <grid, block, 0, gpustream[thr_id] >> > (threads, startNonce, d_resNonce[thr_id], Target6, Target7, (int)rounds, opt_tracegpu);
+		else
+			applog(LOG_ERR, "Number of blake rounds not supported");
 	}
+	CUDA_SAFE_CALL(cudaMemcpyAsync(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); cudaStreamSynchronize(gpustream[thr_id]);
+	CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id]));
+	result = h_resNonce[0];
+
+	for (int n=0; n < (NBN-1); n++)
+		extra_results[thr_id][n] = h_resNonce[n + 1];
 	return result;
 }
 
@@ -364,119 +661,154 @@ static void blake256mid(uint32_t *output, const uint32_t *input, int8_t rounds =
 }
 
 __host__
-void blake256_cpu_setBlock_16(uint32_t *penddata, const uint32_t *midstate, const uint32_t *ptarget)
+static void blake256_cpu_setBlock_16(int thr_id, uint32_t *penddata, const uint32_t *midstate, const uint32_t *ptarget)
 {
-	uint32_t _ALIGN(64) data[11];
-	memcpy(data, midstate, 32);
-	data[8] = penddata[0];
-	data[9] = penddata[1];
-	data[10]= penddata[2];
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 32 + 12, 0, cudaMemcpyHostToDevice));
+	memcpy(h_data, midstate, 32);
+	h_data[8] = penddata[0];
+	h_data[9] = penddata[1];
+	h_data[10] = penddata[2];
+
+	// precalc v[0], v[4], v[8], v[12]
+	h_data[11] = h_data[0] + (h_data[8] ^ 0x85A308D3) + h_data[4];
+	h_data[14] = ROTL32(0xA4093822 ^ 640 ^ h_data[11], 16);
+	h_data[13] = 0x243F6A88 + h_data[14];
+	h_data[12] = ROTR32(h_data[4] ^ h_data[13], 12);
+	h_data[11] += (h_data[9] ^ 0x243F6A88) + h_data[12];
+	h_data[14] = ROTR32(h_data[14] ^ h_data[11], 8);
+	h_data[13] += h_data[14];
+	h_data[12] = ROTR32(h_data[12] ^ h_data[13], 7);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(d_data, h_data, 15 * 4, 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
 }
 #endif
 
-static bool init[MAX_GPUS] = { 0 };
-
-extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds=14)
+extern int scanhash_blake256(int thr_id, uint32_t *pdata, uint32_t *ptarget,
+	uint32_t max_nonce, uint32_t *hashes_done, int8_t blakerounds=14)
 {
 	const uint32_t first_nonce = pdata[19];
-	uint64_t targetHigh = ((uint64_t*)ptarget)[3];
 	uint32_t _ALIGN(64) endiandata[20];
 #if PRECALC64
 	uint32_t _ALIGN(64) midstate[8];
 #else
 	uint32_t crcsum;
 #endif
-	unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 20;
-	uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity);
-	throughput = min(throughput, max_nonce - first_nonce);
+	unsigned int intensity = 28;
+	uint32_t throughput = device_intensity(device_map[thr_id], __func__, 1U << intensity);
+	throughput = min(throughput, max_nonce - first_nonce) & 0xfffffc00;
 
 	int rc = 0;
 
-	if (opt_benchmark) {
-		targetHigh = 0x1ULL << 32;
-		((uint32_t*)ptarget)[6] = swab32(0x4);
+	if (opt_benchmark)
+	{
+		ptarget[7] = 0x00000000;
+		ptarget[6] = 0xffffffff;
 	}
+	uint32_t target6 = ptarget[6];
+	uint32_t target7 = swab32(ptarget[7]); // don't ask me why
 
-	if (opt_tracegpu) {
+	if (opt_tracegpu)
+	{
 		/* test call from util.c */
 		throughput = 1;
 		for (int k = 0; k < 20; k++)
 			pdata[k] = swab32(pdata[k]);
 	}
 
-	if (!init[thr_id]) {
-		if (active_gpus > 1)
-			CUDA_CALL_OR_RET_X(cudaSetDevice(device_map[thr_id]),0);
-		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t)), 0);
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), 0);
-		init[thr_id] = true;
+	static THREAD volatile bool init = false;
+	if(!init)
+	{
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		CUDA_SAFE_CALL(cudaMallocHost(&h_data, 15 * sizeof(uint32_t)));
+		CUDA_SAFE_CALL(cudaMallocHost(&h_resNonce, NBN * sizeof(uint32_t)));
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)));
+		mining_has_stopped[thr_id] = false;
+		init = true;
 	}
 
 #if PRECALC64
 	for (int k = 0; k < 16; k++)
 		be32enc(&endiandata[k], pdata[k]);
 	blake256mid(midstate, endiandata, blakerounds);
-	blake256_cpu_setBlock_16(&pdata[16], midstate, ptarget);
+	blake256_cpu_setBlock_16(thr_id, &pdata[16], midstate, ptarget);
 #else
-	blake256_cpu_setBlock_80(pdata, ptarget);
+	blake256_cpu_setBlock_80(thr_id, pdata, ptarget);
 	crcsum = crc32_u32t(pdata, 64);
 #endif /* PRECALC64 */
 
 	do {
-		uint32_t foundNonce =
 #if PRECALC64
 		// GPU HASH (second block only, first is midstate)
-		blake256_cpu_hash_16(thr_id, throughput, pdata[19], targetHigh, blakerounds);
+		uint32_t foundNonce =	blake256_cpu_hash_16(thr_id, throughput, pdata[19], target6, target7, blakerounds);
 #else
 		// GPU FULL HASH
-		blake256_cpu_hash_80(thr_id, throughput, pdata[19], targetHigh, crcsum, blakerounds);
+		uint32_t foundNonce =	blake256_cpu_hash_80(thr_id, throughput, pdata[19], targetHigh, crcsum, blakerounds);
 #endif
-		if (foundNonce != UINT32_MAX)
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(foundNonce != UINT32_MAX)
 		{
-			uint32_t vhashcpu[8];
-			uint32_t Htarg = (uint32_t)targetHigh;
+			uint32_t vhashcpu[8] = { 0 };
 
 			for (int k=0; k < 19; k++)
 				be32enc(&endiandata[k], pdata[k]);
 
-			be32enc(&endiandata[19], foundNonce);
-			blake256hash(vhashcpu, endiandata, blakerounds);
-
+			if(opt_verify)
+			{
+				be32enc(&endiandata[19], foundNonce);
+				blake256hash(vhashcpu, endiandata, blakerounds);
+			}
 			//applog(LOG_BLUE, "%08x %16llx", vhashcpu[6], targetHigh);
-			if (vhashcpu[6] <= Htarg && fulltest(vhashcpu, ptarget))
+			if (vhashcpu[7] <= target7 && fulltest(vhashcpu, ptarget))
 			{
+				if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, foundNonce);
 				rc = 1;
 				*hashes_done = pdata[19] - first_nonce + throughput;
 				pdata[19] = foundNonce;
 #if NBN > 1
-				if (extra_results[0] != UINT32_MAX) {
-					be32enc(&endiandata[19], extra_results[0]);
-					blake256hash(vhashcpu, endiandata, blakerounds);
-					if (vhashcpu[6] <= Htarg /* && fulltest(vhashcpu, ptarget) */) {
-						pdata[21] = extra_results[0];
-						applog(LOG_BLUE, "1:%x 2:%x", foundNonce, extra_results[0]);
+				if (extra_results[thr_id][0] != UINT32_MAX)
+				{
+					if(opt_verify)
+					{
+						be32enc(&endiandata[19], extra_results[thr_id][0]);
+						blake256hash(vhashcpu, endiandata, blakerounds);
+					}
+					if (vhashcpu[7] <= target7 && fulltest(vhashcpu, ptarget))
+					{
+						pdata[21] = extra_results[thr_id][0];
+						if(opt_benchmark) applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, extra_results[thr_id][0]);
+//						applog(LOG_BLUE, "1:%x 2:%x", foundNonce, extra_results[thr_id][0]);
 						rc = 2;
 					}
-					extra_results[0] = UINT32_MAX;
+					else
+					{
+						if(vhashcpu[7]>target7)
+							applog(LOG_ERR, "GPU #%d: result for second nonce %08x does not validate on CPU!", device_map[thr_id], extra_results[thr_id][0]);
+					}
+					extra_results[thr_id][0] = UINT32_MAX;
 				}
 #endif
 				//applog_hash((uint8_t*)ptarget);
 				//applog_compare_hash((uint8_t*)vhashcpu,(uint8_t*)ptarget);
 				return rc;
 			}
-			else if (opt_debug) {
-				applog_hash((uchar*)ptarget);
-				applog_compare_hash((uchar*)vhashcpu, (uchar*)ptarget);
-				applog(LOG_DEBUG, "GPU #%d: result for nonce %08x does not validate on CPU!", thr_id, foundNonce);
+			else
+			{
+				if(opt_debug)
+				{
+					applog_hash((uchar*)ptarget);
+					applog_compare_hash((uchar*)vhashcpu, (uchar*)ptarget);
+				}
+				if(vhashcpu[7]>target7)
+					applog(LOG_ERR, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
 			}
 		}
 
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce;
+	*hashes_done = pdata[19] - first_nonce ;
 
 	return rc;
 }
diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu
index 718d18ecff..e599ef8e5d 100644
--- a/Algo256/cuda_blake256.cu
+++ b/Algo256/cuda_blake256.cu
@@ -8,16 +8,17 @@ extern "C" {
 }
 
 #include "cuda_helper.h"
-
 #include <memory.h>
 
-static __device__ uint64_t cuda_swab32ll(uint64_t x) {
-	return MAKE_ULONGLONG(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x)));
-}
+#define UINT2(x,y) make_uint2(x,y)
+
+//static __device__ __forceinline__ uint64_t cuda_swab32ll(uint64_t x) {
+//	return MAKE_UINT64(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x)));
+//}
 
-__constant__ static uint32_t  c_data[20];
+__constant__ static uint32_t  c_data[3];
 
-__constant__ static uint8_t sigma[16][16];
+//__constant__ static uint8_t sigma[16][16];
 static uint8_t  c_sigma[16][16] = {
 	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
 	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
@@ -37,17 +38,22 @@ static uint8_t  c_sigma[16][16] = {
 	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
 };
 
-static const uint32_t  c_IV256[8] = {
-	0x6A09E667, 0xBB67AE85,
-	0x3C6EF372, 0xA54FF53A,
-	0x510E527F, 0x9B05688C,
-	0x1F83D9AB, 0x5BE0CD19
-};
-
 __device__ __constant__ static uint32_t cpu_h[8];
-
-__device__ __constant__ static  uint32_t  u256[16];
-static const uint32_t  c_u256[16] = {
+/*
+__device__ __constant__ static  uint32_t  u256[16] =
+{
+	0x243F6A88, 0x85A308D3,
+	0x13198A2E, 0x03707344,
+	0xA4093822, 0x299F31D0,
+	0x082EFA98, 0xEC4E6C89,
+	0x452821E6, 0x38D01377,
+	0xBE5466CF, 0x34E90C6C,
+	0xC0AC29B7, 0xC97C50DD,
+	0x3F84D5B5, 0xB5470917
+};
+*/
+static const uint32_t  c_u256[16] =
+{
 	0x243F6A88, 0x85A308D3,
 	0x13198A2E, 0x03707344,
 	0xA4093822, 0x299F31D0,
@@ -58,25 +64,9 @@ static const uint32_t  c_u256[16] = {
 	0x3F84D5B5, 0xB5470917
 };
 
-#define GS2(a,b,c,d,x) { \
-	const uint8_t idx1 = sigma[r][x]; \
-	const uint8_t idx2 = sigma[r][x+1]; \
-	v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \
-	v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \
-	v[c] += v[d]; \
-	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
-\
-	v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \
-	v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \
-	v[c] += v[d]; \
-	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
-}
-
-//#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
-#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 #define hostGS(a,b,c,d,x) { \
 	const uint8_t idx1 = c_sigma[r][x]; \
-	const uint8_t idx2 = c_sigma[r][x+1]; \
+	const uint8_t idx2 = c_sigma[r][x + 1]; \
 	v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \
 	v[d] = ROTR32(v[d] ^ v[a], 16); \
 	v[c] += v[d]; \
@@ -88,30 +78,67 @@ static const uint32_t  c_u256[16] = {
 	v[b] = ROTR32(v[b] ^ v[c], 7); \
 	}
 
+#define GSPREC(a,b,c,d,x,y) { \
+	v[a] += (m[x] ^ u256[y]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \
+	v[c] += v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 12); \
+	v[a] += (m[y] ^ u256[x]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \
+	v[c] += v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 7); \
+		}
+
+__constant__ uint64_t keccak_round_constants[24] = {
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
+};
+
+__constant__ uint2 keccak_round_constants35[24] = {
+	{ 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 },
+	{ 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 },
+	{ 0x0000808bul, 0x00000000 }, { 0x80000001ul, 0x00000000 },
+	{ 0x80008081ul, 0x80000000 }, { 0x00008009ul, 0x80000000 },
+	{ 0x0000008aul, 0x00000000 }, { 0x00000088ul, 0x00000000 },
+	{ 0x80008009ul, 0x00000000 }, { 0x8000000aul, 0x00000000 },
+	{ 0x8000808bul, 0x00000000 }, { 0x0000008bul, 0x80000000 },
+	{ 0x00008089ul, 0x80000000 }, { 0x00008003ul, 0x80000000 },
+	{ 0x00008002ul, 0x80000000 }, { 0x00000080ul, 0x80000000 },
+	{ 0x0000800aul, 0x00000000 }, { 0x8000000aul, 0x80000000 },
+	{ 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 },
+	{ 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 }
+};
 
 __host__ __forceinline__
-static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint32_t T0)
+static void blake256_compress1st(uint32_t *h, const uint32_t *block)
 {
 	uint32_t m[16];
-	uint32_t v[16];
+	uint32_t v[16] =
+	{
+		0x6A09E667, 0xBB67AE85,
+		0x3C6EF372, 0xA54FF53A,
+		0x510E527F, 0x9B05688C,
+		0x1F83D9AB, 0x5BE0CD19,
+		0x243F6A88, 0x85A308D3,
+		0x13198A2E, 0x03707344,
+		0xA4093A22, 0x299F33D0,
+		0x082EFA98, 0xEC4E6C89
+	};
 	
 	for (int i = 0; i < 16; i++) {
 		m[i] = block[i];
 	}
 
-	for (int i = 0; i < 8; i++)
-		v[i] = h[i];
-
-	v[8] = c_u256[0];
-	v[9] = c_u256[1];
-	v[10] = c_u256[2];
-	v[11] = c_u256[3];
-
-	v[12] = c_u256[4] ^ T0;
-	v[13] = c_u256[5] ^ T0;
-	v[14] = c_u256[6];
-	v[15] = c_u256[7];
-
 	for (int r = 0; r < 14; r++) {
 		/* column step */
 		hostGS(0, 4, 0x8, 0xC, 0x0);
@@ -134,110 +161,528 @@ static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint3
 	h[6] ^= v[6] ^ v[14];
 	h[7] ^= v[7] ^ v[15];
 }
+#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
 
-__device__ __forceinline__
-static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0)
+static void __forceinline__ __device__ keccak_block(uint2 *s)
 {
-	uint32_t v[16];
-	uint32_t m[16]=
-	{
-		block[0], block[1], block[2], block[3],
-		0x80000000, 0, 0, 0,
-		0, 0, 0, 0,
-		0, 1, 0, 640
-	};
+	uint2 bc[5], tmpxor[5], tmp1, tmp2;
+	//	uint2 s[25];
 
-	#pragma unroll 8
-	for (int i = 0; i < 8; i++)
-		v[i] = h[i];
-
-	v[8] =  u256[0];
-	v[9] =  u256[1];
-	v[10] = u256[2];
-	v[11] = u256[3];
-	v[12] = u256[4] ^ T0;
-	v[13] = u256[5] ^ T0;
-	v[14] = u256[6];
-	v[15] = u256[7];
-
-	for (int r = 0; r < 14; r++) {
-		/* column step */
-		GS2(0, 4, 0x8, 0xC, 0x0);
-		GS2(1, 5, 0x9, 0xD, 0x2);
-		GS2(2, 6, 0xA, 0xE, 0x4);
-		GS2(3, 7, 0xB, 0xF, 0x6);
-		/* diagonal step */
-		GS2(0, 5, 0xA, 0xF, 0x8);
-		GS2(1, 6, 0xB, 0xC, 0xA);
-		GS2(2, 7, 0x8, 0xD, 0xC);
-		GS2(3, 4, 0x9, 0xE, 0xE);
+#pragma unroll 1
+	for (int i = 0; i < 24; i++)
+	{
+#pragma unroll
+		for (uint32_t x = 0; x < 5; x++)
+			tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
+
+		bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+		tmp1 = s[1] ^ bc[0];
+
+		s[0] ^= bc[4];
+		s[1] = ROL2(s[6] ^ bc[0], 44);
+		s[6] = ROL2(s[9] ^ bc[3], 20);
+		s[9] = ROL2(s[22] ^ bc[1], 61);
+		s[22] = ROL2(s[14] ^ bc[3], 39);
+		s[14] = ROL2(s[20] ^ bc[4], 18);
+		s[20] = ROL2(s[2] ^ bc[1], 62);
+		s[2] = ROL2(s[12] ^ bc[1], 43);
+		s[12] = ROL2(s[13] ^ bc[2], 25);
+		s[13] = ROL8(s[19] ^ bc[3]);
+		s[19] = ROR8(s[23] ^ bc[2]);
+		s[23] = ROL2(s[15] ^ bc[4], 41);
+		s[15] = ROL2(s[4] ^ bc[3], 27);
+		s[4] = ROL2(s[24] ^ bc[3], 14);
+		s[24] = ROL2(s[21] ^ bc[0], 2);
+		s[21] = ROL2(s[8] ^ bc[2], 55);
+		s[8] = ROL2(s[16] ^ bc[0], 45);
+		s[16] = ROL2(s[5] ^ bc[4], 36);
+		s[5] = ROL2(s[3] ^ bc[2], 28);
+		s[3] = ROL2(s[18] ^ bc[2], 21);
+		s[18] = ROL2(s[17] ^ bc[1], 15);
+		s[17] = ROL2(s[11] ^ bc[0], 10);
+		s[11] = ROL2(s[7] ^ bc[1], 6);
+		s[7] = ROL2(s[10] ^ bc[4], 3);
+		s[10] = ROL2(tmp1, 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0] ^= keccak_round_constants35[i];
 	}
-
-	h[0] ^= v[0] ^ v[8];
-	h[1] ^= v[1] ^ v[9];
-	h[2] ^= v[2] ^ v[10];
-	h[3] ^= v[3] ^ v[11];
-	h[4] ^= v[4] ^ v[12];
-	h[5] ^= v[5] ^ v[13];
-	h[6] ^= v[6] ^ v[14];
-	h[7] ^= v[7] ^ v[15];
 }
 
-__global__ __launch_bounds__(256,4)
-void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash)
+//__launch_bounds__(256)
+__global__ 
+void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t * Hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+//	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
 		uint32_t h[8];
-		uint32_t input[4];
-
+//		uint32_t input[4];
+		const uint32_t T0 = 640;
 		#pragma unroll 8
 		for (int i = 0; i<8; i++) { h[i] = cpu_h[i];}
 
-		#pragma unroll 3
-		for (int i = 0; i < 3; ++i) input[i] = c_data[16 + i];
+		uint32_t v[16];
+
+		const uint32_t c_Padding[12] = {
+			0x80000000, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 1, 0, 640
+		};
+
+		const uint32_t  u256[16] = 
+		{
+			0x243F6A88, 0x85A308D3,
+			0x13198A2E, 0x03707344,
+			0xA4093822, 0x299F31D0,
+			0x082EFA98, 0xEC4E6C89,
+			0x452821E6, 0x38D01377,
+			0xBE5466CF, 0x34E90C6C,
+			0xC0AC29B7, 0xC97C50DD,
+			0x3F84D5B5, 0xB5470917
+		};
+
+		uint32_t m[16] =
+		{
+			c_data[0], c_data[1], c_data[2], nonce,
+			c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3],
+			c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7],
+			c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11]
+		};
+
+#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+			v[i] = h[i];
+
+		v[8] = u256[0];
+		v[9] = u256[1];
+		v[10] = u256[2];
+		v[11] = u256[3];
+		v[12] = u256[4] ^ T0;
+		v[13] = u256[5] ^ T0;
+		v[14] = u256[6];
+		v[15] = u256[7];
+
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+		//	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		GSPREC(0, 4, 0x8, 0xC, 9, 0);
+		GSPREC(1, 5, 0x9, 0xD, 5, 7);
+		GSPREC(2, 6, 0xA, 0xE, 2, 4);
+		GSPREC(3, 7, 0xB, 0xF, 10, 15);
+		GSPREC(0, 5, 0xA, 0xF, 14, 1);
+		GSPREC(1, 6, 0xB, 0xC, 11, 12);
+		GSPREC(2, 7, 0x8, 0xD, 6, 8);
+		GSPREC(3, 4, 0x9, 0xE, 3, 13);
+		//	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		GSPREC(0, 4, 0x8, 0xC, 2, 12);
+		GSPREC(1, 5, 0x9, 0xD, 6, 10);
+		GSPREC(2, 6, 0xA, 0xE, 0, 11);
+		GSPREC(3, 7, 0xB, 0xF, 8, 3);
+		GSPREC(0, 5, 0xA, 0xF, 4, 13);
+		GSPREC(1, 6, 0xB, 0xC, 7, 5);
+		GSPREC(2, 7, 0x8, 0xD, 15, 14);
+		GSPREC(3, 4, 0x9, 0xE, 1, 9);
+
+		//	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		GSPREC(0, 4, 0x8, 0xC, 12, 5);
+		GSPREC(1, 5, 0x9, 0xD, 1, 15);
+		GSPREC(2, 6, 0xA, 0xE, 14, 13);
+		GSPREC(3, 7, 0xB, 0xF, 4, 10);
+		GSPREC(0, 5, 0xA, 0xF, 0, 7);
+		GSPREC(1, 6, 0xB, 0xC, 6, 3);
+		GSPREC(2, 7, 0x8, 0xD, 9, 2);
+		GSPREC(3, 4, 0x9, 0xE, 8, 11);
+
+		//	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		GSPREC(0, 4, 0x8, 0xC, 13, 11);
+		GSPREC(1, 5, 0x9, 0xD, 7, 14);
+		GSPREC(2, 6, 0xA, 0xE, 12, 1);
+		GSPREC(3, 7, 0xB, 0xF, 3, 9);
+		GSPREC(0, 5, 0xA, 0xF, 5, 0);
+		GSPREC(1, 6, 0xB, 0xC, 15, 4);
+		GSPREC(2, 7, 0x8, 0xD, 8, 6);
+		GSPREC(3, 4, 0x9, 0xE, 2, 10);
+		//	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+		GSPREC(0, 4, 0x8, 0xC, 6, 15);
+		GSPREC(1, 5, 0x9, 0xD, 14, 9);
+		GSPREC(2, 6, 0xA, 0xE, 11, 3);
+		GSPREC(3, 7, 0xB, 0xF, 0, 8);
+		GSPREC(0, 5, 0xA, 0xF, 12, 2);
+		GSPREC(1, 6, 0xB, 0xC, 13, 7);
+		GSPREC(2, 7, 0x8, 0xD, 1, 4);
+		GSPREC(3, 4, 0x9, 0xE, 10, 5);
+		//	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+		GSPREC(0, 4, 0x8, 0xC, 10, 2);
+		GSPREC(1, 5, 0x9, 0xD, 8, 4);
+		GSPREC(2, 6, 0xA, 0xE, 7, 6);
+		GSPREC(3, 7, 0xB, 0xF, 1, 5);
+		GSPREC(0, 5, 0xA, 0xF, 15, 11);
+		GSPREC(1, 6, 0xB, 0xC, 9, 14);
+		GSPREC(2, 7, 0x8, 0xD, 3, 12);
+		GSPREC(3, 4, 0x9, 0xE, 13, 0);
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+
+
+
+		h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]);
+		h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]);
+		h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]);
+		h[3] = cuda_swab32(h[3] ^ v[3] ^ v[11]);
+		h[4] = cuda_swab32(h[4] ^ v[4] ^ v[12]);
+		h[5] = cuda_swab32(h[5] ^ v[5] ^ v[13]);
+		h[6] = cuda_swab32(h[6] ^ v[6] ^ v[14]);
+		h[7] = cuda_swab32(h[7] ^ v[7] ^ v[15]);
+
+		uint2 keccak_gpu_state[25] = {0};
+		keccak_gpu_state[0].x = h[0];
+		keccak_gpu_state[0].y = h[1];
+		keccak_gpu_state[1].x = h[2];
+		keccak_gpu_state[1].y = h[3];
+		keccak_gpu_state[2].x = h[4];
+		keccak_gpu_state[2].y = h[5];
+		keccak_gpu_state[3].x = h[6];
+		keccak_gpu_state[3].y = h[7];
+		keccak_gpu_state[4] = UINT2(1, 0);
+
+		keccak_gpu_state[16] = UINT2(0, 0x80000000);
+		keccak_block(keccak_gpu_state);
+		uint64_t *outputHash = (uint64_t *)Hash;
+#pragma unroll 4
+		for (int i = 0; i<4; i++)
+			outputHash[i*threads + thread] = devectorize(keccak_gpu_state[i]);
+	}
+
 
-		input[3] = nonce;
-		blake256_compress2nd(h, input, 640);
 
-        #pragma unroll
-		for (int i = 0; i<4; i++) {
-			Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1]));
-		}
+}
+
+
+__global__ __launch_bounds__(256, 4)
+void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t * Hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+		uint32_t h[8];
+		//		uint32_t input[4];
+#pragma unroll 8
+		for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; }
+
+		uint32_t v[16];
+
+		const uint32_t c_Padding[12] = {
+			0x80000000, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 1, 0, 640
+		};
+		const uint32_t  u256[16] =
+		{
+			0x243F6A88, 0x85A308D3,
+			0x13198A2E, 0x03707344,
+			0xA4093822, 0x299F31D0,
+			0x082EFA98, 0xEC4E6C89,
+			0x452821E6, 0x38D01377,
+			0xBE5466CF, 0x34E90C6C,
+			0xC0AC29B7, 0xC97C50DD,
+			0x3F84D5B5, 0xB5470917
+		};
+
+		uint32_t m[16] =
+		{
+			c_data[0], c_data[1], c_data[2], nonce,
+			c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3],
+			c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7],
+			c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11]
+		};
+
+#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+			v[i] = h[i];
+
+		v[8] = u256[0];
+		v[9] = u256[1];
+		v[10] = u256[2];
+		v[11] = u256[3];
+		v[12] = u256[4] ^ 640;
+		v[13] = u256[5] ^ 640;
+		v[14] = u256[6];
+		v[15] = u256[7];
+
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+		//	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		GSPREC(0, 4, 0x8, 0xC, 9, 0);
+		GSPREC(1, 5, 0x9, 0xD, 5, 7);
+		GSPREC(2, 6, 0xA, 0xE, 2, 4);
+		GSPREC(3, 7, 0xB, 0xF, 10, 15);
+		GSPREC(0, 5, 0xA, 0xF, 14, 1);
+		GSPREC(1, 6, 0xB, 0xC, 11, 12);
+		GSPREC(2, 7, 0x8, 0xD, 6, 8);
+		GSPREC(3, 4, 0x9, 0xE, 3, 13);
+		//	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		GSPREC(0, 4, 0x8, 0xC, 2, 12);
+		GSPREC(1, 5, 0x9, 0xD, 6, 10);
+		GSPREC(2, 6, 0xA, 0xE, 0, 11);
+		GSPREC(3, 7, 0xB, 0xF, 8, 3);
+		GSPREC(0, 5, 0xA, 0xF, 4, 13);
+		GSPREC(1, 6, 0xB, 0xC, 7, 5);
+		GSPREC(2, 7, 0x8, 0xD, 15, 14);
+		GSPREC(3, 4, 0x9, 0xE, 1, 9);
+
+		//	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		GSPREC(0, 4, 0x8, 0xC, 12, 5);
+		GSPREC(1, 5, 0x9, 0xD, 1, 15);
+		GSPREC(2, 6, 0xA, 0xE, 14, 13);
+		GSPREC(3, 7, 0xB, 0xF, 4, 10);
+		GSPREC(0, 5, 0xA, 0xF, 0, 7);
+		GSPREC(1, 6, 0xB, 0xC, 6, 3);
+		GSPREC(2, 7, 0x8, 0xD, 9, 2);
+		GSPREC(3, 4, 0x9, 0xE, 8, 11);
+
+		//	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		GSPREC(0, 4, 0x8, 0xC, 13, 11);
+		GSPREC(1, 5, 0x9, 0xD, 7, 14);
+		GSPREC(2, 6, 0xA, 0xE, 12, 1);
+		GSPREC(3, 7, 0xB, 0xF, 3, 9);
+		GSPREC(0, 5, 0xA, 0xF, 5, 0);
+		GSPREC(1, 6, 0xB, 0xC, 15, 4);
+		GSPREC(2, 7, 0x8, 0xD, 8, 6);
+		GSPREC(3, 4, 0x9, 0xE, 2, 10);
+		//	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+		GSPREC(0, 4, 0x8, 0xC, 6, 15);
+		GSPREC(1, 5, 0x9, 0xD, 14, 9);
+		GSPREC(2, 6, 0xA, 0xE, 11, 3);
+		GSPREC(3, 7, 0xB, 0xF, 0, 8);
+		GSPREC(0, 5, 0xA, 0xF, 12, 2);
+		GSPREC(1, 6, 0xB, 0xC, 13, 7);
+		GSPREC(2, 7, 0x8, 0xD, 1, 4);
+		GSPREC(3, 4, 0x9, 0xE, 10, 5);
+		//	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+		GSPREC(0, 4, 0x8, 0xC, 10, 2);
+		GSPREC(1, 5, 0x9, 0xD, 8, 4);
+		GSPREC(2, 6, 0xA, 0xE, 7, 6);
+		GSPREC(3, 7, 0xB, 0xF, 1, 5);
+		GSPREC(0, 5, 0xA, 0xF, 15, 11);
+		GSPREC(1, 6, 0xB, 0xC, 9, 14);
+		GSPREC(2, 7, 0x8, 0xD, 3, 12);
+		GSPREC(3, 4, 0x9, 0xE, 13, 0);
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+		h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]);
+		h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]);
+		h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]);
+		h[3] = cuda_swab32(h[3] ^ v[3] ^ v[11]);
+		h[4] = cuda_swab32(h[4] ^ v[4] ^ v[12]);
+		h[5] = cuda_swab32(h[5] ^ v[5] ^ v[13]);
+		h[6] = cuda_swab32(h[6] ^ v[6] ^ v[14]);
+		h[7] = cuda_swab32(h[7] ^ v[7] ^ v[15]);
+
+		Hash[((0 * threads) + thread)*2] = (h[0]);
+		Hash[((0 * threads) + thread) * 2 + 1] = (h[1]);
+		Hash[((1 * threads) + thread) * 2] = (h[2]);
+		Hash[((1 * threads) + thread) * 2 + 1] = (h[3]);
+		Hash[((2 * threads) + thread) * 2] = (h[4]);
+		Hash[((2 * threads) + thread) * 2 + 1] = (h[5]);
+		Hash[((3 * threads) + thread) * 2] = (h[6]);
+		Hash[((3 * threads) + thread) * 2 + 1] = (h[7]);
 	}
 }
 
 __host__
-void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
+void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash)
 {
 	const uint32_t threadsperblock = 64;
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	blake256_gpu_hash_80 <<<grid, block>>> (threads, startNonce, Hash);
+	blake256_gpu_hash_80 <<<grid, block, 0, gpustream[thr_id]>>> (threads, startNonce, (uint32_t*)Hash);
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
 
 __host__
-void blake256_cpu_setBlock_80(uint32_t *pdata)
+void blake256_cpu_setBlock_80(int thr_id, uint32_t *pdata)
 {
-	uint32_t h[8];
-	uint32_t data[20];
-	memcpy(data, pdata, 80);
-	for (int i = 0; i<8; i++) {
-		h[i] = c_IV256[i];
-	}
-	blake256_compress1st(h, pdata, 512);
+	uint32_t h[8] =
+	{
+		0x6A09E667, 0xBB67AE85,
+		0x3C6EF372, 0xA54FF53A,
+		0x510E527F, 0x9B05688C,
+		0x1F83D9AB, 0x5BE0CD19
+	};
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_data, pdata + 16, 3 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
 
-	cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice);
+	blake256_compress1st(h, pdata);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(cpu_h, h, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
 }
 
 __host__
-void blake256_cpu_init(int thr_id, uint32_t threads)
+void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash)
 {
-	cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);
-}
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	blakeKeccak256_gpu_hash_80 << <grid, block, 0, gpustream[thr_id] >> > (threads, startNonce, (uint32_t *)Hash);
+	CUDA_SAFE_CALL(cudaGetLastError());
+}
\ No newline at end of file
diff --git a/Algo256/cuda_bmw256.cu b/Algo256/cuda_bmw256.cu
new file mode 100644
index 0000000000..1982df8ee9
--- /dev/null
+++ b/Algo256/cuda_bmw256.cu
@@ -0,0 +1,328 @@
+#include <stdio.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+static uint32_t *h_nonce[MAX_GPUS];
+static uint32_t *d_nonce[MAX_GPUS];
+
+#define shl(x, n)            ((x) << (n))
+#define shr(x, n)            ((x) >> (n))
+//#define SHR(x, n) SHR2(x, n) 
+//#define SHL(x, n) SHL2(x, n) 
+
+#undef SPH_ROTL32
+#define SPH_ROTL32 ROTL32
+
+
+#define ROTL32host(x, n) ROTL32(x,n)
+// #define SPH_ROTL32 SPH_ROTL32
+#define ss0(x)  (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x),  4) ^ SPH_ROTL32((x), 19))
+#define ss1(x)  (shr((x), 1) ^ shl((x), 2) ^ __byte_perm(x,0,0x2103) ^ SPH_ROTL32((x), 23))
+#define ss2(x)  (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
+#define ss3(x)  (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
+#define ss4(x)  (shr((x), 1) ^ (x))
+#define ss5(x)  (shr((x), 2) ^ (x))
+#define rs1(x) SPH_ROTL32((x),  3)
+#define rs2(x) SPH_ROTL32((x),  7)
+#define rs3(x) SPH_ROTL32((x), 13)
+#define rs4(x) __byte_perm(x,0,0x1032)
+#define rs5(x) SPH_ROTL32((x), 19)
+#define rs6(x) SPH_ROTL32((x), 23)
+#define rs7(x) SPH_ROTL32((x), 27)
+
+
+/* Message expansion function 1 */
+__forceinline__ __device__ uint32_t expand32_1(int i, const uint32_t *message, const uint32_t *H, const uint32_t *Q)
+{
+	return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13])
+					+ ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9])
+					+ ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5])
+					+ ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1])
+					+ ((i*(0x05555555ul) + SPH_ROTL32(message[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(message[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(message[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
+}
+
+/* Message expansion function 2 */
+__forceinline__ __device__ uint32_t expand32_2(const int i, const uint32_t *message, const uint32_t *H, const uint32_t *Q)
+{
+	return (
+		rs2(Q[i - 13]) + rs3(Q[i - 11]) + rs4(Q[i - 9]) + rs1(Q[i - 15]) +
+		+rs5(Q[i - 7]) + rs6(Q[i - 5]) + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1]));
+}
+
+#define TPB 512
+__global__	__launch_bounds__(TPB, 2)
+void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *const __restrict__ nonceVector, uint32_t Target)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	//	if (thread < threads)
+	{
+		uint32_t backup = Target;
+		uint32_t message[16] = {0};
+
+		((uint2*)message)[0] = __ldg(&g_hash[thread]);
+		((uint2*)message)[1] = __ldg(&g_hash[thread + 1 * threads]);
+		((uint2*)message)[2] = __ldg(&g_hash[thread + 2 * threads]);
+		((uint2*)message)[3] = __ldg(&g_hash[thread + 3 * threads]);
+
+
+		const uint32_t h1[16] = {
+			(0x40414243), (0x44454647),
+			(0x48494A4B), (0x4C4D4E4F),
+			(0x50515253), (0x54555657),
+			(0x58595A5B), (0x5C5D5E5F),
+			(0x60616263), (0x64656667),
+			(0x68696A6B), (0x6C6D6E6F),
+			(0x70717273), (0x74757677),
+			(0x78797A7B), (0x7C7D7E7F)
+		};
+
+		message[8] = 0x80;
+		message[14] = 0x100;
+
+		uint32_t XL32, XH32, Q[32];
+
+		Q[0] = (message[5] ^ h1[5]) - (message[7] ^ h1[7]) + (message[10] ^ h1[10]) + (message[13] ^ h1[13]) + (message[14] ^ h1[14]);
+		Q[1] = (message[6] ^ h1[6]) - (message[8] ^ h1[8]) + (message[11] ^ h1[11]) + (message[14] ^ h1[14]) - (message[15] ^ h1[15]);
+		Q[2] = (message[0] ^ h1[0]) + (message[7] ^ h1[7]) + (message[9] ^ h1[9]) - (message[12] ^ h1[12]) + (message[15] ^ h1[15]);
+		Q[3] = (message[0] ^ h1[0]) - (message[1] ^ h1[1]) + (message[8] ^ h1[8]) - (message[10] ^ h1[10]) + (message[13] ^ h1[13]);
+		Q[4] = (message[1] ^ h1[1]) + (message[2] ^ h1[2]) + (message[9] ^ h1[9]) - (message[11] ^ h1[11]) - (message[14] ^ h1[14]);
+		Q[5] = (message[3] ^ h1[3]) - (message[2] ^ h1[2]) + (message[10] ^ h1[10]) - (message[12] ^ h1[12]) + (message[15] ^ h1[15]);
+		Q[6] = (message[4] ^ h1[4]) - (message[0] ^ h1[0]) - (message[3] ^ h1[3]) - (message[11] ^ h1[11]) + (message[13] ^ h1[13]);
+		Q[7] = (message[1] ^ h1[1]) - (message[4] ^ h1[4]) - (message[5] ^ h1[5]) - (message[12] ^ h1[12]) - (message[14] ^ h1[14]);
+		Q[8] = (message[2] ^ h1[2]) - (message[5] ^ h1[5]) - (message[6] ^ h1[6]) + (message[13] ^ h1[13]) - (message[15] ^ h1[15]);
+		Q[9] = (message[0] ^ h1[0]) - (message[3] ^ h1[3]) + (message[6] ^ h1[6]) - (message[7] ^ h1[7]) + (message[14] ^ h1[14]);
+		Q[10] = (message[8] ^ h1[8]) - (message[1] ^ h1[1]) - (message[4] ^ h1[4]) - (message[7] ^ h1[7]) + (message[15] ^ h1[15]);
+		Q[11] = (message[8] ^ h1[8]) - (message[0] ^ h1[0]) - (message[2] ^ h1[2]) - (message[5] ^ h1[5]) + (message[9] ^ h1[9]);
+		Q[12] = (message[1] ^ h1[1]) + (message[3] ^ h1[3]) - (message[6] ^ h1[6]) - (message[9] ^ h1[9]) + (message[10] ^ h1[10]);
+		Q[13] = (message[2] ^ h1[2]) + (message[4] ^ h1[4]) + (message[7] ^ h1[7]) + (message[10] ^ h1[10]) + (message[11] ^ h1[11]);
+		Q[14] = (message[3] ^ h1[3]) - (message[5] ^ h1[5]) + (message[8] ^ h1[8]) - (message[11] ^ h1[11]) - (message[12] ^ h1[12]);
+		Q[15] = (message[12] ^ h1[12]) - (message[4] ^ h1[4]) - (message[6] ^ h1[6]) - (message[9] ^ h1[9]) + (message[13] ^ h1[13]);
+
+		Q[0] = ss0(Q[0]) + h1[1];
+		Q[1] = ss1(Q[1]) + h1[2];
+		Q[2] = ss2(Q[2]) + h1[3];
+		Q[3] = ss3(Q[3]) + h1[4];
+		Q[4] = ss4(Q[4]) + h1[5];
+		Q[5] = ss0(Q[5]) + h1[6];
+		Q[6] = ss1(Q[6]) + h1[7];
+		Q[7] = ss2(Q[7]) + h1[8];
+		Q[8] = ss3(Q[8]) + h1[9];
+		Q[9] = ss4(Q[9]) + h1[10];
+		Q[10] = ss0(Q[10]) + h1[11];
+		Q[11] = ss1(Q[11]) + h1[12];
+		Q[12] = ss2(Q[12]) + h1[13];
+		Q[13] = ss3(Q[13]) + h1[14];
+		Q[14] = ss4(Q[14]) + h1[15];
+		Q[15] = ss0(Q[15]) + h1[0];
+
+		Q[16] = ss1(Q[16 - 16]) + ss2(Q[16 - 15]) + ss3(Q[16 - 14]) + ss0(Q[16 - 13])
+			+ ss1(Q[16 - 12]) + ss2(Q[16 - 11]) + ss3(Q[16 - 10]) + ss0(Q[16 - 9])
+			+ ss1(Q[16 - 8]) + ss2(Q[16 - 7]) + ss3(Q[16 - 6]) + ss0(Q[16 - 5])
+			+ ss1(Q[16 - 4]) + ss2(Q[16 - 3]) + ss3(Q[16 - 2]) + ss0(Q[16 - 1])
+			+ ((16 * (0x05555555ul) + SPH_ROTL32(message[0], ((16 - 16) % 16) + 1) + SPH_ROTL32(message[3], ((16 - 13) % 16) + 1)) ^ h1[(16 - 16 + 7) % 16]);
+
+		Q[17] = ss1(Q[17 - 16]) + ss2(Q[17 - 15]) + ss3(Q[17 - 14]) + ss0(Q[17 - 13])
+			+ ss1(Q[17 - 12]) + ss2(Q[17 - 11]) + ss3(Q[17 - 10]) + ss0(Q[17 - 9])
+			+ ss1(Q[17 - 8]) + ss2(Q[17 - 7]) + ss3(Q[17 - 6]) + ss0(Q[17 - 5])
+			+ ss1(Q[17 - 4]) + ss2(Q[17 - 3]) + ss3(Q[17 - 2]) + ss0(Q[17 - 1])
+			+ ((17 * (0x05555555ul) + SPH_ROTL32(message[(17 - 16) % 16], ((17 - 16) % 16) + 1) + SPH_ROTL32(message[(17 - 13) % 16], ((17 - 13) % 16) + 1)) ^ h1[(17 - 16 + 7) % 16]);
+
+		uint32_t precalc = Q[18 - 16] + Q[18 - 14] + Q[18 - 12] + Q[18 - 10] + Q[18 - 8] + Q[18 - 6]; //+ Q[18 - 4]
+		uint32_t precalc2 = Q[19 - 16] + Q[19 - 14] + Q[19 - 12] + Q[19 - 10] + Q[19 - 8] + Q[19 - 6];//+ Q[19 - 4]
+
+		precalc = precalc + Q[18 - 4];
+		precalc2 = precalc2 + Q[18 + 1 - 4];
+		uint32_t p1 = ((18 * (0x05555555ul) + SPH_ROTL32(message[2], ((18 - 16) % 16) + 1) + SPH_ROTL32(message[5], ((18 - 13) % 16) + 1)) ^ h1[(18 - 16 + 7) % 16]);
+		uint32_t p2 = (((18 + 1)*(0x05555555ul) + SPH_ROTL32(message[3], (((18 + 1) - 16) % 16) + 1) + SPH_ROTL32(message[6], (((18 + 1) - 13) % 16) + 1)) ^ h1[((18 + 1) - 16 + 7) % 16]);
+		Q[18] = precalc + expand32_2(18, message, h1, Q) + p1;
+		Q[18 + 1] = precalc2 + expand32_2(18 + 1, message, h1, Q) + p2;
+		precalc = precalc - Q[18 - 16];
+		precalc2 = precalc2 - Q[18 + 1 - 16];
+
+		precalc = precalc + Q[20 - 4];
+		precalc2 = precalc2 + Q[20 + 1 - 4];
+		p1 = ((20 * (0x05555555ul) + SPH_ROTL32(message[4], ((20 - 16) % 16) + 1) + SPH_ROTL32(message[7], ((20 - 13) % 16) + 1) - (0x100 << 15)) ^ h1[(20 - 16 + 7) % 16]);
+		p2 = (((20 + 1)*(0x05555555ul) + SPH_ROTL32(message[5], (((20 + 1) - 16) % 16) + 1) + (0x80 << 9)) ^ h1[((20 + 1) - 16 + 7) % 16]);
+		Q[20] = precalc + expand32_2(20, message, h1, Q) + p1;
+		Q[20 + 1] = precalc2 + expand32_2(20 + 1, message, h1, Q) + p2;
+		precalc = precalc - Q[20 - 16];
+		precalc2 = precalc2 - Q[20 + 1 - 16];
+
+		precalc = precalc + Q[22 - 4];
+		precalc2 = precalc2 + Q[22 + 1 - 4];
+		p1 = ((22 * (0x05555555ul) + SPH_ROTL32(message[6], ((22 - 16) % 16) + 1) - SPH_ROTL32(message[0], ((22 - 6) % 16) + 1)) ^ h1[(22 - 16 + 7) % 16]);
+		p2 = (((22 + 1)*(0x05555555ul) + SPH_ROTL32(message[7], (((22 + 1) - 16) % 16) + 1) - SPH_ROTL32(message[1], (((22 + 1) - 6) % 16) + 1)) ^ h1[((22 + 1) - 16 + 7) % 16]);
+		Q[22] = precalc + expand32_2(22, message, h1, Q) + p1;
+		Q[22 + 1] = precalc2 + expand32_2(22 + 1, message, h1, Q) + p2;
+		precalc = precalc - Q[22 - 16];
+		precalc2 = precalc2 - Q[22 + 1 - 16];
+
+		precalc = precalc + Q[24 - 4];
+		precalc2 = precalc2 + Q[24 + 1 - 4];
+		p1 = ((24 * (0x05555555ul) + (0x80 << 9) - SPH_ROTL32(message[2], ((24 - 6) % 16) + 1)) ^ h1[(24 - 16 + 7) % 16]);
+		p2 = (((24 + 1)*(0x05555555ul) - SPH_ROTL32(message[3], (((24 + 1) - 6) % 16) + 1)) ^ h1[((24 + 1) - 16 + 7) % 16]);
+		Q[24] = precalc + expand32_2(24, message, h1, Q) + p1;
+		Q[24 + 1] = precalc2 + expand32_2(24 + 1, message, h1, Q) + p2;
+		precalc = precalc - Q[24 - 16];
+		precalc2 = precalc2 - Q[24 + 1 - 16];
+
+		precalc = precalc + Q[26 - 4];
+		precalc2 = precalc2 + Q[26 + 1 - 4];
+		p1 = ((26 * (0x05555555ul) - SPH_ROTL32(message[4], ((26 - 6) % 16) + 1)) ^ h1[(26 - 16 + 7) % 16]);
+		p2 = (((26 + 1)*(0x05555555ul) + (0x100 << 15) - SPH_ROTL32(message[5], (((26 + 1) - 6) % 16) + 1)) ^ h1[((26 + 1) - 16 + 7) % 16]);
+		Q[26] = precalc + expand32_2(26, message, h1, Q) + p1;
+		Q[26 + 1] = precalc2 + expand32_2(26 + 1, message, h1, Q) + p2;
+		precalc = precalc - Q[26 - 16];
+		precalc2 = precalc2 - Q[26 + 1 - 16];
+
+		precalc = precalc + Q[28 - 4];
+		precalc2 = precalc2 + Q[28 + 1 - 4];
+		p1 = ((28 * (0x05555555ul) - SPH_ROTL32(message[6], ((28 - 6) % 16) + 1)) ^ h1[(28 - 16 + 7) % 16]);
+		p2 = (((28 + 1)*(0x05555555ul) + SPH_ROTL32(message[0], (((28 + 1) - 13) % 16) + 1) - SPH_ROTL32(message[7], (((28 + 1) - 6) % 16) + 1)) ^ h1[((28 + 1) - 16 + 7) % 16]);
+		Q[28] = precalc + expand32_2(28, message, h1, Q) + p1;
+		Q[28 + 1] = precalc2 + expand32_2(28 + 1, message, h1, Q) + p2;
+		precalc = precalc - Q[28 - 16];
+		precalc2 = precalc2 - Q[28 + 1 - 16];
+
+		precalc = precalc + Q[30 - 4];
+		precalc2 = precalc2 + Q[30 + 1 - 4];
+		p1 = ((30 * (0x05555555ul) + (0x100 << 15) + SPH_ROTL32(message[1], ((30 - 13) % 16) + 1) - (0x80 << 9)) ^ h1[(30 - 16 + 7) % 16]);
+		p2 = (((30 + 1)*(0x05555555ul) + SPH_ROTL32(message[2], (((30 + 1) - 13) % 16) + 1)) ^ h1[((30 + 1) - 16 + 7) % 16]);
+		Q[30] = precalc + expand32_2(30, message, h1, Q) + p1;
+		Q[30 + 1] = precalc2 + expand32_2(30 + 1, message, h1, Q) + p2;
+		precalc = precalc - Q[30 - 16];
+		precalc2 = precalc2 - Q[30 + 1 - 16];
+
+		XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
+		XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
+
+		message[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ message[0]) + (XL32    ^ Q[24] ^ Q[0]);
+		message[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ message[1]) + (XL32    ^ Q[25] ^ Q[1]);
+		message[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ message[2]) + (XL32    ^ Q[26] ^ Q[2]);
+		message[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ message[3]) + (XL32    ^ Q[27] ^ Q[3]);
+		message[4] = (shr(XH32, 3) ^ Q[20] ^ message[4]) + (XL32    ^ Q[28] ^ Q[4]);
+		message[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ message[5]) + (XL32    ^ Q[29] ^ Q[5]);
+		message[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ message[6]) + (XL32    ^ Q[30] ^ Q[6]);
+		message[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ message[7]) + (XL32    ^ Q[31] ^ Q[7]);
+
+		message[8] = SPH_ROTL32(message[4], 9) + (XH32     ^     Q[24] ^ message[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]);
+		message[9] = SPH_ROTL32(message[5], 10) + (XH32     ^     Q[25] ^ message[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]);
+		message[10] = SPH_ROTL32(message[6], 11) + (XH32     ^     Q[26] ^ message[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]);
+		message[11] = SPH_ROTL32(message[7], 12) + (XH32     ^     Q[27] ^ message[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]);
+		message[12] = SPH_ROTL32(message[0], 13) + (XH32     ^     Q[28] ^ message[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]);
+		message[13] = SPH_ROTL32(message[1], 14) + (XH32     ^     Q[29] ^ message[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]);
+		message[14] = SPH_ROTL32(message[2], 15) + (XH32     ^     Q[30] ^ message[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
+		message[15] = SPH_ROTL32(message[3], 16) + (XH32     ^     Q[31] ^ message[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
+
+		const uint32_t h2[16] = {
+			(0xaaaaaaa0), (0xaaaaaaa1), (0xaaaaaaa2),
+			(0xaaaaaaa3), (0xaaaaaaa4), (0xaaaaaaa5),
+			(0xaaaaaaa6), (0xaaaaaaa7), (0xaaaaaaa8),
+			(0xaaaaaaa9), (0xaaaaaaaa), (0xaaaaaaab),
+			(0xaaaaaaac), (0xaaaaaaad), (0xaaaaaaae),
+			(0xaaaaaaaf)
+		};
+
+		Q[0] = (message[5] ^ h2[5]) - (message[7] ^ h2[7]) + (message[10] ^ h2[10]) + (message[13] ^ h2[13]) + (message[14] ^ h2[14]);
+		Q[1] = (message[6] ^ h2[6]) - (message[8] ^ h2[8]) + (message[11] ^ h2[11]) + (message[14] ^ h2[14]) - (message[15] ^ h2[15]);
+		Q[2] = (message[0] ^ h2[0]) + (message[7] ^ h2[7]) + (message[9] ^ h2[9]) - (message[12] ^ h2[12]) + (message[15] ^ h2[15]);
+		Q[3] = (message[0] ^ h2[0]) - (message[1] ^ h2[1]) + (message[8] ^ h2[8]) - (message[10] ^ h2[10]) + (message[13] ^ h2[13]);
+		Q[4] = (message[1] ^ h2[1]) + (message[2] ^ h2[2]) + (message[9] ^ h2[9]) - (message[11] ^ h2[11]) - (message[14] ^ h2[14]);
+		Q[5] = (message[3] ^ h2[3]) - (message[2] ^ h2[2]) + (message[10] ^ h2[10]) - (message[12] ^ h2[12]) + (message[15] ^ h2[15]);
+		Q[6] = (message[4] ^ h2[4]) - (message[0] ^ h2[0]) - (message[3] ^ h2[3]) - (message[11] ^ h2[11]) + (message[13] ^ h2[13]);
+		Q[7] = (message[1] ^ h2[1]) - (message[4] ^ h2[4]) - (message[5] ^ h2[5]) - (message[12] ^ h2[12]) - (message[14] ^ h2[14]);
+		Q[8] = (message[2] ^ h2[2]) - (message[5] ^ h2[5]) - (message[6] ^ h2[6]) + (message[13] ^ h2[13]) - (message[15] ^ h2[15]);
+		Q[9] = (message[0] ^ h2[0]) - (message[3] ^ h2[3]) + (message[6] ^ h2[6]) - (message[7] ^ h2[7]) + (message[14] ^ h2[14]);
+		Q[10] = (message[8] ^ h2[8]) - (message[1] ^ h2[1]) - (message[4] ^ h2[4]) - (message[7] ^ h2[7]) + (message[15] ^ h2[15]);
+		Q[11] = (message[8] ^ h2[8]) - (message[0] ^ h2[0]) - (message[2] ^ h2[2]) - (message[5] ^ h2[5]) + (message[9] ^ h2[9]);
+		Q[12] = (message[1] ^ h2[1]) + (message[3] ^ h2[3]) - (message[6] ^ h2[6]) - (message[9] ^ h2[9]) + (message[10] ^ h2[10]);
+		Q[13] = (message[2] ^ h2[2]) + (message[4] ^ h2[4]) + (message[7] ^ h2[7]) + (message[10] ^ h2[10]) + (message[11] ^ h2[11]);
+		Q[14] = (message[3] ^ h2[3]) - (message[5] ^ h2[5]) + (message[8] ^ h2[8]) - (message[11] ^ h2[11]) - (message[12] ^ h2[12]);
+		Q[15] = (message[12] ^ h2[12]) - (message[4] ^ h2[4]) - (message[6] ^ h2[6]) - (message[9] ^ h2[9]) + (message[13] ^ h2[13]);
+
+		Q[0] = ss0(Q[0]) + h2[1];
+		Q[1] = ss1(Q[1]) + h2[2];
+		Q[2] = ss2(Q[2]) + h2[3];
+		Q[3] = ss3(Q[3]) + h2[4];
+		Q[4] = ss4(Q[4]) + h2[5];
+		Q[5] = ss0(Q[5]) + h2[6];
+		Q[6] = ss1(Q[6]) + h2[7];
+		Q[7] = ss2(Q[7]) + h2[8];
+		Q[8] = ss3(Q[8]) + h2[9];
+		Q[9] = ss4(Q[9]) + h2[10];
+		Q[10] = ss0(Q[10]) + h2[11];
+		Q[11] = ss1(Q[11]) + h2[12];
+		Q[12] = ss2(Q[12]) + h2[13];
+		Q[13] = ss3(Q[13]) + h2[14];
+		Q[14] = ss4(Q[14]) + h2[15];
+		Q[15] = ss0(Q[15]) + h2[0];
+
+#pragma unroll
+		for(int i = 0; i<2; i++)
+			Q[i + 16] = expand32_1(i + 16, message, h2, Q);
+
+		precalc = Q[18 - 16] + Q[18 - 14] + Q[18 - 12] + Q[18 - 10] + Q[18 - 8] + Q[18 - 6];
+		precalc2 = Q[19 - 16] + Q[19 - 14] + Q[19 - 12] + Q[19 - 10] + Q[19 - 8] + Q[19 - 6];
+
+#pragma unroll
+		for(int i = 2 + 16; i < 16 + 16; i += 2)
+		{
+			precalc = precalc + Q[i - 4];
+			precalc2 = precalc2 + Q[i + 1 - 4];
+			p1 = ((i*(0x05555555ul) + SPH_ROTL32(message[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(message[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(message[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ h2[(i - 16 + 7) % 16]);
+			p2 = (((i + 1)*(0x05555555ul) + SPH_ROTL32(message[((i + 1) - 16) % 16], (((i + 1) - 16) % 16) + 1) + SPH_ROTL32(message[((i + 1) - 13) % 16], (((i + 1) - 13) % 16) + 1) - SPH_ROTL32(message[((i + 1) - 6) % 16], (((i + 1) - 6) % 16) + 1)) ^ h2[((i + 1) - 16 + 7) % 16]);
+			Q[i] = precalc + expand32_2(i, message, h2, Q) + p1;
+			Q[i + 1] = precalc2 + expand32_2(i + 1, message, h2, Q) + p2;
+			precalc = precalc - Q[i - 16];
+			precalc2 = precalc2 - Q[i + 1 - 16];
+		}
+
+		XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
+		XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
+
+		message[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ message[3]) + (XL32    ^ Q[27] ^ Q[3]);
+		message[15] = SPH_ROTL32(message[3], 16) + (XH32     ^     Q[31] ^ message[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
+
+		if(message[15] <= backup)
+		{
+
+			uint32_t tmp = atomicExch(nonceVector, startNounce + thread);
+			if(tmp != 0)
+				nonceVector[1] = tmp;
+		}
+	}
+}
+
+
+__host__
+void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target)
+{
+	CUDA_SAFE_CALL(cudaMemsetAsync(d_nonce[thr_id], 0x0, 2 * sizeof(uint32_t), gpustream[thr_id]));
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + TPB - 1) / TPB);
+	dim3 block(TPB);
+
+	bmw256_gpu_hash_32 << <grid, block >> >(threads, startNounce, (uint2 *)g_hash, d_nonce[thr_id], Target);
+	CUDA_SAFE_CALL(cudaGetLastError());
+	CUDA_SAFE_CALL(cudaMemcpy(h_nonce[thr_id], d_nonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+	resultnonces[0] = *(h_nonce[thr_id]);
+	resultnonces[1] = *(h_nonce[thr_id] + 1);
+}
+
+
+__host__
+void bmw256_cpu_init(int thr_id)
+{
+	CUDA_SAFE_CALL(cudaMalloc(&d_nonce[thr_id], 2 * sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMallocHost(&h_nonce[thr_id], 2 * sizeof(uint32_t)));
+}
+
+/*
+__host__
+void bmw256_setTarget(int thr_id, const void *pTargetIn)
+{
+cudaMemcpyToSymbolAsync(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]);
+}
+*/
\ No newline at end of file
diff --git a/Algo256/cuda_cubehash256.cu b/Algo256/cuda_cubehash256.cu
new file mode 100644
index 0000000000..8071d812c3
--- /dev/null
+++ b/Algo256/cuda_cubehash256.cu
@@ -0,0 +1,471 @@
+#include "cuda_helper.h"
+
+#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
+#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
+#define TPB 1024
+
+#define ROTATEUPWARDS7(a)  ROTL32(a,7)
+#define ROTATEUPWARDS11(a) ROTL32(a,11)
+
+//#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
+#define SWAP(a,b) { a ^= b; b ^=a; a ^=b;}
+__device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2])
+{
+	int r;
+	int j;
+	int k;
+	int l;
+	int m;
+
+	#pragma unroll 2
+	for (r = 0; r < CUBEHASH_ROUNDS; ++r) {
+
+		/* "add x_0jklm into x_1jklmn modulo 2^32" */
+#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+#pragma unroll 2
+					for (m = 0; m < 2; ++m)
+						x[1][j][k][l][m] += x[0][j][k][l][m];
+
+		/* "rotate x_0jklm upwards by 7 bits" */
+#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+#pragma unroll 2
+					for (m = 0; m < 2; ++m)
+						x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
+
+		/* "swap x_00klm with x_01klm" */
+#pragma unroll 2
+		for (k = 0; k < 2; ++k)
+#pragma unroll 2
+			for (l = 0; l < 2; ++l)
+#pragma unroll 2
+				for (m = 0; m < 2; ++m)
+					SWAP(x[0][0][k][l][m], x[0][1][k][l][m])
+
+					/* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+					for (j = 0; j < 2; ++j)
+#pragma unroll 2
+						for (k = 0; k < 2; ++k)
+#pragma unroll 2
+							for (l = 0; l < 2; ++l)
+#pragma unroll 2
+								for (m = 0; m < 2; ++m)
+									x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+		/* "swap x_1jk0m with x_1jk1m" */
+#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+#pragma unroll 2
+				for (m = 0; m < 2; ++m)
+					SWAP(x[1][j][k][0][m], x[1][j][k][1][m])
+
+					/* "add x_0jklm into x_1jklm modulo 2^32" */
+#pragma unroll 2
+					for (j = 0; j < 2; ++j)
+#pragma unroll 2
+						for (k = 0; k < 2; ++k)
+#pragma unroll 2
+							for (l = 0; l < 2; ++l)
+#pragma unroll 2
+								for (m = 0; m < 2; ++m)
+									x[1][j][k][l][m] += x[0][j][k][l][m];
+
+		/* "rotate x_0jklm upwards by 11 bits" */
+#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+#pragma unroll 2
+					for (m = 0; m < 2; ++m)
+						x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
+
+		/* "swap x_0j0lm with x_0j1lm" */
+#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+#pragma unroll 2
+			for (l = 0; l < 2; ++l)
+#pragma unroll 2
+				for (m = 0; m < 2; ++m)
+					SWAP(x[0][j][0][l][m], x[0][j][1][l][m])
+
+					/* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+					for (j = 0; j < 2; ++j)
+#pragma unroll 2
+						for (k = 0; k < 2; ++k)
+#pragma unroll 2
+							for (l = 0; l < 2; ++l)
+#pragma unroll 2
+								for (m = 0; m < 2; ++m)
+									x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+		/* "swap x_1jkl0 with x_1jkl1" */
+#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+					SWAP(x[1][j][k][l][0], x[1][j][k][l][1])
+
+	}
+}
+
+__device__ __forceinline__ void block_tox(const uint32_t *in, uint32_t x[2][2][2][2][2])
+{
+	x[0][0][0][0][0] ^= in[0];
+	x[0][0][0][0][1] ^= in[1];
+	x[0][0][0][1][0] ^= in[2];
+	x[0][0][0][1][1] ^= in[3];
+	x[0][0][1][0][0] ^= in[4];
+	x[0][0][1][0][1] ^= in[5];
+	x[0][0][1][1][0] ^= in[6];
+	x[0][0][1][1][1] ^= in[7];
+}
+
+__device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2])
+{
+	out[0] = x[0][0][0][0][0];
+	out[1] = x[0][0][0][0][1];
+	out[2] = x[0][0][0][1][0];
+	out[3] = x[0][0][0][1][1];
+	out[4] = x[0][0][1][0][0];
+	out[5] = x[0][0][1][0][1];
+	out[6] = x[0][0][1][1][0];
+	out[7] = x[0][0][1][1][1];
+
+}
+
+void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const uint32_t *data)
+{
+	/* "xor the block into the first b bytes of the state" */
+	/* "and then transform the state invertibly through r identical rounds" */
+	block_tox(data, x);
+	rrounds(x);
+}
+
+void __device__ __forceinline__ Update32_const(uint32_t x[2][2][2][2][2])
+{
+	x[0][0][0][0][0] ^= 0x80;
+	rrounds(x);
+}
+
+
+
+void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
+{
+	int i;
+
+	/* "the integer 1 is xored into the last state word x_11111" */
+	x[1][1][1][1][1] ^= 1;
+
+	/* "the state is then transformed invertibly through 10r identical rounds" */
+	#pragma unroll 2
+	for (i = 0; i < 10; ++i) rrounds(x);
+
+	/* "output the first h/8 bytes of the state" */
+	hash_fromx(hashval, x);
+}
+
+
+#if __CUDA_ARCH__ <500
+__global__	__launch_bounds__(TPB, 1)
+#else 
+__global__	__launch_bounds__(TPB, 1)
+#endif
+void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//    if (thread < threads)
+    {
+
+        uint2 Hash[4];
+
+		
+		Hash[0]= __ldg(&g_hash[thread]);
+		Hash[1] = __ldg(&g_hash[thread + 1 * threads]);	//	LOHI(Hash[2], Hash[3], __ldg(&g_hash[thread + 1 * threads]));
+		Hash[2] = __ldg(&g_hash[thread + 2 * threads]);	//	LOHI(Hash[4], Hash[5], __ldg(&g_hash[thread + 2 * threads]));
+		Hash[3] = __ldg(&g_hash[thread + 3 * threads]);	//	LOHI(Hash[6], Hash[7], __ldg(&g_hash[thread + 3 * threads]));
+
+		uint32_t x[2][2][2][2][2] =
+		{
+			0xEA2BD4B4, 0xCCD6F29F, 0x63117E71,
+			0x35481EAE, 0x22512D5B, 0xE5D94E63,
+			0x7E624131, 0xF4CC12BE, 0xC2D0B696,
+			0x42AF2070, 0xD0720C35, 0x3361DA8C,
+			0x28CCECA4, 0x8EF8AD83, 0x4680AC00,
+			0x40E5FBAB, 0xD89041C3, 0x6107FBD5,
+			0x6C859D41, 0xF0B26679, 0x09392549,
+			0x5FA25603, 0x65C892FD, 0x93CB6285,
+			0x2AF2B5AE, 0x9E4B4E60, 0x774ABFDD,
+			0x85254725, 0x15815AEB, 0x4AB6AAD6,
+			0x9CDAF8AF, 0xD6032C0A
+
+		};
+		x[0][0][0][0][0] ^= Hash[0].x;
+		x[0][0][0][0][1] ^= Hash[0].y;
+		x[0][0][0][1][0] ^= Hash[1].x;
+		x[0][0][0][1][1] ^= Hash[1].y;
+		x[0][0][1][0][0] ^= Hash[2].x;
+		x[0][0][1][0][1] ^= Hash[2].y;
+		x[0][0][1][1][0] ^= Hash[3].x;
+		x[0][0][1][1][1] ^= Hash[3].y;
+
+//		rrounds(x);
+		int r;
+		int j;
+		int k;
+		int l;
+		int m;
+		
+		/* "add x_0jklm into x_1jklmn modulo 2^32" */
+#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+#pragma unroll 2
+					for (m = 0; m < 2; ++m)
+						x[1][j][k][l][m] += x[0][j][k][l][m];
+
+		/* "rotate x_0jklm upwards by 7 bits" */
+#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+#pragma unroll 2
+					for (m = 0; m < 2; ++m)
+						x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
+
+		/* "swap x_00klm with x_01klm" */
+#pragma unroll 2
+		for (k = 0; k < 2; ++k)
+#pragma unroll 2
+			for (l = 0; l < 2; ++l)
+#pragma unroll 2
+				for (m = 0; m < 2; ++m)
+					SWAP(x[0][0][k][l][m], x[0][1][k][l][m])
+
+					/* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+					for (j = 0; j < 2; ++j)
+#pragma unroll 2
+						for (k = 0; k < 2; ++k)
+#pragma unroll 2
+							for (l = 0; l < 2; ++l)
+#pragma unroll 2
+								for (m = 0; m < 2; ++m)
+									x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+		/* "swap x_1jk0m with x_1jk1m" */
+#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+#pragma unroll 2
+				for (m = 0; m < 2; ++m)
+					SWAP(x[1][j][k][0][m], x[1][j][k][1][m])
+
+					/* "add x_0jklm into x_1jklm modulo 2^32" */
+#pragma unroll 2
+					for (j = 0; j < 2; ++j)
+#pragma unroll 2
+						for (k = 0; k < 2; ++k)
+#pragma unroll 2
+							for (l = 0; l < 2; ++l)
+#pragma unroll 2
+								for (m = 0; m < 2; ++m)
+									x[1][j][k][l][m] += x[0][j][k][l][m];
+
+		/* "rotate x_0jklm upwards by 11 bits" */
+#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+#pragma unroll 2
+					for (m = 0; m < 2; ++m)
+						x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
+
+		/* "swap x_0j0lm with x_0j1lm" */
+#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+#pragma unroll 2
+			for (l = 0; l < 2; ++l)
+#pragma unroll 2
+				for (m = 0; m < 2; ++m)
+					SWAP(x[0][j][0][l][m], x[0][j][1][l][m])
+
+					/* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+					for (j = 0; j < 2; ++j)
+#pragma unroll 2
+						for (k = 0; k < 2; ++k)
+#pragma unroll 2
+							for (l = 0; l < 2; ++l)
+#pragma unroll 2
+								for (m = 0; m < 2; ++m)
+									x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+		/* "swap x_1jkl0 with x_1jkl1" */
+#pragma unroll 2
+		for (j = 0; j < 2; ++j)
+#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+					SWAP(x[1][j][k][l][0], x[1][j][k][l][1])
+
+
+#pragma unroll 1
+		for (r = 1; r < CUBEHASH_ROUNDS; ++r) 
+		{
+
+			/* "add x_0jklm into x_1jklmn modulo 2^32" */
+#pragma unroll 2
+			for (j = 0; j < 2; ++j)
+#pragma unroll 2
+				for (k = 0; k < 2; ++k)
+#pragma unroll 2
+					for (l = 0; l < 2; ++l)
+#pragma unroll 2
+						for (m = 0; m < 2; ++m)
+							x[1][j][k][l][m] += x[0][j][k][l][m];
+
+			/* "rotate x_0jklm upwards by 7 bits" */
+#pragma unroll 2
+			for (j = 0; j < 2; ++j)
+#pragma unroll 2
+				for (k = 0; k < 2; ++k)
+#pragma unroll 2
+					for (l = 0; l < 2; ++l)
+#pragma unroll 2
+						for (m = 0; m < 2; ++m)
+							x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
+
+			/* "swap x_00klm with x_01klm" */
+#pragma unroll 2
+			for (k = 0; k < 2; ++k)
+#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+#pragma unroll 2
+					for (m = 0; m < 2; ++m)
+						SWAP(x[0][0][k][l][m], x[0][1][k][l][m])
+
+						/* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+						for (j = 0; j < 2; ++j)
+#pragma unroll 2
+							for (k = 0; k < 2; ++k)
+#pragma unroll 2
+								for (l = 0; l < 2; ++l)
+#pragma unroll 2
+									for (m = 0; m < 2; ++m)
+										x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+			/* "swap x_1jk0m with x_1jk1m" */
+#pragma unroll 2
+			for (j = 0; j < 2; ++j)
+#pragma unroll 2
+				for (k = 0; k < 2; ++k)
+#pragma unroll 2
+					for (m = 0; m < 2; ++m)
+						SWAP(x[1][j][k][0][m], x[1][j][k][1][m])
+
+						/* "add x_0jklm into x_1jklm modulo 2^32" */
+#pragma unroll 2
+						for (j = 0; j < 2; ++j)
+#pragma unroll 2
+							for (k = 0; k < 2; ++k)
+#pragma unroll 2
+								for (l = 0; l < 2; ++l)
+#pragma unroll 2
+									for (m = 0; m < 2; ++m)
+										x[1][j][k][l][m] += x[0][j][k][l][m];
+
+			/* "rotate x_0jklm upwards by 11 bits" */
+#pragma unroll 2
+			for (j = 0; j < 2; ++j)
+#pragma unroll 2
+				for (k = 0; k < 2; ++k)
+#pragma unroll 2
+					for (l = 0; l < 2; ++l)
+#pragma unroll 2
+						for (m = 0; m < 2; ++m)
+							x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
+
+			/* "swap x_0j0lm with x_0j1lm" */
+#pragma unroll 2
+			for (j = 0; j < 2; ++j)
+#pragma unroll 2
+				for (l = 0; l < 2; ++l)
+#pragma unroll 2
+					for (m = 0; m < 2; ++m)
+						SWAP(x[0][j][0][l][m], x[0][j][1][l][m])
+
+						/* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+						for (j = 0; j < 2; ++j)
+#pragma unroll 2
+							for (k = 0; k < 2; ++k)
+#pragma unroll 2
+								for (l = 0; l < 2; ++l)
+#pragma unroll 2
+									for (m = 0; m < 2; ++m)
+										x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+			/* "swap x_1jkl0 with x_1jkl1" */
+#pragma unroll 2
+			for (j = 0; j < 2; ++j)
+#pragma unroll 2
+				for (k = 0; k < 2; ++k)
+#pragma unroll 2
+					for (l = 0; l < 2; ++l)
+						SWAP(x[1][j][k][l][0], x[1][j][k][l][1])
+
+		}
+
+
+
+		x[0][0][0][0][0] ^= 0x80;
+		rrounds(x);
+
+		Final(x, (uint32_t *)Hash);
+
+		g_hash[thread] =               ((uint2*)Hash)[0];
+		g_hash[1 * threads + thread] = ((uint2*)Hash)[1];
+		g_hash[2 * threads + thread] = ((uint2*)Hash)[2];
+		g_hash[3 * threads + thread] = ((uint2*)Hash)[3];
+    }
+}
+
+
+__host__
+void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash)
+{
+
+    // berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + (TPB) - 1) / (TPB));
+	dim3 block(TPB);
+
+	cubehash256_gpu_hash_32 << <grid, block, 0, gpustream[thr_id] >> >(threads, startNounce, (uint2 *)d_hash);
+	CUDA_SAFE_CALL(cudaGetLastError());
+}
diff --git a/Algo256/cuda_fugue256.cu b/Algo256/cuda_fugue256.cu
index 08950a5d41..7285e2cd70 100644
--- a/Algo256/cuda_fugue256.cu
+++ b/Algo256/cuda_fugue256.cu
@@ -6,10 +6,11 @@
 #include "cuda_helper.h"
 #include <host_defines.h>
 
+
 #define USE_SHARED 1
 
-uint32_t *d_fugue256_hashoutput[MAX_GPUS];
-uint32_t *d_resultNonce[MAX_GPUS];
+static uint32_t *d_fugue256_hashoutput[MAX_GPUS];
+static uint32_t *d_resultNonce[MAX_GPUS];
 
 __constant__ uint32_t GPUstate[30]; // Single GPU
 __constant__ uint32_t pTarget[8]; // Single GPU
@@ -540,7 +541,7 @@ static const uint32_t mixtab3_cpu[] = {
 #define S34   (sc[34])
 #define S35   (sc[35])
 
-#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+//#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
 /* GPU - FUNKTIONEN */
 
 #if USE_SHARED
@@ -561,8 +562,8 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp
 	__syncthreads();
 #endif
 
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
 	{
 		/* Nimm den State und verarbeite das letztenByte (die Nounce) */
 		uint32_t sc[30];
@@ -571,7 +572,7 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp
 		for(int i=0;i<30;i++)
 			sc[i] = GPUstate[i];
 
-		uint32_t nounce = startNounce + thread; // muss noch ermittelt werden
+		const uint32_t nounce = startNounce + thread; // muss noch ermittelt werden
 		uint32_t q;
 
 
@@ -679,11 +680,11 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp
 		uint32_t hash[8];
 	#pragma unroll 4
 		for(int i=0;i<4;i++)
-			((uint32_t*)hash)[i] = SWAB32(sc[19+i]);
+			((uint32_t*)hash)[i] = cuda_swab32(sc[19+i]);
 
 	#pragma unroll 4
 		for(int i=0;i<4;i++)
-			((uint32_t*)hash)[i+4] = SWAB32(sc[3+i]);
+			((uint32_t*)hash)[i + 4] = cuda_swab32(sc[3 + i]);
 
 		int i;
 		bool rc = true;
@@ -710,7 +711,7 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp
 #define texDef(texname, texmem, texsource, texsize) \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
-	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+	cudaMemcpyAsync(texmem, texsource, texsize, cudaMemcpyHostToDevice, gpustream[thr_id]); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
@@ -721,6 +722,8 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp
 void fugue256_cpu_init(int thr_id, uint32_t threads)
 {
 	CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+	cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
@@ -729,8 +732,8 @@ void fugue256_cpu_init(int thr_id, uint32_t threads)
 	texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
 
 	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
-	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)));
 }
 
 __host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
@@ -740,15 +743,15 @@ __host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 	sph_fugue256_init(&ctx_fugue_const);
 	sph_fugue256 (&ctx_fugue_const, data, 80);	// State speichern
 
-	cudaMemcpyToSymbol(	GPUstate,
+	cudaMemcpyToSymbolAsync(	GPUstate,
 						ctx_fugue_const.S,
-						sizeof(uint32_t) * 30 );
+						sizeof(uint32_t) * 30 , 0,cudaMemcpyHostToDevice, gpustream[thr_id]);
 
-	cudaMemcpyToSymbol(	pTarget,
+	cudaMemcpyToSymbolAsync(	pTarget,
 						pTargetIn,
-						sizeof(uint32_t) * 8 );
+						sizeof(uint32_t) * 8, 0, cudaMemcpyHostToDevice, gpustream[thr_id]);
 
-	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+	cudaMemsetAsync(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t), gpustream[thr_id]);
 }
 
 __host__ void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce)
@@ -762,8 +765,8 @@ __host__ void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, v
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	fugue256_gpu_hash<<<grid, block>>>(thr_id, threads, startNounce, d_fugue256_hashoutput[thr_id], d_resultNonce[thr_id]);
+	fugue256_gpu_hash<<<grid, block, 0, gpustream[thr_id]>>>(thr_id, threads, startNounce, d_fugue256_hashoutput[thr_id], d_resultNonce[thr_id]);
 
-	//cudaMemcpy(outputHashes, d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	//cudaMemcpyAsync(outputHashes, d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
+	CUDA_SAFE_CALL(cudaMemcpyAsync(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); cudaStreamSynchronize(gpustream[thr_id]);
 }
diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu
index 4700fa0cbf..1f48acc6c1 100644
--- a/Algo256/cuda_groestl256.cu
+++ b/Algo256/cuda_groestl256.cu
@@ -1,23 +1,10 @@
 #include <memory.h>
-
 #include "cuda_helper.h"
 
-uint32_t *d_gnounce[MAX_GPUS];
-uint32_t *d_GNonce[MAX_GPUS];
+static uint32_t *d_GNonce[MAX_GPUS];
 
 __constant__ uint32_t pTarget[8];
 
-#define C32e(x) \
-	  ((SPH_C32(x) >> 24) \
-	| ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
-	| ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
-	| ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
-
-#define PC32up(j, r)   ((uint32_t)((j) + (r)))
-#define PC32dn(j, r)   0
-#define QC32up(j, r)   0xFFFFFFFF
-#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ ~((uint32_t)(j) << 24))
-
 #define B32_0(x)    __byte_perm(x, 0, 0x4440)
 //((x) & 0xFF)
 #define B32_1(x)    __byte_perm(x, 0, 0x4441)
@@ -91,32 +78,566 @@ texture<unsigned int, 1, cudaReadModeElementType> t3dn2;
 		^ T3up(B32_3(a[b7])); \
 	} while (0)
 
-
-extern uint32_t T0up_cpu[];
-extern uint32_t T0dn_cpu[];
-extern uint32_t T1up_cpu[];
-extern uint32_t T1dn_cpu[];
-extern uint32_t T2up_cpu[];
-extern uint32_t T2dn_cpu[];
-extern uint32_t T3up_cpu[];
-extern uint32_t T3dn_cpu[];
+#ifndef SPH_C32
+#define SPH_C32(x) ((uint32_t)(x ## U))
+#endif
+#define C32e(x)     ((SPH_C32(x) >> 24) \
+                    | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+                    | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+                    | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+
+uint32_t T0up_cpu[] = {
+	C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d),
+	C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54),
+	C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d),
+	C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a),
+	C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287),
+	C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b),
+	C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea),
+	C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b),
+	C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a),
+	C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f),
+	C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808),
+	C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f),
+	C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e),
+	C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5),
+	C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d),
+	C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f),
+	C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e),
+	C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb),
+	C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce),
+	C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297),
+	C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c),
+	C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced),
+	C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b),
+	C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a),
+	C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16),
+	C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794),
+	C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881),
+	C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3),
+	C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a),
+	C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04),
+	C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563),
+	C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d),
+	C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f),
+	C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39),
+	C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947),
+	C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495),
+	C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f),
+	C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83),
+	C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c),
+	C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76),
+	C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e),
+	C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4),
+	C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6),
+	C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b),
+	C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7),
+	C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0),
+	C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25),
+	C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818),
+	C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672),
+	C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351),
+	C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321),
+	C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485),
+	C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa),
+	C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612),
+	C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0),
+	C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9),
+	C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533),
+	C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7),
+	C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020),
+	C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a),
+	C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917),
+	C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8),
+	C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311),
+	C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a)
+};
+
+uint32_t T0dn_cpu[] = {
+	C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6),
+	C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491),
+	C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56),
+	C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec),
+	C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa),
+	C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb),
+	C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45),
+	C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b),
+	C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c),
+	C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83),
+	C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9),
+	C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a),
+	C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d),
+	C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f),
+	C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf),
+	C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea),
+	C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34),
+	C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b),
+	C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d),
+	C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713),
+	C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1),
+	C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6),
+	C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72),
+	C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85),
+	C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed),
+	C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411),
+	C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe),
+	C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b),
+	C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05),
+	C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1),
+	C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342),
+	C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf),
+	C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3),
+	C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e),
+	C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a),
+	C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6),
+	C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3),
+	C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b),
+	C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28),
+	C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad),
+	C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14),
+	C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8),
+	C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4),
+	C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2),
+	C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da),
+	C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049),
+	C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf),
+	C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810),
+	C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c),
+	C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197),
+	C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e),
+	C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f),
+	C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc),
+	C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c),
+	C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069),
+	C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927),
+	C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322),
+	C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733),
+	C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9),
+	C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5),
+	C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a),
+	C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0),
+	C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e),
+	C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c)
+};
+
+uint32_t T1up_cpu[] = {
+	C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c),
+	C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc),
+	C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187),
+	C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5),
+	C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892),
+	C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d),
+	C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025),
+	C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed),
+	C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be),
+	C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1),
+	C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118),
+	C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41),
+	C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2),
+	C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4),
+	C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847),
+	C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba),
+	C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672),
+	C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16),
+	C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449),
+	C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2),
+	C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574),
+	C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c),
+	C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd),
+	C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde),
+	C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a),
+	C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7),
+	C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698),
+	C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e),
+	C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085),
+	C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c),
+	C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5),
+	C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7),
+	C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271),
+	C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b),
+	C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9),
+	C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4),
+	C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281),
+	C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e),
+	C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44),
+	C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a),
+	C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622),
+	C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37),
+	C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1),
+	C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486),
+	C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2),
+	C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b),
+	C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f),
+	C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828),
+	C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96),
+	C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3),
+	C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63),
+	C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94),
+	C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5),
+	C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36),
+	C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b),
+	C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0),
+	C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755),
+	C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2),
+	C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960),
+	C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e),
+	C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339),
+	C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3),
+	C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33),
+	C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e)
+};
+
+uint32_t T1dn_cpu[] = {
+	C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d),
+	C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954),
+	C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d),
+	C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a),
+	C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87),
+	C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b),
+	C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea),
+	C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b),
+	C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a),
+	C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f),
+	C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908),
+	C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f),
+	C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e),
+	C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5),
+	C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d),
+	C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f),
+	C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e),
+	C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb),
+	C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face),
+	C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697),
+	C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c),
+	C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed),
+	C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b),
+	C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a),
+	C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116),
+	C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294),
+	C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781),
+	C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3),
+	C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a),
+	C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904),
+	C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463),
+	C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d),
+	C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f),
+	C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39),
+	C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447),
+	C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795),
+	C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f),
+	C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683),
+	C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c),
+	C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176),
+	C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e),
+	C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4),
+	C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6),
+	C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b),
+	C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7),
+	C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0),
+	C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525),
+	C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018),
+	C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872),
+	C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551),
+	C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21),
+	C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85),
+	C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa),
+	C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812),
+	C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0),
+	C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9),
+	C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433),
+	C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7),
+	C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920),
+	C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a),
+	C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417),
+	C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8),
+	C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11),
+	C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a)
+};
+
+uint32_t T2up_cpu[] = {
+	C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a),
+	C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d),
+	C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1),
+	C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59),
+	C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68),
+	C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6),
+	C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560),
+	C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76),
+	C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2),
+	C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352),
+	C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1),
+	C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b),
+	C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f),
+	C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb),
+	C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98),
+	C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50),
+	C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446),
+	C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d),
+	C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34),
+	C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1),
+	C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5),
+	C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a),
+	C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af),
+	C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b),
+	C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7),
+	C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6),
+	C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66),
+	C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75),
+	C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580),
+	C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd),
+	C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7),
+	C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08),
+	C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2),
+	C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65),
+	C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3),
+	C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642),
+	C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322),
+	C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95),
+	C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c),
+	C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37),
+	C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436),
+	C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f),
+	C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435),
+	C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274),
+	C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18),
+	C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972),
+	C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0),
+	C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038),
+	C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca),
+	C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764),
+	C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d),
+	C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b),
+	C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29),
+	C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a),
+	C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902),
+	C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7),
+	C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277),
+	C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1),
+	C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9),
+	C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b),
+	C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23),
+	C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003),
+	C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d),
+	C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62)
+};
+
+uint32_t T2dn_cpu[] = {
+	C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7),
+	C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39),
+	C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac),
+	C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3),
+	C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef),
+	C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded),
+	C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a),
+	C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d),
+	C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98),
+	C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d),
+	C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9),
+	C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154),
+	C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221),
+	C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e),
+	C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5),
+	C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf),
+	C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268),
+	C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6),
+	C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa),
+	C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226),
+	C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499),
+	C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77),
+	C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4),
+	C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11),
+	C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1),
+	C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722),
+	C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7),
+	C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96),
+	C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a),
+	C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9),
+	C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584),
+	C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765),
+	C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d),
+	C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c),
+	C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4),
+	C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7),
+	C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d),
+	C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16),
+	C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450),
+	C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41),
+	C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228),
+	C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b),
+	C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193),
+	C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff),
+	C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af),
+	C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92),
+	C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85),
+	C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820),
+	C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8),
+	C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335),
+	C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c),
+	C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e),
+	C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583),
+	C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638),
+	C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2),
+	C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e),
+	C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544),
+	C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266),
+	C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089),
+	C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51),
+	C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934),
+	C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb),
+	C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c),
+	C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58)
+};
+
+uint32_t T3up_cpu[] = {
+	C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6),
+	C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191),
+	C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656),
+	C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec),
+	C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa),
+	C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb),
+	C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545),
+	C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b),
+	C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c),
+	C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383),
+	C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9),
+	C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a),
+	C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d),
+	C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f),
+	C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf),
+	C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea),
+	C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434),
+	C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b),
+	C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d),
+	C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313),
+	C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1),
+	C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6),
+	C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272),
+	C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585),
+	C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded),
+	C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111),
+	C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe),
+	C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b),
+	C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505),
+	C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1),
+	C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242),
+	C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf),
+	C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3),
+	C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e),
+	C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a),
+	C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6),
+	C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3),
+	C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b),
+	C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828),
+	C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad),
+	C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414),
+	C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8),
+	C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4),
+	C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2),
+	C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada),
+	C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949),
+	C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf),
+	C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010),
+	C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c),
+	C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797),
+	C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e),
+	C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f),
+	C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc),
+	C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c),
+	C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969),
+	C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727),
+	C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222),
+	C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333),
+	C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9),
+	C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5),
+	C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a),
+	C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0),
+	C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e),
+	C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c)
+};
+
+uint32_t T3dn_cpu[] = {
+	C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c),
+	C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc),
+	C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87),
+	C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5),
+	C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792),
+	C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d),
+	C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25),
+	C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed),
+	C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe),
+	C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1),
+	C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818),
+	C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41),
+	C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2),
+	C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4),
+	C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47),
+	C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba),
+	C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72),
+	C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16),
+	C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49),
+	C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2),
+	C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74),
+	C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c),
+	C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd),
+	C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade),
+	C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a),
+	C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7),
+	C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198),
+	C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e),
+	C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85),
+	C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c),
+	C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5),
+	C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7),
+	C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71),
+	C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b),
+	C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9),
+	C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4),
+	C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81),
+	C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e),
+	C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44),
+	C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a),
+	C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22),
+	C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437),
+	C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1),
+	C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86),
+	C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2),
+	C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b),
+	C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f),
+	C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828),
+	C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296),
+	C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3),
+	C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163),
+	C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594),
+	C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5),
+	C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236),
+	C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b),
+	C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0),
+	C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355),
+	C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2),
+	C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060),
+	C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e),
+	C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739),
+	C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3),
+	C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133),
+	C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e)
+};
 
 __device__ __forceinline__
-void groestl256_perm_P(uint32_t thread, uint32_t *a, uint32_t *mixtabs)
+void groestl256_perm_P(uint32_t *const __restrict__ a, const uint32_t *const __restrict__ mixtabs)
 {
 	#pragma unroll 10
 	for (int r = 0; r<10; r++)
 	{
 		uint32_t t[16];
 
-		a[0x0] ^= PC32up(0x00, r);
-		a[0x2] ^= PC32up(0x10, r);
-		a[0x4] ^= PC32up(0x20, r);
-		a[0x6] ^= PC32up(0x30, r);
-		a[0x8] ^= PC32up(0x40, r);
-		a[0xA] ^= PC32up(0x50, r);
-		a[0xC] ^= PC32up(0x60, r);
-		a[0xE] ^= PC32up(0x70, r);
+		a[0x0] ^= 0x00 + r;
+		a[0x2] ^= 0x10 + r;
+		a[0x4] ^= 0x20 + r;
+		a[0x6] ^= 0x30 + r;
+		a[0x8] ^= 0x40 + r;
+		a[0xA] ^= 0x50 + r;
+		a[0xC] ^= 0x60 + r;
+		a[0xE] ^= 0x70 + r;
 		RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF);
 		RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1);
 		RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3);
@@ -131,31 +652,69 @@ void groestl256_perm_P(uint32_t thread, uint32_t *a, uint32_t *mixtabs)
 			a[k] = t[k];
 	}
 }
+__device__ __forceinline__
+
+void groestl256_perm_P_final(uint32_t *const __restrict__ a, const uint32_t *const __restrict__ mixtabs)
+{
+	uint32_t t[16];
+#pragma unroll
+	for(int r = 0; r<9; r++)
+	{
+		a[0x0] ^= 0x00 + r;
+		a[0x2] ^= 0x10 + r;
+		a[0x4] ^= 0x20 + r;
+		a[0x6] ^= 0x30 + r;
+		a[0x8] ^= 0x40 + r;
+		a[0xA] ^= 0x50 + r;
+		a[0xC] ^= 0x60 + r;
+		a[0xE] ^= 0x70 + r;
+		RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF);
+		RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1);
+		RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3);
+		RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5);
+		RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7);
+		RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9);
+		RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB);
+		RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD);
+
+#pragma unroll 16
+		for(int k = 0; k<16; k++)
+			a[k] = t[k];
+	}
+	a[15] = T0dn(B32_0(a[14] ^ 0x79))
+		^   T1dn(B32_1(a[ 0] ^ 0x09))
+		^   T2dn(B32_2(a[ 2] ^ 0x19))
+		^   T3dn(B32_3(a[ 4] ^ 0x29))
+		^   T0up(B32_0(a[ 7]))
+		^   T1up(B32_1(a[ 9]))
+		^   T2up(B32_2(a[11]))
+		^   T3up(B32_3(a[13]));
+}
 
 __device__ __forceinline__
-void groestl256_perm_Q(uint32_t thread, uint32_t *a, uint32_t *mixtabs)
+void groestl256_perm_Q(uint32_t *const __restrict__ a, const uint32_t *const __restrict__ mixtabs)
 {
 	#pragma unroll
-	for (int r = 0; r<10; r++)
+	for (uint32_t r = 0; r<0x0a000000; r+=0x01000000)
 	{
 		uint32_t t[16];
 
-		a[0x0] ^= QC32up(0x00, r);
-		a[0x1] ^= QC32dn(0x00, r);
-		a[0x2] ^= QC32up(0x10, r);
-		a[0x3] ^= QC32dn(0x10, r);
-		a[0x4] ^= QC32up(0x20, r);
-		a[0x5] ^= QC32dn(0x20, r);
-		a[0x6] ^= QC32up(0x30, r);
-		a[0x7] ^= QC32dn(0x30, r);
-		a[0x8] ^= QC32up(0x40, r);
-		a[0x9] ^= QC32dn(0x40, r);
-		a[0xA] ^= QC32up(0x50, r);
-		a[0xB] ^= QC32dn(0x50, r);
-		a[0xC] ^= QC32up(0x60, r);
-		a[0xD] ^= QC32dn(0x60, r);
-		a[0xE] ^= QC32up(0x70, r);
-		a[0xF] ^= QC32dn(0x70, r);
+		a[0x0] ^= 0xFFFFFFFF;
+		a[0x1] ^= ~r;
+		a[0x2] ^= 0xFFFFFFFF;
+		a[0x3] ^= r ^ 0xefffffff;
+		a[0x4] ^= 0xFFFFFFFF;
+		a[0x5] ^= r ^ 0xdfffffff;
+		a[0x6] ^= 0xFFFFFFFF;
+		a[0x7] ^= r ^ 0xcfffffff;
+		a[0x8] ^= 0xFFFFFFFF;
+		a[0x9] ^= r ^ 0xbfffffff;
+		a[0xA] ^= 0xFFFFFFFF;
+		a[0xB] ^= r ^ 0xafffffff;
+		a[0xC] ^= 0xFFFFFFFF;
+		a[0xD] ^= r ^ 0x9fffffff;
+		a[0xE] ^= 0xFFFFFFFF;
+		a[0xF] ^= r ^ 0x8fffffff;
 		RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD);
 		RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF);
 		RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1);
@@ -172,7 +731,7 @@ void groestl256_perm_Q(uint32_t thread, uint32_t *a, uint32_t *mixtabs)
 }
 
 __global__ __launch_bounds__(256,1)
-void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ outputHash, uint32_t *const __restrict__ nonceVector)
+void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, const uint64_t *const __restrict__ outputHash, uint32_t *const __restrict__ nonceVector)
 {
 #if USE_SHARED
 	__shared__ uint32_t mixtabs[2048];
@@ -191,8 +750,8 @@ void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *con
 	__syncthreads();
 #endif
 
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
 	{
 		// GROESTL
 		uint32_t message[16];
@@ -218,22 +777,22 @@ void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *con
 		// Perm
 
 #if USE_SHARED
-		groestl256_perm_P(thread, state, mixtabs);
+		groestl256_perm_P(state, mixtabs);
 		state[15] ^= 0x10000;
-		groestl256_perm_Q(thread, message, mixtabs);
+		groestl256_perm_Q(message, mixtabs);
 #else
-		groestl256_perm_P(thread, state, NULL);
+		groestl256_perm_P(state, NULL);
 		state[15] ^= 0x10000;
-		groestl256_perm_P(thread, message, NULL);
+		groestl256_perm_P(message, NULL);
 #endif
 		#pragma unroll 16
 		for (int u = 0; u<16; u++) state[u] ^= message[u];
 		#pragma unroll 16
 		for (int u = 0; u<16; u++) message[u] = state[u];
 #if USE_SHARED
-		groestl256_perm_P(thread, message, mixtabs);
+		groestl256_perm_P_final(message, mixtabs);
 #else
-		groestl256_perm_P(thread, message, NULL);
+		groestl256_perm_P(message, NULL);
 #endif
 		state[15] ^= message[15];
 
@@ -248,13 +807,13 @@ void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *con
 
 #define texDef(texname, texmem, texsource, texsize) \
 	unsigned int *texmem; \
-	cudaMalloc(&texmem, texsize); \
-	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+	CUDA_SAFE_CALL(cudaMalloc(&texmem, texsize)); \
+	CUDA_SAFE_CALL(cudaMemcpyAsync(texmem, texsource, texsize, cudaMemcpyHostToDevice, gpustream[thr_id])); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+	  CUDA_SAFE_CALL(cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize)); }
 
 __host__
 void groestl256_cpu_init(int thr_id, uint32_t threads)
@@ -270,28 +829,27 @@ void groestl256_cpu_init(int thr_id, uint32_t threads)
 	texDef(t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256);
 	texDef(t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256);
 
-	cudaMalloc(&d_GNonce[thr_id], 2*sizeof(uint32_t));
-	cudaMallocHost(&d_gnounce[thr_id], 2*sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaMalloc(&d_GNonce[thr_id], 2 * sizeof(uint32_t)));
 }
 
 __host__
-void groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order, uint32_t *resultnonces)
+void groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, uint32_t *resultnonces)
 {
-	cudaMemset(d_GNonce[thr_id], 0, 2*sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaMemsetAsync(d_GNonce[thr_id], 0, 2 * sizeof(uint32_t), gpustream[thr_id]));
 	const uint32_t threadsperblock = 256;
 
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	groestl256_gpu_hash32<<<grid, block>>>(threads, startNounce, d_outputHash, d_GNonce[thr_id]);
-	cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	resultnonces[0] = *(d_gnounce[thr_id]);
-	resultnonces[1] = *(d_gnounce[thr_id] + 1);
+	groestl256_gpu_hash32<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_outputHash, d_GNonce[thr_id]);
+	CUDA_SAFE_CALL(cudaGetLastError());
+	CUDA_SAFE_CALL(cudaMemcpyAsync(resultnonces, d_GNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]));
+	CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id]));
 }
 
 __host__
-void groestl256_setTarget(const void *pTargetIn)
+void groestl256_setTarget(int thr_id, const void *pTargetIn)
 {
-	cudaMemcpyToSymbol(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
 }
diff --git a/Algo256/cuda_keccak256.cu b/Algo256/cuda_keccak256.cu
index 7b99cd7fcc..3cd851e44f 100644
--- a/Algo256/cuda_keccak256.cu
+++ b/Algo256/cuda_keccak256.cu
@@ -1,12 +1,13 @@
 #include "miner.h"
-
-extern "C" {
+#ifdef __cplusplus
+#include <cstdint>
+#else
 #include <stdint.h>
+#endif
 #include <memory.h>
-}
-
 #include "cuda_helper.h"
 
+
 #define UINT2(x,y) make_uint2(x,y)
 
 static uint32_t *d_KNonce[MAX_GPUS];
@@ -43,602 +44,217 @@ __constant__ uint2 keccak_round_constants35[24] = {
 };
 
 
-__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding?)
-
-#if __CUDA_ARCH__ >= 350
-__device__ __forceinline__
-static void keccak_blockv35_32(uint2 *s)
-{
-	int i;
-	uint2 t1, t[5], u[5], v, w;
-
-	t1 = s[1] ^ s[16];
-
-	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-	u[0] = s[4] ^ ROL2(t1, 1);
-	u[1] = s[0] ^ ROL2(s[2], 1);
-	u[2] = t1 ^ ROL2(s[3], 1);
-	u[3] = s[2] ^ ROL2(s[4], 1);
-	u[4] = s[3] ^ ROL2(s[0], 1);
-
-	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-	s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-	s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-	s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-	s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-	s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-	/* rho pi: b[..] = rotl(a[..], ..) */
-	v = s[1];
-	s[1] = ROL2(s[6], 44);
-	s[6] = ROL2(s[9], 20);
-	s[9] = ROL2(s[22], 61);
-	s[22] = ROL2(s[14], 39);
-	s[14] = ROL2(s[20], 18);
-	s[20] = ROL2(s[2], 62);
-	s[2] = ROL2(s[12], 43);
-	s[12] = ROL2(s[13], 25);
-	s[13] = ROL2(s[19], 8);
-	s[19] = ROL2(s[23], 56);
-	s[23] = ROL2(s[15], 41);
-	s[15] = ROL2(s[4], 27);
-	s[4] = ROL2(s[24], 14);
-	s[24] = ROL2(s[21], 2);
-	s[21] = ROL2(s[8], 55);
-	s[8] = ROL2(s[16], 45);
-	s[16] = ROL2(s[5], 36);
-	s[5] = ROL2(s[3], 28);
-	s[3] = ROL2(s[18], 21);
-	s[18] = ROL2(s[17], 15);
-	s[17] = ROL2(s[11], 10);
-	s[11] = ROL2(s[7], 6);
-	s[7] = ROL2(s[10], 3);
-	s[10] = ROL2(v, 1);
-
-	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-	v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
-	v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
-	v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-	v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-	v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-	/* iota: a[0,0] ^= round constant */
-	s[0] = s[0]^1;	//vectorize(keccak_round_constants[0]);
-
-#pragma unroll
-	for (i = 1; i < 24; i++) {
-		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
-
-		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-		u[0] = t[4] ^ ROL2(t[1], 1);
-		u[1] = t[0] ^ ROL2(t[2], 1);
-		u[2] = t[1] ^ ROL2(t[3], 1);
-		u[3] = t[2] ^ ROL2(t[4], 1);
-		u[4] = t[3] ^ ROL2(t[0], 1);
-
-		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-		/* rho pi: b[..] = rotl(a[..], ..) */
-		v = s[1];
-		s[1] = ROL2(s[6], 44);
-		s[6] = ROL2(s[9], 20);
-		s[9] = ROL2(s[22], 61);
-		s[22] = ROL2(s[14], 39);
-		s[14] = ROL2(s[20], 18);
-		s[20] = ROL2(s[2], 62);
-		s[2] = ROL2(s[12], 43);
-		s[12] = ROL2(s[13], 25);
-		s[13] = ROL2(s[19], 8);
-		s[19] = ROL2(s[23], 56);
-		s[23] = ROL2(s[15], 41);
-		s[15] = ROL2(s[4], 27);
-		s[4] = ROL2(s[24], 14);
-		s[24] = ROL2(s[21], 2);
-		s[21] = ROL2(s[8], 55);
-		s[8] = ROL2(s[16], 45);
-		s[16] = ROL2(s[5], 36);
-		s[5] = ROL2(s[3], 28);
-		s[3] = ROL2(s[18], 21);
-		s[18] = ROL2(s[17], 15);
-		s[17] = ROL2(s[11], 10);
-		s[11] = ROL2(s[7], 6);
-		s[7] = ROL2(s[10], 3);
-		s[10] = ROL2(v, 1);
-
-		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-		v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
-		v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
-		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-		/* iota: a[0,0] ^= round constant */
-		s[0] ^= keccak_round_constants35[i]; //vectorize(keccak_round_constants[i]);
-	}
-}
-#else
-
-__device__ __forceinline__
-static void keccak_blockv30_32(uint64_t *s, const uint64_t *keccak_round_constants)
-{
-	int i;
-	uint64_t t1, t[5], u[5], v, w;
-
-	/* absorb input */
-	/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-	t1 = s[1] ^ s[16];
-
-	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-	u[0] = s[4] ^ ROTL64(t1, 1);
-	u[1] = s[0] ^ ROTL64(s[2], 1);
-	u[2] = t1 ^ ROTL64(s[3], 1);
-	u[3] = s[2] ^ ROTL64(s[4], 1);
-	u[4] = s[3] ^ ROTL64(s[0], 1);
-
-	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-	s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-	s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-	s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-	s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-	s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-	/* rho pi: b[..] = rotl(a[..], ..) */
-	v = s[1];
-	s[1] = ROTL64(s[6], 44);
-	s[6] = ROTL64(s[9], 20);
-	s[9] = ROTL64(s[22], 61);
-	s[22] = ROTL64(s[14], 39);
-	s[14] = ROTL64(s[20], 18);
-	s[20] = ROTL64(s[2], 62);
-	s[2] = ROTL64(s[12], 43);
-	s[12] = ROTL64(s[13], 25);
-	s[13] = ROTL64(s[19], 8);
-	s[19] = ROTL64(s[23], 56);
-	s[23] = ROTL64(s[15], 41);
-	s[15] = ROTL64(s[4], 27);
-	s[4] = ROTL64(s[24], 14);
-	s[24] = ROTL64(s[21], 2);
-	s[21] = ROTL64(s[8], 55);
-	s[8] = ROTL64(s[16], 45);
-	s[16] = ROTL64(s[5], 36);
-	s[5] = ROTL64(s[3], 28);
-	s[3] = ROTL64(s[18], 21);
-	s[18] = ROTL64(s[17], 15);
-	s[17] = ROTL64(s[11], 10);
-	s[11] = ROTL64(s[7], 6);
-	s[7] = ROTL64(s[10], 3);
-	s[10] = ROTL64(v, 1);
-
-	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-	v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
-	v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
-	v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-	v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-	v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-	/* iota: a[0,0] ^= round constant */
-	s[0] ^= 1;//keccak_round_constants[0];
-
-	for (i = 1; i < 24; i++) {
-		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+__constant__ uint2 c_PaddedMessage80[10]; // padded message (80 bytes + padding?)
 
-		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-		u[0] = t[4] ^ ROTL64(t[1], 1);
-		u[1] = t[0] ^ ROTL64(t[2], 1);
-		u[2] = t[1] ^ ROTL64(t[3], 1);
-		u[3] = t[2] ^ ROTL64(t[4], 1);
-		u[4] = t[3] ^ ROTL64(t[0], 1);
-
-		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-		/* rho pi: b[..] = rotl(a[..], ..) */
-		v = s[1];
-		s[1] = ROTL64(s[6], 44);
-		s[6] = ROTL64(s[9], 20);
-		s[9] = ROTL64(s[22], 61);
-		s[22] = ROTL64(s[14], 39);
-		s[14] = ROTL64(s[20], 18);
-		s[20] = ROTL64(s[2], 62);
-		s[2] = ROTL64(s[12], 43);
-		s[12] = ROTL64(s[13], 25);
-		s[13] = ROTL64(s[19], 8);
-		s[19] = ROTL64(s[23], 56);
-		s[23] = ROTL64(s[15], 41);
-		s[15] = ROTL64(s[4], 27);
-		s[4] = ROTL64(s[24], 14);
-		s[24] = ROTL64(s[21], 2);
-		s[21] = ROTL64(s[8], 55);
-		s[8] = ROTL64(s[16], 45);
-		s[16] = ROTL64(s[5], 36);
-		s[5] = ROTL64(s[3], 28);
-		s[3] = ROTL64(s[18], 21);
-		s[18] = ROTL64(s[17], 15);
-		s[17] = ROTL64(s[11], 10);
-		s[11] = ROTL64(s[7], 6);
-		s[7] = ROTL64(s[10], 3);
-		s[10] = ROTL64(v, 1);
-
-		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-		v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
-		v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
-		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-		/* iota: a[0,0] ^= round constant */
-		s[0] ^= keccak_round_constants[i];
-	}
-}
-#endif
+#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
 
-#if __CUDA_ARCH__ >= 350
-__device__ __forceinline__
-static void keccak_blockv35_80(uint2 *s)
+static void __forceinline__ __device__ keccak_block(uint2 *s)
 {
-	int i;
-	uint2 t[5], u[5], v, w;
-
-	/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-	t[0] = s[0] ^ s[5] ^ s[10];
-	t[1] = s[1] ^ s[6] ^ s[16];
-	t[2] = s[2] ^ s[7];
-	t[3] = s[3] ^ s[8];
-	t[4] = s[4] ^ s[9];
-
-	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-	u[0] = t[4] ^ ROL2(t[1], 1);
-	u[1] = t[0] ^ ROL2(t[2], 1);
-	u[2] = t[1] ^ ROL2(t[3], 1);
-	u[3] = t[2] ^ ROL2(t[4], 1);
-	u[4] = t[3] ^ ROL2(t[0], 1);
-
-	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-	s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0];
-	s[1] ^= u[1]; s[6] ^= u[1]; s[16] ^= u[1];
-	s[2] ^= u[2]; s[7] ^= u[2]; 
-	s[3] ^= u[3]; s[8] ^= u[3]; 
-	s[4] ^= u[4]; s[9] ^= u[4]; 
-
-	/* rho pi: b[..] = rotl(a[..], ..) */
-	v = s[1];
-	s[1] = ROL2(s[6], 44);
-	s[6] = ROL2(s[9], 20);
-	s[9] = ROL2(u[2], 61);
-	s[22] = ROL2(u[4], 39);
-	s[14] = ROL2(u[0], 18);
-	s[20] = ROL2(s[2], 62);
-	s[2] = ROL2(u[2], 43);
-	s[12] = ROL2(u[3], 25);
-	s[13] = ROL2(u[4], 8);
-	s[19] = ROL2(u[3], 56);
-	s[23] = ROL2(u[0], 41);
-	s[15] = ROL2(s[4], 27);
-	s[4] = ROL2(u[4], 14);
-	s[24] = ROL2(u[1], 2);
-	s[21] = ROL2(s[8], 55);
-	s[8] = ROL2(s[16], 45);
-	s[16] = ROL2(s[5], 36);
-	s[5] = ROL2(s[3], 28);
-	s[3] = ROL2(u[3], 21);
-	s[18] = ROL2(u[2], 15);
-	s[17] = ROL2(u[1], 10);
-	s[11] = ROL2(s[7], 6);
-	s[7] = ROL2(s[10], 3);
-	s[10] = ROL2(v, 1);
-
-	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-	v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
-	v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
-	v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-	v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-	v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-	/* iota: a[0,0] ^= round constant */
-	s[0] = s[0]^1; //keccak_round_constants[0];
-
-	#pragma unroll
-	for (i = 1; i < 23; i++) {
-		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+	uint2 bc[5], tmpxor[5], tmp1, tmp2;
+//	uint2 s[25];
 
-		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-		u[0] = t[4] ^ ROL2(t[1], 1);
-		u[1] = t[0] ^ ROL2(t[2], 1);
-		u[2] = t[1] ^ ROL2(t[3], 1);
-		u[3] = t[2] ^ ROL2(t[4], 1);
-		u[4] = t[3] ^ ROL2(t[0], 1);
-
-		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-		/* rho pi: b[..] = rotl(a[..], ..) */
-		v = s[1];
-		s[1] = ROL2(s[6], 44);
-		s[6] = ROL2(s[9], 20);
-		s[9] = ROL2(s[22], 61);
-		s[22] = ROL2(s[14], 39);
-		s[14] = ROL2(s[20], 18);
-		s[20] = ROL2(s[2], 62);
-		s[2] = ROL2(s[12], 43);
-		s[12] = ROL2(s[13], 25);
-		s[13] = ROL2(s[19], 8);
-		s[19] = ROL2(s[23], 56);
-		s[23] = ROL2(s[15], 41);
-		s[15] = ROL2(s[4], 27);
-		s[4] = ROL2(s[24], 14);
-		s[24] = ROL2(s[21], 2);
-		s[21] = ROL2(s[8], 55);
-		s[8] = ROL2(s[16], 45);
-		s[16] = ROL2(s[5], 36);
-		s[5] = ROL2(s[3], 28);
-		s[3] = ROL2(s[18], 21);
-		s[18] = ROL2(s[17], 15);
-		s[17] = ROL2(s[11], 10);
-		s[11] = ROL2(s[7], 6);
-		s[7] = ROL2(s[10], 3);
-		s[10] = ROL2(v, 1);
-
-		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-		v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
-		v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
-		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-		/* iota: a[0,0] ^= round constant */
+#pragma unroll 1
+	for (int i= 0; i < 24; i++) 
+	{
+#pragma unroll
+		for (uint32_t x = 0; x < 5; x++)
+			tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
+
+		bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+		tmp1 = s[1] ^ bc[0];
+
+		s[0] ^= bc[4];
+		s[1] = ROL2(s[6] ^ bc[0], 44);
+		s[6] = ROL2(s[9] ^ bc[3], 20);
+		s[9] = ROL2(s[22] ^ bc[1], 61);
+		s[22] = ROL2(s[14] ^ bc[3], 39);
+		s[14] = ROL2(s[20] ^ bc[4], 18);
+		s[20] = ROL2(s[2] ^ bc[1], 62);
+		s[2] = ROL2(s[12] ^ bc[1], 43);
+		s[12] = ROL2(s[13] ^ bc[2], 25);
+		s[13] = ROL8(s[19] ^ bc[3]);
+		s[19] = ROR8(s[23] ^ bc[2]);
+		s[23] = ROL2(s[15] ^ bc[4], 41);
+		s[15] = ROL2(s[4] ^ bc[3], 27);
+		s[4] = ROL2(s[24] ^ bc[3], 14);
+		s[24] = ROL2(s[21] ^ bc[0], 2);
+		s[21] = ROL2(s[8] ^ bc[2], 55);
+		s[8] = ROL2(s[16] ^ bc[0], 45);
+		s[16] = ROL2(s[5] ^ bc[4], 36);
+		s[5] = ROL2(s[3] ^ bc[2], 28);
+		s[3] = ROL2(s[18] ^ bc[2], 21);
+		s[18] = ROL2(s[17] ^ bc[1], 15);
+		s[17] = ROL2(s[11] ^ bc[0], 10);
+		s[11] = ROL2(s[7] ^ bc[1], 6);
+		s[7] = ROL2(s[10] ^ bc[4], 3);
+		s[10] = ROL2(tmp1, 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
 		s[0] ^= keccak_round_constants35[i];
 	}
-	t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-	t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-	t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-	t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-	t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
-
-	s[0] ^= t[4] ^ ROL2(t[1], 1);
-	s[18] ^= t[2] ^ ROL2(t[4], 1);
-	s[24] ^= t[3] ^ ROL2(t[0], 1);
-
-	s[3] = ROL2(s[18], 21) ^ ((~ROL2(s[24], 14)) & s[0]);
 }
-#else
 
-__device__ __forceinline__
-static void keccak_blockv30_80(uint64_t *s, const uint64_t *keccak_round_constants)
+__global__	__launch_bounds__(512)
+void keccak256_gpu_hash_80(uint32_t threads, uint32_t startNounce,  uint32_t *const __restrict__ resNounce)
 {
-	int i;
-	uint64_t t[5], u[5], v, w;
-
-	/* absorb input */
-	/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-	t[0] = s[0] ^ s[5] ^ s[10];
-	t[1] = s[1] ^ s[6] ^ s[16];
-	t[2] = s[2] ^ s[7];
-	t[3] = s[3] ^ s[8] ;
-	t[4] = s[4] ^ s[9];
-
-	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-	u[0] = t[4] ^ ROTL64(t[1], 1);
-	u[1] = t[0] ^ ROTL64(t[2], 1);
-	u[2] = t[1] ^ ROTL64(t[3], 1);
-	u[3] = t[2] ^ ROTL64(t[4], 1);
-	u[4] = t[3] ^ ROTL64(t[0], 1);
-
-	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-	s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0];
-	s[1] ^= u[1]; s[6] ^= u[1]; s[16] ^= u[1];
-	s[2] ^= u[2]; s[7] ^= u[2];
-	s[3] ^= u[3]; s[8] ^= u[3];
-	s[4] ^= u[4]; s[9] ^= u[4];
-
-	/* rho pi: b[..] = rotl(a[..], ..) */
-	v = s[1];
-	s[1] = ROTL64(s[6], 44);
-	s[6] = ROTL64(s[9], 20);
-	s[9] = ROTL64(u[2], 61);
-	s[22] = ROTL64(u[4], 39);
-	s[14] = ROTL64(u[0], 18);
-	s[20] = ROTL64(s[2], 62);
-	s[2] = ROTL64(u[2], 43);
-	s[12] = ROTL64(u[3], 25);
-	s[13] = ROTL64(u[4], 8);
-	s[19] = ROTL64(u[3], 56);
-	s[23] = ROTL64(u[0], 41);
-	s[15] = ROTL64(s[4], 27);
-	s[4] = ROTL64(u[4], 14);
-	s[24] = ROTL64(u[1], 2);
-	s[21] = ROTL64(s[8], 55);
-	s[8] = ROTL64(s[16], 45);
-	s[16] = ROTL64(s[5], 36);
-	s[5] = ROTL64(s[3], 28);
-	s[3] = ROTL64(u[3], 21);
-	s[18] = ROTL64(u[2], 15);
-	s[17] = ROTL64(u[1], 10);
-	s[11] = ROTL64(s[7], 6);
-	s[7] = ROTL64(s[10], 3);
-	s[10] = ROTL64(v, 1);
-
-	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-	v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
-	v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
-	v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-	v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-	v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-	/* iota: a[0,0] ^= round constant */
-	s[0] ^= keccak_round_constants[0];
-
-	for (i = 1; i < 23; i++) {
-		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
+	{
+		const uint32_t nounce = startNounce + thread;
+		uint2 bc[5], tmpxor[5], tmp1, tmp2;
+		uint2 s[25];
+		
+		s[9] = make_uint2(c_PaddedMessage80[9].x, cuda_swab32(nounce));
+		s[10] = make_uint2(1, 0);
+		s[16] = make_uint2(0, 0x80000000);
+
+		tmpxor[0] = c_PaddedMessage80[0] ^ c_PaddedMessage80[5] ^ s[10];
+		tmpxor[1] = c_PaddedMessage80[1] ^ c_PaddedMessage80[6] ^ s[16];
+		tmpxor[2] = c_PaddedMessage80[2] ^ c_PaddedMessage80[7];
+		tmpxor[3] = c_PaddedMessage80[3] ^ c_PaddedMessage80[8];
+		tmpxor[4] = c_PaddedMessage80[4] ^ s[9];
+
+		bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+		tmp1 = c_PaddedMessage80[1] ^ bc[0];
+
+		s[0] = c_PaddedMessage80[0] ^ bc[4];
+		s[1] = ROL2(c_PaddedMessage80[6] ^ bc[0], 44);
+		s[6] = ROL2(s[9] ^ bc[3], 20);
+		s[9] = ROL2(bc[1], 61);
+		s[22] = ROL2(bc[3], 39);
+		s[14] = ROL2(bc[4], 18);
+		s[20] = ROL2(c_PaddedMessage80[2] ^ bc[1], 62);
+		s[2] = ROL2(bc[1], 43);
+		s[12] = ROL2(bc[2], 25);
+		s[13] = ROL8(bc[3]);
+		s[19] = ROR8(bc[2]);
+		s[23] = ROL2(bc[4], 41);
+		s[15] = ROL2(c_PaddedMessage80[4] ^ bc[3], 27);
+		s[4] = ROL2(bc[3], 14);
+		s[24] = ROL2(bc[0], 2);
+		s[21] = ROL2(c_PaddedMessage80[8] ^ bc[2], 55);
+		s[8] = ROL2(s[16] ^ bc[0], 45);
+		s[16] = ROL2(c_PaddedMessage80[5] ^ bc[4], 36);
+		s[5] = ROL2(c_PaddedMessage80[3] ^ bc[2], 28);
+		s[3] = ROL2( bc[2], 21);
+		s[18] = ROL2(bc[1], 15);
+		s[17] = ROL2(bc[0], 10);
+		s[11] = ROL2(c_PaddedMessage80[7] ^ bc[1], 6);
+		s[7] = ROL2(s[10] ^ bc[4], 3);
+		s[10] = ROL2(tmp1, 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0].x ^= 1;
+
+#pragma unroll 2
+		for (int i = 1; i < 23; i++) 
+		{
+
+#pragma unroll
+			for (uint32_t x = 0; x < 5; x++)
+				tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
+
+			bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+			bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+			bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+			bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+			bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+			tmp1 = s[1] ^ bc[0];
+
+			s[0] ^= bc[4];
+			s[1] = ROL2(s[6] ^ bc[0], 44);
+			s[6] = ROL2(s[9] ^ bc[3], 20);
+			s[9] = ROL2(s[22] ^ bc[1], 61);
+			s[22] = ROL2(s[14] ^ bc[3], 39);
+			s[14] = ROL2(s[20] ^ bc[4], 18);
+			s[20] = ROL2(s[2] ^ bc[1], 62);
+			s[2] = ROL2(s[12] ^ bc[1], 43);
+			s[12] = ROL2(s[13] ^ bc[2], 25);
+			s[13] = ROL8(s[19] ^ bc[3]);
+			s[19] = ROR8(s[23] ^ bc[2]);
+			s[23] = ROL2(s[15] ^ bc[4], 41);
+			s[15] = ROL2(s[4] ^ bc[3], 27);
+			s[4] = ROL2(s[24] ^ bc[3], 14);
+			s[24] = ROL2(s[21] ^ bc[0], 2);
+			s[21] = ROL2(s[8] ^ bc[2], 55);
+			s[8] = ROL2(s[16] ^ bc[0], 45);
+			s[16] = ROL2(s[5] ^ bc[4], 36);
+			s[5] = ROL2(s[3] ^ bc[2], 28);
+			s[3] = ROL2(s[18] ^ bc[2], 21);
+			s[18] = ROL2(s[17] ^ bc[1], 15);
+			s[17] = ROL2(s[11] ^ bc[0], 10);
+			s[11] = ROL2(s[7] ^ bc[1], 6);
+			s[7] = ROL2(s[10] ^ bc[4], 3);
+			s[10] = ROL2(tmp1, 1);
+
+			tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+			tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+			tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+			tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+			tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+			s[0] ^= keccak_round_constants35[i];
+		}
+		uint2 t[5];
 		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
 		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
 		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
 		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
 		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
 
-		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-		u[0] = t[4] ^ ROTL64(t[1], 1);
-		u[1] = t[0] ^ ROTL64(t[2], 1);
-		u[2] = t[1] ^ ROTL64(t[3], 1);
-		u[3] = t[2] ^ ROTL64(t[4], 1);
-		u[4] = t[3] ^ ROTL64(t[0], 1);
-
-		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-		/* rho pi: b[..] = rotl(a[..], ..) */
-		v = s[ 1];
-		s[ 1] = ROTL64(s[ 6], 44);
-		s[ 6] = ROTL64(s[ 9], 20);
-		s[ 9] = ROTL64(s[22], 61);
-		s[22] = ROTL64(s[14], 39);
-		s[14] = ROTL64(s[20], 18);
-		s[20] = ROTL64(s[ 2], 62);
-		s[ 2] = ROTL64(s[12], 43);
-		s[12] = ROTL64(s[13], 25);
-		s[13] = ROTL64(s[19],  8);
-		s[19] = ROTL64(s[23], 56);
-		s[23] = ROTL64(s[15], 41);
-		s[15] = ROTL64(s[ 4], 27);
-		s[ 4] = ROTL64(s[24], 14);
-		s[24] = ROTL64(s[21],  2);
-		s[21] = ROTL64(s[ 8], 55);
-		s[ 8] = ROTL64(s[16], 45);
-		s[16] = ROTL64(s[ 5], 36);
-		s[ 5] = ROTL64(s[ 3], 28);
-		s[ 3] = ROTL64(s[18], 21);
-		s[18] = ROTL64(s[17], 15);
-		s[17] = ROTL64(s[11], 10);
-		s[11] = ROTL64(s[ 7],  6);
-		s[ 7] = ROTL64(s[10],  3);
-		s[10] = ROTL64(    v,  1);
-
-		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
-		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
-		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-		/* iota: a[0,0] ^= round constant */
-		s[0] ^= keccak_round_constants[i];
-	}
-	t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-	t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-	t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-	t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-	t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
-
-	s[0] ^= t[4] ^ ROTL64(t[1], 1);
-	s[18] ^= t[2] ^ ROTL64(t[4], 1);
-	s[24] ^= t[3] ^ ROTL64(t[0], 1);
+		s[0] ^= t[4] ^ ROL2(t[1], 1);
+		s[18] ^= t[2] ^ ROL2(t[4], 1);
+		s[24] ^= t[3] ^ ROL2(t[0], 1);
 
-	s[3] = ROTL64(s[18], 21) ^ ((~ROTL64(s[24], 14)) & s[0]);
-}
-#endif
+		s[3] = ROL2(s[18], 21) ^ ((~ROL2(s[24], 14)) & s[0]);
 
-__global__ __launch_bounds__(128,5)
-void keccak256_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint32_t nounce = startNounce + thread;
-
-#if __CUDA_ARCH__ >= 350
-		uint2 keccak_gpu_state[25];
-		#pragma unroll 25
-		for (int i=0; i<25; i++) {
-			if (i<9) keccak_gpu_state[i] = vectorize(c_PaddedMessage80[i]);
-			else     keccak_gpu_state[i] = UINT2(0, 0);
-		}
-
-		keccak_gpu_state[9]= vectorize(c_PaddedMessage80[9]);
-		keccak_gpu_state[9].y = cuda_swab32(nounce);
-		keccak_gpu_state[10] = UINT2(1, 0);
-		keccak_gpu_state[16] = UINT2(0, 0x80000000);
 
-		keccak_blockv35_80(keccak_gpu_state);
-		if (devectorize(keccak_gpu_state[3]) <= ((uint64_t*)pTarget)[3])
-		{
-			uint32_t tmp = atomicCAS(resNounce, 0xffffffff, nounce);
-			if (tmp != 0xffffffff)
-				resNounce[1] = nounce;
-	}
-#else
-		uint64_t keccak_gpu_state[25];
-		#pragma unroll 25
-		for (int i=0; i<25; i++) {
-			if (i<9) keccak_gpu_state[i] = c_PaddedMessage80[i];
-			else     keccak_gpu_state[i] = 0;
-		}
-		keccak_gpu_state[9]  = REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
-		keccak_gpu_state[10] = 0x0000000000000001;
-		keccak_gpu_state[16] = 0x8000000000000000;
-
-		keccak_blockv30_80(keccak_gpu_state, keccak_round_constants);
-		if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3])
+		if (devectorize(s[3]) <= ((uint64_t*)pTarget)[3])
 		{
 			uint32_t tmp = atomicCAS(resNounce, 0xffffffff, nounce);
 			if (tmp != 0xffffffff)
 				resNounce[1] = nounce;
 		}
-#endif
 	}
 }
 
 __host__
-void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order, uint32_t *h_nounce)
+void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *h_nounce)
 {
-	cudaMemset(d_KNonce[thr_id], 0xff, 4*sizeof(uint32_t));
-	const uint32_t threadsperblock = 128;
+	cudaMemsetAsync(d_KNonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]);
+	const uint32_t threadsperblock = 512;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-
-	keccak256_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash, d_KNonce[thr_id]);
+	keccak256_gpu_hash_80<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_KNonce[thr_id]);
 	//MyStreamSynchronize(NULL, order, thr_id);
-	cudaMemcpy(h_nounce, d_KNonce[thr_id], 4 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	CUDA_SAFE_CALL(cudaMemcpyAsync(h_nounce, d_KNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); cudaStreamSynchronize(gpustream[thr_id]);
 }
 
 __global__ __launch_bounds__(256,3)
 void keccak256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
 	{
-#if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */
 		uint2 keccak_gpu_state[25];
 		#pragma unroll 25
 		for (int i = 0; i<25; i++) {
@@ -647,53 +263,37 @@ void keccak256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *out
 		}
 		keccak_gpu_state[4]  = UINT2(1, 0);
 		keccak_gpu_state[16] = UINT2(0, 0x80000000);
-		keccak_blockv35_32(keccak_gpu_state);
+		keccak_block(keccak_gpu_state);
 
 		#pragma unroll 4
 		for (int i=0; i<4; i++)
 			outputHash[i*threads+thread] = devectorize(keccak_gpu_state[i]);
-#else
-		uint64_t keccak_gpu_state[25];
-		#pragma unroll 25
-		for (int i = 0; i<25; i++) {
-			if (i<4)
-				keccak_gpu_state[i] = outputHash[i*threads+thread];
-			else
-				keccak_gpu_state[i] = 0;
-		}
-		keccak_gpu_state[4]  = 0x0000000000000001;
-		keccak_gpu_state[16] = 0x8000000000000000;
-
-		keccak_blockv30_32(keccak_gpu_state, keccak_round_constants);
-		#pragma unroll 4
-		for (int i = 0; i<4; i++)
-			outputHash[i*threads + thread] = keccak_gpu_state[i];
-#endif
 	}
 }
 
 __host__
-void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash)
 {
 	const uint32_t threadsperblock = 256;
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	keccak256_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
+	keccak256_gpu_hash_32 <<<grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, d_outputHash);
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
 
 __host__
-void keccak256_setBlock_80(void *pdata,const void *pTargetIn)
+void keccak256_setBlock_80(int thr_id, void *pdata,const void *pTargetIn)
 {
 	unsigned char PaddedMessage[80];
 	memcpy(PaddedMessage, pdata, 80);
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_PaddedMessage80, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
 }
 
 __host__
 void keccak256_cpu_init(int thr_id, uint32_t threads)
 {
-	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], 4*sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], 2*sizeof(uint32_t)));
 }
\ No newline at end of file
diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu
index d2060e0911..0361d66ee1 100644
--- a/Algo256/cuda_skein256.cu
+++ b/Algo256/cuda_skein256.cu
@@ -2,146 +2,142 @@
 
 #include "cuda_helper.h"
 
-#if 0
-static __constant__ uint64_t SKEIN_IV512_256[8] = {
-	0xCCD044A12FDB3E13, 0xE83590301A79A9EB,
-	0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB,
-	0xEC06025E74DD7683, 0xE7A436CDC4746251,
-	0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13
-};
-#endif
-
-static __constant__ uint2 vSKEIN_IV512_256[8] = {
-	{ 0x2FDB3E13, 0xCCD044A1 },
-	{ 0x1A79A9EB, 0xE8359030 },
-	{ 0x4F816E6F, 0x55AEA061 },
-	{ 0xAE9B94DB, 0x2A2767A4 },
-	{ 0x74DD7683, 0xEC06025E },
-	{ 0xC4746251, 0xE7A436CD },
-	{ 0x393AD185, 0xC36FBAF9 },
-	{ 0x33EDFC13, 0x3EEDBA18 }
-};
-
-static __constant__ int ROT256[8][4] =
-{
-	46,36, 19, 37,
-	33,27, 14, 42,
-	17,49, 36, 39,
-	44, 9, 54, 56,
-	39,30, 34, 24,
-	13,50, 10, 17,
-	25,29, 39, 43,
-	8, 35, 56, 22,
-};
-
-static __constant__ uint2 skein_ks_parity = { 0xA9FC1A22,0x1BD11BDA};
-static __constant__ uint2 t12[6] = {
-	{ 0x20,	0 },
-	{ 0,	0xf0000000 },
-	{ 0x20,	0xf0000000 },
-	{ 0x08,	0 },
-	{ 0,	0xff000000 },
-	{ 0x08,	0xff000000 }
-};
-
-#if 0
-static __constant__ uint64_t t12_30[6] = {
-	0x20,
-	0xf000000000000000,
-	0xf000000000000020,
-	0x08,
-	0xff00000000000000,
-	0xff00000000000008
-};
-#endif
-
 static __forceinline__ __device__
-void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int ROT)
+void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7,
+	const int ROT0, const int ROT1, const int ROT2, const int ROT3)
 {
-	p0 += p1; p1 = ROL2(p1, ROT256[ROT][0]);  p1 ^= p0;
-	p2 += p3; p3 = ROL2(p3, ROT256[ROT][1]);  p3 ^= p2;
-	p4 += p5; p5 = ROL2(p5, ROT256[ROT][2]);  p5 ^= p4;
-	p6 += p7; p7 = ROL2(p7, ROT256[ROT][3]);  p7 ^= p6;
+	p0 += p1; p1 = ROL2(p1, ROT0) ^ p0;
+	p2 += p3; p3 = ROL2(p3, ROT1) ^ p2;
+	p4 += p5; p5 = ROL2(p5, ROT2) ^ p4;
+	p6 += p7; p7 = ROL2(p7, ROT3) ^ p6;
 }
 
+__forceinline__ __device__
+void Round_8_512v35(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts,
+	uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, const int R)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56);
+
+	p0 += ks[(R+0) % 9];
+	p1 += ks[(R+1) % 9];
+	p2 += ks[(R+2) % 9];
+	p3 += ks[(R+3) % 9];
+	p4 += ks[(R+4) % 9];
+	p5 += ks[(R+5) % 9] + ts[(R+0) % 3];
+	p6 += ks[(R+6) % 9] + ts[(R+1) % 3];
+	p7 += ks[(R+7) % 9] + make_uint2(R, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8,  35, 56, 22);
+
+	p0 += ks[(R+1) % 9];
+	p1 += ks[(R+2) % 9];
+	p2 += ks[(R+3) % 9];
+	p3 += ks[(R+4) % 9];
+	p4 += ks[(R+5) % 9];
+	p5 += ks[(R+6) % 9] + ts[(R+1) % 3];
+	p6 += ks[(R+7) % 9] + ts[(R+2) % 3];
+	p7 += ks[(R+8) % 9] + make_uint2(R+1, 0);
+}
 
-static __forceinline__ __device__
-void Round_8_512v35(uint2 *ks, uint2 *ts,
-                    uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3,
-                    uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int R)
+__forceinline__ __device__
+void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts,
+	uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
 {
-	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 0);
-	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 1);
-	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 2);
-	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 3);
-	p0 += ks[((R)+0) % 9];   /* inject the key schedule value */
-	p1 += ks[((R)+1) % 9];
-	p2 += ks[((R)+2) % 9];
-	p3 += ks[((R)+3) % 9];
-	p4 += ks[((R)+4) % 9];
-	p5 += ks[((R)+5) % 9] + ts[((R)+0) % 3];
-	p6 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
-	p7 += ks[((R)+7) % 9] + make_uint2((R),0);
-	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 4);
-	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 5);
-	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 6);
-	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 7);
-	p0 += ks[((R)+1) % 9];   /* inject the key schedule value */
-	p1 += ks[((R)+2) % 9];
-	p2 += ks[((R)+3) % 9];
-	p3 += ks[((R)+4) % 9];
-	p4 += ks[((R)+5) % 9];
-	p5 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
-	p6 += ks[((R)+7) % 9] + ts[((R)+2) % 3];
-	p7 += ks[((R)+8) % 9] + make_uint2((R)+1, 0);
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[8];
+	p1 += ks[0];
+	p2 += ks[1];
+	p3 += ks[2];
+	p4 += ks[3];
+	p5 += ks[4] + ts[2];
+	p6 += ks[5] + ts[0];
+	p7 += ks[6] + make_uint2(17, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8,  35, 56, 22);
+
+	p0 += ks[0];
+	p1 += ks[1];
+	p2 += ks[2];
+	p3 += ks[3];
 }
 
 
-__global__ __launch_bounds__(256,3)
+
+__global__ __launch_bounds__(256,4)
 void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint2 skein_ks_parity = { 0xA9FC1A22, 0x1BD11BDA };
+
+	const uint2 h2[9] = {
+		{ 0x2FDB3E13, 0xCCD044A1 },
+		{ 0x1A79A9EB, 0xE8359030 },
+		{ 0x4F816E6F, 0x55AEA061 },
+		{ 0xAE9B94DB, 0x2A2767A4 },
+		{ 0x74DD7683, 0xEC06025E },
+		{ 0xC4746251, 0xE7A436CD },
+		{ 0x393AD185, 0xC36FBAF9 },
+		{ 0x33EDFC13, 0x3EEDBA18 },
+		{ 0xC73A4E2A, 0xB69D3CFC }
+	};
+	const uint2 t12[6] = {
+		{ 0x20, 0 },
+		{ 0, 0xf0000000 },
+		{ 0x20, 0xf0000000 },
+		{ 0x08, 0 },
+		{ 0, 0xff000000 },
+		{ 0x08, 0xff000000 }
+	};
+
+//	if (thread < threads)
 	{
-		uint2 h[9];
-		uint2 t[3];
+
 		uint2 dt0,dt1,dt2,dt3;
 		uint2 p0, p1, p2, p3, p4, p5, p6, p7;
 
-		h[8] = skein_ks_parity;
-		for (int i = 0; i<8; i++) {
-			h[i] = vSKEIN_IV512_256[i];
-			h[8] ^= h[i];
-		}
-
-		t[0]=t12[0];
-		t[1]=t12[1];
-		t[2]=t12[2];
-
 		LOHI(dt0.x,dt0.y,outputHash[thread]);
 		LOHI(dt1.x,dt1.y,outputHash[threads+thread]);
 		LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]);
 		LOHI(dt3.x,dt3.y,outputHash[3*threads+thread]);
 
-		p0 = h[0] + dt0;
-		p1 = h[1] + dt1;
-		p2 = h[2] + dt2;
-		p3 = h[3] + dt3;
-		p4 = h[4];
-		p5 = h[5] + t[0];
-		p6 = h[6] + t[1];
-		p7 = h[7];
-
-		#pragma unroll
-		for (int i = 1; i<19; i+=2) {
-			Round_8_512v35(h,t,p0,p1,p2,p3,p4,p5,p6,p7,i);
-		}
+		p0 = h2[0] + dt0;
+		p1 = h2[1] + dt1;
+		p2 = h2[2] + dt2;
+		p3 = h2[3] + dt3;
+		p4 = h2[4];
+		p5 = h2[5] + t12[0];
+		p6 = h2[6] + t12[1];
+		p7 = h2[7];
+
+		Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 1);
+		Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 3);
+		Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 5);
+		Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 7);
+		Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 9);
+		Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 11);
+		Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 13);
+		Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 15);
+		Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 17);
 
 		p0 ^= dt0;
 		p1 ^= dt1;
 		p2 ^= dt2;
 		p3 ^= dt3;
 
+		uint2 h[9];
 		h[0] = p0;
 		h[1] = p1;
 		h[2] = p2;
@@ -150,23 +146,21 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp
 		h[5] = p5;
 		h[6] = p6;
 		h[7] = p7;
-		h[8] = skein_ks_parity;
-
-		#pragma unroll 8
-		for (int i = 0; i<8; i++) {
-			h[8] ^= h[i];
-		}
-
-		t[0] = t12[3];
-		t[1] = t12[4];
-		t[2] = t12[5];
-		p5 += t[0];  //p5 already equal h[5]
-		p6 += t[1];
-
-		#pragma unroll
-		for (int i = 1; i<19; i+=2) {
-			Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
-		}
+		h[8] = skein_ks_parity ^ h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7];
+
+		const uint2 *t = t12+3;
+		p5 += t12[3];  //p5 already equal h[5]
+		p6 += t12[4];
+
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 1);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 3);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 5);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 7);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 9);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 11);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 13);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 15);
+		Round_8_512v35_final(h, t, p0, p1, p2, p3, p4, p5, p6, p7);
 
 		outputHash[thread]           = devectorize(p0);
 		outputHash[threads+thread]   = devectorize(p1);
@@ -178,18 +172,17 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp
 __host__
 void skein256_cpu_init(int thr_id, uint32_t threads)
 {
-	//empty
 }
 
 __host__
-void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash)
 {
-	const uint32_t threadsperblock = 256;
+	const uint32_t threadsperblock = 32;
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
-
+	skein256_gpu_hash_32<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_outputHash);
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
 
diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu
index 5d4db89b46..cec9ded7ba 100644
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@@ -8,21 +8,18 @@ extern "C"
 #include "sph/sph_shavite.h"
 #include "sph/sph_simd.h"
 #include "sph/sph_keccak.h"
-
-#include "miner.h"
 }
+#include "miner.h"
 
-#include "cuda_helper.h"
 
-static uint32_t *d_hash[MAX_GPUS];
-static uint32_t *h_nounce[MAX_GPUS];
+#include "cuda_helper.h"
 
 extern void keccak256_cpu_init(int thr_id, uint32_t threads);
-extern void keccak256_setBlock_80(void *pdata,const void *ptarget);
-extern void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order, uint32_t *h_nounce);
+extern void keccak256_setBlock_80(int thr_id, void *pdata,const void *ptarget);
+extern void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *h_nounce);
 
 // CPU Hash
-extern "C" void keccak256_hash(void *state, const void *input)
+void keccak256_hash(void *state, const void *input)
 {
 	sph_keccak_context ctx_keccak;
 
@@ -35,78 +32,94 @@ extern "C" void keccak256_hash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
-
-extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+extern int scanhash_keccak256(int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done)
 {
+	static THREAD uint32_t *h_nounce = nullptr;
+
 	const uint32_t first_nonce = pdata[19];
-	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 21); // 256*256*8*4
-	throughput = min(throughput, (max_nonce - first_nonce));
+	uint32_t intensity = (device_sm[device_map[thr_id]] > 500) ? 1 << 28 : 1 << 27;;
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity); // 256*4096
+	uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffc00;
+
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0005;
+		ptarget[7] = 0x0002;
 
-	if (!init[thr_id]) {
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+	static THREAD volatile bool init = false;
+	if(!init)
+	{
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
-		keccak256_cpu_init(thr_id, (int)throughput);
-		CUDA_SAFE_CALL(cudaMallocHost(&h_nounce[thr_id], 4 * sizeof(uint32_t)));
-		init[thr_id] = true;
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		CUDA_SAFE_CALL(cudaMallocHost(&h_nounce, 2 * sizeof(uint32_t)));
+		keccak256_cpu_init(thr_id, (int)throughputmax);
+//		CUDA_SAFE_CALL(cudaMallocHost(&h_nounce, 2 * sizeof(uint32_t)));
+		mining_has_stopped[thr_id] = false;
+		init = true;
 	}
 
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++) {
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 	}
 
-	keccak256_setBlock_80((void*)endiandata, ptarget);
+	keccak256_setBlock_80(thr_id, (void*)endiandata, ptarget);
 
 	do {
-		int order = 0;
 
-		keccak256_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++, h_nounce[thr_id]);
-		if (h_nounce[thr_id][0] != UINT32_MAX)
+		keccak256_cpu_hash_80(thr_id, (int) throughput, pdata[19], h_nounce);
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(h_nounce[0] != UINT32_MAX)
 		{
 			uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], h_nounce[thr_id][0]);
+			uint32_t vhash64[8]={0};
+			if(opt_verify){ be32enc(&endiandata[19], h_nounce[0]);
 			keccak256_hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
 			{
 				int res = 1;
 				// check if there was some other ones...
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (h_nounce[thr_id][1] != 0xffffffff)
+				if (h_nounce[1] != 0xffffffff)
 				{
-					pdata[21] = h_nounce[thr_id][1];
-					res++;
-					if (opt_benchmark)
-						applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, h_nounce[thr_id][1], vhash64[7], Htarg);
+					if(opt_verify){ be32enc(&endiandata[19], h_nounce[1]);
+					keccak256_hash(vhash64, endiandata);
+
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+						pdata[21] = h_nounce[1];
+						res++;
+						if (opt_benchmark)
+							applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_nounce[1]);
+					}
+					else
+					{
+						if (vhash64[7] != Htarg)
+						{
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_nounce[1]);
+						}
+					}
 				}
-				pdata[19] = h_nounce[thr_id][0];
+				pdata[19] = h_nounce[0];
 				if (opt_benchmark)
-					applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, h_nounce[thr_id][0], vhash64[7], Htarg);
+					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_nounce[0]);
 				return res;
 			}
 			else
 			{
 				if (vhash64[7] != Htarg)
 				{
-					applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_nounce[thr_id][0]);
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_nounce[0]);
 				}
 			}
 		}
 
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
-
-	*hashes_done = pdata[19] - first_nonce;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/JHA/cuda_jha_compactionTest.cu b/JHA/cuda_jha_compactionTest.cu
index 9d4c5d63e4..faf8373ab7 100644
--- a/JHA/cuda_jha_compactionTest.cu
+++ b/JHA/cuda_jha_compactionTest.cu
@@ -4,6 +4,7 @@
 #include "cuda_helper.h"
 #include <sm_30_intrinsics.h>
 
+
 static uint32_t *d_tempBranch1Nonces[MAX_GPUS];
 static uint32_t *d_numValid[MAX_GPUS];
 static uint32_t *h_numValid[MAX_GPUS];
@@ -32,8 +33,8 @@ cuda_compactTestFunction_t h_JackpotTrueFunction[MAX_GPUS], h_JackpotFalseFuncti
 // Setup-Funktionen
 __host__ void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads)
 {
-	cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t));
-	cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t));
+	cudaMemcpyFromSymbolAsync(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t), 0, cudaMemcpyDeviceToHost, gpustream[thr_id]);
+	cudaMemcpyFromSymbolAsync(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t), 0, cudaMemcpyDeviceToHost, gpustream[thr_id]);
 
 	// wir brauchen auch Speicherplatz auf dem Device
 	cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);	
@@ -47,18 +48,10 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads)
 	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
 }
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
-/**
- * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
- */
-#undef __shfl_up
-#define __shfl_up(var, delta, width) (0)
-#endif
-
 // Die Summenfunktion (vom NVIDIA SDK)
 __global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
 {
-	extern __shared__ uint32_t sums[];
+	__shared__ uint32_t sums[32];
 	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
 	//int lane_id = id % warpSize;
 	int lane_id = id % width;
@@ -192,7 +185,7 @@ __global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, c
 	uint32_t value;
 	if (id < threads)
 	{
-//		uint32_t nounce = startNounce + id;
+//		const uint32_t nounce = startNounce + id;
 		uint32_t *inpHash;
 		if(d_validNonceTable == NULL)
 		{
@@ -252,38 +245,38 @@ __host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, uint32_t thre
 	bool callThrid = (thr2 > 0) ? true : false;
 
 	// Erster Initialscan
-	jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(
+	jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 0, gpustream[thr_id]>>>(
 		d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);	
 
 	// weitere Scans
 	if(callThrid)
 	{		
-		jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
-		jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
+		jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 0, gpustream[thr_id]>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
+		jackpot_compactTest_gpu_SCAN<<<1, thr2, 0, gpustream[thr_id]>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
 	}else
 	{
-		jackpot_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
+		jackpot_compactTest_gpu_SCAN<<<thr3,blockSize2, 0, gpustream[thr_id]>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
 	}
 
 	if(callThrid)
-		cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+		cudaMemcpyAsync(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
 	else
-		cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+		cudaMemcpyAsync(nrm, &(d_partSum[0][thr_id])[nSummen - 1], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
 
 	
 	// Addieren
 	if(callThrid)
 	{
-		jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
+		jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize, 0, gpustream[thr_id]>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
 	}
-	jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
+	jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize, 0, gpustream[thr_id]>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
 	
 	// Scatter
-	jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, 
+	jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0, gpustream[thr_id]>>>(d_tempBranch1Nonces[thr_id], d_nonces1, 
 		function, orgThreads, startNounce, inpHashes, d_validNonceTable);
 
 	// Sync
-	cudaDeviceSynchronize();
+	cudaStreamSynchronize(gpustream[thr_id]);
 }
 
 ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
@@ -301,33 +294,32 @@ __host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, uint32_t thread
 	int thr2 = threads / (blockSize*blockSize);
 
 	// 1
-	jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes);
-	jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
-	jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
-	cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
-	jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
+	jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t), 0, gpustream[thr_id]>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes);
+	jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t), 0, gpustream[thr_id]>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
+	jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t), 0, gpustream[thr_id]>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
+	cudaMemcpyAsync(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
+	jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize, 0, gpustream[thr_id]>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
+	jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize, 0, gpustream[thr_id]>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
 
 	// 2
-	jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes);
-	jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
-	jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
-	cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);	
-	jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
-	jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
+	jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t), 0, gpustream[thr_id]>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes);
+	jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t), 0, gpustream[thr_id]>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
+	jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t), 0, gpustream[thr_id]>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
+	cudaMemcpyAsync(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);	
+	jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize, 0, gpustream[thr_id]>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
+	jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize, 0, gpustream[thr_id]>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
 	
 	// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten
 	// Schritt 3: Scatter
-	jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes);
-	jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes);
+	jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0, 0, gpustream[thr_id]>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes);
+	jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0, 0, gpustream[thr_id]>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes);
 	cudaDeviceSynchronize();
 	*/
 }
 
 __host__ void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
 											uint32_t *d_nonces1, uint32_t *nrm1,
-											uint32_t *d_nonces2, uint32_t *nrm2,
-											int order)
+											uint32_t *d_nonces2, uint32_t *nrm2)
 {
 	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
 	// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu
index 37b15ee578..47c3594d9e 100644
--- a/JHA/cuda_jha_keccak512.cu
+++ b/JHA/cuda_jha_keccak512.cu
@@ -3,6 +3,7 @@
 
 #include "cuda_helper.h"
 
+
 __constant__ uint64_t c_State[25];
 __constant__ uint32_t c_PaddedMessage[18];
 
@@ -12,23 +13,21 @@ __constant__ uint32_t c_PaddedMessage[18];
 #define U64TO32_LE(p, v) \
     *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
 
-static const uint64_t host_keccak_round_constants[24] = {
-    0x0000000000000001ull, 0x0000000000008082ull,
-    0x800000000000808aull, 0x8000000080008000ull,
-    0x000000000000808bull, 0x0000000080000001ull,
-    0x8000000080008081ull, 0x8000000000008009ull,
-    0x000000000000008aull, 0x0000000000000088ull,
-    0x0000000080008009ull, 0x000000008000000aull,
-    0x000000008000808bull, 0x800000000000008bull,
-    0x8000000000008089ull, 0x8000000000008003ull,
-    0x8000000000008002ull, 0x8000000000000080ull,
-    0x000000000000800aull, 0x800000008000000aull,
-    0x8000000080008081ull, 0x8000000000008080ull,
-    0x0000000080000001ull, 0x8000000080008008ull
+__constant__ uint64_t c_keccak_round_constants[24] = {
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
 };
 
-__constant__ uint64_t c_keccak_round_constants[24];
-
 static __device__ __forceinline__ void
 keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
     int i;
@@ -102,12 +101,12 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
 
 __global__ void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
 {
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+    const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
     if (thread < threads)
     {
-        uint32_t nounce = startNounce + thread;
+        const uint32_t nounce = startNounce + thread;
 
-        int hashPosition = nounce - startNounce;
+        const uint32_t hashPosition = nounce - startNounce;
 
         // Nachricht kopieren
         uint32_t message[18];
@@ -147,11 +146,6 @@ __global__ void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounc
 // Setup-Funktionen
 __host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads)
 {
-    // Kopiere die Hash-Tabellen in den GPU-Speicher
-    cudaMemcpyToSymbol( c_keccak_round_constants,
-                        host_keccak_round_constants,
-                        sizeof(host_keccak_round_constants),
-                        0, cudaMemcpyHostToDevice);
 }
 
 #define cKeccakB    1600
@@ -161,7 +155,7 @@ __host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads)
 #define crypto_hash_BYTES 64
 
 #if     (cKeccakB   == 1600)
-    typedef unsigned long long  UINT64;
+    typedef uint64_t  UINT64;
     typedef UINT64 tKeccakLane;
     #define cKeccakNumberOfRounds   24
 #endif
@@ -487,7 +481,7 @@ void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount )
 }
 
 // inlen kann 72...143 betragen
-__host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
+__host__ void jackpot_keccak512_cpu_setBlock(int thr_id, void *pdata, size_t inlen)
 {
     const unsigned char *in = (const unsigned char*)pdata;
 
@@ -503,10 +497,10 @@ __host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
 
     // Kopiere den state nach der ersten Runde (nach Absorption von 72 Bytes Inputdaten)
     // ins Constant Memory
-    cudaMemcpyToSymbol( c_State,
+    cudaMemcpyToSymbolAsync( c_State,
                         state,
                         sizeof(state),
-                        0, cudaMemcpyHostToDevice);
+						0, cudaMemcpyHostToDevice, gpustream[thr_id]);
 
     //    padding
     memcpy( temp, in, (size_t)inlen );
@@ -516,13 +510,13 @@ __host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
 
 
     // Kopiere den Rest der Message und das Padding ins Constant Memory
-    cudaMemcpyToSymbol( c_PaddedMessage,
+    cudaMemcpyToSymbolAsync( c_PaddedMessage,
                         temp,
                         cKeccakR_SizeInBytes,
-                        0, cudaMemcpyHostToDevice);
+						0, cudaMemcpyHostToDevice, gpustream[thr_id]);
 }
 
-__host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
+__host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
 {
     const uint32_t threadsperblock = 256;
 
@@ -530,5 +524,5 @@ __host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t
     dim3 grid((threads + threadsperblock-1)/threadsperblock);
     dim3 block(threadsperblock);
 
-    jackpot_keccak512_gpu_hash<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
+    jackpot_keccak512_gpu_hash<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, (uint64_t*)d_hash);
 }
diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index e455acee6d..eaa5ea16c5 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -10,38 +10,29 @@ extern "C"
 #include "miner.h"
 #include "cuda_helper.h"
 
-static uint32_t *d_hash[MAX_GPUS];
-
 extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
-extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
-extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void jackpot_keccak512_cpu_setBlock(int thr_id, void *pdata, size_t inlen);
+extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
-extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_skein512_cpu_init(int thr_id);
-extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads);
 extern void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, 
 											uint32_t *d_nonces1, uint32_t *nrm1,
-											uint32_t *d_nonces2, uint32_t *nrm2,
-											int order);
-
-extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+											uint32_t *d_nonces2, uint32_t *nrm2);
 
-// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
-static uint32_t *d_jackpotNonces[MAX_GPUS];
-static uint32_t *d_branch1Nonces[MAX_GPUS];
-static uint32_t *d_branch2Nonces[MAX_GPUS];
-static uint32_t *d_branch3Nonces[MAX_GPUS];
+extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash);
 
 // Original jackpothash Funktion aus einem miner Quelltext
-extern "C" unsigned int jackpothash(void *state, const void *input)
+unsigned int jackpothash(void *state, const void *input)
 {
     sph_blake512_context     ctx_blake;
     sph_groestl512_context   ctx_groestl;
@@ -83,155 +74,171 @@ extern "C" unsigned int jackpothash(void *state, const void *input)
     return round;
 }
 
-static bool init[MAX_GPUS] = { 0 };
-
-extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done)
+extern int scanhash_jackpot(int thr_id, uint32_t *pdata,
+    uint32_t *ptarget, uint32_t max_nonce,
+    uint32_t *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
 
-	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 20);
-	throughput = min(throughput, (max_nonce - first_nonce));
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << 20);
+	uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x000f;
+		ptarget[7] = 0x000f;
 
-	if (!init[thr_id])
+	static THREAD uint32_t *d_hash = nullptr;
+	static THREAD uint32_t *d_jackpotNonces = nullptr;
+	static THREAD uint32_t *d_branch1Nonces = nullptr;
+	static THREAD uint32_t *d_branch2Nonces = nullptr;
+	static THREAD uint32_t *d_branch3Nonces = nullptr;
+	static THREAD volatile bool init = false;
+
+	if (!init)
 	{
-		CUDA_CALL_OR_RET_X(cudaSetDevice(device_map[thr_id]), 0);
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		get_cuda_arch(&cuda_arch[thr_id]);
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax));
 
-		jackpot_keccak512_cpu_init(thr_id, throughput);
-		jackpot_compactTest_cpu_init(thr_id, throughput);
-		quark_groestl512_cpu_init(thr_id, throughput);
+		jackpot_keccak512_cpu_init(thr_id, throughputmax);
+		jackpot_compactTest_cpu_init(thr_id, throughputmax);
+		quark_groestl512_cpu_init(thr_id, throughputmax);
 		quark_skein512_cpu_init(thr_id);
-		cuda_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughputmax);
 
-		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2);
-		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2);
-		cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput*2);
+		cudaMalloc(&d_branch1Nonces, sizeof(uint32_t)*throughputmax * 1.25/2);
+		cudaMalloc(&d_branch2Nonces, sizeof(uint32_t)*throughputmax * 1.25/2);
+		cudaMalloc(&d_branch3Nonces, sizeof(uint32_t)*throughputmax * 1.25); // 25% more than we need, just in case
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2));
+		CUDA_SAFE_CALL(cudaMalloc(&d_jackpotNonces, sizeof(uint32_t)*throughputmax * 2));
+		mining_has_stopped[thr_id] = false;
 
-		init[thr_id] = true;
+		init = true;
 	}
 
 	uint32_t endiandata[22];
 	for (int k=0; k < 22; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
-	jackpot_keccak512_cpu_setBlock((void*)endiandata, 80);
-	cuda_check_cpu_setTarget(ptarget);
+	jackpot_keccak512_cpu_setBlock(thr_id, (void*)endiandata, 80);
+	cuda_check_cpu_setTarget(ptarget, thr_id);
 
 	do {
-		int order = 0;
-
 		// erstes Keccak512 Hash mit CUDA
-		jackpot_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		jackpot_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash);
 
 		uint32_t nrm1, nrm2, nrm3;
 
 		// Runde 1 (ohne Gröstl)
 
-		jackpot_compactTest_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
-				d_branch1Nonces[thr_id], &nrm1,
-				d_branch3Nonces[thr_id], &nrm3,
-				order++);
+		jackpot_compactTest_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, NULL,
+				d_branch1Nonces, &nrm1,
+				d_branch3Nonces, &nrm3);
 
 		// verfolge den skein-pfad weiter
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces, d_hash);
 
 		// noch schnell Blake & JH
-		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
-			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
+		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces,
+			d_branch1Nonces, &nrm1,
+			d_branch2Nonces, &nrm2);
 
 		if (nrm1+nrm2 == nrm3) {
-			quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
-			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+			quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash);
+			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces, d_hash);
 		}
 
 		// Runde 3 (komplett)
 
 		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
-		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
-			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
+		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces,
+			d_branch1Nonces, &nrm1,
+			d_branch2Nonces, &nrm2);
 
 		if (nrm1+nrm2 == nrm3) {
-			quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
-			quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+			quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash);
+			quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces, d_hash);
 		}
 
 		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
-		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
-			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
+		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces,
+			d_branch1Nonces, &nrm1,
+			d_branch2Nonces, &nrm2);
 
 		if (nrm1+nrm2 == nrm3) {
-			quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
-			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+			quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash);
+			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces, d_hash);
 		}
 
 		// Runde 3 (komplett)
 
 		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
-		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
-			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
+		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces,
+			d_branch1Nonces, &nrm1,
+			d_branch2Nonces, &nrm2);
 
 		if (nrm1+nrm2 == nrm3) {
-			quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
-			quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+			quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash);
+			quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces, d_hash);
 		}
 
 		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
-		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
-			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
+		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces,
+			d_branch1Nonces, &nrm1,
+			d_branch2Nonces, &nrm2);
 
 		if (nrm1+nrm2 == nrm3) {
-			quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
-			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+			quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash);
+			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces, d_hash);
 		}
 
-		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
+		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces, d_hash);
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(foundNonce != 0xffffffff)
 		{
 			unsigned int rounds;
-			uint32_t vhash64[8];
+			uint32_t vhash64[8]={0};
 			uint32_t Htarg = ptarget[7];
-			be32enc(&endiandata[19], foundNonce);
+			if(opt_verify){ be32enc(&endiandata[19], foundNonce);
 
 			// diese jackpothash Funktion gibt die Zahl der Runden zurück
 			rounds = jackpothash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce);
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash, foundNonce);
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (secNonce != 0) {
-					pdata[21] = secNonce;
-					res++;
+				if (secNonce != 0)
+				{
+					if(opt_verify){ be32enc(&endiandata[19], secNonce);
+					rounds = jackpothash(vhash64, endiandata);
+
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+						pdata[21] = secNonce;
+						res++;
+					}
+					else
+					{
+						if(opt_verify)
+							applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU (%d rounds)!", device_map[thr_id], secNonce, rounds);
+					}
 				}
 				pdata[19] = foundNonce;
 				return res;
 			}
 			else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU (%d rounds)!", thr_id, foundNonce, rounds);
+				if(opt_verify)
+					applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU (%d rounds)!", device_map[thr_id], foundNonce, rounds);
 			}
 		}
 
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/Makefile.am b/Makefile.am
index ff320b399c..e593739769 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,12 +1,13 @@
 # allow to use Host cuda functions in C/C++
 DEF_INCLUDES = @CUDA_INCLUDES@
 
-JANSSON_INCLUDES=
 if WANT_JANSSON
-JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
+JANSSON_INCLUDES = -I$(top_srcdir)/compat/jansson
+else
+JANSSON_INCLUDES =
 endif
 
-EXTRA_DIST		= autogen.sh README.txt LICENSE.txt \
+EXTRA_DIST = autogen.sh README.txt LICENSE.txt \
 			  cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \
 			  compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in
 
@@ -14,33 +15,31 @@ SUBDIRS = compat
 
 bin_PROGRAMS = ccminer
 
-ccminer_SOURCES	= elist.h miner.h compat.h \
+ccminer_SOURCES = elist.h miner.h compat.h \
 			  compat/inttypes.h compat/stdbool.h compat/unistd.h \
 			  compat/sys/time.h compat/getopt/getopt.h \
-			  crc32.c hefty1.c scrypt.c \
+			  crc32.c hefty1.c \
 			  ccminer.cpp util.cpp \
 			  api.cpp hashlog.cpp nvml.cpp stats.cpp sysinfos.cpp cuda.cpp \
-			  heavy/heavy.cu \
-			  heavy/cuda_blake512.cu heavy/cuda_blake512.h \
-			  heavy/cuda_combine.cu heavy/cuda_combine.h \
-			  heavy/cuda_groestl512.cu heavy/cuda_groestl512.h \
-			  heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \
-			  heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \
-			  heavy/cuda_sha256.cu heavy/cuda_sha256.h \
-			  fuguecoin.cpp Algo256/cuda_fugue256.cu sph/fugue.c uint256.h \
+			  cuda_helper.h cuda_vector.h \
+			  sph/neoscrypt.h sph/neoscrypt.cpp \
+			  sph/sha256_Y.h sph/sha256_Y.c sph/sph_sha2.c \
+			  fuguecoin.cpp Algo256/cuda_fugue256.cu sph/fugue.c \
 			  groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \
 			  myriadgroestl.cpp cuda_myriadgroestl.cu \
 			  lyra2/Lyra2.c lyra2/Sponge.c \
-			  lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \
+			  lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \
 			  Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
+			  Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \
 			  Algo256/blake256.cu Algo256/keccak256.cu \
 			  JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
 			  JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \
 			  quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
 			  quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/cuda_jh512keccak512.cu \
-			  quark/quarkcoin.cu quark/animecoin.cu \
+			  quark/quarkcoin.cu \
 			  quark/cuda_quark_compactionTest.cu  \
-			  cuda_nist5.cu pentablake.cu \
+			  cuda_nist5.cu pentablake.cu skein.cu \
+			  Sia/sia.cu Sia/cuda_sia.cu \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
@@ -53,25 +52,36 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
 			  x15/whirlpool.cu \
 			  x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \
-			  x11/s3.cu \
-			  bitcoin.cu cuda_bitcoin.cu
-
+			  x11/s3.cu x11/c11.cu \
+			  bitcoin.cu cuda_bitcoin.cu \
+			  x15/cuda_whirlpoolx.cu x15/whirlpoolx.cu \
+			  neoscrypt/neoscrypt.cu neoscrypt/cuda_neoscrypt.cu
+			  neoscrypt/cuda_neoscrypt_tpruvot.cu
+			  neoscrypt/cuda_vector_tpruvot.cuh neoscrypt/cuda_vector_uint2x4.cuh
+
+# scrypt
+# ccminer_SOURCES += scrypt.cpp scrypt-jane.cpp \
+#     scrypt/blake.cu scrypt/keccak.cu scrypt/sha256.cu \
+#     scrypt/salsa_kernel.cu scrypt/test_kernel.cu \
+#     scrypt/fermi_kernel.cu scrypt/kepler_kernel.cu \
+#     scrypt/nv_kernel.cu scrypt/nv_kernel2.cu scrypt/titan_kernel.cu
+
+			  
 if HAVE_NVML
 nvml_defs = -DUSE_WRAPNVML
 nvml_libs = -ldl
 endif
 
-if HAVE_WINDOWS
-ccminer_SOURCES += compat/winansi.c
-endif
-
 ccminer_LDFLAGS  = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
 ccminer_LDADD    = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@ $(nvml_libs)
 ccminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(CPPFLAGS) $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) $(DEF_INCLUDES) $(nvml_defs) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
 
-nvcc_ARCH  = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
-#nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
-#nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+nvcc_ARCH  = -gencode=arch=compute_61,code=sm_61
+nvcc_ARCH += -gencode=arch=compute_52,code=sm_52
+nvcc_ARCH += -gencode=arch=compute_50,code=sm_50
+nvcc_ARCH += -gencode=arch=compute_37,code=sm_37
+nvcc_ARCH += -gencode=arch=compute_35,code=sm_35
+nvcc_ARCH += -gencode=arch=compute_30,code=sm_30
 
 nvcc_FLAGS = $(nvcc_ARCH) @CUDA_INCLUDES@ -I. @CUDA_CFLAGS@
 nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
@@ -113,3 +123,14 @@ quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
 
 JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
 	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
+
+# This kernel need also an older SM to be able to autotune kernels
+# scrypt/salsa_kernel.o: scrypt/salsa_kernel.cu
+# 	$(NVCC) $(nvcc_FLAGS) -gencode=arch=compute_20,code=\"sm_21,compute_20\" --maxrregcount=80 -o $@ -c $<
+
+skein.o: skein.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
+
+# This kernel requires at least sm_35
+neoscrypt/cuda_neoscrypt.o: neoscrypt/cuda_neoscrypt.cu
+	$(NVCC)  -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_37,code=sm_37 @CUDA_INCLUDES@ -I. @CUDA_CFLAGS@ $(JANSSON_INCLUDES) --ptxas-options="-v" -o $@ -c $<
diff --git a/README.md b/README.md
index b03f3ee719..1177648cc7 100644
--- a/README.md
+++ b/README.md
@@ -1,33 +1,23 @@
-ccminer
-=======
+# ccminer
 
 Based on Christian Buchner's &amp; Christian H.'s CUDA project
 based on the Fork by tpruvot@github with X14,X15,X17,WHIRL,Blake256 and LYRA2 support , and some others, check the [README.txt](README.txt)
 Reforked and optimized by sp-hash@github and KlausT@github 
 
-SP-HASH: BTC donation address: 1CTiNJyoUmbdMRACtteRWXhGqtSETYd6Vd
+* KlausT:  BTC donation address: 1H2BHSyuwLP9vqt2p3bK9G3mDJsAi7qChw
+* sp-hash: BTC donation address: 1CTiNJyoUmbdMRACtteRWXhGqtSETYd6Vd
+* tpruvot: BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
 
-A part of the recent algos were originally wrote by [djm34](https://github.com/djm34).
+A part of the recent algos were originally written by [djm34](https://github.com/djm34).
 
-This variant was tested and built on Linux (ubuntu server 14.04) and VStudio 2013 on Windows 7.
+This variant was tested and built with Visual Studio 2015 on Windows 10
 
-Note that the x86 releases are generally faster than x64 ones on Windows.
-
-About source code dependencies
-------------------------------
+## About source code dependencies
 
 This project requires some libraries to be built :
+* OpenSSL (prebuilt for win)
+* Curl (prebuilt for win)
+* pthreads (prebuilt for win)
 
-- OpenSSL (prebuilt for win)
-
-- Curl (prebuilt for win)
-
-- pthreads (prebuilt for win)
-
-The tree now contains recent prebuilt openssl and curl .lib for both x86 and x64 platforms (windows).
-
-To rebuild them, you need to clone this repository and its submodules :
-    git clone https://github.com/peters/curl-for-windows.git compat/curl-for-windows
-
-There is also a [Tutorial for windows](http://cudamining.co.uk/url/tutorials/id/3) on [CudaMining](http://cudamining.co.uk) website.
+This fork now contains these libraries for both x86 and x64 platforms (windows).
 
diff --git a/README.txt b/README.txt
index 910aad4850..9b70c7f6e1 100644
--- a/README.txt
+++ b/README.txt
@@ -1,11 +1,13 @@
-
-ccMiner release 1.5.2-tpruvot (SP_MOD) (Jan 2015) - "Happy new Year!"
+ccMiner release 8.12(KlausT-mod) (August 17th, 2017)
 ---------------------------------------------------------------
 
 ***************************************************************
 If you find this tool useful and like to support its continued 
           development, then consider a donation.
 
+KlausT @github:
+  BTC 1H2BHSyuwLP9vqt2p3bK9G3mDJsAi7qChw
+
 tpruvot@github:
   BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
   DRK  : XeVrkPrWB7pDbdFLfKhF1Z3xpqhsx6wkH3
@@ -29,6 +31,7 @@ cbuchner v1.2:
 
 This is a CUDA accelerated mining application which handle :
 
+Bitcoin
 HeavyCoin & MjollnirCoin
 FugueCoin
 GroestlCoin & Myriad-Groestl
@@ -38,11 +41,13 @@ TalkCoin
 DarkCoin and other X11 coins
 NEOS blake (256 14-rounds)
 BlakeCoin (256 8-rounds)
-Keccak (Maxcoin)
 Deep, Doom and Qubit
+Keccak (Maxcoin)
 Pentablake (Blake 512 x5)
 S3 (OneCoin)
+Skein (Skein + SHA)
 Lyra2RE (new VertCoin algo)
+Neoscrypt
 
 where some of these coins have a VERY NOTABLE nVidia advantage
 over competing AMD (OpenCL Only) implementations.
@@ -60,71 +65,85 @@ that the most of our comments are in german.
 This code is based on the pooler cpuminer 2.3.2 release and inherits
 its command line interface and options.
 
-  -a, --algo=ALGO       specify the algorithm to use
-                          anime       use to mine Animecoin
-                          blake       use to mine NEOS (Blake 256)
-                          blakecoin   use to mine Old Blake 256
-                          deep        use to mine Deepcoin
-                          dmd-gr      use to mine Diamond-Groestl
-                          fresh       use to mine Freshcoin
-                          fugue256    use to mine Fuguecoin
-                          groestl     use to mine Groestlcoin
-                          heavy       use to mine Heavycoin
-                          jackpot     use to mine Jackpotcoin
-                          keccak      use to mine Maxcoin
-                          luffa       use to mine Doomcoin
-                          lyra2       use to mine Vertcoin
-                          mjollnir    use to mine Mjollnircoin
-                          myr-gr      use to mine Myriad-Groest
-                          nist5       use to mine TalkCoin
-                          penta       use to mine Joincoin / Pentablake
-                          quark       use to mine Quarkcoin
-                          qubit       use to mine Qubit Algo
-                          s3          use to mine 1coin
-                          whirl       use to mine Whirlcoin
-                          x11         use to mine DarkCoin
-                          x14         use to mine X14Coin
-                          x15         use to mine Halcyon
-                          x17         use to mine X17
-
-  -d, --devices         gives a comma separated list of CUDA device IDs
-                        to operate on. Device IDs start counting from 0!
-                        Alternatively give string names of your card like
-                        gtx780ti or gt640#2 (matching 2nd gt640 in the PC).
-
-  -i, --intensity       GPU threads per call 8-31 (default: 0=auto)
-                        Decimals are allowed for fine tuning
-  -f, --diff            Divide difficulty by this factor (std is 1)
-  -v, --vote            Heavycoin block vote (default: 512)
+   -a, --algo=ALGO specify the hash algorithm to use
+			bitcoin     Bitcoin
+			blake       Blake 256 (SFR/NEOS)
+			blakecoin   Fast Blake 256 (8 rounds)
+			c11         X11 variant
+			deep        Deepcoin
+			dmd-gr      Diamond-Groestl
+			fresh       Freshcoin (shavite 80)
+			fugue256    Fuguecoin
+			groestl     Groestlcoin
+			jackpot     Jackpot
+			keccak      Keccak-256 (Maxcoin)
+			luffa       Doomcoin
+			lyra2v2     VertCoin
+			myr-gr      Myriad-Groestl
+            neoscrypt   neoscrypt (FeatherCoin)
+			nist5       NIST5 (TalkCoin)
+			penta       Pentablake hash (5x Blake 512)
+			quark       Quark
+			qubit       Qubit
+			sia         Siacoin (at pools compatible to siamining.com) 
+			skein       Skein SHA2 (Skeincoin)
+			s3          S3 (1Coin)
+			spread      Spread
+			x11         X11 (DarkCoin)
+			x13         X13 (MaruCoin)
+			x14         X14
+			x15         X15
+			x17         X17 (peoplecurrency)
+			vanilla     Blake 256 8 rounds
+			yescrypt    yescrypt
+			whirl       Whirlcoin (old whirlpool)
+			whirlpoolx  Vanillacoin 
+  -d, --devices         Comma separated list of CUDA devices to use. 
+                        Device IDs start counting from 0! Alternatively takes
+                        string names of your cards like gtx780ti or gt640#2
+                        (matching 2nd gt640 in the PC)
+  -i  --intensity=N     GPU intensity 8-31 (default: auto) 
+                        Decimals are allowed for fine tuning 
+  -f, --diff-factor     Divide difficulty by this factor (default 1.0) 
+  -m, --diff-multiplier Multiply difficulty by this value (default 1.0) 
+  -v, --vote=VOTE       block reward vote (for HeavyCoin)
   -o, --url=URL         URL of mining server
   -O, --userpass=U:P    username:password pair for mining server
   -u, --user=USERNAME   username for mining server
   -p, --pass=PASSWORD   password for mining server
       --cert=FILE       certificate for mining server using SSL
   -x, --proxy=[PROTOCOL://]HOST[:PORT]  connect through a proxy
-  -t, --threads=N       number of miner threads (default: number of nVidia GPUs in your system)
+  -t, --threads=N       number of miner threads (default: number of nVidia GPUs)
   -r, --retries=N       number of times to retry if a network call fails
                           (default: retry indefinitely)
-  -R, --retry-pause=N   time to pause between retries, in seconds (default: 15)
+  -R, --retry-pause=N   time to pause between retries, in seconds (default: 30)
   -T, --timeout=N       network timeout, in seconds (default: 270)
   -s, --scantime=N      upper bound on time spent scanning current work when
                         long polling is unavailable, in seconds (default: 5)
+  -n, --ndevs           list cuda devices
   -N, --statsavg        number of samples used to display hashrate (default: 30)
       --no-gbt          disable getblocktemplate support (height check in solo)
       --no-longpoll     disable X-Long-Polling support
       --no-stratum      disable X-Stratum support
+  -e                    disable extranonce
   -q, --quiet           disable per-thread hashmeter output
+      --no-color        disable colored output
   -D, --debug           enable debug output
   -P, --protocol-dump   verbose dump of protocol-level activities
+      --cpu-affinity    set process affinity to cpu core(s), mask 0x3 for cores 0 and 1
+      --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest)
   -b, --api-bind        IP/Port for the miner API (default: 127.0.0.1:4068)
+  -S, --syslog          use system log for output messages
+      --syslog-prefix=... allow to change syslog tool name
+  -B, --background      run the miner in the background
       --benchmark       run in offline benchmark mode
       --cputest         debug hashes from cpu algorithms
-      --cpu-affinity    set process affinity to specific cpu core(s) mask
-      --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest)
+      --no-cpu-verify   don't verify the found results
   -c, --config=FILE     load a JSON-format configuration file
-      --no-color        disable colored console output
+      --plimit=N        Set the gpu power limit to N Watt (driver version >=352.21)
+                        (needs adminitrator rights under Windows)
   -V, --version         display version information and exit
-  -h, --help            display this help text and exit
+  -h, --help            display this help text and exit\n"
 
 
 >>> Examples <<<
@@ -179,142 +198,45 @@ features.
 
 >>> RELEASE HISTORY <<<
 
-  Jan.      2015  v1.5.2
-                  Allow per device intensity, example: -i 20,19.5
-                  Add process CPU priority and affinity mask parameters
-                  Intelligent duplicate shares check feature (enabled if needed)
-                  api: Fan RPM (windows), Cuda threads count, linux kernel ver.
-                  More X11 optimisations from sp and KlausT
-                  SM 3.0 enhancements
-
-  Dec. 16th 2014  v1.5.1
-                  Add lyra2RE algo for Vertcoin based on djm34/vtc code
-                  Multiple shares support (2 for the moment)
-                  X11 optimisations (From klaust and sp-hash)
-                  HTML5 WebSocket api compatibility (see api/websocket.htm)
-                  Solo mode height checks with getblocktemplate rpc calls
-
-  Nov. 27th 2014  v1.5.0
-                  Upgrade compat jansson to 2.6 (for windows)
-                  Add pool mining.set_extranonce support
-                  Allow intermediate intensity with decimals
-                  Update prebuilt x86 openssl lib to 1.0.1i
-                  Fix heavy algo on linux (broken since 1.4)
-                  Some internal changes to use the C++ compiler
-                  New API 1.2 with some new commands (read only)
-                  Add some of sp x11/x15 optimisations (and tsiv x13)
-
-  Nov. 15th 2014  v1.4.9
-                  Support of nvml and nvapi(windows) to monitor gpus
-                  Fix (again) displayed hashrate for multi gpus systems
-                    Average is now made by card (30 scans of the card)
-                  Final API v1.1 (new fields + histo command)
-                  Add support of telnet queries "telnet 127.0.0.1 4068"
-                  add histo api command to get performance debug details
-                  Add a rig sample php ui using json wrapper (php)
-                  Restore quark/jackpot previous speed (differently)
-
-  Nov. 12th 2014  v1.4.8
-                  Add a basic API and a sample php json wrapper
-                  Add statsavg (def 20) and api-bind parameters
-
-  Nov. 11th 2014  v1.4.7
-                  Average hashrate (based on the 20 last scans)
-                  Rewrite blake algo
-                  Add the -i (gpu threads/intensity parameter)
-                  Add some X11 optimisations based on sp_ commits
-                  Fix quark reported hashrate and benchmark mode for some algos
-                  Enhance json config file param (int/float/false) (-c config.json)
-                  Update windows prebuilt curl to 7.38.0
-
-  Oct. 26th 2014  v1.4.6
-                  Add S3 algo reusing existing code (onecoin)
-                  Small X11 (simd512) enhancement
-
-  Oct. 20th 2014  v1.4.5
-                  Add keccak algo from djm34 repo (maxcoin)
-                  Curl 7.35 and OpenSSL are now included in the binary (and win tree)
-                  Enhance windows terminal support (--help was broken)
-
-  Sep. 27th 2014  v1.4.4
-                  First SM 5.2 Release (GTX 970 & 980)
-                  CUDA Runtime included in binary
-                  Colors enabled by default
-
-  Sep. 10th 2014  v1.4.3
-                  Add algos from djm34 repo (deep, doom, qubit)
-                  Goalcoin seems to be dead, not imported.
-                  Create also the pentablake algo (5x Blake 512)
-
-  Sept  6th 2014  Almost twice the speed on blake256 algos with the "midstate" cache
-
-  Sep.  1st 2014  add X17, optimized x15 and whirl
-                  add blake (256 variant)
-                  color support on Windows,
-                  remove some dll dependencies (pthreads, msvcp)
-
-  Aug. 18th 2014  add X14, X15, Whirl, and Fresh algos,
-                  also add colors and nvprof cmd line support
-
-  June 15th 2014  add X13 and Diamond Groestl support.
-                  Thanks to tsiv and to Bombadil for the contributions!
-
-  June 14th 2014  released Killer Groestl quad version which I deem
-                  sufficiently hard to port over to AMD. It isn't
-                  the fastest option for Compute 3.5 and 5.0 cards,
-                  but it is still much faster than the table based
-                  versions.
-
-  May 10th 2014   added X11, but without the bells & whistles
-                  (no killer Groestl, SIMD hash quite slow still)
-
-  May 6th 2014    this adds the quark and animecoin algorithms.
-
-  May 3rd 2014    add the MjollnirCoin hash algorithm for the upcomin
-                  MjollnirCoin relaunch.
-
-                  Add the -f (--diff) option to adjust the difficulty
-                  e.g. for the erebor Dwarfpool myr-gr SaffronCoin pool.
-                  Use -f 256 there.
-
-  May 1st 2014    adapt the Jackpot algorithms to changes made by the
-                  coin developers. We keep our unique nVidia advantage
-                  because we have a way to break up the divergence.
-                  NOTE: Jackpot Hash now requires Compute 3.0 or later.
-
-  April, 27 2014  this release adds Myriad-Groestl and Jackpot Coin.
-                  we apply an optimization to Jackpot that turns this
-                  into a Keccak-only CUDA coin ;) Jackpot is tested with
-                  solo--mining only at the moment.
-
-  March, 27 2014  Heavycoin exchange rates soar, and as a result this coin
-                  gets some love: We greatly optimized the Hefty1 kernel
-                  for speed. Expect some hefty gains, especially on 750Ti's!
-
-                  By popular demand, we added the -d option as known from
-                  cudaminer.
-
-                  different compute capability builds are now provided until
-                  we figure out how to pack everything into a single executable
-                  in a Windows build.
-
-  March, 24 2014  fixed Groestl pool support
-
-                  went back to Compute 1.x for cuda_hefty1.cu kernel by
-                  default after numerous reports of ccminer v0.2/v0.3
-                  not working with HeavyCoin for some people.
-
-  March, 23 2014  added Groestlcoin support. stratum status unknown
-                  (the only pool is currently down for fixing issues)
-
-  March, 21 2014  use of shared memory in Fugue256 kernel boosts hash rates
-                  on Fermi and Maxwell devices. Kepler may suffer slightly
-                  (3-5%)
-
-                  Fixed Stratum for Fuguecoin. Tested on dwarfpool.
-
-  March, 18 2014  initial release.
-
+2015-02-01 Release 1.0, forked from tpruvot and sp-hash
+2015-02-03 v1.01: bug fix for cards with compute capability 3.0 (untested)
+2015-02-09 v1.02: various bug fixes and optimizations
+2015-03-08 v2.00: added whirlpoolx algo (Vanillacoin), also various optimizations and bug fixes
+2015-03-30 v3.00: added skein (for Myriadcoin for example)
+2015-05-06 v4.00: added Neoscrypt
+2015-05-15 v4.01: fixed crash after ctrl-c (Windows), fixed -g option
+2015-07-06 v5.00: -g option removed, some bug fixes and optimizations
+2015-07-08 v5.01: lyra2 optimization
+2015-08-22 v6.00: remove Lyra2RE, add Lyra2REv2, remove Animecoin, remove yesscrypt
+2016-05-03 v6.01: various bug fixes and optimizations
+2016-05-12 v6.02: faster x17 and quark
+2016-05-16 v7.00: added Vanillacoin, optimized blake and blakecoin,
+                  added stratum methods used by yiimp.ccminer.org
+2016-05-16 v7.01: stratum.get_stats bug fix
+2016-06-02 v7.02: fix default intensity for Nist5
+                  fix power usage statistics
+2016-06-11 v7.03: faster lyra2v2
+2016-06-18 v7.04: Neoscrypt optimization
+                  Bug Fixes 
+2016-08-11 v8.00: added Siacoin
+2016-08-12 v8.01: increse default intensity for Sia
+                  fix Linux build
+2016-09-29 v8.02: change to CUDA 8.0 on Windows
+                  various small changes
+2016-12-08 v8.03: fix memory leak in Neoscrypt
+2016-12-13 v8.04: fix illegal memory access in X11-X17
+                  fix duplicate shares in skein
+2016-12-17 v8.05: fix Skein bug
+2017-03-12 v8.06: Heavy and Mjollnir algos removed
+2017-05-18 v8.07: Bitcredit algo removed
+                  fixed bugs in bitcoin and jackpot algo
+2017-05-19 v8.08: fix Makefile and configure.ac for Linux
+2017-06-07 v8.09: some minor bug fixes
+2017-07-17 v8.10: fix Orbitcoin solo mining (Neoscrypt)
+2017-07-25 v8.11: change some timeout values
+                  fix Feathercoin solo mining (Neoscrypt)
+				  show chance to find a block while solo mining
+2017-08-17 v8.12: fix Myriad-Groestl speed bug
 
 >>> AUTHORS <<<
 
@@ -322,7 +244,7 @@ Notable contributors to this application are:
 
 Christian Buchner, Christian H. (Germany): Initial CUDA implementation
 
-djm34, tsiv, sp for cuda algos implementation and optimisation
+djm34, tsiv, sp and KlausT for cuda algos implementation and optimisation
 
 Tanguy Pruvot : 750Ti tuning, blake, colors, general code cleanup/opts
                 API monitoring, linux Config/Makefile and vstudio stuff...
diff --git a/Sia/cuda_sia.cu b/Sia/cuda_sia.cu
new file mode 100644
index 0000000000..81f403e588
--- /dev/null
+++ b/Sia/cuda_sia.cu
@@ -0,0 +1,314 @@
+/*
+Copyright (c) 2015 KlausT and Vorksholk
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+*/
+
+
+#include <stdint.h>
+#include "cuda_helper.h"
+#include "sia.h"
+
+#ifdef _MSC_VER
+#define THREAD __declspec(thread)
+#else
+#define THREAD __thread
+#endif
+
+#ifdef __INTELLISENSE__
+#define __launch_bounds__(blocksize)
+#endif
+
+static THREAD uint64_t *vpre_h;
+static THREAD uint32_t *nonceOut_d;
+static THREAD uint64_t *hash_d;
+__constant__ uint64_t vpre[16];
+__constant__ uint64_t header[10];
+
+__device__ __forceinline__
+static uint64_t __byte_perm_64(const uint64_t source, const uint32_t grab1, const uint32_t grab2)
+{
+	uint64_t r;
+	uint32_t r1;
+	uint32_t r2;
+
+	uint32_t i1;
+	uint32_t i2;
+
+	asm("mov.b64 {%0, %1}, %2;" : "=r"(i1), "=r"(i2) : "l"(source));
+	asm("prmt.b32 %0, %1, %2, %3;" : "=r"(r1) : "r"(i1), "r"(i2), "r"(grab1));
+	asm("prmt.b32 %0, %1, %2, %3;" : "=r"(r2) : "r"(i1), "r"(i2), "r"(grab2));
+	asm("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(r1), "r"(r2));
+
+	return r;
+}
+
+__device__ __forceinline__
+static uint64_t __swap_hilo(const uint64_t source)
+{
+	uint64_t r;
+	uint32_t s1;
+	uint32_t s2;
+
+	asm("mov.b64 {%0, %1}, %2;" : "=r"(s1), "=r"(s2) : "l"(source));
+	asm("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(s2), "r"(s1));
+
+	return r;
+}
+
+__device__ unsigned int numberofresults;
+
+__global__ void __launch_bounds__(blocksize, 3) siakernel(uint32_t * __restrict__ nonceOut, uint64_t target, uint64_t startnonce)
+{
+	uint64_t v[16];
+	const uint64_t start = startnonce + (blockDim.x * blockIdx.x + threadIdx.x)*npt;
+	const uint64_t end = start + npt;
+
+	numberofresults = 0;
+
+	for(uint64_t n = start; n < end; n++)
+	{
+		v[2] = 0x5BF2CD1EF9D6B596u + n; v[14] = __swap_hilo(~0x1f83d9abfb41bd6bu ^ v[2]); v[10] = 0x3c6ef372fe94f82bu + v[14]; v[6] = __byte_perm_64(0x1f83d9abfb41bd6bu ^ v[10], 0x6543, 0x2107);
+		v[2] = v[2] + v[6] + header[5]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63);
+		v[3] = 0x130C253729B586Au + header[6]; v[15] = __swap_hilo(0x5be0cd19137e2179u ^ v[3]); v[11] = 0xa54ff53a5f1d36f1u + v[15]; v[7] = __byte_perm_64(0x5be0cd19137e2179u ^ v[11], 0x6543, 0x2107);
+		v[3] = v[3] + v[7] + header[7]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63);
+		v[0] = vpre[0] + vpre[5] + header[8]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(vpre[5] ^ v[10], 0x6543, 0x2107);
+		v[0] = v[0] + v[5] + header[9]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63);
+		v[1] = vpre[1] + v[6];          v[12] = __swap_hilo(vpre[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107);
+		v[1] = v[1] + v[6];             v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63);
+		v[2] = v[2] + v[7];             v[13] = __swap_hilo(vpre[13] ^ v[2]); v[8] = vpre[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107);
+		v[2] = v[2] + v[7];             v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63);
+		v[3] = v[3] + vpre[4];          v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = vpre[9] + v[14]; v[4] = __byte_perm_64(vpre[4] ^ v[9], 0x6543, 0x2107);
+		v[3] = v[3] + v[4];             v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63);
+
+		v[0] = v[0] + v[4];             v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107);
+		v[0] = v[0] + v[4];             v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63);
+		v[1] = v[1] + v[5] + n;         v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107);
+		v[1] = v[1] + v[5] + header[8]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63);
+		v[2] = v[2] + v[6] + header[9]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107);
+		v[2] = v[2] + v[6];             v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63);
+		v[3] = v[3] + v[7];             v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107);
+		v[3] = v[3] + v[7] + header[6]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63);
+		v[0] = v[0] + v[5] + header[1]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107);
+		v[0] = v[0] + v[5];             v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63);
+		v[1] = v[1] + v[6] + header[0]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107);
+		v[1] = v[1] + v[6] + header[2]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63);
+		v[2] = v[2] + v[7];             v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107);
+		v[2] = v[2] + v[7] + header[7]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63);
+		v[3] = v[3] + v[4] + header[5]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107);
+		v[3] = v[3] + v[4] + header[3]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63);
+
+		v[0] = v[0] + v[4];             v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107);
+		v[0] = v[0] + v[4] + header[8]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63);
+		v[1] = v[1] + v[5];             v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107);
+		v[1] = v[1] + v[5] + header[0]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63);
+		v[2] = v[2] + v[6] + header[5]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107);
+		v[2] = v[2] + v[6] + header[2]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63);
+		v[3] = v[3] + v[7];             v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107);
+		v[3] = v[3] + v[7];             v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63);
+		v[0] = v[0] + v[5];             v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107);
+		v[0] = v[0] + v[5];             v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63);
+		v[1] = v[1] + v[6] + header[3]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107);
+		v[1] = v[1] + v[6] + header[6]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63);
+		v[2] = v[2] + v[7] + header[7]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107);
+		v[2] = v[2] + v[7] + header[1]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63);
+		v[3] = v[3] + v[4] + header[9]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107);
+		v[3] = v[3] + v[4] + n;         v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63);
+
+		v[0] = v[0] + v[4] + header[7]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107);
+		v[0] = v[0] + v[4] + header[9]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63);
+		v[1] = v[1] + v[5] + header[3]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107);
+		v[1] = v[1] + v[5] + header[1]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63);
+		v[2] = v[2] + v[6];             v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107);
+		v[2] = v[2] + v[6];             v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63);
+		v[3] = v[3] + v[7];             v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107);
+		v[3] = v[3] + v[7];             v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63);
+		v[0] = v[0] + v[5] + header[2]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107);
+		v[0] = v[0] + v[5] + header[6]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63);
+		v[1] = v[1] + v[6] + header[5]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107);
+		v[1] = v[1] + v[6];             v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63);
+		v[2] = v[2] + v[7] + n;         v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107);
+		v[2] = v[2] + v[7] + header[0]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63);
+		v[3] = v[3] + v[4];             v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107);
+		v[3] = v[3] + v[4] + header[8]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63);
+
+		v[0] = v[0] + v[4] + header[9]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107);
+		v[0] = v[0] + v[4] + header[0]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63);
+		v[1] = v[1] + v[5] + header[5]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107);
+		v[1] = v[1] + v[5] + header[7]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63);
+		v[2] = v[2] + v[6] + header[2]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107);
+		v[2] = v[2] + v[6] + n;         v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63);
+		v[3] = v[3] + v[7];             v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107);
+		v[3] = v[3] + v[7];             v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63);
+		v[0] = v[0] + v[5];             v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107);
+		v[0] = v[0] + v[5] + header[1]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63);
+		v[1] = v[1] + v[6];             v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107);
+		v[1] = v[1] + v[6];             v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63);
+		v[2] = v[2] + v[7] + header[6]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107);
+		v[2] = v[2] + v[7] + header[8]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63);
+		v[3] = v[3] + v[4] + header[3]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107);
+		v[3] = v[3] + v[4];             v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63);
+
+		v[0] = v[0] + v[4] + header[2]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107);
+		v[0] = v[0] + v[4];             v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63);
+		v[1] = v[1] + v[5] + header[6]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107);
+		v[1] = v[1] + v[5];             v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63);
+		v[2] = v[2] + v[6] + header[0]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107);
+		v[2] = v[2] + v[6];             v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63);
+		v[3] = v[3] + v[7] + header[8]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107);
+		v[3] = v[3] + v[7] + header[3]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63);
+		v[0] = v[0] + v[5] + n;         v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107);
+		v[0] = v[0] + v[5];             v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63);
+		v[1] = v[1] + v[6] + header[7]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107);
+		v[1] = v[1] + v[6] + header[5]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63);
+		v[2] = v[2] + v[7];             v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107);
+		v[2] = v[2] + v[7];             v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63);
+		v[3] = v[3] + v[4] + header[1]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107);
+		v[3] = v[3] + v[4] + header[9]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63);
+
+		v[0] = v[0] + v[4];             v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107);
+		v[0] = v[0] + v[4] + header[5]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63);
+		v[1] = v[1] + v[5] + header[1]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107);
+		v[1] = v[1] + v[5];             v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63);
+		v[2] = v[2] + v[6];             v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107);
+		v[2] = v[2] + v[6];             v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63);
+		v[3] = v[3] + v[7] + n;         v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107);
+		v[3] = v[3] + v[7];             v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63);
+		v[0] = v[0] + v[5] + header[0]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107);
+		v[0] = v[0] + v[5] + header[7]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63);
+		v[1] = v[1] + v[6] + header[6]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107);
+		v[1] = v[1] + v[6] + header[3]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63);
+		v[2] = v[2] + v[7] + header[9]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107);
+		v[2] = v[2] + v[7] + header[2]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63);
+		v[3] = v[3] + v[4] + header[8]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107);
+		v[3] = v[3] + v[4];             v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63);
+
+		v[0] = v[0] + v[4];             v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107);
+		v[0] = v[0] + v[4];             v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63);
+		v[1] = v[1] + v[5] + header[7]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107);
+		v[1] = v[1] + v[5];             v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63);
+		v[2] = v[2] + v[6];             v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107);
+		v[2] = v[2] + v[6] + header[1]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63);
+		v[3] = v[3] + v[7] + header[3]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107);
+		v[3] = v[3] + v[7] + header[9]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63);
+		v[0] = v[0] + v[5] + header[5]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107);
+		v[0] = v[0] + v[5] + header[0]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63);
+		v[1] = v[1] + v[6];             v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107);
+		v[1] = v[1] + v[6] + n;         v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63);
+		v[2] = v[2] + v[7] + header[8]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107);
+		v[2] = v[2] + v[7] + header[6]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63);
+		v[3] = v[3] + v[4] + header[2]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107);
+		v[3] = v[3] + v[4];             v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63);
+
+		v[0] = v[0] + v[4] + header[6]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107);
+		v[0] = v[0] + v[4];             v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63);
+		v[1] = v[1] + v[5];             v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107);
+		v[1] = v[1] + v[5] + header[9]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63);
+		v[2] = v[2] + v[6];             v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107);
+		v[2] = v[2] + v[6] + header[3]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63);
+		v[3] = v[3] + v[7] + header[0]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107);
+		v[3] = v[3] + v[7] + header[8]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63);
+		v[0] = v[0] + v[5];             v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107);
+		v[0] = v[0] + v[5] + header[2]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63);
+		v[1] = v[1] + v[6];             v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107);
+		v[1] = v[1] + v[6] + header[7]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63);
+		v[2] = v[2] + v[7] + header[1]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107);
+		v[2] = v[2] + v[7] + n;         v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63);
+		v[3] = v[3] + v[4];             v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107);
+		v[3] = v[3] + v[4] + header[5]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63);
+
+		v[0] = v[0] + v[4];             v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107);
+		v[0] = v[0] + v[4] + header[2]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63);
+		v[1] = v[1] + v[5] + header[8]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107);
+		v[1] = v[1] + v[5] + n;         v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63);
+		v[2] = v[2] + v[6] + header[7]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107);
+		v[2] = v[2] + v[6] + header[6]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63);
+		v[3] = v[3] + v[7] + header[1]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107);
+		v[3] = v[3] + v[7] + header[5]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63);
+		v[0] = v[0] + v[5];             v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107);
+		v[0] = v[0] + v[5];             v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63);
+		v[1] = v[1] + v[6] + header[9]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107);
+		v[1] = v[1] + v[6];             v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63);
+		v[2] = v[2] + v[7] + header[3]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107);
+		v[2] = v[2] + v[7];             v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63);
+		v[3] = v[3] + v[4];             v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107);
+		v[3] = v[3] + v[4] + header[0]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63);
+
+		v[0] = v[0] + v[4] + header[0]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107);
+		v[0] = v[0] + v[4] + header[1]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63);
+		v[1] = v[1] + v[5] + header[2]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107);
+		v[1] = v[1] + v[5] + header[3]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63);
+		v[2] = v[2] + v[6] + n;         v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107);
+		v[2] = v[2] + v[6] + header[5]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63);
+		v[3] = v[3] + v[7] + header[6]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107);
+		v[3] = v[3] + v[7] + header[7]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63);
+		v[0] = v[0] + v[5] + header[8]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107);
+		v[0] = v[0] + v[5] + header[9]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63);
+		v[1] = v[1] + v[6];             v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107);
+		v[1] = v[1] + v[6];             v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63);
+		v[2] = v[2] + v[7];             v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107);
+		v[2] = v[2] + v[7];             v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63);
+		v[3] = v[3] + v[4];             v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107);
+		v[3] = v[3] + v[4];             v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63);
+
+		v[0] = v[0] + v[4];             v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107);
+		v[0] = v[0] + v[4];             v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63);
+		v[1] = v[1] + v[5] + n;         v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107);
+		v[1] = v[1] + v[5] + header[8]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63);
+		v[2] = v[2] + v[6] + header[9]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107);
+		v[2] = v[2] + v[6];             v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63);
+		v[3] = v[3] + v[7];             v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107);
+		v[3] = v[3] + v[7] + header[6];	v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63);
+		v[0] = v[0] + v[5] + header[1];
+		v[0] = v[0] + __byte_perm_64(v[5] ^ (v[10] + __swap_hilo(v[15] ^ v[0])), 0x6543, 0x2107);
+		v[2] = v[2] + v[7];
+		v[13] = __swap_hilo(v[13] ^ v[2]);
+		v[8] = v[8] + v[13];
+		v[2] = v[2] + __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107) + header[7];
+
+		if(cuda_swab64(0x6A09E667F2BDC928 ^ v[0] ^ (v[8] + __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076))) < target)
+		{
+			int i = atomicAdd(&numberofresults, 1);
+			if(i < MAXRESULTS)
+				nonceOut[i] = n & 0xffffffff;
+			return;
+		}
+	}
+}
+
+void sia_gpu_hash(cudaStream_t cudastream, int thr_id, uint32_t threads, uint32_t *nonceOut, uint64_t target, uint64_t startnonce)
+{
+	siakernel << <threads / blocksize / npt, blocksize, 0, cudastream >> >(nonceOut_d, target, startnonce);
+	CUDA_SAFE_CALL(cudaGetLastError());
+	CUDA_SAFE_CALL(cudaMemcpyAsync(nonceOut, nonceOut_d, 4 * MAXRESULTS, cudaMemcpyDeviceToHost, cudastream));
+	CUDA_SAFE_CALL(cudaStreamSynchronize(cudastream));
+}
+
+void sia_gpu_init(int thr_id)
+{
+	CUDA_SAFE_CALL(cudaMallocHost(&vpre_h, 16 * 8));
+	CUDA_SAFE_CALL(cudaMalloc(&nonceOut_d, MAXRESULTS * 4));
+	CUDA_SAFE_CALL(cudaMalloc(&hash_d, 4 * 8));
+}
+
+void sia_precalc(cudaStream_t cudastream, const uint64_t *blockHeader)
+{
+	vpre_h[0] = 0xBB1838E7A0A44BF9u + blockHeader[0]; vpre_h[12] = ROTR64(0x510E527FADE68281u ^ vpre_h[0], 32); vpre_h[8] = 0x6a09e667f3bcc908u + vpre_h[12]; vpre_h[4] = ROTR64(0x510e527fade682d1u ^ vpre_h[8], 24);
+	vpre_h[0] = vpre_h[0] + vpre_h[4] + blockHeader[1];       vpre_h[12] = ROTR64(vpre_h[12] ^ vpre_h[0], 16);              vpre_h[8] = vpre_h[8] + vpre_h[12];               vpre_h[4] = ROTR64(vpre_h[4] ^ vpre_h[8], 63);
+	vpre_h[1] = 0x566D1711B009135Au + blockHeader[2]; vpre_h[13] = ROTR64(0x9b05688c2b3e6c1fu ^ vpre_h[1], 32); vpre_h[9] = 0xbb67ae8584caa73bu + vpre_h[13]; vpre_h[5] = ROTR64(0x9b05688c2b3e6c1fu ^ vpre_h[9], 24);
+	vpre_h[1] = vpre_h[1] + vpre_h[5] + blockHeader[3];       vpre_h[13] = ROTR64(vpre_h[13] ^ vpre_h[1], 16);              vpre_h[9] = vpre_h[9] + vpre_h[13];               vpre_h[5] = ROTR64(vpre_h[5] ^ vpre_h[9], 63);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(vpre, vpre_h, 16 * 8, 0, cudaMemcpyHostToDevice, cudastream));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(header, blockHeader, 10 * 8, 0, cudaMemcpyHostToDevice, cudastream));
+	CUDA_SAFE_CALL(cudaMemsetAsync(nonceOut_d, 0, 4 * MAXRESULTS, cudastream));
+}
\ No newline at end of file
diff --git a/Sia/sia.cu b/Sia/sia.cu
new file mode 100644
index 0000000000..ad02c1d569
--- /dev/null
+++ b/Sia/sia.cu
@@ -0,0 +1,306 @@
+/*-
+* blake2b C code from https://github.com/SiaMining/sgminer/blob/master/algorithm/sia.c
+*
+* Copyright 2009 Colin Percival, 2014 savale
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+* 1. Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+* 2. Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in the
+*    documentation and/or other materials provided with the distribution.
+*
+* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+* SUCH DAMAGE.
+*
+* This file was originally written by Colin Percival as part of the Tarsnap
+* online backup system.
+*/
+#include "miner.h"
+#include "cuda_helper.h"
+#include <cstdio>
+using namespace std;
+#include <cuda_profiler_api.h>
+#include "sia.h"
+
+extern void applog_hex(void *data, int len);
+extern bool fulltest_sia(const uint64_t *hash, const uint64_t *target);
+
+#define B2B_GET64(p)                            \
+    (((uint64_t) ((uint8_t *) (p))[0]) ^        \
+    (((uint64_t) ((uint8_t *) (p))[1]) << 8) ^  \
+    (((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \
+    (((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \
+    (((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \
+    (((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \
+    (((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
+    (((uint64_t) ((uint8_t *) (p))[7]) << 56))
+
+#define B2B_G(a, b, c, d, x, y) {   \
+    v[a] = v[a] + v[b] + x;         \
+    v[d] = ROTR64(v[d] ^ v[a], 32); \
+    v[c] = v[c] + v[d];             \
+    v[b] = ROTR64(v[b] ^ v[c], 24); \
+    v[a] = v[a] + v[b] + y;         \
+    v[d] = ROTR64(v[d] ^ v[a], 16); \
+    v[c] = v[c] + v[d];             \
+    v[b] = ROTR64(v[b] ^ v[c], 63); }
+
+static const uint64_t blake2b_iv[8] =
+{
+	0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+	0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+	0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+	0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+};
+
+typedef struct
+{
+	uint8_t b[128];                     // input buffer
+	uint64_t h[8];                      // chained state
+	uint64_t t[2];                      // total number of bytes
+	size_t c;                           // pointer for b[]
+	size_t outlen;                      // digest size
+} blake2b_ctx;
+
+void blake2b_update(blake2b_ctx *ctx, const void *in, size_t inlen);
+
+static void blake2b_compress(blake2b_ctx *ctx, int last)
+{
+	const uint8_t sigma[12][16] =
+	{
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+		{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+		{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+		{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+		{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+		{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+		{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+		{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+		{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+		{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+		{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}
+	};
+	int i;
+	uint64_t v[16], m[16];
+
+	for(i = 0; i < 8; i++)
+	{           // init work variables
+		v[i] = ctx->h[i];
+		v[i + 8] = blake2b_iv[i];
+	}
+
+	v[12] ^= ctx->t[0];                 // low 64 bits of offset
+	v[13] ^= ctx->t[1];                 // high 64 bits
+	if(last)                           // last block flag set ?
+		v[14] = ~v[14];
+
+	for(i = 0; i < 16; i++)            // get little-endian words
+		m[i] = B2B_GET64(&ctx->b[8 * i]);
+
+	for(i = 0; i < 12; i++)
+	{          // twelve rounds
+		B2B_G(0, 4, 8, 12, m[sigma[i][0]], m[sigma[i][1]]);
+		B2B_G(1, 5, 9, 13, m[sigma[i][2]], m[sigma[i][3]]);
+		B2B_G(2, 6, 10, 14, m[sigma[i][4]], m[sigma[i][5]]);
+		B2B_G(3, 7, 11, 15, m[sigma[i][6]], m[sigma[i][7]]);
+		B2B_G(0, 5, 10, 15, m[sigma[i][8]], m[sigma[i][9]]);
+		B2B_G(1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
+		B2B_G(2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]);
+		B2B_G(3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]);
+	}
+
+	for(i = 0; i < 8; ++i)
+		ctx->h[i] ^= v[i] ^ v[i + 8];
+}
+
+// Initialize the hashing context "ctx" with optional key "key".
+//      1 <= outlen <= 64 gives the digest size in bytes.
+//      Secret key (also <= 64 bytes) is optional (keylen = 0).
+int blake2b_init(blake2b_ctx *ctx, size_t outlen, const void *key, size_t keylen)        // (keylen=0: no key)
+{
+	size_t i;
+
+	if(outlen == 0 || outlen > 64 || keylen > 64)
+		return -1;                      // illegal parameters
+
+	for(i = 0; i < 8; i++)             // state, "param block"
+		ctx->h[i] = blake2b_iv[i];
+	ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen;
+
+	ctx->t[0] = 0;                      // input count low word
+	ctx->t[1] = 0;                      // input count high word
+	ctx->c = 0;                         // pointer within buffer
+	ctx->outlen = outlen;
+
+	for(i = keylen; i < 128; i++)      // zero input block
+		ctx->b[i] = 0;
+	if(keylen > 0)
+	{
+		blake2b_update(ctx, key, keylen);
+		ctx->c = 128;                   // at the end
+	}
+
+	return 0;
+}
+
+// Add "inlen" bytes from "in" into the hash.
+void blake2b_update(blake2b_ctx *ctx,	const void *in, size_t inlen)
+{
+	size_t i;
+
+	for(i = 0; i < inlen; i++)
+	{
+		if(ctx->c == 128)
+		{            // buffer full ?
+			ctx->t[0] += ctx->c;        // add counters
+			if(ctx->t[0] < ctx->c)     // carry overflow ?
+				ctx->t[1]++;            // high word
+			blake2b_compress(ctx, 0);   // compress (not last)
+			ctx->c = 0;                 // counter to zero
+		}
+		ctx->b[ctx->c++] = ((const uint8_t *)in)[i];
+	}
+}
+
+// Generate the message digest (size given in init).
+//      Result placed in "out".
+void blake2b_final(blake2b_ctx *ctx, void *out)
+{
+	size_t i;
+
+	ctx->t[0] += ctx->c;                // mark last block offset
+	if(ctx->t[0] < ctx->c)             // carry overflow
+		ctx->t[1]++;                    // high word
+
+	while(ctx->c < 128)                // fill up with zeros
+		ctx->b[ctx->c++] = 0;
+	blake2b_compress(ctx, 1);           // final block flag = 1
+
+	// little endian convert and store
+	for(i = 0; i < ctx->outlen; i++)
+	{
+		((uint8_t *)out)[i] =
+			(ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF;
+	}
+}
+
+void siahash(const void *data, unsigned int len, void *hash)
+{
+	blake2b_ctx ctx;
+	blake2b_init(&ctx, 32, NULL, 0);
+	blake2b_update(&ctx, data, len);
+	blake2b_final(&ctx, hash);
+}
+
+/***************************************************************************/
+
+int scanhash_sia(int thr_id, uint32_t *pdata, uint32_t *ptarget, uint32_t max_nonce, uint32_t *hashes_done)
+{
+	static THREAD uint32_t *h_nounce = nullptr;
+	const uint32_t first_nonce = pdata[8];
+	static THREAD uint32_t throughputmax;
+
+	if(opt_benchmark)
+		ptarget[7] = 0x00000001;
+
+	static THREAD volatile bool init = false;
+	if(!init)
+	{
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		CUDA_SAFE_CALL(cudaMallocHost(&h_nounce, MAXRESULTS * sizeof(uint32_t)));
+		sia_gpu_init(thr_id);
+
+		throughputmax = device_intensity(device_map[thr_id], __func__, 1U << 28);
+		mining_has_stopped[thr_id] = false;
+		init = true;
+	}
+	uint32_t throughput = min(throughputmax, (max_nonce - first_nonce));
+	throughput -= throughput % (blocksize*npt);
+
+	sia_precalc(gpustream[thr_id], (uint64_t *)pdata);
+
+	uint32_t endiandata[20];
+	for(int k = 0; k < 20; k++)
+		le32enc(&endiandata[k], pdata[k]);
+
+	do
+	{
+		sia_gpu_hash(gpustream[thr_id], thr_id, throughput, h_nounce, ((uint64_t*)ptarget)[3], ((uint64_t*)pdata)[4]);
+		if(stop_mining)
+		{
+			cudaDeviceSynchronize();
+			cudaStreamDestroy(gpustream[thr_id]);
+			cudaProfilerStop();
+			mining_has_stopped[thr_id] = true;
+			pthread_exit(nullptr);
+		}
+		if(h_nounce[0] != 0)
+		{
+			const uint64_t Htarg = ((uint64_t*)ptarget)[3];
+			uint64_t vhash64[4] = {0};
+			if(opt_verify)
+			{
+				le32enc(&endiandata[8], h_nounce[0]);
+				siahash(endiandata, 80, vhash64);
+			}
+			if(swab64(vhash64[0]) <= Htarg && fulltest_sia(vhash64, (uint64_t*)ptarget))
+			{
+				int res = 1;
+				*hashes_done = pdata[8] - first_nonce + throughput;
+				if(opt_benchmark || opt_debug)  applog(LOG_INFO, "GPU #%d: Found nonce %08x", device_map[thr_id], h_nounce[0]);
+				// check if there was some other ones...
+				if(h_nounce[1] != 0)
+				{
+					if(opt_verify)
+					{
+						le32enc(&endiandata[8], h_nounce[1]);
+						siahash(vhash64, 80, endiandata);
+
+					}
+					if(swab64(vhash64[0]) <= Htarg && fulltest_sia(vhash64, (uint64_t*)ptarget))
+					{
+						pdata[20] = h_nounce[1];
+						res++;
+						if(opt_benchmark || opt_debug)  applog(LOG_INFO, "GPU #%d: Found second nonce", device_map[thr_id]);
+					}
+					else
+					{
+						if(vhash64[0] != Htarg) // don't show message if it is equal but fails fulltest
+							applog(LOG_INFO, "GPU #%d: result does not validate on CPU!", device_map[thr_id]);
+					}
+				}
+				pdata[8] = h_nounce[0];
+//				applog(LOG_INFO, "hashes done = %08x", *hashes_done);
+				return res;
+			}
+			else
+			{
+				if(vhash64[0] != Htarg) // don't show message if it is equal but fails fulltest
+					applog(LOG_INFO, "GPU #%d: result does not validate on CPU!", device_map[thr_id]);
+			}
+		}
+		pdata[8] += throughput;
+		CUDA_SAFE_CALL(cudaGetLastError());
+
+	} while(!work_restart[thr_id].restart && ((uint64_t)max_nonce >((uint64_t)pdata[8] + (uint64_t)throughput)));
+	*hashes_done = pdata[8] - first_nonce;
+	return 0;
+}
\ No newline at end of file
diff --git a/Sia/sia.h b/Sia/sia.h
new file mode 100644
index 0000000000..e180cf6363
--- /dev/null
+++ b/Sia/sia.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#define MAXRESULTS 8
+
+#define npt 1
+#define blocksize 512
+
+void sia_gpu_init(int thr_id);
+void sia_precalc(cudaStream_t cudastream, const uint64_t *blockHeader);
+void sia_gpu_hash(cudaStream_t cudastream, int thr_id, uint32_t threads, uint32_t *nonceOut, uint64_t target, uint64_t startnonce);
diff --git a/api.cpp b/api.cpp
index c1860f6d18..38572f1456 100644
--- a/api.cpp
+++ b/api.cpp
@@ -15,19 +15,18 @@
 //# include <winsock2.h>
 #endif
 
-#include <stdio.h>
+#include <cstdio>
 #include <ctype.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
+#include <cstdlib>
+#include <cstring>
 #include <inttypes.h>
 #include <unistd.h>
 #include <sys/time.h>
-#include <time.h>
-#include <math.h>
-#include <stdarg.h>
-#include <assert.h>
-
+#include <ctime>
+#include <cmath>
+#include <cstdarg>
+#include <cassert>
+using namespace std;
 #include <sys/stat.h>
 #include <sys/types.h>
 
@@ -90,7 +89,7 @@ static time_t startup = 0;
 static int bye = 0;
 
 extern char *opt_api_allow;
-extern int opt_api_listen; /* port */
+extern uint16_t opt_api_listen; /* port */
 extern uint32_t accepted_count;
 extern uint32_t rejected_count;
 extern int num_cpus;
@@ -122,6 +121,7 @@ static void gpustatus(int thr_id)
 		cgpu->gpu_temp = gpu_temp(cgpu);
 		cgpu->gpu_fan = (uint16_t) gpu_fanpercent(cgpu);
 		cgpu->gpu_fan_rpm = (uint16_t) gpu_fanrpm(cgpu);
+		cgpu->gpu_power = gpu_power(cgpu); 
 #endif
 		cuda_gpu_clocks(cgpu);
 
@@ -270,12 +270,7 @@ static const char* os_name()
 #ifdef WIN32
 	return "windows";
 #else
-	FILE *fd = fopen("/proc/version", "r");
-	if (!fd || !fscanf(fd, "Linux version %48s", &os_version[6]))
 		return "linux";
-	fclose(fd);
-	os_version[48] = '\0';
-	return (const char*) os_version;
 #endif
 }
 
@@ -426,8 +421,10 @@ static size_t base64_encode(const uchar *indata, size_t insize, char *outptr, si
 	memset(outptr, 0, outlen);
 
 	outbuf = output = (char*)calloc(1, inlen * 4 / 3 + 4);
-	if (outbuf == NULL) {
-		return -1;
+	if(outbuf == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(1);
 	}
 
 	while (inlen > 0) {
@@ -479,7 +476,7 @@ static size_t base64_encode(const uchar *indata, size_t insize, char *outptr, si
 	return len;
 }
 
-#include "compat/curl-for-windows/openssl/openssl/crypto/sha/sha.h"
+#include "openssl/sha.h"
 
 /* websocket handshake (tested in Chrome) */
 static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
@@ -539,8 +536,11 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
 
 	size_t handlen = strlen(answer);
 	uchar *data = (uchar*) calloc(1, handlen + frames + (size_t) datalen + 1);
-	if (data == NULL)
-		return -1;
+	if(data == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(1);
+	}
 	else {
 		uchar *p = data;
 		// HTTP header 101
@@ -565,8 +565,11 @@ static void setup_ipaccess()
 	char group;
 
 	buf = (char*) calloc(1, strlen(opt_api_allow) + 1);
-	if (unlikely(!buf))
-		proper_exit(1);//, "Failed to malloc ipaccess buf");
+	if(buf == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(1);
+	}
 
 	strcpy(buf, opt_api_allow);
 	ipcount = 1;
@@ -576,8 +579,11 @@ static void setup_ipaccess()
 
 	// possibly more than needed, but never less
 	ipaccess = (struct IP4ACCESS *) calloc(ipcount, sizeof(struct IP4ACCESS));
-	if (unlikely(!ipaccess))
-		proper_exit(1);//, "Failed to calloc ipaccess");
+	if(ipaccess == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(1);
+	}
 
 	ips = 0;
 	ptr = buf;
@@ -676,11 +682,11 @@ static bool check_connect(struct sockaddr_in *cli, char **connectaddr, char *gro
 static void api()
 {
 	const char *addr = opt_api_allow;
-	short int port = opt_api_listen; // 4068
+	uint16_t port = opt_api_listen; // 4068
 	char buf[MYBUFSIZ];
 	int c, n, bound;
-	char *connectaddr;
-	char *binderror;
+	char *connectaddr = nullptr;
+	char *binderror = nullptr;
 	char group;
 	time_t bindstart;
 	struct sockaddr_in serv;
@@ -688,11 +694,11 @@ static void api()
 	socklen_t clisiz;
 	bool addrok = false;
 	long long counter;
-	char *result;
-	char *params;
+	char *result = nullptr;
+	char *params = nullptr;
 	int i;
 
-	SOCKETTYPE *apisock;
+	SOCKETTYPE *apisock = nullptr;
 	if (!opt_api_listen && opt_debug) {
 		applog(LOG_DEBUG, "API disabled");
 		return;
@@ -706,6 +712,11 @@ static void api()
 	}
 
 	apisock = (SOCKETTYPE*) calloc(1, sizeof(*apisock));
+	if(apisock == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(1);
+	}
 	*apisock = INVSOCK;
 
 	sleep(1);
@@ -748,14 +759,29 @@ static void api()
 			binderror = strerror(errno);
 			if ((time(NULL) - bindstart) > 61)
 				break;
-			else {
+			else if (opt_api_listen == 4068) {
+				/* when port is default one, use first available */
+				if (opt_debug)
+					applog(LOG_DEBUG, "API bind to port %d failed, trying port %u",
+						port, (uint32_t) port+1);
+				port++;
+				serv.sin_port = htons(port);
+				sleep(1);
+			} else {
 				if (!opt_quiet || opt_debug)
-					applog(LOG_WARNING, "API bind to port %d failed - trying again in 20sec", port);
+					applog(LOG_WARNING, "API bind to port %u failed - trying again in 20sec",
+						(uint32_t) port);
 				sleep(20);
 			}
 		}
-		else
+		else {
 			bound = 1;
+			if (opt_api_listen != port) {
+				applog(LOG_WARNING, "API bind to port %d failed - using port %u",
+					opt_api_listen, (uint32_t) port);
+				opt_api_listen = port;
+			}
+		}
 	}
 
 	if (bound == 0) {
@@ -772,13 +798,19 @@ static void api()
 	}
 
 	buffer = (char *) calloc(1, MYBUFSIZ + 1);
+	if(buffer == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(1);
+	}
 
 	counter = 0;
 	while (bye == 0) {
 		counter++;
 
 		clisiz = sizeof(cli);
-		if (SOCKETFAIL(c = accept(*apisock, (struct sockaddr *)(&cli), &clisiz))) {
+		c = accept(*apisock, (struct sockaddr*) (&cli), &clisiz);
+		if (SOCKETFAIL(c)) {
 			applog(LOG_ERR, "API failed (%s)%s", strerror(errno), UNAVAILABLE);
 			CLOSESOCKET(*apisock);
 			free(apisock);
diff --git a/autogen.sh b/autogen.sh
index 8261a2c136..a4768b525a 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -1 +1 @@
-aclocal && autoheader && automake --add-missing --gnu --copy && autoconf
+aclocal && autoheader && automake --add-missing --gnu --copy && autoconf 
diff --git a/bitcoin.cu b/bitcoin.cu
index 6f8b2b1107..aecf49c35c 100644
--- a/bitcoin.cu
+++ b/bitcoin.cu
@@ -1,8 +1,6 @@
 #include "miner.h"
 #include "cuda_helper.h"
 
-static uint32_t *h_nounce[MAX_GPUS];
-
 extern void bitcoin_cpu_init(int thr_id);
 extern void bitcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *const ms, uint32_t merkle, uint32_t time, uint32_t compacttarget, uint32_t *const h_nounce);
 extern void bitcoin_midstate(const uint32_t *data, uint32_t *midstate);
@@ -112,39 +110,42 @@ void bitcoin_hash(uint32_t *output, const uint32_t *data, uint32_t nonce, const
 		b = a;
 		a = t1 + t2;
 	}
-	output[0] = a + hc[0];
-	output[1] = b + hc[1];
-	output[2] = c + hc[2];
-	output[3] = d + hc[3];
-	output[4] = e + hc[4];
-	output[5] = f + hc[5];
-	output[6] = g + hc[6];
-	output[7] = h + hc[7];
+	be32enc(&output[0], a + hc[0]);
+	be32enc(&output[1], b + hc[1]);
+	be32enc(&output[2], c + hc[2]);
+	be32enc(&output[3], d + hc[3]);
+	be32enc(&output[4], e + hc[4]);
+	be32enc(&output[5], f + hc[5]);
+	be32enc(&output[6], g + hc[6]);
+	be32enc(&output[7], h + hc[7]);
 }
 
-static bool init[MAX_GPUS] = { 0 };
 
 int scanhash_bitcoin(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done)
 {
+	static THREAD uint32_t *h_nounce = nullptr;
+
 	const uint32_t first_nonce = pdata[19];
-	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 24);
-	throughput = min(throughput, (max_nonce - first_nonce));
+	uint32_t throughput = device_intensity(device_map[thr_id], __func__, 1U << 28);
+	throughput = min(throughput, (max_nonce - first_nonce)) & 0xfffffc00;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0005;
+		ptarget[7] = 0x0005;
 
-	if (!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
 		bitcoin_cpu_init(thr_id);
-		CUDA_SAFE_CALL(cudaMallocHost(&h_nounce[thr_id], 2 * sizeof(uint32_t)));
-		init[thr_id] = true;
+		CUDA_SAFE_CALL(cudaMallocHost(&h_nounce, 2 * sizeof(uint32_t)));
+		mining_has_stopped[thr_id] = false;
+		init = true;
 	}
 
 	uint32_t ms[8];
@@ -152,40 +153,52 @@ int scanhash_bitcoin(int thr_id, uint32_t *pdata,
 
 	do
 	{
-		bitcoin_cpu_hash(thr_id, (int)throughput, pdata[19], ms, pdata[16], pdata[17], pdata[18], h_nounce[thr_id]);
-		if (h_nounce[thr_id][0] != UINT32_MAX)
+		bitcoin_cpu_hash(thr_id, throughput, pdata[19], ms, pdata[16], pdata[17], pdata[18], h_nounce);
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(h_nounce[0] != UINT32_MAX)
 		{
-			uint32_t vhash64[8];
-			bitcoin_hash(vhash64, pdata, h_nounce[thr_id][0], ms);
-			if (vhash64[7] == 0 && fulltest(vhash64, ptarget))
+			uint32_t vhash64[8]={0};
+			bitcoin_hash(vhash64, pdata, h_nounce[0], ms);
+			if (!opt_verify || (vhash64[7] == 0 && fulltest(vhash64, ptarget)))
 			{
 				int res = 1;
 				// check if there was some other ones...
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (h_nounce[thr_id][1] != 0xffffffff)
+				if (h_nounce[1] != 0xffffffff)
 				{
-					pdata[21] = h_nounce[thr_id][1];
-					res++;
-					if (opt_benchmark)
-						applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, h_nounce[thr_id][1]);
+					bitcoin_hash(vhash64, pdata, h_nounce[1], ms);
+					if (!opt_verify || (vhash64[7] == 0 && fulltest(vhash64, ptarget)))
+					{
+						pdata[21] = h_nounce[1];
+						res++;
+						if (opt_benchmark)
+							applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_nounce[1]);
+					}
+					else
+					{
+						if (vhash64[7] > 0)
+						{
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_nounce[1]);
+						}
+					}
 				}
-				pdata[19] = h_nounce[thr_id][0];
+				pdata[19] = h_nounce[0];
 				if (opt_benchmark)
-					applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, h_nounce[thr_id][0]);
+					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_nounce[0]);
 				return res;
 			}
 			else
 			{
 				if (vhash64[7] > 0)
 				{
-					applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_nounce[thr_id][0]);
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_nounce[0]);
 				}
 			}
 		}
 
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu
index ddbeb1aa81..acfd6e17a2 100644
--- a/bitslice_transformations_quad.cu
+++ b/bitslice_transformations_quad.cu
@@ -1,72 +1,81 @@
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
-/**
- * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
- * If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
- */
-#undef __shfl
-#define __shfl(var, srcLane, width) (uint32_t)(var)
-#endif
 
-#define merge8(z,x,y)\
-		z=__byte_perm(x, y, 0x5140); \
+#define merge8(z, x, y, b)\
+		z=__byte_perm(x, y, b); \
 
 #define SWAP8(x,y)\
 		x=__byte_perm(x, y, 0x5410); \
 		y=__byte_perm(x, y, 0x7632);
 
 #define SWAP4(x,y)\
-		t = (y<<4); \
-		t = (x ^ t); \
-		t = 0xf0f0f0f0UL & t; \
+		t = 0xf0f0f0f0UL & (x ^ (y<<4)); \
 		x = (x ^ t); \
 		t=  t>>4;\
 		y=  y ^ t;
 
+#ifndef NOASM
+#define SWAP4_final(x,y)\
+	asm("and.b32 %0, %0, 0x0f0f0f0f;"\
+	    "and.b32 %1, %1, 0x0f0f0f0f;"\
+	    "vshl.u32.u32.u32.clamp.add %0, %1, 4, %0;\n\t"\
+	    : "+r"(x) : "r"(y));
+#else
+#define SWAP4_final(x,y)\
+	t = 0xf0f0f0f0UL & (x ^ (y << 4)); \
+	x = (x ^ (0xf0f0f0f0UL & (x ^ (y << 4)))); 
+#endif
+
 #define SWAP2(x,y)\
-		t = (y<<2); \
-		t = (x ^ t); \
-		t = 0xccccccccUL & t; \
+		t = 0xccccccccUL & (x ^ (y<<2)); \
 		x = (x ^ t); \
 		t=  t>>2;\
 		y=  y ^ t;
 
 #define SWAP1(x,y)\
-		t = (y+y); \
-		t = (x ^ t); \
-		t = 0xaaaaaaaaUL & t; \
+		t = 0xaaaaaaaaUL & (x ^ (y<<1)); \
 		x = (x ^ t); \
-		t=  t>>1;\
-		y=  y ^ t;
-
+		t = t>>1;\
+		y = y ^ t;
 
 __device__ __forceinline__
 void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
 {
-    uint32_t other[8];
+	uint32_t other[8];
 	uint32_t t;
 
-    #pragma unroll
-    for (int i = 0; i < 8; i++) 
+	const uint32_t perm = (threadIdx.x & 1) ? 0x7362 : 0x5140;
+	const unsigned int n = threadIdx.x & 3;
+#pragma unroll
+	for(int i = 0; i < 4; i++)
 	{
-		const unsigned int n = threadIdx.x & 3;
 		input[i] = __shfl((int)input[i], n ^ (3 * (n >= 1 && n <= 2)), 4);
-        other[i] = __shfl((int)input[i], (threadIdx.x + 1) & 3, 4);
-        input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
-        other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
-        if (threadIdx.x & 1) {
-            input[i] = __byte_perm(input[i], 0, 0x1032);
-            other[i] = __byte_perm(other[i], 0, 0x1032);
-        }
-    }
-
-	merge8(output[0], input[0], input[4]);
-	merge8(output[1], other[0], other[4]);
-	merge8(output[2], input[1], input[5]);
-	merge8(output[3], other[1], other[5]);
-	merge8(output[4], input[2], input[6]);
-	merge8(output[5], other[2], other[6]);
-	merge8(output[6], input[3], input[7]);
-	merge8(output[7], other[3], other[7]);
+		other[i] = __shfl((int)input[i], (threadIdx.x + 1) & 3, 4);
+		input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
+		other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
+	}
+
+	if((threadIdx.x & 3) < 2)
+	{
+		input[4] = 0x80;
+	}
+	else
+	{
+		input[4] = 0;
+	}
+
+	if((threadIdx.x & 3) > 1)
+		other[7] = 0x01000000;
+	else
+		other[7] = 0;
+	input[7] = 0;
+
+	merge8(output[0], input[0], input[4], perm);
+	merge8(output[1], other[0],        0, perm);
+	merge8(output[2], input[1],        0, perm);
+	merge8(output[3], other[1],        0, perm);
+	merge8(output[4], input[2],        0, perm);
+	merge8(output[5], other[2],        0, perm);
+	merge8(output[6], input[3],        0, perm);
+	merge8(output[7], other[3], other[7], perm);
 
 	SWAP1(output[0], output[1]);
 	SWAP1(output[2], output[3]);
@@ -85,15 +94,67 @@ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __rest
 }
 
 __device__ __forceinline__
-void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
+void myr_to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
 {
+	uint32_t other[8];
+	uint32_t t;
+
+	const uint32_t perm = (threadIdx.x & 1) ? 0x7362 : 0x5140;
+	const unsigned int n = threadIdx.x & 3;
+#pragma unroll
+	for(int i = 0; i < 5; i++)
+	{
+		input[i] = __shfl((int)input[i], n ^ (3 * (n >= 1 && n <= 2)), 4);
+		other[i] = __shfl((int)input[i], (threadIdx.x + 1) & 3, 4);
+		input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
+		other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
+	}
+	if(n < 2)
+	{
+		input[5] = 0x80;
+		other[7] = 0;
+	}
+	else
+	{
+		input[5] = 0;
+		other[7] = 0x01000000;
+	}
+
+	merge8(output[0], input[0], input[4], perm);
+	merge8(output[1], other[0], other[4], perm);
+	merge8(output[2], input[1], input[5], perm);
+	output[3] = __byte_perm(other[1], 0, perm);
+	output[4] = __byte_perm(input[2], 0, perm);
+	output[5] = __byte_perm(other[2], 0, perm);
+	output[6] = __byte_perm(input[3], 0, perm);
+	merge8(output[7], other[3], other[7], perm);
 
+	SWAP1(output[0], output[1]);
+	SWAP1(output[2], output[3]);
+	SWAP1(output[4], output[5]);
+	SWAP1(output[6], output[7]);
+
+	SWAP2(output[0], output[2]);
+	SWAP2(output[1], output[3]);
+	SWAP2(output[4], output[6]);
+	SWAP2(output[5], output[7]);
+
+	SWAP4(output[0], output[4]);
+	SWAP4(output[1], output[5]);
+	SWAP4(output[2], output[6]);
+	SWAP4(output[3], output[7]);
+}
+
+__device__ __forceinline__
+void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
+{
 	uint32_t t;
+	const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531;
 
-	output[0] = __byte_perm(input[0], input[4], 0x7531);
-	output[2] = __byte_perm(input[1], input[5], 0x7531);
-	output[8] = __byte_perm(input[2], input[6], 0x7531);
-	output[10] = __byte_perm(input[3], input[7], 0x7531);
+	output[0] = __byte_perm(input[0], input[4], perm);
+	output[2] = __byte_perm(input[1], input[5], perm);
+	output[8] = __byte_perm(input[2], input[6], perm);
+	output[10] = __byte_perm(input[3], input[7], perm);
 
 	SWAP1(output[0], output[2]);
 	SWAP1(output[8], output[10]);
@@ -112,57 +173,87 @@ void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *cons
 	SWAP4(output[0], output[8]);
 	SWAP4(output[2], output[10]);
 
-	output[4] = output[0];
-	output[6] = output[2];
-	output[12] = output[8];
-	output[14] = output[10];
-
-	if (threadIdx.x & 1) 
+	if(threadIdx.x & 1)
 	{
+		output[14] = __byte_perm(output[10], 0, 0x3232);
+		output[12] = __byte_perm(output[8], 0, 0x3232);
+		output[6] = __byte_perm(output[2], 0, 0x3232);
+		output[4] = __byte_perm(output[0], 0, 0x3232);
+
 		output[0] = __byte_perm(output[0], 0, 0x1032);
 		output[2] = __byte_perm(output[2], 0, 0x1032);
-		output[4] = __byte_perm(output[4], 0, 0x3232);
-		output[6] = __byte_perm(output[6], 0, 0x3232);
 		output[8] = __byte_perm(output[8], 0, 0x1032);
 		output[10] = __byte_perm(output[10], 0, 0x1032);
-		output[12] = __byte_perm(output[12], 0, 0x3232);
-		output[14] = __byte_perm(output[14], 0, 0x3232);
+	}
+	else
+	{
+		output[4] = output[0];
+		output[6] = output[2];
+		output[12] = output[8];
+		output[14] = output[10];
 	}
 
 	output[0] = __byte_perm(output[0], __shfl((int)output[0], (threadIdx.x + 1) & 3, 4), 0x7610);
-	output[0 + 1] = __shfl((int)output[0], (threadIdx.x + 2) & 3, 4);
-
 	output[2] = __byte_perm(output[2], __shfl((int)output[2], (threadIdx.x + 1) & 3, 4), 0x7610);
-	output[2 + 1] = __shfl((int)output[2], (threadIdx.x + 2) & 3, 4);
-
 	output[4] = __byte_perm(output[4], __shfl((int)output[4], (threadIdx.x + 1) & 3, 4), 0x7632);
-	output[4 + 1] = __shfl((int)output[4], (threadIdx.x + 2) & 3, 4);
-
 	output[6] = __byte_perm(output[6], __shfl((int)output[6], (threadIdx.x + 1) & 3, 4), 0x7632);
-	output[6 + 1] = __shfl((int)output[6], (threadIdx.x + 2) & 3, 4);
-
 	output[8] = __byte_perm(output[8], __shfl((int)output[8], (threadIdx.x + 1) & 3, 4), 0x7610);
-	output[8 + 1] = __shfl((int)output[8], (threadIdx.x + 2) & 3, 4);
-
 	output[10] = __byte_perm(output[10], __shfl((int)output[10], (threadIdx.x + 1) & 3, 4), 0x7610);
-	output[10 + 1] = __shfl((int)output[10], (threadIdx.x + 2) & 3, 4);
-
 	output[12] = __byte_perm(output[12], __shfl((int)output[12], (threadIdx.x + 1) & 3, 4), 0x7632);
-	output[12 + 1] = __shfl((int)output[12], (threadIdx.x + 2) & 3, 4);
-
 	output[14] = __byte_perm(output[14], __shfl((int)output[14], (threadIdx.x + 1) & 3, 4), 0x7632);
+
+	output[0 + 1] = __shfl((int)output[0], (threadIdx.x + 2) & 3, 4);
+	output[2 + 1] = __shfl((int)output[2], (threadIdx.x + 2) & 3, 4);
+	output[4 + 1] = __shfl((int)output[4], (threadIdx.x + 2) & 3, 4);
+	output[6 + 1] = __shfl((int)output[6], (threadIdx.x + 2) & 3, 4);
+	output[8 + 1] = __shfl((int)output[8], (threadIdx.x + 2) & 3, 4);
+	output[10 + 1] = __shfl((int)output[10], (threadIdx.x + 2) & 3, 4);
+	output[12 + 1] = __shfl((int)output[12], (threadIdx.x + 2) & 3, 4);
 	output[14 + 1] = __shfl((int)output[14], (threadIdx.x + 2) & 3, 4);
 
-/*	if (threadIdx.x & 3)
+}
+
+__device__ __forceinline__
+void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
+{
+	uint32_t t;
+	const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531;
+
+	output[0] = __byte_perm(input[0], input[4], perm);
+	output[2] = __byte_perm(input[1], input[5], perm);
+	output[8] = __byte_perm(input[2], input[6], perm);
+	output[10] = __byte_perm(input[3], input[7], perm);
+
+	SWAP1(output[0], output[2]);
+	SWAP1(output[8], output[10]);
+
+	SWAP2(output[2], output[10]);
+
+	output[6] = __byte_perm(output[2], output[10], 0x5410);
+	output[10] = __byte_perm(output[2], output[10], 0x7632);
+
+	if(threadIdx.x & 3)
 	{
-		output[0] = output[0 + 1] = 0;
-		output[2] = output[2 + 1] = 0;
-		output[4] = output[4 + 1] = 0;
-		output[6] = output[6 + 1] = 0;
-		output[8] = output[8 + 1] = 0;
-		output[10] = output[10 + 1] = 0;
-		output[12] = output[12 + 1] = 0;
-		output[14] = output[14 + 1] = 0;
+		SWAP4_final(output[6], output[10]);
+		output[6] = __byte_perm(output[6], 0, 0x3232);
 	}
-*/
+	else
+	{
+		output[2] = output[6];
+
+		SWAP4(output[2], output[10]);
+
+		if(threadIdx.x & 1)
+		{
+			output[6] = __byte_perm(output[2], 0, 0x3232);
+		}
+		else
+		{
+			output[6] = output[2];
+		}
+	}
+
+	output[6] = __byte_perm(output[6], __shfl((int)output[6], (threadIdx.x + 1) & 3, 4), 0x7632);
+	output[7] = __shfl((int)output[6], (threadIdx.x + 2) & 3, 4);
+
 }
diff --git a/build.sh b/build.sh
index 17935f3968..9a15d5c01d 100755
--- a/build.sh
+++ b/build.sh
@@ -13,4 +13,5 @@ rm -f config.status
 # CFLAGS="-O2" ./configure
 ./configure.sh
 
-make -j 4
+make -j4
+
diff --git a/cpuminer-config.h b/ccminer-config-win.h
similarity index 93%
rename from cpuminer-config.h
rename to ccminer-config-win.h
index 51fca9fe5d..661e032627 100644
--- a/cpuminer-config.h
+++ b/ccminer-config-win.h
@@ -14,7 +14,7 @@
 
 /* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
    */
-#define HAVE_ALLOCA_H 1
+//#define HAVE_ALLOCA_H 1
 
 /* Define to 1 if you have the declaration of `be32dec', and to 0 if you
    don't. */
@@ -39,7 +39,7 @@
 #define HAVE_INTTYPES_H 1
 
 /* Define to 1 if you have the `crypto' library (-lcrypto). */
-#define HAVE_LIBCRYPTO 1
+//#define HAVE_LIBCRYPTO 1
 
 /* Define to 1 if you have a functional curl library. */
 #define HAVE_LIBCURL 1
@@ -57,31 +57,31 @@
 #define HAVE_STDLIB_H 1
 
 /* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
+//#define HAVE_STRINGS_H 1
 
 /* Define to 1 if you have the <string.h> header file. */
 #define HAVE_STRING_H 1
 
 /* Define to 1 if you have the <syslog.h> header file. */
-#define HAVE_SYSLOG_H 1
+//#define HAVE_SYSLOG_H 1
 
 /* Define to 1 if you have the <sys/endian.h> header file. */
 /* #undef HAVE_SYS_ENDIAN_H */
 
 /* Define to 1 if you have the <sys/param.h> header file. */
-#define HAVE_SYS_PARAM_H 1
+//#define HAVE_SYS_PARAM_H 1
 
 /* Define to 1 if you have the <sys/stat.h> header file. */
 #define HAVE_SYS_STAT_H 1
 
 /* Define to 1 if you have the <sys/sysctl.h> header file. */
-#define HAVE_SYS_SYSCTL_H 1
+//#define HAVE_SYS_SYSCTL_H 1
 
 /* Define to 1 if you have the <sys/types.h> header file. */
 #define HAVE_SYS_TYPES_H 1
 
 /* Define to 1 if you have the <unistd.h> header file. */
-#define HAVE_UNISTD_H 1
+//#define HAVE_UNISTD_H 1
 
 /* Defined if libcurl supports AsynchDNS */
 #define LIBCURL_FEATURE_ASYNCHDNS 1
@@ -156,7 +156,7 @@
 #define PACKAGE_NAME "ccminer"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 1.5.31-git(SP-MOD)"
+#define PACKAGE_STRING "ccminer 8.12-KlausT"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "ccminer"
@@ -165,7 +165,7 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.5.31-git(SP-MOD)"
+#define PACKAGE_VERSION "8.12-KlausT"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
@@ -179,16 +179,16 @@
 #define STDC_HEADERS 1
 
 /* Define to 1 if AVX assembly is available. */
-#define USE_AVX 1
+#define USE_AVX 0
 
 /* Define to 1 if AVX2 assembly is available. */
-#define USE_AVX2 1
+#define USE_AVX2 0
 
 /* Define to 1 if XOP assembly is available. */
-#define USE_XOP 1
+//#define USE_XOP 1
 
 /* Version number of package */
-#define VERSION "1.5.31-git(SP-MOD)"
+#define VERSION "8.12-KlausT"
 
 /* Define curl_free() as free() if our version of curl lacks curl_free. */
 /* #undef curl_free */
diff --git a/ccminer.cpp b/ccminer.cpp
index 6445654c25..74e919d3b4 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -1,6 +1,7 @@
 ﻿/*
  * Copyright 2010 Jeff Garzik
  * Copyright 2012-2014 pooler
+ * Copyright 2014-2015 tpruvot
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -8,18 +9,22 @@
  * any later version.  See COPYING for more details.
  */
 
-#include "cpuminer-config.h"
+#ifndef WIN32
+#include "ccminer-config.h"
+#else
+#include "ccminer-config-win.h"
+#endif
+#include "cuda_runtime_api.h"
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include <inttypes.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cinttypes>
 #include <unistd.h>
-#include <math.h>
+#include <cmath>
 #include <sys/time.h>
-#include <time.h>
-#include <signal.h>
+#include <ctime>
+#include <csignal>
 
 #include <curl/curl.h>
 #include <jansson.h>
@@ -27,7 +32,7 @@
 
 #ifdef WIN32
 #include <windows.h>
-#include <stdint.h>
+#include <cstdint>
 #else
 #include <errno.h>
 #include <sys/resource.h>
@@ -39,6 +44,7 @@
 #include <sys/sysctl.h>
 #endif
 #endif
+using namespace std;
 
 #include "miner.h"
 
@@ -50,15 +56,17 @@ BOOL WINAPI ConsoleHandler(DWORD);
 #endif
 
 #define PROGRAM_NAME		"ccminer"
-#define LP_SCANTIME		60
-#define HEAVYCOIN_BLKHDR_SZ		84
+#define LP_SCANTIME		25
 #define MNR_BLKHDR_SZ 80
 
+double expectedblocktime(const uint32_t *target);
+
 // from cuda.cpp
 int cuda_num_devices();
 void cuda_devicenames();
 void cuda_devicereset();
 int cuda_finddevice(char *name);
+void cuda_print_devices();
 
 #include "nvml.h"
 #ifdef USE_WRAPNVML
@@ -78,70 +86,43 @@ struct workio_cmd {
 	} u;
 };
 
-enum sha_algos {
-	ALGO_ANIME,
-	ALGO_BITCOIN,
-	ALGO_BLAKE,
-	ALGO_BLAKECOIN,
-	ALGO_DEEP,
-	ALGO_DMD_GR,
-	ALGO_DOOM,
-	ALGO_FRESH,
-	ALGO_FUGUE256,		/* Fugue256 */
-	ALGO_GROESTL,
-	ALGO_HEAVY,		/* Heavycoin hash */
-	ALGO_KECCAK,
-	ALGO_JACKPOT,
-	ALGO_LUFFA_DOOM,
-	ALGO_LYRA2,
-	ALGO_MJOLLNIR,		/* Hefty hash */
-	ALGO_MYR_GR,
-	ALGO_NIST5,
-	ALGO_PENTABLAKE,
-	ALGO_QUARK,
-	ALGO_QUBIT,
-	ALGO_S3,
-	ALGO_SPREADX11,
-	ALGO_WHC,
-	ALGO_X11,
-	ALGO_X13,
-	ALGO_X14,
-	ALGO_X15,
-	ALGO_X17,
-};
-
 static const char *algo_names[] = {
-	"anime",
 	"bitcoin",
 	"blake",
 	"blakecoin",
+	"c11",
 	"deep",
 	"dmd-gr",
 	"doom", /* is luffa */
 	"fresh",
 	"fugue256",
 	"groestl",
-	"heavy",
 	"keccak",
 	"jackpot",
 	"luffa",
-	"lyra2",
-	"mjollnir",
+	"lyra2v2",
 	"myr-gr",
 	"nist5",
 	"penta",
 	"quark",
 	"qubit",
+	"sia",
+	"skein",
 	"s3",
 	"spread",
 	"whirl",
+	"whirlpoolx",
 	"x11",
 	"x13",
 	"x14",
 	"x15",
 	"x17",
+	"vanilla",
+	"neoscrypt"
 };
 
+char curl_err_str[CURL_ERROR_SIZE];
+bool opt_verify = true;
 bool opt_debug = false;
 bool opt_protocol = false;
 bool opt_benchmark = false;
@@ -150,6 +131,7 @@ bool have_longpoll = false;
 bool want_stratum = true;
 bool have_stratum = false;
 bool allow_gbt = true;
+bool allow_mininginfo = true;
 bool check_dups = false;
 static bool submit_old = false;
 bool use_syslog = false;
@@ -157,54 +139,63 @@ bool use_colors = true;
 static bool opt_background = false;
 bool opt_quiet = false;
 static int opt_retries = -1;
-static int opt_fail_pause = 30;
+static int opt_fail_pause = 10;
 int opt_timeout = 270;
-static int opt_scantime = 5;
-static json_t *opt_config;
+static int opt_scantime = 25;
+static json_t *opt_config = nullptr;
 static const bool opt_time = true;
-static enum sha_algos opt_algo = ALGO_X11;
+enum sha_algos opt_algo;
 int opt_n_threads = 0;
 int opt_affinity = -1;
 int opt_priority = 0;
 static double opt_difficulty = 1; // CH
+static bool opt_extranonce = true;
 bool opt_trust_pool = false;
-uint16_t opt_vote = 9999;
 int num_cpus;
 int active_gpus;
-char * device_name[MAX_GPUS];
-int device_map[MAX_GPUS] = { 0, 1, 2, 3, 4, 5, 6, 7,8,9,10,11,12,13,14,15 };
+char * device_name[MAX_GPUS] = { nullptr };
+int device_map[MAX_GPUS] = { 0 };
 long  device_sm[MAX_GPUS] = { 0 };
-uint32_t gpus_intensity[MAX_GPUS] = { 0 };
+uint32_t gpus_intensity[MAX_GPUS] = {0};
+uint32_t device_gpu_clocks[MAX_GPUS] = {0};
+uint32_t device_mem_clocks[MAX_GPUS] = {0};
+uint32_t device_plimit[MAX_GPUS] = {0};
+int8_t device_pstate[MAX_GPUS];
 char *rpc_user = NULL;
-static char *rpc_url;
-static char *rpc_userpass;
-static char *rpc_pass;
+static char *rpc_url = nullptr;
+static char *rpc_userpass = nullptr;
+static char *rpc_pass = nullptr;
 static char *short_url = NULL;
-char *opt_cert;
-char *opt_proxy;
+char *opt_cert = nullptr;
+char *opt_proxy = nullptr;
 long opt_proxy_type;
-struct thr_info *thr_info;
+struct thr_info *thr_info = nullptr;
 static int work_thr_id;
-struct thr_api *thr_api;
+struct thr_api *thr_api = nullptr;
 int longpoll_thr_id = -1;
 int stratum_thr_id = -1;
 int api_thr_id = -1;
 bool stratum_need_reset = false;
 struct work_restart *work_restart = NULL;
 struct stratum_ctx stratum = { 0 };
+bool stop_mining = false;
+volatile bool mining_has_stopped[MAX_GPUS];
 
-pthread_mutex_t applog_lock;
-static pthread_mutex_t stats_lock;
+pthread_mutex_t applog_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t stats_lock = PTHREAD_MUTEX_INITIALIZER;
 uint32_t accepted_count = 0L;
 uint32_t rejected_count = 0L;
-static double *thr_hashrates;
+static double thr_hashrates[MAX_GPUS];
 uint64_t global_hashrate = 0;
 double   global_diff = 0.0;
+uint64_t net_hashrate = 0;
+uint64_t net_blocks = 0;
+
 int opt_statsavg = 30;
-// strdup on char* to allow a common free() if used
-static char* opt_syslog_pfx = strdup(PROGRAM_NAME);
-char *opt_api_allow = strdup("127.0.0.1"); /* 0.0.0.0 for all ips */
-int opt_api_listen = 4068; /* 0 to disable */
+uint16_t opt_api_listen = 4068; /* 0 to disable */
+bool opt_stratum_stats = true;
+static char* opt_syslog_pfx = nullptr;
+char *opt_api_allow = nullptr;
 
 #ifdef HAVE_GETOPT_LONG
 #include <getopt.h>
@@ -220,26 +211,28 @@ struct option {
 static char const usage[] = "\
 Usage: " PROGRAM_NAME " [OPTIONS]\n\
 Options:\n\
-  -a, --algo=ALGO       specify the hash algorithm to use\n\
-			anime       Animecoin\n\
+  -a, --algo=ALGO specify the hash algorithm to use\n\
+			bitcoin     Bitcoin\n\
 			blake       Blake 256 (SFR/NEOS)\n\
 			blakecoin   Fast Blake 256 (8 rounds)\n\
+			c11         X11 variant\n\
 			deep        Deepcoin\n\
 			dmd-gr      Diamond-Groestl\n\
 			fresh       Freshcoin (shavite 80)\n\
 			fugue256    Fuguecoin\n\
 			groestl     Groestlcoin\n\
-			heavy       Heavycoin\n\
 			jackpot     Jackpot\n\
 			keccak      Keccak-256 (Maxcoin)\n\
 			luffa       Doomcoin\n\
-			lyra2       VertCoin\n\
-			mjollnir    Mjollnircoin\n\
+			lyra2v2     VertCoin\n\
 			myr-gr      Myriad-Groestl\n\
+			neoscrypt   neoscrypt (FeatherCoin)\n\
 			nist5       NIST5 (TalkCoin)\n\
 			penta       Pentablake hash (5x Blake 512)\n\
 			quark       Quark\n\
 			qubit       Qubit\n\
+			sia         Siacoin (at pools compatible to siamining.com) \n\
+			skein       Skein SHA2 (Skeincoin)\n\
 			s3          S3 (1Coin)\n\
 			spread      Spread\n\
 			x11         X11 (DarkCoin)\n\
@@ -247,16 +240,18 @@ Options:\n\
 			x14         X14\n\
 			x15         X15\n\
 			x17         X17 (peoplecurrency)\n\
+			vanilla     Blake 256 8 rounds\n\
+			yescrypt    yescrypt\n\
 			whirl       Whirlcoin (old whirlpool)\n\
-  -d, --devices         Comma separated list of CUDA devices to use.\n\
+			whirlpoolx  Vanillacoin \n\
+  -d, --devices         Comma separated list of CUDA devices to use. \n\
                         Device IDs start counting from 0! Alternatively takes\n\
                         string names of your cards like gtx780ti or gt640#2\n\
                         (matching 2nd gt640 in the PC)\n\
   -i  --intensity=N     GPU intensity 8-31 (default: auto) \n\
                         Decimals are allowed for fine tuning \n\
-  -f, --diff            Divide difficulty by this factor (std is 1) \n\
-  -v, --vote=VOTE       block reward vote (for HeavyCoin)\n\
-  -m, --trust-pool      trust the max block reward vote (maxvote) sent by the pool\n\
+  -f, --diff-factor     Divide difficulty by this factor (default 1.0) \n\
+  -m, --diff-multiplier Multiply difficulty by this value (default 1.0) \n\
   -o, --url=URL         URL of mining server\n\
   -O, --userpass=U:P    username:password pair for mining server\n\
   -u, --user=USERNAME   username for mining server\n\
@@ -270,45 +265,48 @@ Options:\n\
   -T, --timeout=N       network timeout, in seconds (default: 270)\n\
   -s, --scantime=N      upper bound on time spent scanning current work when\n\
                           long polling is unavailable, in seconds (default: 5)\n\
+  -n, --ndevs           list cuda devices\n\
   -N, --statsavg        number of samples used to display hashrate (default: 30)\n\
       --no-gbt          disable getblocktemplate support (height check in solo)\n\
       --no-longpoll     disable X-Long-Polling support\n\
       --no-stratum      disable X-Stratum support\n\
+	-e                    disable extranonce\n\
   -q, --quiet           disable per-thread hashmeter output\n\
       --no-color        disable colored output\n\
   -D, --debug           enable debug output\n\
   -P, --protocol-dump   verbose dump of protocol-level activities\n\
       --cpu-affinity    set process affinity to cpu core(s), mask 0x3 for cores 0 and 1\n\
       --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest)\n\
-  -b, --api-bind        IP/Port for the miner API (default: 127.0.0.1:4068)\n"
-
-#ifdef HAVE_SYSLOG_H
-"\
+  -b, --api-bind        IP/Port for the miner API (default: 127.0.0.1:4068)\n\
   -S, --syslog          use system log for output messages\n\
-      --syslog-prefix=... allow to change syslog tool name\n"
-#endif
-#ifndef WIN32
-"\
-  -B, --background      run the miner in the background\n"
-#endif
-"\
+      --syslog-prefix=... allow to change syslog tool name\n\
+  -B, --background      run the miner in the background\n\
       --benchmark       run in offline benchmark mode\n\
       --cputest         debug hashes from cpu algorithms\n\
+      --no-cpu-verify   don't verify the found results\n\
   -c, --config=FILE     load a JSON-format configuration file\n\
   -V, --version         display version information and exit\n\
-  -h, --help            display this help text and exit\n\
-";
+  -h, --help            display this help text and exit\n"
+#if defined(USE_WRAPNVML) && (defined(__linux) || defined(_WIN64)) /* via nvml */
+"\
+      --mem-clock=N     Set the gpu memory max clock (346.72+ driver)\n\
+      --gpu-clock=N     Set the gpu engine max clock (346.72+ driver)\n\
+      --pstate=N        Set the gpu power state (352.21+ driver)\n\
+			--plimit=N        Set the gpu power limit(352.21 + driver)\n"
+#endif
+"";
 
 static char const short_options[] =
 #ifndef WIN32
-	"B"
+"B"
 #endif
 #ifdef HAVE_SYSLOG_H
-	"S"
+"S"
 #endif
-	"a:c:i:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:f:mv:N:b:";
+"a:c:i:Dhp:Px:nqr:R:s:t:T:o:u:O:Vd:f:m:N:b:e";
 
-static struct option const options[] = {
+static struct option const options[] =
+{
 	{ "algo", 1, NULL, 'a' },
 	{ "api-bind", 1, NULL, 'b' },
 #ifndef WIN32
@@ -316,6 +314,7 @@ static struct option const options[] = {
 #endif
 	{ "benchmark", 0, NULL, 1005 },
 	{ "cert", 1, NULL, 1001 },
+	{ "no-cpu-verify", 0, NULL, 1022 },
 	{ "config", 1, NULL, 'c' },
 	{ "cputest", 0, NULL, 1006 },
 	{ "cpu-affinity", 1, NULL, 1020 },
@@ -323,6 +322,7 @@ static struct option const options[] = {
 	{ "debug", 0, NULL, 'D' },
 	{ "help", 0, NULL, 'h' },
 	{ "intensity", 1, NULL, 'i' },
+	{ "ndevs", 0, NULL, 'n' },
 	{ "no-color", 0, NULL, 1002 },
 	{ "no-gbt", 0, NULL, 1011 },
 	{ "no-longpoll", 0, NULL, 1003 },
@@ -340,65 +340,84 @@ static struct option const options[] = {
 	{ "syslog-prefix", 1, NULL, 1008 },
 #endif
 	{ "threads", 1, NULL, 't' },
-	{ "vote", 1, NULL, 'v' },
-	{ "trust-pool", 0, NULL, 'm' },
+	{ "Disable extranounce support", 1, NULL, 'e' },
 	{ "timeout", 1, NULL, 'T' },
 	{ "url", 1, NULL, 'o' },
 	{ "user", 1, NULL, 'u' },
 	{ "userpass", 1, NULL, 'O' },
 	{ "version", 0, NULL, 'V' },
 	{ "devices", 1, NULL, 'd' },
-	{ "diff", 1, NULL, 'f' },
-	{ 0, 0, 0, 0 }
+	{ "diff-multiplier", 1, NULL, 'm' },
+	{ "diff-factor", 1, NULL, 'f' },
+	{ "diff", 1, NULL, 'f' }, // compat
+	{"gpu-clock", 1, NULL, 1070},
+	{"mem-clock", 1, NULL, 1071},
+	{"pstate", 1, NULL, 1072},
+	{"plimit", 1, NULL, 1073},
+	{0, 0, 0, 0}
 };
 
-static struct work _ALIGN(64) g_work;
-static time_t g_work_time;
-static pthread_mutex_t g_work_lock;
+struct work _ALIGN(64) g_work;
+time_t g_work_time;
+static pthread_mutex_t g_work_lock = PTHREAD_MUTEX_INITIALIZER;
 
 
 #ifdef __linux /* Linux specific policy and affinity management */
 #include <sched.h>
-static inline void drop_policy(void) {
+static inline void drop_policy(void)
+{
 	struct sched_param param;
 	param.sched_priority = 0;
 #ifdef SCHED_IDLE
-	if (unlikely(sched_setscheduler(0, SCHED_IDLE, &param) == -1))
+	if(unlikely(sched_setscheduler(0, SCHED_IDLE, &param) == -1))
 #endif
 #ifdef SCHED_BATCH
 		sched_setscheduler(0, SCHED_BATCH, &param);
 #endif
 }
-static void affine_to_cpu_mask(int id, uint8_t mask) {
+static void affine_to_cpu_mask(int id, uint8_t mask)
+{
 	cpu_set_t set;
 	CPU_ZERO(&set);
-	for (uint8_t i = 0; i < num_cpus; i++) {
+	for(uint8_t i = 0; i < num_cpus; i++)
+	{
 		// cpu mask
-		if (mask & (1<<i)) { CPU_SET(i, &set); printf("%d \n", i); }
+		if(mask & (1 << i))
+		{
+			CPU_SET(i, &set); printf("%d \n", i);
+		}
 	}
-	if (id == -1) {
+	if(id == -1)
+	{
 		// process affinity
 		sched_setaffinity(0, sizeof(&set), &set);
-	} else {
+	}
+	else
+	{
 		// thread only
 		pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set);
 	}
 }
 #elif defined(__FreeBSD__) /* FreeBSD specific policy and affinity management */
 #include <sys/cpuset.h>
-static inline void drop_policy(void) { }
-static void affine_to_cpu_mask(int id, uint8_t mask) {
+static inline void drop_policy(void)
+{}
+static void affine_to_cpu_mask(int id, uint8_t mask)
+{
 	cpuset_t set;
 	CPU_ZERO(&set);
-	for (uint8_t i = 0; i < num_cpus; i++) {
-		if (mask & (1<<i)) CPU_SET(i, &set);
+	for(uint8_t i = 0; i < num_cpus; i++)
+	{
+		if(mask & (1 << i)) CPU_SET(i, &set);
 	}
 	cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset_t), &set);
 }
 #else /* Windows */
-static inline void drop_policy(void) { }
-static void affine_to_cpu_mask(int id, uint8_t mask) {
-	if (id == -1)
+static inline void drop_policy(void)
+{}
+static void affine_to_cpu_mask(int id, uint8_t mask)
+{
+	if(id == -1)
 		SetProcessAffinityMask(GetCurrentProcess(), mask);
 	else
 		SetThreadAffinityMask(GetCurrentThread(), mask);
@@ -413,96 +432,105 @@ void get_currentalgo(char* buf, int sz)
 }
 
 /**
- * Exit app
- */
+* Exit app
+*/
 void proper_exit(int reason)
 {
-#ifdef WIN32
-	timeEndPeriod(1); // else never executed
-#endif
 #ifdef USE_WRAPNVML
-	if (hnvml)
+	if(hnvml)
 		nvml_destroy(hnvml);
 #endif
-	
+//	if(reason == 2)
+	{
+		stop_mining = true;
+		applog(LOG_INFO, "stopping %d threads", opt_n_threads);
+		bool everything_stopped;
+		do
+		{
+			everything_stopped = true;
+			for(int i = 0; i < opt_n_threads; i++)
+			{
+				if(!mining_has_stopped[i])
+					everything_stopped = false;
+			}
+		} while(!everything_stopped);
+		applog(LOG_INFO, "resetting GPUs");
+		cuda_devicereset();
+	}
 
-	pthread_mutex_lock(&g_work_lock);	//freeze stratum
-	pthread_mutex_lock(&stats_lock);	//hack. Freeze all the gputhreads when they finnish
+	curl_global_cleanup();
 
-	free(opt_syslog_pfx);
-	free(opt_api_allow);
-	hashlog_purge_all();
-	stats_purge_all();
-	cuda_devicereset();
-	
+#ifdef WIN32
+	timeEndPeriod(1);
+#endif
 
-	try
-	{
-		sleep(10);			//make sure that the gpu threads are stopped when updating the stats.
-		exit(0);
-	}
-	catch (...)
-	{
-		int t = 0;
-	}
+	exit(reason & 1);
 }
 
-static bool jobj_binary(const json_t *obj, const char *key,
-			void *buf, size_t buflen)
+static size_t jobj_binary(const json_t *obj, const char *key,
+						void *buf, size_t buflen)
 {
 	const char *hexstr;
 	json_t *tmp;
 
 	tmp = json_object_get(obj, key);
-	if (unlikely(!tmp)) {
+	if(unlikely(tmp == NULL))
+	{
 		applog(LOG_ERR, "JSON key '%s' not found", key);
 		return false;
 	}
 	hexstr = json_string_value(tmp);
-	if (unlikely(!hexstr)) {
+	if(unlikely(hexstr == NULL))
+	{
 		applog(LOG_ERR, "JSON key '%s' is not a string", key);
 		return false;
 	}
-	if (!hex2bin((uchar*)buf, hexstr, buflen))
-		return false;
-
-	return true;
+	if(strlen(hexstr) / 2 <= buflen)
+		hex2bin((uchar*)buf, hexstr, buflen);
+	else
+		return 0;
+	return strlen(hexstr)/2;
 }
 
 static bool work_decode(const json_t *val, struct work *work)
 {
-	int data_size = sizeof(work->data), target_size = sizeof(work->target);
-	int adata_sz = ARRAY_SIZE(work->data), atarget_sz = ARRAY_SIZE(work->target);
+	int target_size;
+	int midstate_size = sizeof(work->midstate);
+	int atarget_sz = ARRAY_SIZE(work->target);
 	int i;
 
-	if (unlikely(!jobj_binary(val, "data", work->data, data_size))) {
-		applog(LOG_ERR, "JSON inval data");
+	size_t data_size = jobj_binary(val, "data", work->data, sizeof(work->data));
+
+	if(opt_algo != ALGO_NEO && data_size != 128)
+	{
+		applog(LOG_ERR, "JSON invalid data");
 		return false;
 	}
-	if (unlikely(!jobj_binary(val, "target", work->target, target_size))) {
-		applog(LOG_ERR, "JSON inval target");
+	work->datasize = data_size;
+	int adata_sz = (int)data_size / 4;
+
+	target_size = (int)jobj_binary(val, "target", work->target, sizeof(work->target));
+	if(target_size != sizeof(work->target))
+	{
+		applog(LOG_ERR, "JSON invalid target", target_size);
 		return false;
 	}
 
-	if (opt_algo == ALGO_HEAVY) {
-		if (unlikely(!jobj_binary(val, "maxvote", &work->maxvote, sizeof(work->maxvote)))) {
-			work->maxvote = 2048;
-		}
-	} else work->maxvote = 0;
-
-	for (i = 0; i < adata_sz; i++)
+	for(i = 0; i < adata_sz; i++)
 		work->data[i] = le32dec(work->data + i);
-	for (i = 0; i < atarget_sz; i++)
+	for(i = 0; i < atarget_sz; i++)
 		work->target[i] = le32dec(work->target + i);
 
 	json_t *jr = json_object_get(val, "noncerange");
-	if (jr) {
+	if(jr)
+	{
 		const char * hexstr = json_string_value(jr);
-		if (likely(hexstr)) {
+		if(likely(hexstr))
+		{
 			// never seen yet...
 			hex2bin((uchar*)work->noncerange.u64, hexstr, 8);
 			applog(LOG_DEBUG, "received noncerange: %08x-%08x",
-				work->noncerange.u32[0], work->noncerange.u32[1]);
+				   work->noncerange.u32[0], work->noncerange.u32[1]);
 		}
 	}
 
@@ -513,9 +541,9 @@ static bool work_decode(const json_t *val, struct work *work)
 }
 
 /**
- * Calculate the work difficulty as double
- * Not sure it works with pools
- */
+* Calculate the work difficulty as double
+* Not sure it works with pools
+*/
 static void calc_diff(struct work *work, int known)
 {
 	// sample for diff 32.53 : 00000007de5f0000
@@ -526,52 +554,52 @@ static void calc_diff(struct work *work, int known)
 	swab256(rtarget, work->target);
 	data64 = (uint64_t *)(rtarget + 3); /* todo: index (3) can be tuned here */
 
-	if (opt_algo == ALGO_HEAVY) {
-		data64 = (uint64_t *)(rtarget + 2);
-	}
-
 	d64 = swab64(*data64);
-	if (unlikely(!d64))
+	if(unlikely(!d64))
 		d64 = 1;
 	work->difficulty = (double)diffone / d64;
-	if (opt_difficulty > 0.) {
+	if(opt_difficulty > 0.)
+	{
 		work->difficulty /= opt_difficulty;
 	}
 }
 
 static int share_result(int result, const char *reason)
 {
-	char s[345];
+	char s[32] = { 0 };
 	double hashrate = 0.;
 
 	pthread_mutex_lock(&stats_lock);
 
-	for (int i = 0; i < opt_n_threads; i++) {
+	for(int i = 0; i < opt_n_threads; i++)
+	{
 		hashrate += stats_get_speed(i, thr_hashrates[i]);
 	}
-
 	result ? accepted_count++ : rejected_count++;
 	pthread_mutex_unlock(&stats_lock);
 
 	global_hashrate = llround(hashrate);
 
-	sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate);
-	applog(LOG_NOTICE, "accepted: %lu/%lu (%.2f%%), %s khash/s %s",
-			accepted_count,
-			accepted_count + rejected_count,
-			100. * accepted_count / (accepted_count + rejected_count),
-			s,
-			use_colors ?
-				(result ? CL_GRN "yay!!!" : CL_RED "booooo")
-			:	(result ? "(yay!!!)" : "(booooo)"));
-
-	if (reason) {
+	format_hashrate(hashrate, s);
+	applog(LOG_NOTICE, "accepted: %lu/%lu (%.2f%%), %s %s",
+		   accepted_count,
+		   accepted_count + rejected_count,
+		   100. * accepted_count / (accepted_count + rejected_count),
+		   s,
+		   use_colors ?
+		   (result ? CL_GRN "yay!!!" : CL_RED "booooo")
+		   : (result ? "(yay!!!)" : "(booooo)"));
+
+	if(reason)
+	{
 		applog(LOG_WARNING, "reject reason: %s", reason);
-		return 0;
-		if (strncmp(reason, "Duplicate share", 15) == 0 && !check_dups) {
+		if(strncmp(reason, "Duplicate share", 15) == 0 && !check_dups)
+		{
 			applog(LOG_WARNING, "enabling duplicates check feature");
 			check_dups = true;
 		}
+		return 0;
+
 	}
 	return 1;
 }
@@ -582,113 +610,133 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 	bool stale_work = false;
 	char s[384];
 
+	/* discard if a newer block was received */
+	/*
+	stale_work = work->height && work->height < g_work.height;
+	if (have_stratum && !stale_work) {
+	pthread_mutex_lock(&g_work_lock);
+	if (strlen(work->job_id + 8))
+	stale_work = strcmp(work->job_id + 8, g_work.job_id + 8);
+	pthread_mutex_unlock(&g_work_lock);
+	}
+	*/
+	if(!have_stratum && !stale_work && allow_gbt)
+	{
+		struct work wheight = { 0 };
+		if(get_blocktemplate(curl, &wheight))
+		{
+			if(work->height && work->height < wheight.height)
+			{
+				if(opt_debug)
+					applog(LOG_WARNING, "block %u was already solved", work->height, wheight.height);
+				return true;
+			}
+		}
+	}
+
+	if(stale_work)
+	{
+		if(opt_debug)
+			applog(LOG_WARNING, "stale work detected, discarding");
+		return true;
+	}
 	calc_diff(work, 0);
 
-	if (have_stratum) {
+	if(have_stratum)
+	{
 		uint32_t sent = 0;
 		uint32_t ntime, nonce;
-		uint16_t nvote;
-		char *ntimestr, *noncestr, *xnonce2str, *nvotestr;
+		char *ntimestr, *noncestr, *xnonce2str;
 
-		le32enc(&ntime, work->data[17]);
-		le32enc(&nonce, work->data[19]);
+		if(opt_algo != ALGO_SIA)
+		{
+			le32enc(&ntime, work->data[17]);
+			le32enc(&nonce, work->data[19]);
+			noncestr = bin2hex((const uchar*)(&nonce), 4);
+			ntimestr = bin2hex((const uchar*)(&ntime), 4);
+		}
+		else
+		{
+			le32enc(&ntime, work->data[10]);
+			uint64_t ntime64 = ntime;
+			le32enc(&nonce, work->data[8]);
+			uint64_t nonce64 = nonce;
+			le32enc(&nonce, work->data[9]);
+			nonce64 += (uint64_t)nonce << 32;
+			noncestr = bin2hex((const uchar*)(&nonce64), 8);
+			ntimestr = bin2hex((const uchar*)(&ntime64), 8);
+		}
 
-		noncestr = bin2hex((const uchar*)(&nonce), 4);
 
-		if (check_dups)
+		if(check_dups)
 			sent = hashlog_already_submittted(work->job_id, nonce);
-		if (sent > 0) {
-			sent = (uint32_t) time(NULL) - sent;
-			if (!opt_quiet) {
+		if(sent > 0)
+		{
+			sent = (uint32_t)time(NULL) - sent;
+			if(!opt_quiet)
+			{
 				applog(LOG_WARNING, "nonce %s was already sent %u seconds ago", noncestr, sent);
 				hashlog_dump_job(work->job_id);
 			}
 			free(noncestr);
 			// prevent useless computing on some pools
-			stratum_need_reset = true;
-			for (int i = 0; i < opt_n_threads; i++)
-				work_restart[i].restart = 1;
+			g_work_time = 0;
+			restart_threads();
 			return true;
 		}
 
-		ntimestr = bin2hex((const uchar*)(&ntime), 4);
 		xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len);
 
-		if (opt_algo == ALGO_HEAVY) {
-			be16enc(&nvote, *((uint16_t*)&work->data[20]));
-			nvotestr = bin2hex((const uchar*)(&nvote), 2);
-			sprintf(s,
-				"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-				rpc_user, work->job_id + 8, xnonce2str, ntimestr, noncestr, nvotestr);
-			free(nvotestr);
-		} else {
-			sprintf(s,
+		sprintf(s,
 				"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
 				rpc_user, work->job_id + 8, xnonce2str, ntimestr, noncestr);
-		}
 		free(xnonce2str);
 		free(ntimestr);
 		free(noncestr);
 
 		gettimeofday(&stratum.tv_submit, NULL);
-
-/*		pthread_mutex_lock(&g_work_lock);
-		stale_work = work->height != g_work.height;
-		pthread_mutex_unlock(&g_work_lock);
-		if (stale_work)
+		if(unlikely(!stratum_send_line(&stratum, s)))
 		{
-			applog(LOG_WARNING, "stale work detected, discarding");
-			return true;
-		}
-		*/
-		if (unlikely(!stratum_send_line(&stratum, s))) {
 			applog(LOG_ERR, "submit_upstream_work stratum_send_line failed");
 			return false;
 		}
 
-		if (check_dups)
+		if(check_dups)
 			hashlog_remember_submit(work, nonce);
 
-	} else 
+	}
+	else
 	{
-		/*
-		stale_work = work->height != g_work.height;
 
-		if (stale_work)
-		{
-			applog(LOG_WARNING, "stale work detected, discarding");
-			return true;
-		}
-		*/
 		/* build hex string */
 		char *str = NULL;
-
-		if (opt_algo != ALGO_HEAVY && opt_algo != ALGO_MJOLLNIR) {
-			for (int i = 0; i < ARRAY_SIZE(work->data); i++)
-				le32enc(work->data + i, work->data[i]);
-		}
-		str = bin2hex((uchar*)work->data, sizeof(work->data));
-		if (unlikely(!str)) {
+		for(int i = 0; i < (work->datasize >> 2); i++)
+			le32enc(work->data + i, work->data[i]);
+		str = bin2hex((uchar*)work->data, work->datasize);
+		if(unlikely(!str))
+		{
 			applog(LOG_ERR, "submit_upstream_work OOM");
 			return false;
 		}
 
 		/* build JSON-RPC request */
 		sprintf(s,
-			"{\"method\": \"getwork\", \"params\": [\"%s\"], \"id\":4}\r\n",
-			str);
+				"{\"method\": \"getwork\", \"params\": [\"%s\"], \"id\":4}\r\n",
+				str);
 
 		/* issue JSON-RPC request */
 		val = json_rpc_call(curl, rpc_url, rpc_userpass, s, false, false, NULL);
-		if (unlikely(!val)) {
+		if(unlikely(!val))
+		{
 			applog(LOG_ERR, "submit_upstream_work json_rpc_call failed");
 			return false;
 		}
 
 		res = json_object_get(val, "result");
 		reason = json_object_get(val, "reject-reason");
-		if (!share_result(json_is_true(res), reason ? json_string_value(reason) : NULL)) {
-			if (check_dups)
+		if(!share_result(json_is_true(res), reason ? json_string_value(reason) : NULL))
+		{
+			if(check_dups)
 				hashlog_purge_job(work->job_id);
 		}
 
@@ -704,21 +752,49 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 static bool gbt_work_decode(const json_t *val, struct work *work)
 {
 	json_t *err = json_object_get(val, "error");
-	if (err && !json_is_null(err)) {
+	if(err && !json_is_null(err))
+	{
 		allow_gbt = false;
-		applog(LOG_INFO, "GBT not supported, bloc height unavailable");
+		applog(LOG_INFO, "GBT not supported, block height unavailable");
 		return false;
 	}
 
-	if (!work->height) {
+	if(!work->height)
+	{
 		// complete missing data from getwork
 		json_t *key = json_object_get(val, "height");
-		if (key && json_is_integer(key)) {
-			work->height = (uint32_t) json_integer_value(key);
-			if (!opt_quiet && work->height > g_work.height) {
-				applog(LOG_BLUE, "%s %s block %d", short_url,
-					algo_names[opt_algo], work->height);
+		if(key && json_is_integer(key))
+		{
+			work->height = (uint32_t)json_integer_value(key);
+			if(!opt_quiet && work->height > g_work.height)
+			{
+				if(!have_stratum && allow_mininginfo && global_diff > 0)
+				{
+					char netinfo[64] = { 0 };
+					char srate[32] = { 0 };
+					sprintf(netinfo, "diff %.2f", global_diff);
+					if(net_hashrate)
+					{
+						format_hashrate((double)net_hashrate, srate);
+						strcat(netinfo, ", net ");
+						strcat(netinfo, srate);
+					}
+					applog(LOG_BLUE, "%s block %d, %s",
+						   algo_names[opt_algo], work->height, netinfo);
+				}
+				else
+				{
+					applog(LOG_BLUE, "%s %s block %d", short_url,
+						   algo_names[opt_algo], work->height);
+				}
 				g_work.height = work->height;
+				if(!have_stratum)
+				{
+					double x = expectedblocktime(work->target);
+					if(x != 0.0)
+						applog(LOG_BLUE, "50%% chance to find a block in about %.2f days", x);
+				}
+
 			}
 		}
 	}
@@ -728,20 +804,29 @@ static bool gbt_work_decode(const json_t *val, struct work *work)
 
 #define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"
 static const char *gbt_req =
-	"{\"method\": \"getblocktemplate\", \"params\": ["
-	//	"{\"capabilities\": " GBT_CAPABILITIES "}"
-	"], \"id\":0}\r\n";
+"{\"method\": \"getblocktemplate\", \"params\": ["
+//	"{\"capabilities\": " GBT_CAPABILITIES "}"
+"], \"id\":9}\r\n";
 
 static bool get_blocktemplate(CURL *curl, struct work *work)
 {
-	if (!allow_gbt)
+	if(!allow_gbt)
 		return false;
 
+	int curl_err = 0;
 	json_t *val = json_rpc_call(curl, rpc_url, rpc_userpass, gbt_req,
-			    want_longpoll, false, NULL);
+								want_longpoll, have_longpoll, &curl_err);
 
-	if (!val)
+	if(!val && curl_err == -1)
+	{
+		// when getblocktemplate is not supported, disable it
+		allow_gbt = false;
+		if(!opt_quiet)
+		{
+			applog(LOG_BLUE, "gbt not supported, block height notices disabled");
+		}
 		return false;
+	}
 
 	bool rc = gbt_work_decode(json_object_get(val, "result"), work);
 
@@ -750,8 +835,84 @@ static bool get_blocktemplate(CURL *curl, struct work *work)
 	return rc;
 }
 
+// good alternative for wallet mining, difficulty and net hashrate
+static const char *info_req =
+"{\"method\": \"getmininginfo\", \"params\": [], \"id\":8}\r\n";
+
+static bool get_mininginfo(CURL *curl, struct work *work)
+{
+	if(have_stratum || !allow_mininginfo)
+		return false;
+
+	int curl_err = 0;
+	json_t *val = json_rpc_call(curl, rpc_url, rpc_userpass, info_req,
+								want_longpoll, have_longpoll, &curl_err);
+
+	if(!val && curl_err == -1)
+	{
+		allow_mininginfo = false;
+		if(opt_debug)
+		{
+			applog(LOG_DEBUG, "getmininginfo not supported");
+		}
+		return false;
+	}
+	else
+	{
+		json_t *res = json_object_get(val, "result");
+		// "blocks": 491493 (= current work height - 1)
+		// "difficulty": 0.99607860999999998
+		// "networkhashps": 56475980
+		if(res)
+		{
+			json_t *key = json_object_get(res, "powdifficulty");
+			if(key && json_is_real(key))
+			{
+				global_diff = json_real_value(key);
+			}
+			key = json_object_get(res, "difficulty");
+			if(key && json_is_real(key))
+			{
+				global_diff = json_real_value(key);
+			}
+			key = json_object_get(res, "networkhashps");
+			if(key && json_is_integer(key))
+			{
+				net_hashrate = json_integer_value(key);
+			}
+			key = json_object_get(res, "blocks");
+			if(key && json_is_integer(key))
+			{
+				net_blocks = json_integer_value(key);
+			}
+		}
+	}
+	json_decref(val);
+	return true;
+}
+
+// time (in days) for a 50% chance to find a block
+double expectedblocktime(const uint32_t *target)
+{
+	double x = 0.0;
+	if(global_hashrate == 0)
+		return 0;
+	else
+	{
+		for(int i = 0; i < 8; i++)
+		{
+			x *= 4294967296.0;
+			x += target[7 - i];
+		}
+		if(x != 0.0)
+			return 115792089237316195423570985008687907853269984665640564039457584007913129639935.0 / x / (double)global_hashrate / 86400.0;
+		else
+			return 0.0;
+	}
+}
+
 static const char *rpc_req =
-	"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n";
+"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n";
 
 static bool get_upstream_work(CURL *curl, struct work *work)
 {
@@ -761,29 +922,32 @@ static bool get_upstream_work(CURL *curl, struct work *work)
 
 	gettimeofday(&tv_start, NULL);
 	val = json_rpc_call(curl, rpc_url, rpc_userpass, rpc_req,
-			    want_longpoll, false, NULL);
+						want_longpoll, false, NULL);
 	gettimeofday(&tv_end, NULL);
 
-	if (have_stratum) {
-		if (val)
+	if(have_stratum)
+	{
+		if(val)
 			json_decref(val);
 		return true;
 	}
 
-	if (!val)
+	if(!val)
 		return false;
 
 	rc = work_decode(json_object_get(val, "result"), work);
 
-	if (opt_protocol && rc) {
+	if(opt_protocol && rc)
+	{
 		timeval_subtract(&diff, &tv_end, &tv_start);
 		/* show time because curl can be slower against versions/config */
 		applog(LOG_DEBUG, "got new work in %.2f ms",
-		       (1000.0 * diff.tv_sec) + (0.001 * diff.tv_usec));
+			   (1000.0 * diff.tv_sec) + (0.001 * diff.tv_usec));
 	}
 
 	json_decref(val);
 
+	get_mininginfo(curl, work);
 	get_blocktemplate(curl, work);
 
 	return rc;
@@ -791,10 +955,11 @@ static bool get_upstream_work(CURL *curl, struct work *work)
 
 static void workio_cmd_free(struct workio_cmd *wc)
 {
-	if (!wc)
+	if(!wc)
 		return;
 
-	switch (wc->cmd) {
+	switch(wc->cmd)
+	{
 	case WC_SUBMIT_WORK:
 		aligned_free(wc->u.work);
 		break;
@@ -812,12 +977,14 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
 	int failures = 0;
 
 	ret_work = (struct work*)aligned_calloc(sizeof(*ret_work));
-	if (!ret_work)
+	if(!ret_work)
 		return false;
 
 	/* obtain new work from bitcoin via JSON-RPC */
-	while (!get_upstream_work(curl, ret_work)) {
-		if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
+	while(!get_upstream_work(curl, ret_work))
+	{
+		if(unlikely((opt_retries >= 0) && (++failures > opt_retries)))
+		{
 			applog(LOG_ERR, "json_rpc_call failed, terminating workio thread");
 			aligned_free(ret_work);
 			return false;
@@ -825,12 +992,12 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
 
 		/* pause, then restart work-request loop */
 		applog(LOG_ERR, "json_rpc_call failed, retry after %d seconds",
-			opt_fail_pause);
+			   opt_fail_pause);
 		sleep(opt_fail_pause);
 	}
 
 	/* send work to requesting thread */
-	if (!tq_push(wc->thr->q, ret_work))
+	if(!tq_push(wc->thr->q, ret_work))
 		aligned_free(ret_work);
 
 	return true;
@@ -841,14 +1008,16 @@ static bool workio_submit_work(struct workio_cmd *wc, CURL *curl)
 	int failures = 0;
 
 	/* submit solution to bitcoin via JSON-RPC */
-	while (!submit_upstream_work(curl, wc->u.work)) {
-		if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
+	while(!submit_upstream_work(curl, wc->u.work))
+	{
+		if(unlikely((opt_retries >= 0) && (++failures > opt_retries)))
+		{
 			applog(LOG_ERR, "...terminating workio thread");
 			return false;
 		}
 
 		/* pause, then restart work-request loop */
-		if (!opt_benchmark)
+		if(!opt_benchmark)
 			applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
 
 		sleep(opt_fail_pause);
@@ -864,23 +1033,27 @@ static void *workio_thread(void *userdata)
 	bool ok = true;
 
 	curl = curl_easy_init();
-	if (unlikely(!curl)) {
+	if(unlikely(!curl))
+	{
 		applog(LOG_ERR, "CURL initialization failed");
 		return NULL;
 	}
 
-	while (ok) {
+	while(ok)
+	{
 		struct workio_cmd *wc;
 
 		/* wait for workio_cmd sent to us, on our queue */
 		wc = (struct workio_cmd *)tq_pop(mythr->q, NULL);
-		if (!wc) {
+		if(!wc)
+		{
 			ok = false;
 			break;
 		}
 
 		/* process workio_cmd */
-		switch (wc->cmd) {
+		switch(wc->cmd)
+		{
 		case WC_GET_WORK:
 			ok = workio_get_work(wc, curl);
 			break;
@@ -907,33 +1080,54 @@ static bool get_work(struct thr_info *thr, struct work *work)
 	struct workio_cmd *wc;
 	struct work *work_heap;
 
-	if (opt_benchmark) {
-		memset(work->data, 0x55, 76);
-		//work->data[17] = swab32((uint32_t)time(NULL));
-		memset(work->data + 19, 0x00, 52);
-		work->data[20] = 0x80000000;
-		work->data[31] = 0x00000280;
-		memset(work->target, 0x00, sizeof(work->target));
+	if(opt_benchmark)
+	{
+		if(opt_algo != ALGO_SIA)
+		{
+			memset(work->data, 0x55, 76);
+			memset(work->data + 19, 0x00, 52);
+			work->data[1] = (uint32_t)((double)rand() / (1ULL + RAND_MAX) * 0xffffffffu);
+			work->data[20] = 0x80000000;
+			work->data[31] = 0x00000280;
+			memset(work->target, 0x00, sizeof(work->target));
+			work->datasize = 128;
+		}
+		else
+		{
+			memset(work->data, 0, 4);
+			work->data[1] = (uint32_t)((double)rand() / (1ULL + RAND_MAX) * 0xffffffffu);
+			memset(work->data+2, 0x55, 24);
+			memset(work->data + 8, 0, 8);
+			memset(work->data + 10, 0, 4);
+			memset(work->data + 11, 0x55, 4);
+			memset(work->data + 12, 0x55, 32);
+			memset(work->target, 0x00, sizeof(work->target));
+			work->datasize = 128;
+		}
 		return true;
 	}
 
 	/* fill out work request message */
 	wc = (struct workio_cmd *)calloc(1, sizeof(*wc));
-	if (!wc)
-		return false;
+	if(wc == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 
 	wc->cmd = WC_GET_WORK;
 	wc->thr = thr;
 
 	/* send work request to workio thread */
-	if (!tq_push(thr_info[work_thr_id].q, wc)) {
+	if(!tq_push(thr_info[work_thr_id].q, wc))
+	{
 		workio_cmd_free(wc);
 		return false;
 	}
 
 	/* wait for response, a unit of work */
 	work_heap = (struct work *)tq_pop(thr->q, NULL);
-	if (!work_heap)
+	if(!work_heap)
 		return false;
 
 	/* copy returned work into storage provided by caller */
@@ -948,19 +1142,25 @@ static bool submit_work(struct thr_info *thr, const struct work *work_in)
 	struct workio_cmd *wc;
 	/* fill out work request message */
 	wc = (struct workio_cmd *)calloc(1, sizeof(*wc));
-	if (!wc)
-		return false;
+	if(wc == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 
 	wc->u.work = (struct work *)aligned_calloc(sizeof(*work_in));
-	if (!wc->u.work)
-		goto err_out;
+	if(wc->u.work == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 
 	wc->cmd = WC_SUBMIT_WORK;
 	wc->thr = thr;
 	memcpy(wc->u.work, work_in, sizeof(*work_in));
 
 	/* send solution to workio thread */
-	if (!tq_push(thr_info[work_thr_id].q, wc))
+	if(!tq_push(thr_info[work_thr_id].q, wc))
 		goto err_out;
 
 	return true;
@@ -970,33 +1170,32 @@ static bool submit_work(struct thr_info *thr, const struct work *work_in)
 	return false;
 }
 
-static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
+static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 {
-	uchar merkle_root[64];
+	extern void siahash(const void *data, unsigned int len, void *hash);
+	uchar merkle_root[1024];
 	int i;
 
-	if (!sctx->job.job_id) {
+	if(!sctx->job.job_id)
+	{
 		// applog(LOG_WARNING, "stratum_gen_work: job not yet retrieved");
-		return;
+		return false;
 	}
 
 	pthread_mutex_lock(&sctx->work_lock);
 
 	// store the job ntime as high part of jobid
 	snprintf(work->job_id, sizeof(work->job_id), "%07x %s",
-		be32dec(sctx->job.ntime) & 0xfffffff, sctx->job.job_id);
+					 be32dec(sctx->job.ntime) & 0xfffffff, sctx->job.job_id);
 	work->xnonce2_len = sctx->xnonce2_size;
 	memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size);
 
-	// also store the bloc number
+	// also store the block number
 	work->height = sctx->job.height;
 
 	/* Generate merkle root */
-	switch (opt_algo) {
-		case ALGO_HEAVY:
-		case ALGO_MJOLLNIR:
-			heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
-			break;
+	switch(opt_algo)
+	{
 		case ALGO_FUGUE256:
 		case ALGO_GROESTL:
 		case ALGO_KECCAK:
@@ -1004,87 +1203,112 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_WHC:
 			SHA256((uchar*)sctx->job.coinbase, sctx->job.coinbase_size, (uchar*)merkle_root);
 			break;
+		case ALGO_SIA:
+		{
+			merkle_root[0] = (uchar)0;
+			memcpy(merkle_root + 1, sctx->job.coinbase, sctx->job.coinbase_size);
+			siahash(merkle_root, (unsigned int)sctx->job.coinbase_size + 1, merkle_root + 33);
+			break;
+		}
 		default:
 			sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
 	}
+	if(opt_algo == ALGO_SIA)
+		merkle_root[0] = (uchar)1;
 
-	for (i = 0; i < sctx->job.merkle_count; i++) {
-		memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
-		if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR)
-			heavycoin_hash(merkle_root, merkle_root, 64);
+	for(i = 0; i < sctx->job.merkle_count; i++)
+	{
+		if(opt_algo == ALGO_SIA)
+		{
+			memcpy(merkle_root + 1, sctx->job.merkle[i], 32);
+			siahash(merkle_root, 65, merkle_root + 33);
+		}
 		else
+		{
+			memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
 			sha256d(merkle_root, merkle_root, 64);
+		}
 	}
-	
+
 	/* Increment extranonce2 */
-	for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++);
+	if(opt_extranonce)
+	{
+		for(i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++);
+		{
+			sctx->job.xnonce2[i]++;
+		}
+	}
+	static uint32_t highnonce = 0;
+	if(opt_algo == ALGO_SIA)
+		highnonce++;
 
 	/* Assemble block header */
 	memset(work->data, 0, sizeof(work->data));
-	work->data[0] = le32dec(sctx->job.version);
-	for (i = 0; i < 8; i++)
-		work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i);
-	for (i = 0; i < 8; i++)
-		work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
-	work->data[17] = le32dec(sctx->job.ntime);
-	work->data[18] = le32dec(sctx->job.nbits);
-	if (opt_algo == ALGO_MJOLLNIR || opt_algo == ALGO_HEAVY)
+	if(opt_algo != ALGO_SIA)
 	{
-		for (i = 0; i < 20; i++)
-			work->data[i] = be32dec((uint32_t *)&work->data[i]);
+		work->data[0] = le32dec(sctx->job.version);
+		for(i = 0; i < 8; i++)
+			work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i);
+		for(i = 0; i < 8; i++)
+			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
+		work->data[17] = le32dec(sctx->job.ntime);
+		work->data[18] = le32dec(sctx->job.nbits);
+		work->data[20] = 0x80000000;
+		work->data[31] = 0x00000280;
 	}
-
-	work->data[20] = 0x80000000;
-	work->data[31] = (opt_algo == ALGO_MJOLLNIR) ? 0x000002A0 : 0x00000280;
-
-	// HeavyCoin (vote / reward)
-	if (opt_algo == ALGO_HEAVY) {
-		work->maxvote = 2048;
-		uint16_t *ext = (uint16_t*)(&work->data[20]);
-		ext[0] = opt_vote;
-		ext[1] = be16dec(sctx->job.nreward);
-		// applog(LOG_DEBUG, "DEBUG: vote=%hx reward=%hx", ext[0], ext[1]);
+	else
+	{
+		for(i = 0; i < 8; i++)
+			work->data[i] = le32dec((uint32_t *)sctx->job.prevhash + i);
+		work->data[8] = 0; // nonce
+		work->data[9] = highnonce;
+		work->data[10] = le32dec(sctx->job.ntime);
+		work->data[11] = 0;
+		for(i = 0; i < 8; i++)
+			work->data[12 + i] = le32dec((uint32_t *)(merkle_root + 33) + i);
 	}
 
 	pthread_mutex_unlock(&sctx->work_lock);
-
-	if (opt_debug) {
-		char *tm = atime2str(swab32(work->data[17]) - sctx->srvtime_diff);
+	if(opt_debug)
+	{
+		char *tm;
+		if(opt_algo != ALGO_SIA)
+			tm = atime2str(swab32(work->data[17]) - sctx->srvtime_diff);
+		else
+			tm = atime2str(work->data[10] - sctx->srvtime_diff);
 		char *xnonce2str = bin2hex(work->xnonce2, sctx->xnonce2_size);
 		applog(LOG_DEBUG, "DEBUG: job_id=%s xnonce2=%s time=%s",
-		       work->job_id, xnonce2str, tm);
+					 work->job_id, xnonce2str, tm);
 		free(tm);
 		free(xnonce2str);
 	}
 
-	switch (opt_algo) {
+	switch(opt_algo)
+	{
 		case ALGO_JACKPOT:
+		case ALGO_NEO:
 			diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
 			break;
 		case ALGO_DMD_GR:
 		case ALGO_FRESH:
 		case ALGO_FUGUE256:
 		case ALGO_GROESTL:
-			diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
-			break;
 		case ALGO_KECCAK:
+		case ALGO_LYRA2v2:
 			diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
 			break;
-		case ALGO_LYRA2:
-		case ALGO_QUBIT:
-			diff_to_target(work->target, sctx->job.diff / (128.0 * opt_difficulty));
-			break;
 		default:
 			diff_to_target(work->target, sctx->job.diff / opt_difficulty);
 	}
+	return true;
 }
 
-static void restart_threads(void)
+void restart_threads(void)
 {
-	if (opt_debug && !opt_quiet)
-		applog(LOG_DEBUG,"%s", __FUNCTION__);
+	if(opt_debug && !opt_quiet)
+		applog(LOG_DEBUG, "%s", __FUNCTION__);
 
-	for (int i = 0; i < opt_n_threads; i++)
+	for(int i = 0; i < opt_n_threads; i++)
 		work_restart[i].restart = 1;
 }
 
@@ -1095,110 +1319,117 @@ static void *miner_thread(void *userdata)
 	struct work work;
 	uint64_t loopcnt = 0;
 	uint32_t max_nonce;
-	uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1);
-	bool work_done = false;
+	uint32_t end_nonce = UINT32_MAX / opt_n_threads * (thr_id + 1) - (thr_id + 1);
 	bool extrajob = false;
 	char s[16];
 	int rc = 0;
 
 	memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized
 
-	/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
-	 * and if that fails, then SCHED_BATCH. No need for this to be an
-	 * error if it fails */
-	if (!opt_benchmark && opt_priority == 0) {
-		setpriority(PRIO_PROCESS, 0, 18);
-		drop_policy();
-	} else {
-		int prio = 0;
+	if(opt_priority > 0)
+	{
+		int prio = 2; // default to normal
 #ifndef WIN32
-		prio = 18;
+		prio = 0;
 		// note: different behavior on linux (-19 to 19)
-		switch (opt_priority) {
-			case 1:
-				prio = 5;
-				break;
-			case 2:
-				prio = 0;
-				break;
-			case 3:
-				prio = -5;
-				break;
-			case 4:
-				prio = -10;
-				break;
-			case 5:
-				prio = -15;
+		switch(opt_priority)
+		{
+		case 0:
+			prio = 15;
+			break;
+		case 1:
+			prio = 5;
+			break;
+		case 2:
+			prio = 0; // normal process
+			break;
+		case 3:
+			prio = -1; // above
+			break;
+		case 4:
+			prio = -10;
+			break;
+		case 5:
+			prio = -15;
 		}
-		applog(LOG_DEBUG, "Thread %d priority %d (set to %d)", thr_id,
-			opt_priority, prio);
+		if(opt_debug)
+			applog(LOG_DEBUG, "Thread %d priority %d (nice %d)",
+			thr_id, opt_priority, prio);
 #endif
-		int ret = setpriority(PRIO_PROCESS, 0, prio);
-		if (opt_priority == 0) {
-			drop_policy();
-		}
+		setpriority(PRIO_PROCESS, 0, prio);
+		drop_policy();
 	}
 
+
 	/* Cpu thread affinity */
-	if (num_cpus > 1) {
-		if (opt_affinity == -1 && opt_n_threads > 1) {
-			if (!opt_quiet)
+	if(num_cpus > 1)
+	{
+		if(opt_affinity == -1)
+		{
+			if(opt_debug)
 				applog(LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)", thr_id,
-						thr_id % num_cpus, (1 << (thr_id % num_cpus)));
-			affine_to_cpu_mask(thr_id, 1 << (thr_id % num_cpus));
-		} else if (opt_affinity != -1) {
-			if (!opt_quiet)
-				applog(LOG_DEBUG, "Binding thread %d to cpu mask %x", thr_id,
-						opt_affinity);
+				thr_id%num_cpus, (1 << (thr_id)));
+			affine_to_cpu_mask(thr_id, 1 << (thr_id));
+		}
+		else if(opt_affinity != -1)
+		{
+			if(opt_debug)
+				applog(LOG_DEBUG, "Binding thread %d to gpu mask %x", thr_id,
+				opt_affinity);
 			affine_to_cpu_mask(thr_id, opt_affinity);
 		}
 	}
 
-	while (1) 
+	while(1)
 	{
-		if (opt_benchmark)
+		// &work.data[19]
+		int wcmplen;
+		switch(opt_algo)
 		{
-			work.data[19] = work.data[19] & 0xfffffffU;	//reset Hashcounters
-			work.data[21] = work.data[21] & 0xfffffffU;
+			case ALGO_SIA:
+				wcmplen = 80;
+				break;
+			default:
+				wcmplen = 76;
 		}
+		uint32_t *nonceptr;
+		if(opt_algo!=ALGO_SIA)
+			nonceptr = (uint32_t*)(((char*)work.data) + wcmplen);
+		else
+			nonceptr = (uint32_t*)(((char*)work.data) + 8*4);
+
 		struct timeval tv_start, tv_end, diff;
-		unsigned long hashes_done=0;
+		uint32_t hashes_done = 0;
 		uint32_t start_nonce;
 		uint32_t scan_time = have_longpoll ? LP_SCANTIME : opt_scantime;
-		uint64_t max64, minmax = 0x100000;
+		uint64_t max64, minmax;
 
-		// &work.data[19]
-		int wcmplen = 76;
-		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
-
-		if (have_stratum) {
-			uint32_t sleeptime = 0;
-			while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) {
-				usleep(100*1000);
-				if (sleeptime > 4) {
-					extrajob = true;
-					break;
-				}
-				sleeptime++;
-			}
-			if (sleeptime && opt_debug && !opt_quiet)
-				applog(LOG_DEBUG, "sleeptime: %u ms", sleeptime*100);
-			nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
+		if(have_stratum)
+		{
+			if(loopcnt == 0 || time(NULL) >= (g_work_time + opt_scantime))
+				extrajob = true;
 			pthread_mutex_lock(&g_work_lock);
-			extrajob |= work_done;
-			if (nonceptr[0] >= end_nonce || extrajob) {
-				work_done = false;
+			if(nonceptr[0] >= end_nonce - 0x00010000 || extrajob)
+			{
 				extrajob = false;
-				stratum_gen_work(&stratum, &g_work);
+				while(!stratum_gen_work(&stratum, &g_work))
+				{
+					applog(LOG_WARNING, "GPU #%d: waiting for data", device_map[thr_id]);
+					sleep(3);
+				}
 			}
-		} else {
+		}
+		else
+		{
 			pthread_mutex_lock(&g_work_lock);
-			if ((time(NULL) - g_work_time) >= scan_time || nonceptr[0] >= (end_nonce - 0x100)) {
-				if (opt_debug && g_work_time && !opt_quiet)
+			if((time(NULL) - g_work_time) >= scan_time || nonceptr[0] >= (end_nonce - 0x10000))
+			{
+				if(opt_debug && g_work_time && !opt_quiet)
 					applog(LOG_DEBUG, "work time %u/%us nonce %x/%x", time(NULL) - g_work_time,
-						scan_time, nonceptr[0], end_nonce);
+					scan_time, nonceptr[0], end_nonce);
 				/* obtain new work from internal workio thread */
-				if (unlikely(!get_work(mythr, &g_work))) {
+				if(unlikely(!get_work(mythr, &g_work)))
+				{
 					pthread_mutex_unlock(&g_work_lock);
 					applog(LOG_ERR, "work retrieval failed, exiting mining thread %d", mythr->id);
 					goto out;
@@ -1206,242 +1437,301 @@ static void *miner_thread(void *userdata)
 				g_work_time = time(NULL);
 			}
 		}
-
-		if (!opt_benchmark && (g_work.height != work.height || memcmp(work.target, g_work.target, sizeof(work.target))))
+		if(!opt_benchmark && (g_work.height != work.height || memcmp(work.target, g_work.target, sizeof(work.target))))
 		{
 			calc_diff(&g_work, 0);
-			if (!have_stratum)
+			if(!have_stratum && !allow_mininginfo)
 				global_diff = g_work.difficulty;
-			if (opt_debug) {
+			if(opt_debug)
+			{
 				uint64_t target64 = g_work.target[7] * 0x100000000ULL + g_work.target[6];
 				applog(LOG_DEBUG, "job %s target change: %llx (%.1f)", g_work.job_id, target64, g_work.difficulty);
 			}
 			memcpy(work.target, g_work.target, sizeof(work.target));
 			work.difficulty = g_work.difficulty;
 			work.height = g_work.height;
-			nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
-			/* on new target, ignoring nonce, clear sent data (hashlog) */
-			if (memcmp(work.target, g_work.target, sizeof(work.target))) {
-				if (check_dups)
-					hashlog_purge_job(work.job_id);
-			}
 		}
-		if (memcmp(work.data, g_work.data, wcmplen)) {
-			#if 0
-			if (opt_debug) {
-				for (int n=0; n <= (wcmplen-8); n+=8) {
-					if (memcmp(work.data + n, g_work.data + n, 8)) {
+
+		int different;
+		if(opt_algo != ALGO_SIA)
+			different = memcmp(work.data, g_work.data, wcmplen);
+		else
+			different = memcmp(work.data, g_work.data, 7*4) || memcmp(work.data + 9, g_work.data + 9, 44);
+		if(different)
+		{
+			if(opt_debug)
+				applog(LOG_DEBUG, "thread %d: new work", thr_id);
+#if 0
+			if(opt_debug)
+			{
+				for(int n = 0; n <= (wcmplen - 8); n += 8)
+				{
+					if(memcmp(work.data + n, g_work.data + n, 8))
+					{
 						applog(LOG_DEBUG, "job %s work updated at offset %d:", g_work.job_id, n);
-						applog_hash((uchar*) &work.data[n]);
-						applog_compare_hash((uchar*) &g_work.data[n], (uchar*) &work.data[n]);
+						applog_hash((uchar*)&work.data[n]);
+						applog_compare_hash((uchar*)&g_work.data[n], (uchar*)&work.data[n]);
 					}
 				}
 			}
-			#endif
+#endif
+			if(opt_debug && opt_algo == ALGO_SIA)
+				applog(LOG_DEBUG, "thread %d: high nonce = %08X", thr_id, work.data[9]);
 			memcpy(&work, &g_work, sizeof(struct work));
 			nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
-		} else
-			nonceptr[0]++; //??
-
+		}
+		else
+		{
+			if(opt_debug)
+				applog(LOG_DEBUG, "thread %d: continue with old work", thr_id);
+		}
 		work_restart[thr_id].restart = 0;
 		pthread_mutex_unlock(&g_work_lock);
-
 		/* adjust max_nonce to meet target scan time */
-		if (have_stratum)
-			max64 = LP_SCANTIME;
+		uint32_t max64time;
+		if(have_stratum)
+			max64time = LP_SCANTIME;
 		else
-			max64 = max(1, scan_time + g_work_time - time(NULL));
+			max64time = (uint32_t)max(1, scan_time + g_work_time - time(NULL));
 
-		max64 *= (uint32_t)thr_hashrates[thr_id];
+		max64 = max64time * (uint32_t)thr_hashrates[thr_id];
 
 		/* on start, max64 should not be 0,
-		 *    before hashrate is computed */
-		if (max64 < minmax) {
-			switch (opt_algo) {
-			case ALGO_BLAKECOIN:
+		*    before hashrate is computed */
+		switch(opt_algo)
+		{
+			case ALGO_KECCAK:
+				minmax = 83000000 * max64time;
+				break;
 			case ALGO_BLAKE:
-				minmax = 0x80000000U;
+			case ALGO_SIA:
+				minmax = 260000000 * max64time;
+				break;
+			case ALGO_BLAKECOIN:
+			case ALGO_VANILLA:
+				minmax = 470000000 * max64time;
 				break;
 			case ALGO_BITCOIN:
-			case ALGO_KECCAK:
-				minmax = 0x40000000U;
+				minmax = 100000000 * max64time;
+				break;
+			case ALGO_QUBIT:
+			case ALGO_QUARK:
+				minmax = 3100000 * max64time;
+				break;
+			case ALGO_JACKPOT:
+				minmax = 2800000 * max64time;
 				break;
+			case ALGO_SKEIN:
+			case ALGO_WHCX:
 			case ALGO_DOOM:
-			case ALGO_JACKPOT:
 			case ALGO_LUFFA_DOOM:
-				minmax = 0x2000000;
+				minmax = 38000000 * max64time;
 				break;
+			case ALGO_NIST5:
 			case ALGO_S3:
+				minmax = 4600000 * max64time;
+				break;
 			case ALGO_X11:
+			case ALGO_C11:
+				minmax = 1500000 * max64time;
+				break;
 			case ALGO_X13:
-				minmax = 0x400000;
+				minmax = 1200000 * max64time;
 				break;
-			case ALGO_LYRA2:
-				minmax = 0x100000;
+			case ALGO_X17:
+			case ALGO_X15:
+				minmax = 1000000 * max64time;
 				break;
-			}
-			max64 = max(minmax-1, max64);
+			case ALGO_LYRA2v2:
+				minmax = 1900000 * max64time;
+				break;
+			case ALGO_NEO:
+				minmax = 90000 * max64time;
+				break;
+			default:
+				minmax = 4000 * max64time;
 		}
+		max64 = max(minmax, max64);
 
 		// we can't scan more than uint capacity
 		max64 = min(UINT32_MAX, max64);
-
 		start_nonce = nonceptr[0];
 
 		/* never let small ranges at end */
-		if (end_nonce >= UINT32_MAX - 256)
+		if(end_nonce >= UINT32_MAX - 256)
 			end_nonce = UINT32_MAX;
 
-		if ((max64 + start_nonce) >= end_nonce)
+		if((max64 + start_nonce) >= end_nonce)
 			max_nonce = end_nonce;
 		else
-			max_nonce = (uint32_t) (max64 + start_nonce);
+			max_nonce = (uint32_t)(max64 + start_nonce);
 
 		// todo: keep it rounded for gpu threads ?
 
 		work.scanned_from = start_nonce;
-		nonceptr[0] = start_nonce;
 
-		if (opt_debug)
+		if(opt_debug)
 			applog(LOG_DEBUG, "GPU #%d: start=%08x end=%08x range=%08x",
-				device_map[thr_id], start_nonce, max_nonce, (max_nonce-start_nonce));
+			device_map[thr_id], start_nonce, max_nonce, (max_nonce - start_nonce + 1));
 
 		hashes_done = 0;
 		gettimeofday(&tv_start, NULL);
-
+		uint32_t databackup;
+		if(opt_algo != ALGO_SIA)
+			databackup = nonceptr[2];
+		else
+			databackup = nonceptr[12];
 		/* scan nonces for a proof-of-work hash */
-		switch (opt_algo) {
-
-		case ALGO_HEAVY:
-			rc = scanhash_heavy(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ);
-			break;
+		switch(opt_algo)
+		{
 
 		case ALGO_KECCAK:
 			rc = scanhash_keccak256(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_MJOLLNIR:
-			rc = scanhash_heavy(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ);
+									max_nonce, &hashes_done);
 			break;
 
 		case ALGO_DEEP:
 			rc = scanhash_deep(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+							   max_nonce, &hashes_done);
 			break;
 
 		case ALGO_DOOM:
 		case ALGO_LUFFA_DOOM:
 			rc = scanhash_doom(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+							   max_nonce, &hashes_done);
+			break;
+
+		case ALGO_C11:
+			rc = scanhash_c11(thr_id, work.data, work.target,
+							  max_nonce, &hashes_done);
 			break;
 
 		case ALGO_FUGUE256:
 			rc = scanhash_fugue256(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+								   max_nonce, &hashes_done);
 			break;
 
 		case ALGO_GROESTL:
 		case ALGO_DMD_GR:
 			rc = scanhash_groestlcoin(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+									  max_nonce, &hashes_done);
 			break;
 
 		case ALGO_MYR_GR:
 			rc = scanhash_myriad(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+								 max_nonce, &hashes_done);
 			break;
 
 		case ALGO_JACKPOT:
 			rc = scanhash_jackpot(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+								  max_nonce, &hashes_done);
 			break;
 
 		case ALGO_QUARK:
 			rc = scanhash_quark(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+								max_nonce, &hashes_done);
 			break;
 
 		case ALGO_QUBIT:
 			rc = scanhash_qubit(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+								max_nonce, &hashes_done);
 			break;
 
-		case ALGO_ANIME:
-			rc = scanhash_anime(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
 
 		case ALGO_BITCOIN:
 			rc = scanhash_bitcoin(thr_id, work.data, work.target,
-				max_nonce, &hashes_done);
+								  max_nonce, &hashes_done);
+			break;
+
+		case ALGO_VANILLA:
+			rc = scanhash_blake256(thr_id, work.data, work.target,
+														 max_nonce, &hashes_done, 8);
 			break;
 
 		case ALGO_BLAKECOIN:
 			rc = scanhash_blake256(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done, 8);
+								   max_nonce, &hashes_done, 8);
 			break;
 
 		case ALGO_BLAKE:
 			rc = scanhash_blake256(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done, 14);
+								   max_nonce, &hashes_done, 14);
 			break;
 
 		case ALGO_FRESH:
 			rc = scanhash_fresh(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+								max_nonce, &hashes_done);
 			break;
 
-		case ALGO_LYRA2:
-			rc = scanhash_lyra2(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+		case ALGO_LYRA2v2:
+			rc = scanhash_lyra2v2(thr_id, work.data, work.target,
+				max_nonce, &hashes_done);
 			break;
 
 		case ALGO_NIST5:
 			rc = scanhash_nist5(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+								max_nonce, &hashes_done);
 			break;
 
 		case ALGO_PENTABLAKE:
 			rc = scanhash_pentablake(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+									 max_nonce, &hashes_done);
+			break;
+
+		case ALGO_SKEIN:
+			rc = scanhash_skeincoin(thr_id, work.data, work.target,
+									max_nonce, &hashes_done);
 			break;
 
 		case ALGO_S3:
 			rc = scanhash_s3(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+							 max_nonce, &hashes_done);
 			break;
 
 		case ALGO_WHC:
 			rc = scanhash_whc(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+							  max_nonce, &hashes_done);
+			break;
+
+		case ALGO_WHCX:
+			rc = scanhash_whirlpoolx(thr_id, work.data, work.target,
+									 max_nonce, &hashes_done);
 			break;
 
 		case ALGO_X11:
 			rc = scanhash_x11(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+							  max_nonce, &hashes_done);
 			break;
 
 		case ALGO_X13:
 			rc = scanhash_x13(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+							  max_nonce, &hashes_done);
 			break;
 
 		case ALGO_X14:
 			rc = scanhash_x14(thr_id, work.data, work.target,
-				max_nonce, &hashes_done);
+							  max_nonce, &hashes_done);
 			break;
 
 		case ALGO_X15:
 			rc = scanhash_x15(thr_id, work.data, work.target,
-				max_nonce, &hashes_done);
+							  max_nonce, &hashes_done);
 			break;
 
 		case ALGO_X17:
 			rc = scanhash_x17(thr_id, work.data, work.target,
-				max_nonce, &hashes_done);
+							  max_nonce, &hashes_done);
+			break;
+
+		case ALGO_NEO:
+			if(!have_stratum && work.datasize == 128)
+				rc = scanhash_neoscrypt(true, thr_id, work.data, work.target, max_nonce, &hashes_done);
+			else
+				rc = scanhash_neoscrypt(have_stratum, thr_id, work.data, work.target, max_nonce, &hashes_done);
+			break;
+
+		case ALGO_SIA:
+			rc = scanhash_sia(thr_id, work.data, work.target, max_nonce, &hashes_done);
 			break;
 
 		default:
@@ -1451,67 +1741,78 @@ static void *miner_thread(void *userdata)
 
 		/* record scanhash elapsed time */
 		gettimeofday(&tv_end, NULL);
-
-		if (rc && opt_debug)
+		if(rc && opt_debug)
 			applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", nonceptr[0], swab32(nonceptr[0])); // data[19]
-		if (rc > 1 && opt_debug)
-			applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", nonceptr[2], swab32(nonceptr[2])); // data[21]
-
+		if(opt_algo != ALGO_SIA)
+		{
+			if(rc > 1 && opt_debug)
+				applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", nonceptr[2], swab32(nonceptr[2])); // data[21]
+		}
+		else
+		{
+			if(rc > 1 && opt_debug)
+				applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", nonceptr[12], swab32(nonceptr[12])); // data[21]
+		}
 		timeval_subtract(&diff, &tv_end, &tv_start);
 
-		if (diff.tv_sec > 0 || (diff.tv_sec==0 && diff.tv_usec>2000)) // avoid totally wrong hash rates
+		if(diff.tv_sec > 0 || (diff.tv_sec == 0 && diff.tv_usec>2000)) // avoid totally wrong hash rates
 		{
-			double dtime = (double) diff.tv_sec + 1e-6 * diff.tv_usec;
+			double dtime = (double)diff.tv_sec + 1e-6 * diff.tv_usec;
 
 			/* hashrate factors for some algos */
 			double rate_factor = 1.0;
-			switch (opt_algo) {
-				case ALGO_JACKPOT:
-				case ALGO_QUARK:
-					// to stay comparable to other ccminer forks or pools
-					rate_factor = 0.5;
-					break;
+			switch(opt_algo)
+			{
+			case ALGO_JACKPOT:
+			case ALGO_QUARK:
+				// to stay comparable to other ccminer forks or pools
+				rate_factor = 0.5;
+				break;
 			}
 
 			/* store thread hashrate */
-			if (dtime > 0.0) {
+			if(dtime > 0.0)
+			{
 				pthread_mutex_lock(&stats_lock);
 				thr_hashrates[thr_id] = hashes_done / dtime;
 				thr_hashrates[thr_id] *= rate_factor;
-				stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height);
+				stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t)rc, work.height);
 				pthread_mutex_unlock(&stats_lock);
 			}
 		}
 
 		work.scanned_to = start_nonce + hashes_done - 1;
-		if (opt_debug && opt_benchmark) 
+		if(opt_debug && opt_benchmark)
 		{
 			// to debug nonce ranges
-			applog(LOG_DEBUG, "GPU #%d:  ends=%08x range=%llx", device_map[thr_id],
-				start_nonce + hashes_done - 1, hashes_done);
+			applog(LOG_DEBUG, "GPU #%d:  ends=%08x range=%08x", device_map[thr_id],
+				   start_nonce + hashes_done - 1, hashes_done);
 		}
 
-		if (check_dups)
+		if(check_dups)
 			hashlog_remember_scan_range(&work);
 
-		/* output */
-		if (!opt_quiet && (loopcnt > 0)) {
-			sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f",
-				1e-3 * thr_hashrates[thr_id]);
-			applog(LOG_INFO, "GPU #%d: %s, %s kH/s",
-				device_map[thr_id], device_name[device_map[thr_id]], s);
+		if(!opt_quiet && loopcnt > 0)
+		{
+			double hashrate;
+
+			hashrate = thr_hashrates[thr_id];
+			format_hashrate(hashrate, s);
+			applog(LOG_INFO, "GPU #%d: %s, %s", device_map[thr_id], device_name[device_map[thr_id]], s);
 		}
 
 		/* loopcnt: ignore first loop hashrate */
-		if ((loopcnt>0) && thr_id == (opt_n_threads - 1)) {
+		if((loopcnt>0) && thr_id == (opt_n_threads - 1))
+		{
 			double hashrate = 0.;
 			pthread_mutex_lock(&stats_lock);
-			for (int i = 0; i < opt_n_threads && thr_hashrates[i]; i++)
+			for(int i = 0; i < opt_n_threads; i++)
 				hashrate += stats_get_speed(i, thr_hashrates[i]);
 			pthread_mutex_unlock(&stats_lock);
-			if (opt_benchmark) {
-				sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", hashrate / 1000.);
-				applog(LOG_NOTICE, "Total: %s kH/s", s);
+			if(opt_benchmark)
+			{
+				format_hashrate(hashrate, s);
+				applog(LOG_NOTICE, "Total: %s", s);
 			}
 
 			// X-Mining-Hashrate
@@ -1519,13 +1820,26 @@ static void *miner_thread(void *userdata)
 		}
 
 		/* if nonce found, submit work */
-		if (rc && !opt_benchmark) {
-			if (!submit_work(mythr, &work))
+		if(rc && !opt_benchmark)
+		{
+			uint32_t found2;
+			if(opt_algo != ALGO_SIA)
+			{
+				found2 = nonceptr[2];
+				nonceptr[2] = databackup;
+			}
+			else
+			{
+				found2 = nonceptr[12];
+				nonceptr[12] = databackup;
+			}
+			if(!submit_work(mythr, &work))
 				break;
 
 			// prevent stale work in solo
 			// we can't submit twice a block!
-			if (!have_stratum) {
+			if(!have_stratum && !have_longpoll)
+			{
 				pthread_mutex_lock(&g_work_lock);
 				// will force getwork
 				g_work_time = 0;
@@ -1534,14 +1848,26 @@ static void *miner_thread(void *userdata)
 			}
 
 			// second nonce found, submit too (on pool only!)
-			if (rc > 1 && work.data[21]) {
-				work.data[19] = work.data[21];
-				work.data[21] = 0;
-				if (!submit_work(mythr, &work))
-					break;
+			if(opt_algo != ALGO_SIA)
+			{
+				if(rc > 1 && nonceptr[2])
+				{
+					nonceptr[0] = found2;
+					if(!submit_work(mythr, &work))
+						break;
+				}
 			}
+			else
+				if(rc > 1 && nonceptr[12])
+				{
+					nonceptr[0] = found2;
+					if(!submit_work(mythr, &work))
+						break;
+				}
+
 		}
-		work.data[19] = start_nonce + hashes_done;
+		nonceptr[0] = start_nonce + hashes_done;
+
 		loopcnt++;
 	}
 
@@ -1559,67 +1885,79 @@ static void *longpoll_thread(void *userdata)
 	bool need_slash = false;
 
 	curl = curl_easy_init();
-	if (unlikely(!curl)) {
+	if(unlikely(!curl))
+	{
 		applog(LOG_ERR, "CURL initialization failed");
 		goto out;
 	}
 
 start:
 	hdr_path = (char*)tq_pop(mythr->q, NULL);
-	if (!hdr_path)
+	if(!hdr_path)
 		goto out;
 
 	/* full URL */
-	if (strstr(hdr_path, "://")) {
+	if(strstr(hdr_path, "://"))
+	{
 		lp_url = hdr_path;
 		hdr_path = NULL;
 	}
-	
+
 	/* absolute path, on current server */
-	else {
+	else
+	{
 		copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path;
-		if (rpc_url[strlen(rpc_url) - 1] != '/')
+		if(rpc_url[strlen(rpc_url) - 1] != '/')
 			need_slash = true;
 
 		lp_url = (char*)malloc(strlen(rpc_url) + strlen(copy_start) + 2);
-		if (!lp_url)
-			goto out;
+		if(lp_url == NULL)
+		{
+			applog(LOG_ERR, "Out of memory!");
+			proper_exit(2);
+		}
 
 		sprintf(lp_url, "%s%s%s", rpc_url, need_slash ? "/" : "", copy_start);
 	}
 
-	applog(LOG_INFO, "Long-polling activated for %s", lp_url);
+	applog(LOG_INFO, "Long-polling enabled on %s", lp_url);
 
-	while (1) {
+	while(1)
+	{
 		json_t *val, *soval;
 		int err;
 
 		val = json_rpc_call(curl, lp_url, rpc_userpass, rpc_req,
-				    false, true, &err);
-		if (have_stratum) {
-			if (val)
+							false, true, &err);
+		if(have_stratum)
+		{
+			if(val)
 				json_decref(val);
 			goto out;
 		}
-		if (likely(val)) {
-			if (!opt_quiet) applog(LOG_INFO, "LONGPOLL detected new block");
+		if(likely(val))
+		{
 			soval = json_object_get(json_object_get(val, "result"), "submitold");
 			submit_old = soval ? json_is_true(soval) : false;
 			pthread_mutex_lock(&g_work_lock);
-			if (work_decode(json_object_get(val, "result"), &g_work)) {
-				if (opt_debug)
-					applog(LOG_BLUE, "LONGPOLL pushed new work");
+			if(work_decode(json_object_get(val, "result"), &g_work))
+			{
+				if(!opt_quiet)
+					applog(LOG_BLUE, "%s detected new block", short_url);
 				g_work_time = time(NULL);
 				restart_threads();
 			}
 			pthread_mutex_unlock(&g_work_lock);
 			json_decref(val);
-		} else {
+		}
+		else
+		{
 			pthread_mutex_lock(&g_work_lock);
 			g_work_time -= LP_SCANTIME;
 			pthread_mutex_unlock(&g_work_lock);
 			restart_threads();
-			if (err != CURLE_OPERATION_TIMEDOUT) {
+			if(err != CURLE_OPERATION_TIMEDOUT)
+			{
 				have_longpoll = false;
 				free(hdr_path);
 				free(lp_url);
@@ -1634,7 +1972,7 @@ static void *longpoll_thread(void *userdata)
 	free(hdr_path);
 	free(lp_url);
 	tq_freeze(mythr->q);
-	if (curl)
+	if(curl)
 		curl_easy_cleanup(curl);
 
 	return NULL;
@@ -1648,7 +1986,8 @@ static bool stratum_handle_response(char *buf)
 	bool ret = false;
 
 	val = JSON_LOADS(buf, &err);
-	if (!val) {
+	if(!val)
+	{
 		applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
 		goto out;
 	}
@@ -1657,24 +1996,24 @@ static bool stratum_handle_response(char *buf)
 	err_val = json_object_get(val, "error");
 	id_val = json_object_get(val, "id");
 
-	if (!id_val || json_is_null(id_val) || !res_val)
+	if(!id_val || json_is_null(id_val) || !res_val)
 		goto out;
 
 	// ignore subscribe late answer (yaamp)
-	if (json_integer_value(id_val) < 4)
+	if(json_integer_value(id_val) < 4)
 		goto out;
 
 	gettimeofday(&tv_answer, NULL);
 	timeval_subtract(&diff, &tv_answer, &stratum.tv_submit);
 	// store time required to the pool to answer to a submit
-	stratum.answer_msec = (1000 * diff.tv_sec) + (uint32_t) (0.001 * diff.tv_usec);
+	stratum.answer_msec = (1000 * diff.tv_sec) + (uint32_t)(0.001 * diff.tv_usec);
 
 	share_result(json_is_true(res_val),
-		err_val ? json_string_value(json_array_get(err_val, 1)) : NULL);
+				 err_val ? json_string_value(json_array_get(err_val, 1)) : NULL);
 
 	ret = true;
 out:
-	if (val)
+	if(val)
 		json_decref(val);
 
 	return ret;
@@ -1686,72 +2025,83 @@ static void *stratum_thread(void *userdata)
 	char *s;
 
 	stratum.url = (char*)tq_pop(mythr->q, NULL);
-	if (!stratum.url)
+	if(!stratum.url)
 		goto out;
 	applog(LOG_BLUE, "Starting Stratum on %s", stratum.url);
-
-	while (1) {
+	stratum.curl = NULL;
+	while(1)
+	{
 		int failures = 0;
 
-		if (stratum_need_reset) {
+		if(stratum_need_reset)
+		{
 			stratum_need_reset = false;
 			stratum_disconnect(&stratum);
 			applog(LOG_DEBUG, "stratum connection reset");
 		}
 
-		while (!stratum.curl) {
+		while(!stratum.curl)
+		{
 			pthread_mutex_lock(&g_work_lock);
 			g_work_time = 0;
 			pthread_mutex_unlock(&g_work_lock);
 			restart_threads();
 
-			if (!stratum_connect(&stratum, stratum.url) ||
-			    !stratum_subscribe(&stratum) ||
-			    !stratum_authorize(&stratum, rpc_user, rpc_pass)) {
+			if(!stratum_connect(&stratum, stratum.url) ||
+			   !stratum_subscribe(&stratum) ||
+			   !stratum_authorize(&stratum, rpc_user, rpc_pass, opt_extranonce))
+			{
 				stratum_disconnect(&stratum);
-				if (opt_retries >= 0 && ++failures > opt_retries) {
+				if(opt_retries >= 0 && ++failures > opt_retries)
+				{
 					applog(LOG_ERR, "...terminating workio thread");
 					tq_push(thr_info[work_thr_id].q, NULL);
 					goto out;
 				}
-				if (!opt_benchmark)
+				if(!opt_benchmark)
 					applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
 				sleep(opt_fail_pause);
 			}
 		}
 
-		if (stratum.job.job_id &&
-		    (!g_work_time || strncmp(stratum.job.job_id, g_work.job_id + 8, 120))) {
+		if(stratum.job.job_id &&
+		   (!g_work_time || strncmp(stratum.job.job_id, g_work.job_id + 8, 120)))
+		{
 			pthread_mutex_lock(&g_work_lock);
 			stratum_gen_work(&stratum, &g_work);
 			g_work_time = time(NULL);
-			if (stratum.job.clean) 
+			if(stratum.job.clean)
 			{
-				if (!opt_quiet)
+				if(!opt_quiet)
 					applog(LOG_BLUE, "%s %s block %d", short_url, algo_names[opt_algo],
-						stratum.job.height);
+					stratum.job.height);
 				restart_threads();
-				if (check_dups)
+				if(check_dups)
 					hashlog_purge_old();
 				stats_purge_old();
-			} else if (opt_debug && !opt_quiet) {
-					applog(LOG_BLUE, "%s asks job %d for block %d", short_url,
-						strtoul(stratum.job.job_id, NULL, 16), stratum.job.height);
+			}
+			else if(opt_debug && !opt_quiet)
+			{
+				applog(LOG_BLUE, "%s asks job %s for block %d", short_url,
+					   stratum.job.job_id, stratum.job.height);
 			}
 			pthread_mutex_unlock(&g_work_lock);
 		}
-		
-		if (!stratum_socket_full(&stratum, 120)) {
+
+		if(!stratum_socket_full(&stratum, 120))
+		{
 			applog(LOG_ERR, "Stratum connection timed out");
 			s = NULL;
-		} else
+		}
+		else
 			s = stratum_recv_line(&stratum);
-		if (!s) {
+		if(!s)
+		{
 			stratum_disconnect(&stratum);
 			applog(LOG_ERR, "Stratum connection interrupted");
 			continue;
 		}
-		if (!stratum_handle_method(&stratum, s))
+		if(!stratum_handle_method(&stratum, s))
 			stratum_handle_response(s);
 		free(s);
 	}
@@ -1764,20 +2114,20 @@ static void show_version_and_exit(void)
 {
 	printf("%s v%s\n"
 #ifdef WIN32
-		"pthreads static %s\n"
+		   "pthreads static %s\n"
 #endif
-		"%s\n",
-		PACKAGE_NAME, PACKAGE_VERSION,
+		   "%s\n",
+		   PACKAGE_NAME, PACKAGE_VERSION,
 #ifdef WIN32
-		PTW32_VERSION_STRING,
+		   PTW32_VERSION_STRING,
 #endif
-		curl_version());
+		   curl_version());
 	proper_exit(0);
 }
 
 static void show_usage_and_exit(int status)
 {
-	if (status)
+	if(status)
 		fprintf(stderr, "Try `" PROGRAM_NAME " --help' for more information.\n");
 	else
 		printf(usage);
@@ -1790,35 +2140,42 @@ static void parse_arg(int key, char *arg)
 	int v, i;
 	double d;
 
-	switch(key) {
+	switch(key)
+	{
 	case 'a':
-		for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
-			if (algo_names[i] &&
-			    !strcmp(arg, algo_names[i])) {
+		for(i = 0; i < ARRAY_SIZE(algo_names); i++)
+		{
+			if(algo_names[i] &&
+			   !strcasecmp(arg, algo_names[i]))
+			{
 				opt_algo = (enum sha_algos)i;
 				break;
 			}
 		}
-		if (i == ARRAY_SIZE(algo_names))
+		if(i == ARRAY_SIZE(algo_names))
 			show_usage_and_exit(1);
 		break;
 	case 'b':
 		p = strstr(arg, ":");
-		if (p) {
+		if(p)
+		{
 			/* ip:port */
-			if (p - arg > 0) {
+			if(p - arg > 0)
+			{
 				free(opt_api_allow);
 				opt_api_allow = strdup(arg);
 				opt_api_allow[p - arg] = '\0';
 			}
 			opt_api_listen = atoi(p + 1);
 		}
-		else if (arg && strstr(arg, ".")) {
+		else if(arg && strstr(arg, "."))
+		{
 			/* ip only */
 			free(opt_api_allow);
 			opt_api_allow = strdup(arg);
 		}
-		else if (arg) {
+		else if(arg)
+		{
 			/* port or 0 to disable */
 			opt_api_listen = atoi(arg);
 		}
@@ -1828,14 +2185,15 @@ static void parse_arg(int key, char *arg)
 		break;
 	case 'c': {
 		json_error_t err;
-		if (opt_config)
+		if(opt_config)
 			json_decref(opt_config);
 #if JANSSON_VERSION_HEX >= 0x020000
 		opt_config = json_load_file(arg, 0, &err);
 #else
 		opt_config = json_load_file(arg, &err);
 #endif
-		if (!json_is_object(opt_config)) {
+		if(!json_is_object(opt_config))
+		{
 			applog(LOG_ERR, "JSON decode of %s failed", arg);
 			proper_exit(1);
 		}
@@ -1843,36 +2201,43 @@ static void parse_arg(int key, char *arg)
 	}
 	case 'i':
 		d = atof(arg);
-		v = (uint32_t) d;
-		if (v < 0 || v > 31)
+		v = (uint32_t)d;
+		if(v < 0 || v > 31)
 			show_usage_and_exit(1);
+		else
 		{
-			int n = 0, adds = 0;
+			int n = 0;
 			int ngpus = cuda_num_devices();
-			char * pch = strtok(arg,",");
-			if (pch == NULL) {
-				for (n=0; n < ngpus; n++)
-					gpus_intensity[n] = (1 << v);
-				break;
-			}
-			while (pch != NULL) {
+			uint32_t last = 0;
+			char *pch = arg;
+			do
+			{
 				d = atof(pch);
-				v = (uint32_t) d;
-				if (v > 7) { /* 0 = default */
-					gpus_intensity[n] = (1 << v);
-					if ((d - v) > 0.0) {
-						adds = (uint32_t) floor((d - v) * (1 << (v-8))) * 256;
-						gpus_intensity[n] += adds;
+				v = (uint32_t)d;
+				if(v > 7)
+				{ /* 0 = default */
+					if((d - v) > 0.0)
+					{
+						uint32_t adds = (uint32_t)floor((d - v) * (1 << (v - 8))) * 256;
+						gpus_intensity[n] = (1 << v) + adds;
 						applog(LOG_INFO, "Adding %u threads to intensity %u, %u cuda threads",
-							adds, v, gpus_intensity[n]);
-					} else {
+							   adds, v, gpus_intensity[n]);
+					}
+					else if(gpus_intensity[n] != (1 << v))
+					{
+						gpus_intensity[n] = (1 << v);
 						applog(LOG_INFO, "Intensity set to %u, %u cuda threads",
-							v, gpus_intensity[n]);
+							   v, gpus_intensity[n]);
 					}
 				}
+				last = gpus_intensity[n];
 				n++;
-				pch = strtok(NULL, ",");
-			}
+				pch = strpbrk(pch, ",");
+				if(pch != NULL)
+					pch++;
+			} while(pch != NULL);
+			while(n < MAX_GPUS)
+				gpus_intensity[n++] = last;
 		}
 		break;
 	case 'D':
@@ -1880,10 +2245,14 @@ static void parse_arg(int key, char *arg)
 		break;
 	case 'N':
 		v = atoi(arg);
-		if (v < 1)
+		if(v < 1)
 			opt_statsavg = INT_MAX;
 		opt_statsavg = v;
 		break;
+	case 'n': /* --ndevs */
+		cuda_print_devices();
+		proper_exit(0);
+		break;
 	case 'q':
 		opt_quiet = true;
 		break;
@@ -1896,79 +2265,87 @@ static void parse_arg(int key, char *arg)
 		break;
 	case 'r':
 		v = atoi(arg);
-		if (v < -1 || v > 9999)	/* sanity check */
+		if(v < -1 || v > 9999)	/* sanity check */
 			show_usage_and_exit(1);
 		opt_retries = v;
 		break;
 	case 'R':
 		v = atoi(arg);
-		if (v < 1 || v > 9999)	/* sanity check */
+		if(v < 1 || v > 9999)	/* sanity check */
 			show_usage_and_exit(1);
 		opt_fail_pause = v;
 		break;
 	case 's':
 		v = atoi(arg);
-		if (v < 1 || v > 9999)	/* sanity check */
+		if(v < 1 || v > 9999)	/* sanity check */
 			show_usage_and_exit(1);
 		opt_scantime = v;
 		break;
 	case 'T':
 		v = atoi(arg);
-		if (v < 1 || v > 99999)	/* sanity check */
+		if(v < 1 || v > 99999)	/* sanity check */
 			show_usage_and_exit(1);
 		opt_timeout = v;
 		break;
 	case 't':
 		v = atoi(arg);
-		if (v < 1 || v > 9999)	/* sanity check */
+		if(v < 1 || v > 9999)	/* sanity check */
 			show_usage_and_exit(1);
 		opt_n_threads = v;
 		break;
-	case 'v':
-		v = atoi(arg);
-		if (v < 0 || v > 8192)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_vote = (uint16_t)v;
-		break;
-	case 'm':
-		opt_trust_pool = true;
-		break;
 	case 'u':
 		free(rpc_user);
 		rpc_user = strdup(arg);
 		break;
 	case 'o':			/* --url */
 		p = strstr(arg, "://");
-		if (p) {
-			if (strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) &&
-					strncasecmp(arg, "stratum+tcp://", 14))
-				show_usage_and_exit(1);
+		if(p)
+		{
+			if(strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) &&
+			   strncasecmp(arg, "stratum+tcp://", 14))
+			   show_usage_and_exit(1);
 			free(rpc_url);
 			rpc_url = strdup(arg);
 			short_url = &rpc_url[(p - arg) + 3];
-		} else {
-			if (!strlen(arg) || *arg == '/')
+		}
+		else
+		{
+			if(!strlen(arg) || *arg == '/')
 				show_usage_and_exit(1);
 			free(rpc_url);
 			rpc_url = (char*)malloc(strlen(arg) + 8);
+			if(rpc_url == NULL)
+			{
+				applog(LOG_ERR, "Out of memory!");
+				proper_exit(1);
+			}
 			sprintf(rpc_url, "http://%s", arg);
 			short_url = &rpc_url[7];
 		}
 		p = strrchr(rpc_url, '@');
-		if (p) {
+		if(p)
+		{
 			char *sp, *ap;
 			*p = '\0';
 			ap = strstr(rpc_url, "://") + 3;
 			sp = strchr(ap, ':');
-			if (sp) {
+			if(sp)
+			{
 				free(rpc_userpass);
 				rpc_userpass = strdup(ap);
 				free(rpc_user);
 				rpc_user = (char*)calloc(sp - ap + 1, 1);
+				if(rpc_user == NULL)
+				{
+					applog(LOG_ERR, "Out of memory!");
+					proper_exit(1);
+				}
 				strncpy(rpc_user, ap, sp - ap);
 				free(rpc_pass);
 				rpc_pass = strdup(sp + 1);
-			} else {
+			}
+			else
+			{
 				free(rpc_user);
 				rpc_user = strdup(ap);
 			}
@@ -1979,25 +2356,30 @@ static void parse_arg(int key, char *arg)
 		break;
 	case 'O':			/* --userpass */
 		p = strchr(arg, ':');
-		if (!p)
+		if(!p)
 			show_usage_and_exit(1);
 		free(rpc_userpass);
 		rpc_userpass = strdup(arg);
 		free(rpc_user);
 		rpc_user = (char*)calloc(p - arg + 1, 1);
+		if(rpc_user == NULL)
+		{
+			applog(LOG_ERR, "Out of memory!");
+			proper_exit(1);
+		}
 		strncpy(rpc_user, arg, p - arg);
 		free(rpc_pass);
 		rpc_pass = strdup(p + 1);
 		break;
 	case 'x':			/* --proxy */
-		if (!strncasecmp(arg, "socks4://", 9))
+		if(!strncasecmp(arg, "socks4://", 9))
 			opt_proxy_type = CURLPROXY_SOCKS4;
-		else if (!strncasecmp(arg, "socks5://", 9))
+		else if(!strncasecmp(arg, "socks5://", 9))
 			opt_proxy_type = CURLPROXY_SOCKS5;
 #if LIBCURL_VERSION_NUM >= 0x071200
-		else if (!strncasecmp(arg, "socks4a://", 10))
+		else if(!strncasecmp(arg, "socks4a://", 10))
 			opt_proxy_type = CURLPROXY_SOCKS4A;
-		else if (!strncasecmp(arg, "socks5h://", 10))
+		else if(!strncasecmp(arg, "socks5h://", 10))
 			opt_proxy_type = CURLPROXY_SOCKS5_HOSTNAME;
 #endif
 		else
@@ -2035,118 +2417,189 @@ static void parse_arg(int key, char *arg)
 	case 1008:
 		applog(LOG_INFO, "Now logging to syslog...");
 		use_syslog = true;
-		if (arg && strlen(arg)) {
+		if(arg && strlen(arg))
+		{
 			free(opt_syslog_pfx);
 			opt_syslog_pfx = strdup(arg);
 		}
 		break;
 	case 1020:
 		v = atoi(arg);
-		if (v < -1)
+		if(v < -1)
 			v = -1;
-		if (v > (1<<num_cpus)-1)
+		if(v >(1 << num_cpus) - 1)
 			v = -1;
 		opt_affinity = v;
 		break;
 	case 1021:
 		v = atoi(arg);
-		if (v < 0 || v > 5)	/* sanity check */
+		if(v < 0 || v > 5)	/* sanity check */
 			show_usage_and_exit(1);
 		opt_priority = v;
 		break;
+	case 1022:
+		opt_verify = false;
+		break;
 	case 'd': // CB
+	{
+		int ngpus = cuda_num_devices();
+		char * pch = strtok(arg, ",");
+		opt_n_threads = 0;
+		while(pch != NULL)
 		{
-			int ngpus = cuda_num_devices();
-			char * pch = strtok (arg,",");
-			opt_n_threads = 0;
-			while (pch != NULL) {
-				if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
+			if(pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
+			{
+				if(atoi(pch) < ngpus)
+					device_map[opt_n_threads++] = atoi(pch);
+				else
 				{
-					if (atoi(pch) < ngpus)
-						device_map[opt_n_threads++] = atoi(pch);
-					else {
-						applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
-						proper_exit(1);
-					}
-				} else {
-					int device = cuda_finddevice(pch);
-					if (device >= 0 && device < ngpus)
-						device_map[opt_n_threads++] = device;
-					else {
-						applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
-						proper_exit(1);
-					}
+					applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
+					proper_exit(1);
+				}
+			}
+			else
+			{
+				int device = cuda_finddevice(pch);
+				if(device >= 0 && device < ngpus)
+					device_map[opt_n_threads++] = device;
+				else
+				{
+					applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
+					proper_exit(1);
 				}
-				// set number of active gpus
-				active_gpus = opt_n_threads;
-				pch = strtok (NULL, ",");
 			}
+			// set number of active gpus
+			active_gpus = opt_n_threads;
+			pch = strtok(NULL, ",");
 		}
-		break;
+	}
+	break;
 	case 'f': // CH - Divisor for Difficulty
 		d = atof(arg);
-		if (d == 0)	/* sanity check */
+		if(d == 0)	/* sanity check */
 			show_usage_and_exit(1);
 		opt_difficulty = d;
 		break;
+	case 'm': // --diff-multiplier
+		d = atof(arg);
+		if(d <= 0.)
+			show_usage_and_exit(1);
+		opt_difficulty = 1.0/d;
+		break;
+	case 'e':
+		opt_extranonce = false;
+		break;
 	case 'V':
 		show_version_and_exit();
 	case 'h':
 		show_usage_and_exit(0);
+	case 1070: /* --gpu-clock */
+	{
+		char *pch = strtok(arg, ",");
+		int n = 0;
+		while(pch != NULL && n < MAX_GPUS)
+		{
+			int dev_id = device_map[n++];
+			device_gpu_clocks[dev_id] = atoi(pch);
+			pch = strtok(NULL, ",");
+		}
+	}
+	break;
+	case 1071: /* --mem-clock */
+	{
+		char *pch = strtok(arg, ",");
+		int n = 0;
+		while(pch != NULL && n < MAX_GPUS)
+		{
+			int dev_id = device_map[n++];
+			device_mem_clocks[dev_id] = atoi(pch);
+			pch = strtok(NULL, ",");
+		}
+	}
+	break;
+	case 1072: /* --pstate */
+	{
+		char *pch = strtok(arg, ",");
+		int n = 0;
+		while(pch != NULL && n < MAX_GPUS)
+		{
+			int dev_id = device_map[n++];
+			device_pstate[dev_id] = (int8_t)atoi(pch);
+			pch = strtok(NULL, ",");
+		}
+	}
+	break;
+	case 1073: /* --plimit */
+	{
+		char *pch = strtok(arg, ",");
+		int n = 0;
+		while(pch != NULL && n < MAX_GPUS)
+		{
+			int dev_id = device_map[n++];
+			device_plimit[dev_id] = atoi(pch);
+			pch = strtok(NULL, ",");
+		}
+	}
+	break;
 	default:
 		show_usage_and_exit(1);
 	}
 
-	if (use_syslog)
+	if(use_syslog)
 		use_colors = false;
 }
 
 /**
- * Parse json config file
- */
+* Parse json config file
+*/
 static void parse_config(void)
 {
 	int i;
 	json_t *val;
 
-	if (!json_is_object(opt_config))
+	if(!json_is_object(opt_config))
 		return;
 
-	for (i = 0; i < ARRAY_SIZE(options); i++) {
+	for(i = 0; i < ARRAY_SIZE(options); i++)
+	{
 
-		if (!options[i].name)
+		if(!options[i].name)
 			break;
-		if (!strcmp(options[i].name, "config"))
+		if(!strcmp(options[i].name, "config"))
 			continue;
 
 		val = json_object_get(opt_config, options[i].name);
-		if (!val)
+		if(!val)
 			continue;
 
-		if (options[i].has_arg && json_is_string(val)) {
+		if(options[i].has_arg && json_is_string(val))
+		{
 			char *s = strdup(json_string_value(val));
-			if (!s)
+			if(!s)
 				continue;
 			parse_arg(options[i].val, s);
 			free(s);
 		}
-		else if (options[i].has_arg && json_is_integer(val)) {
+		else if(options[i].has_arg && json_is_integer(val))
+		{
 			char buf[16];
-			sprintf(buf, "%d", (int) json_integer_value(val));
+			sprintf(buf, "%d", (int)json_integer_value(val));
 			parse_arg(options[i].val, buf);
 		}
-		else if (options[i].has_arg && json_is_real(val)) {
+		else if(options[i].has_arg && json_is_real(val))
+		{
 			char buf[16];
 			sprintf(buf, "%f", json_real_value(val));
 			parse_arg(options[i].val, buf);
 		}
-		else if (!options[i].has_arg) {
-			if (json_is_true(val))
+		else if(!options[i].has_arg)
+		{
+			if(json_is_true(val))
 				parse_arg(options[i].val, (char*) "");
 		}
 		else
 			applog(LOG_ERR, "JSON option %s invalid",
-				options[i].name);
+			options[i].name);
 	}
 }
 
@@ -2154,61 +2607,60 @@ static void parse_cmdline(int argc, char *argv[])
 {
 	int key;
 
-	while (1) {
+	while(1)
+	{
 #if HAVE_GETOPT_LONG
 		key = getopt_long(argc, argv, short_options, options, NULL);
 #else
 		key = getopt(argc, argv, short_options);
 #endif
-		if (key < 0)
+		if(key < 0)
 			break;
 
 		parse_arg(key, optarg);
 	}
-	if (optind < argc) {
+	if(optind < argc)
+	{
 		fprintf(stderr, "%s: unsupported non-option argument '%s'\n",
-			argv[0], argv[optind]);
+				argv[0], argv[optind]);
 		show_usage_and_exit(1);
 	}
 
 	parse_config();
 
-	if (opt_algo == ALGO_HEAVY && opt_vote == 9999) {
-		fprintf(stderr, "%s: Heavycoin hash requires block reward vote parameter (see --vote)\n",
-			argv[0]);
-		show_usage_and_exit(1);
-	}
 }
 
 #ifndef WIN32
 static void signal_handler(int sig)
 {
-	switch (sig) {
+	switch(sig)
+	{
 	case SIGHUP:
 		applog(LOG_INFO, "SIGHUP received");
 		break;
 	case SIGINT:
 		signal(sig, SIG_IGN);
 		applog(LOG_INFO, "SIGINT received, exiting");
-		proper_exit(0);
+		proper_exit(2);
 		break;
 	case SIGTERM:
 		applog(LOG_INFO, "SIGTERM received, exiting");
-		proper_exit(0);
+		proper_exit(2);
 		break;
 	}
 }
 #else
 BOOL WINAPI ConsoleHandler(DWORD dwType)
 {
-	switch (dwType) {
+	switch(dwType)
+	{
 	case CTRL_C_EVENT:
 		applog(LOG_INFO, "CTRL_C_EVENT received, exiting");
-		proper_exit(0);
+		proper_exit(2);
 		break;
 	case CTRL_BREAK_EVENT:
 		applog(LOG_INFO, "CTRL_BREAK_EVENT received, exiting");
-		proper_exit(0);
+		proper_exit(2);
 		break;
 	default:
 		return false;
@@ -2217,26 +2669,62 @@ BOOL WINAPI ConsoleHandler(DWORD dwType)
 }
 #endif
 
+static int msver(void)
+{
+	int version;
+#ifdef _MSC_VER
+	switch(_MSC_VER)
+	{
+		case 1500: version = 2008; break;
+		case 1600: version = 2010; break;
+		case 1700: version = 2012; break;
+		case 1800: version = 2013; break;
+		case 1900: version = 2015; break;
+		default: version = _MSC_VER / 100;
+	}
+#else
+	version = 0;
+#endif
+	return version;
+}
+
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
-	long flags;
 	int i;
+	
+	// strdup on char* to allow a common free() if used
+	opt_syslog_pfx = strdup(PROGRAM_NAME);
+	opt_api_allow = strdup("127.0.0.1"); /* 0.0.0.0 for all ips */
 
-	printf("*** ccminer " PACKAGE_VERSION " for nVidia GPUs by sp-hash@github ***\n");
-#ifdef WIN32
-	printf("\tBuilt with VC++ 2013 and nVidia CUDA SDK 6.5\n\n");
+#if defined _WIN64 || defined _LP64
+	printf("ccminer " PACKAGE_VERSION " (64bit) for nVidia GPUs\n");
+#else
+	printf("ccminer " PACKAGE_VERSION " (32bit) for nVidia GPUs\n");
+#endif
+#ifdef _MSC_VER
+	printf("Compiled with Visual Studio %d ", msver());
+#else
+#ifdef __clang__
+	printf("Compiled with Clang %s ", __clang_version__);
+#else
+#ifdef __GNUC__
+	printf("Compiled with GCC %d.%d ", __GNUC__, __GNUC_MINOR__);
 #else
-	printf("\tBuilt with the nVidia CUDA SDK 6.5\n\n");
+	printf("Compiled with an unusual compiler ");
 #endif
-	printf("  Based on pooler cpuminer 2.3.2 and the tpruvot@github fork\n ");
-	printf("  CUDA support by Christian Buchner, Christian H. and DJM34\n");
-	printf("  Includes optimizations implemented by sp , klaust, tpruvot and tsiv. \n\n");
+#endif
+#endif
+	printf("using Nvidia CUDA Toolkit %d.%d\n\n", CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10);
+	printf("Based on pooler cpuminer 2.3.2 and the tpruvot@github fork\n");
+	printf("CUDA support by Christian Buchner, Christian H. and DJM34\n");
+	printf("Includes optimizations implemented by sp-hash, klaust, tpruvot and tsiv.\n\n");
 
 	rpc_user = strdup("");
 	rpc_pass = strdup("");
 
-	pthread_mutex_init(&applog_lock, NULL);
+	for(int i = 0; i < MAX_GPUS; i++)
+		device_pstate[i] = -1;
 
 	// number of cpus for thread affinity
 #if defined(WIN32)
@@ -2248,63 +2736,83 @@ int main(int argc, char *argv[])
 #elif defined(CTL_HW) && defined(HW_NCPU)
 	int req[] = { CTL_HW, HW_NCPU };
 	size_t len = sizeof(num_cpus);
-	sysctl(req, 2, &num_cpus, &len, NULL, 0);
+	sysc tl(req, 2, &num_cpus, &len, NULL, 0);
 #else
 	num_cpus = 1;
 #endif
-	if (num_cpus < 1)
-		num_cpus = 1;
+	// number of gpus
+	active_gpus = cuda_num_devices();
+//	cuda_devicereset();
 
-	// default thread to device map
-	for (i = 0; i < MAX_GPUS; i++) {
-		device_map[i] = i;
+	if(active_gpus > 1)
+	{
+		// default thread to device map
+		for(i = 0; i < MAX_GPUS; i++)
+		{
+			device_map[i] = i;
+		}
 	}
 
-	// number of gpus
-	active_gpus = cuda_num_devices();
 	cuda_devicenames();
 
 	/* parse command line */
 	parse_cmdline(argc, argv);
 
-	if (!opt_benchmark && !rpc_url) {
+	if(opt_protocol)
+	{
+		curl_version_info_data *info;
+
+		info = curl_version_info(CURLVERSION_NOW);
+		applog(LOG_DEBUG, "using libcurl %s", info->version);
+		int features = info->features;
+		if(features&CURL_VERSION_IPV6)
+			applog(LOG_DEBUG, "libcurl supports IPv6");
+		if(features&CURL_VERSION_SSL)
+			applog(LOG_DEBUG, "libcurl supports SSL");
+		if(features&CURL_VERSION_IDN)
+			applog(LOG_DEBUG, "libcurl supports international domain names");
+	}
+	if(!opt_benchmark && !rpc_url)
+	{
 		fprintf(stderr, "%s: no URL supplied\n", argv[0]);
 		show_usage_and_exit(1);
 	}
+	cuda_devicereset();
 
-	if (!rpc_userpass) {
+	if(!rpc_userpass)
+	{
 		rpc_userpass = (char*)malloc(strlen(rpc_user) + strlen(rpc_pass) + 2);
-		if (!rpc_userpass)
-			return 1;
+		if(rpc_userpass == NULL)
+		{
+			applog(LOG_ERR, "Out of memory!");
+			proper_exit(2);
+		}
 		sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
 	}
 
 	/* init stratum data.. */
 	memset(&stratum.url, 0, sizeof(stratum));
 
-	pthread_mutex_init(&stats_lock, NULL);
-	pthread_mutex_init(&g_work_lock, NULL);
 	pthread_mutex_init(&stratum.sock_lock, NULL);
 	pthread_mutex_init(&stratum.work_lock, NULL);
 
-	flags = !opt_benchmark && strncmp(rpc_url, "https:", 6)
-	      ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
-	      : CURL_GLOBAL_ALL;
-	if (curl_global_init(flags)) {
+	if(curl_global_init(CURL_GLOBAL_ALL))
+	{
 		applog(LOG_ERR, "CURL initialization failed");
 		return 1;
 	}
 
 #ifndef WIN32
-	if (opt_background) {
+	if(opt_background)
+	{
 		i = fork();
-		if (i < 0) exit(1);
-		if (i > 0) exit(0);
+		if(i < 0) exit(1);
+		if(i > 0) exit(0);
 		i = setsid();
-		if (i < 0)
+		if(i < 0)
 			applog(LOG_ERR, "setsid() failed (errno = %d)", errno);
 		i = chdir("/");
-		if (i < 0)
+		if(i < 0)
 			applog(LOG_ERR, "chdir() failed (errno = %d)", errno);
 		signal(SIGHUP, signal_handler);
 		signal(SIGTERM, signal_handler);
@@ -2313,10 +2821,12 @@ int main(int argc, char *argv[])
 	signal(SIGINT, signal_handler);
 #else
 	SetConsoleCtrlHandler((PHANDLER_ROUTINE)ConsoleHandler, TRUE);
-	if (opt_priority > 0) {
+	if(opt_priority > 0)
+	{
 		DWORD prio = NORMAL_PRIORITY_CLASS;
-		SetPriorityClass(NULL, prio);
-		switch (opt_priority) {
+		//		SetPriorityClass(NULL, prio);
+		switch(opt_priority)
+		{
 		case 1:
 			prio = BELOW_NORMAL_PRIORITY_CLASS;
 			break;
@@ -2329,36 +2839,64 @@ int main(int argc, char *argv[])
 		case 5:
 			prio = REALTIME_PRIORITY_CLASS;
 		}
-		SetPriorityClass(GetCurrentProcess(), prio);
+		if(SetPriorityClass(GetCurrentProcess(), prio) == 0)
+		{
+			LPSTR messageBuffer = nullptr;
+			size_t size = FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+										NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL);
+			applog(LOG_ERR, "Error while trying to set the priority:");
+			applog(LOG_ERR, "%s", messageBuffer);
+			LocalFree(messageBuffer);
+		}
+		prio = GetPriorityClass(GetCurrentProcess());
+		switch(prio)
+		{
+		case NORMAL_PRIORITY_CLASS:
+			applog(LOG_INFO, "CPU priority: %s", "normal");
+			break;
+		case BELOW_NORMAL_PRIORITY_CLASS:
+			applog(LOG_INFO, "CPU priority: %s", "below normal");
+			break;
+		case ABOVE_NORMAL_PRIORITY_CLASS:
+			applog(LOG_INFO, "CPU priority: %s", "above normal");
+			break;
+		case HIGH_PRIORITY_CLASS:
+			applog(LOG_INFO, "CPU priority: %s", "high");
+			break;
+		case REALTIME_PRIORITY_CLASS:
+			applog(LOG_INFO, "CPU priority: %s", "realtime");
+			break;
+		case IDLE_PRIORITY_CLASS:
+			applog(LOG_INFO, "CPU priority: %s", "idle");
+			break;
+		default:
+			applog(LOG_INFO, "CPU priority class: %d", prio);
+		}
 	}
 #endif
-	if (opt_affinity != -1) {
-		if (!opt_quiet)
+	if(opt_affinity != -1)
+	{
+		if(!opt_quiet)
 			applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity);
 		affine_to_cpu_mask(-1, opt_affinity);
 	}
-	if (active_gpus == 0) {
-		applog(LOG_ERR, "No CUDA devices found! terminating.");
-		exit(1);
-	}
-	if (!opt_n_threads)
+	if(!opt_n_threads)
 		opt_n_threads = active_gpus;
 
 #ifdef HAVE_SYSLOG_H
-	if (use_syslog)
+	if(use_syslog)
 		openlog(opt_syslog_pfx, LOG_PID, LOG_USER);
 #endif
 
 	work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart));
-	if (!work_restart)
-		return 1;
+	if(work_restart == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 
 	thr_info = (struct thr_info *)calloc(opt_n_threads + 4, sizeof(*thr));
-	if (!thr_info)
-		return 1;
-
-	thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double));
-	if (!thr_hashrates)
+	if(!thr_info)
 		return 1;
 
 	/* init workio thread info */
@@ -2366,102 +2904,134 @@ int main(int argc, char *argv[])
 	thr = &thr_info[work_thr_id];
 	thr->id = work_thr_id;
 	thr->q = tq_new();
-	if (!thr->q)
+	if(!thr->q)
 		return 1;
 
+	for(int i = 0; i < MAX_GPUS; i++)
+		mining_has_stopped[i] = true;
+
 	/* start work I/O thread */
-	if (pthread_create(&thr->pth, NULL, workio_thread, thr)) {
+	if(pthread_create(&thr->pth, NULL, workio_thread, thr))
+	{
 		applog(LOG_ERR, "workio thread create failed");
 		return 1;
 	}
 
-	if (want_longpoll && !have_stratum) {
+	if(want_longpoll && !have_stratum)
+	{
 		/* init longpoll thread info */
 		longpoll_thr_id = opt_n_threads + 1;
 		thr = &thr_info[longpoll_thr_id];
 		thr->id = longpoll_thr_id;
 		thr->q = tq_new();
-		if (!thr->q)
+		if(!thr->q)
 			return 1;
 
 		/* start longpoll thread */
-		if (unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) {
+		if(unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr)))
+		{
 			applog(LOG_ERR, "longpoll thread create failed");
 			return 1;
 		}
 	}
 
-	if (want_stratum) {
+	if(want_stratum)
+	{
 		/* init stratum thread info */
 		stratum_thr_id = opt_n_threads + 2;
 		thr = &thr_info[stratum_thr_id];
 		thr->id = stratum_thr_id;
 		thr->q = tq_new();
-		if (!thr->q)
+		if(!thr->q)
 			return 1;
 
 		/* start stratum thread */
-		if (unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) {
+		if(unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr)))
+		{
 			applog(LOG_ERR, "stratum thread create failed");
 			return 1;
 		}
 
-		if (have_stratum)
+		if(have_stratum)
 			tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url));
 	}
 
 #ifdef USE_WRAPNVML
-#ifndef WIN32
+#if defined(__linux__) || defined(_WIN64)
 	/* nvml is currently not the best choice on Windows (only in x64) */
 	hnvml = nvml_create();
-	if (hnvml)
+	if (hnvml) {
+		bool gpu_reinit = false;// (opt_cudaschedule >= 0);
+		cuda_devicenames(); // refresh gpu vendor name
 		applog(LOG_INFO, "NVML GPU monitoring enabled.");
-#else
-	if (nvapi_init() == 0)
+		for(int n = 0; n < active_gpus; n++)
+		{
+			if(nvml_set_pstate(hnvml, device_map[n]) == 1)
+				gpu_reinit = true;
+			if(nvml_set_plimit(hnvml, device_map[n]) == 1)
+				gpu_reinit = true;
+			if(nvml_set_clocks(hnvml, device_map[n]) == 1)
+				gpu_reinit = true;
+			if(gpu_reinit)
+			{
+//				cuda_reset_device(n, NULL);
+			}
+		}
+	}
+#endif
+#ifdef WIN32
+	if(!hnvml && nvapi_init() == 0)
+	{
 		applog(LOG_INFO, "NVAPI GPU monitoring enabled.");
+		cuda_devicenames(); // refresh gpu vendor name
+	}
 #endif
-	else
+	else if(!hnvml)
 		applog(LOG_INFO, "GPU monitoring is not available.");
 #endif
 
-	if (opt_api_listen) {
+	if(opt_api_listen)
+	{
 		/* api thread */
 		api_thr_id = opt_n_threads + 3;
 		thr = &thr_info[api_thr_id];
 		thr->id = api_thr_id;
 		thr->q = tq_new();
-		if (!thr->q)
+		if(!thr->q)
 			return 1;
 
 		/* start stratum thread */
-		if (unlikely(pthread_create(&thr->pth, NULL, api_thread, thr))) {
+		if(unlikely(pthread_create(&thr->pth, NULL, api_thread, thr)))
+		{
 			applog(LOG_ERR, "api thread create failed");
 			return 1;
 		}
 	}
 
 	/* start mining threads */
-	for (i = 0; i < opt_n_threads; i++) {
+	for(i = 0; i < opt_n_threads; i++)
+	{
 		thr = &thr_info[i];
 
 		thr->id = i;
 		thr->gpu.thr_id = i;
-		thr->gpu.gpu_id = (uint8_t) device_map[i];
-		thr->gpu.gpu_arch = (uint16_t) device_sm[device_map[i]];
+		thr->gpu.gpu_id = (uint8_t)device_map[i];
+		thr->gpu.gpu_arch = (uint16_t)device_sm[device_map[i]];
 		thr->q = tq_new();
-		if (!thr->q)
+		if(!thr->q)
 			return 1;
 
-		if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) {
+		if(unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr)))
+		{
 			applog(LOG_ERR, "thread %d create failed", i);
 			return 1;
 		}
 	}
 
 	applog(LOG_INFO, "%d miner thread%s started, "
-		"using '%s' algorithm.",
-		opt_n_threads, opt_n_threads > 1 ? "s":"",
-		algo_names[opt_algo]);
+		   "using '%s' algorithm.",
+		   opt_n_threads, opt_n_threads > 1 ? "s" : "",
+		   algo_names[opt_algo]);
 
 #ifdef WIN32
 	timeBeginPeriod(1); // enable high timer precision (similar to Google Chrome Trick)
@@ -2470,10 +3040,6 @@ int main(int argc, char *argv[])
 	/* main loop - simply wait for workio thread to exit */
 	pthread_join(thr_info[work_thr_id].pth, NULL);
 
-#ifdef WIN32
-	timeEndPeriod(1); // be nice and forego high timer precision
-#endif
-
 	applog(LOG_INFO, "workio thread dead, exiting.");
 
 	proper_exit(0);
diff --git a/ccminer.opensdf b/ccminer.opensdf
deleted file mode 100644
index 303c7000d4..0000000000
Binary files a/ccminer.opensdf and /dev/null differ
diff --git a/ccminer.v12.suo b/ccminer.v12.suo
deleted file mode 100644
index 3f464a9307..0000000000
Binary files a/ccminer.v12.suo and /dev/null differ
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index ccd4af33b6..f96d83ffb8 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Release|Win32">
       <Configuration>Release</Configuration>
@@ -27,7 +27,7 @@
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
     <CLRSupport>false</CLRSupport>
     <WholeProgramOptimization>true</WholeProgramOptimization>
   </PropertyGroup>
@@ -35,26 +35,27 @@
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
     <CLRSupport>false</CLRSupport>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
     <CLRSupport>false</CLRSupport>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
     <CLRSupport>false</CLRSupport>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.props" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -77,9 +78,15 @@
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <LinkIncremental>false</LinkIncremental>
+    <RunCodeAnalysis>false</RunCodeAnalysis>
+    <IntDir>$(SolutionDir)$(Configuration)\temp\$(PlatformTarget)\</IntDir>
+    <OutDir>$(SolutionDir)$(Configuration)\$(PlatformTarget)\</OutDir>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
+    <OutDir>$(SolutionDir)$(Configuration)\$(PlatformTarget)\</OutDir>
+    <IntDir>$(SolutionDir)$(Configuration)\temp\$(PlatformTarget)\</IntDir>
+    <TargetExt>.exe</TargetExt>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
@@ -94,7 +101,7 @@
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;pthreadVC2.lib;libcurl.x86.lib;openssl.x86.lib;zlib.x86.lib;ws2_32.lib;Wldap32.lib;nvapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>normaliz.lib;cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;pthreadVC2.lib;libcurl.x86.lib;openssl.x86.lib;zlib.x86.lib;ws2_32.lib;Wldap32.lib;nvapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <IgnoreSpecificDefaultLibraries>libcmt.lib;msvcrt.lib</IgnoreSpecificDefaultLibraries>
       <AdditionalLibraryDirectories>compat\pthreads\x86;compat\curl-for-windows\out\x86\Release\lib;compat\nvapi\x86;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
       <AdditionalOptions>/NODEFAULTLIB:LIBCMT %(AdditionalOptions)</AdditionalOptions>
@@ -122,7 +129,7 @@
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;pthreadVC2.lib;libcurl.x64.lib;openssl.x64.lib;zlib.x64.lib;ws2_32.lib;Wldap32.lib;nvapi64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>normaliz.lib;cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;pthreadVC2.lib;libcurl.x64.lib;openssl.x64.lib;zlib.x64.lib;ws2_32.lib;Wldap32.lib;nvapi64.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <IgnoreSpecificDefaultLibraries>libcmt.lib</IgnoreSpecificDefaultLibraries>
       <AdditionalLibraryDirectories>compat\pthreads\x64;compat\curl-for-windows\out\x64\Release\lib;compat\nvapi\amd64;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
       <AdditionalOptions>/NODEFAULTLIB:LIBCMTD %(AdditionalOptions)</AdditionalOptions>
@@ -143,26 +150,33 @@
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
+      <Optimization>Full</Optimization>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
-      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
       <CompileAsManaged>false</CompileAsManaged>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>.;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>HAVE_STRUCT_TIMESPEC;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>.;compat;compat\includes;compat\includes-x86;compat\getopt;compat\includes\pthreads;compat\includes\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
       <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
       <ExceptionHandling>SyncCThrow</ExceptionHandling>
+      <StringPooling>true</StringPooling>
+      <FloatingPointModel>Precise</FloatingPointModel>
+      <SDLCheck>false</SDLCheck>
+      <BufferSecurityCheck>true</BufferSecurityCheck>
+      <EnablePREfast>false</EnablePREfast>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>true</MinimalRebuild>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>false</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
       <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.x86.lib;openssl.x86.lib;zlib.x86.lib;ws2_32.lib;Wldap32.lib;nvapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>normaliz.lib;cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;libcrypto.lib;zlibstat.lib;ws2_32.lib;Wldap32.lib;nvapi.lib;jansson.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <IgnoreSpecificDefaultLibraries>libcmt.lib</IgnoreSpecificDefaultLibraries>
-      <AdditionalLibraryDirectories>compat\pthreads\x86;compat\curl-for-windows\out\x86\Release\lib;compat\nvapi\x86;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>compat\libs\x86;compat\nvapi\x86;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
       <AdditionalOptions>/NODEFAULTLIB:LIBCMT %(AdditionalOptions)</AdditionalOptions>
       <ImageHasSafeExceptionHandlers>false</ImageHasSafeExceptionHandlers>
       <LinkStatus>false</LinkStatus>
@@ -173,40 +187,55 @@
       <MaxRegCount>80</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
-      <CodeGeneration>compute_52,sm_52;compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_62,sm_62;compute_60,sm_60;compute_61,sm_61;compute_53,sm_53;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;compute_37,sm_37;</CodeGeneration>
       <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
       <Defines>
       </Defines>
       <Optimization>O3</Optimization>
+      <Runtime>MT</Runtime>
+      <RuntimeChecks>Default</RuntimeChecks>
+      <TypeInfo>false</TypeInfo>
+      <KeepDir>$(CudaIntDir)\temp\$(PlatformTarget)</KeepDir>
+      <CudaRuntime>Shared</CudaRuntime>
     </CudaCompile>
     <CudaLink>
       <GPUDebugInfo>false</GPUDebugInfo>
     </CudaLink>
     <CudaLink>
-      <Optimization>O2</Optimization>
+      <Optimization>O3</Optimization>
+      <PerformDeviceLink>false</PerformDeviceLink>
     </CudaLink>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
+      <Optimization>Full</Optimization>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
-      <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
       <CompileAsManaged>false</CompileAsManaged>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>.;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>HAVE_STRUCT_TIMESPEC;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>.;compat;compat\includes;compat\includes-x64;compat\getopt;compat\includes\pthreads;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+      <WholeProgramOptimization>true</WholeProgramOptimization>
+      <FloatingPointModel>Precise</FloatingPointModel>
+      <OmitFramePointers>false</OmitFramePointers>
+      <StringPooling>true</StringPooling>
+      <ExceptionHandling>SyncCThrow</ExceptionHandling>
+      <BufferSecurityCheck>true</BufferSecurityCheck>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>true</MinimalRebuild>
+      <SDLCheck>false</SDLCheck>
+      <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
     </ClCompile>
     <Link>
-      <GenerateDebugInformation>false</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
       <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.x64.lib;openssl.x64.lib;zlib.x64.lib;ws2_32.lib;Wldap32.lib;cudart_static.lib;cuda.lib;nvapi64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>normaliz.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;libcrypto.lib;zlibstat.lib;ws2_32.lib;Wldap32.lib;cudart.lib;nvapi64.lib;jansson.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <IgnoreSpecificDefaultLibraries>libcmt.lib</IgnoreSpecificDefaultLibraries>
-      <AdditionalLibraryDirectories>compat\pthreads\x64;compat\curl-for-windows\out\x64\Release\lib;compat\nvapi\amd64;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>compat\libs\x64;compat\nvapi\amd64;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
       <AdditionalOptions>/NODEFAULTLIB:LIBCMT %(AdditionalOptions)</AdditionalOptions>
       <LinkStatus>false</LinkStatus>
       <LargeAddressAware>true</LargeAddressAware>
@@ -215,32 +244,34 @@
       <CInterleavedPTX>false</CInterleavedPTX>
       <MaxRegCount>80</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
-      <Keep>false</Keep>
-      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <Keep>true</Keep>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;compute_37,sm_37;</CodeGeneration>
       <Include>
       </Include>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+      <Optimization>O3</Optimization>
+      <Runtime>MT</Runtime>
+      <RuntimeChecks>Default</RuntimeChecks>
+      <TypeInfo>false</TypeInfo>
+      <KeepDir>$(CudaIntDir)\temp\$(PlatformTarget)</KeepDir>
+      <CudaRuntime>Shared</CudaRuntime>
+      <AdditionalOptions>-Wno-deprecated-gpu-targets %(AdditionalOptions)</AdditionalOptions>
     </CudaCompile>
     <CudaLink>
       <PerformDeviceLink>false</PerformDeviceLink>
+      <GPUDebugInfo>false</GPUDebugInfo>
+      <Optimization>O3</Optimization>
     </CudaLink>
   </ItemDefinitionGroup>
   <ItemGroup>
+    <CudaCompile Include="Algo256\cuda_bmw256.cu" />
+    <CudaCompile Include="Algo256\cuda_cubehash256.cu" />
     <CudaCompile Include="bitcoin.cu">
       <FileType>CppCode</FileType>
     </CudaCompile>
     <ClCompile Include="compat\getopt\getopt_long.c" />
     <ClCompile Include="compat\gettimeofday.c" />
-    <ClCompile Include="compat\jansson\dump.c" />
-    <ClCompile Include="compat\jansson\error.c" />
-    <ClCompile Include="compat\jansson\hashtable.c" />
-    <ClCompile Include="compat\jansson\load.c" />
-    <ClCompile Include="compat\jansson\memory.c" />
-    <ClCompile Include="compat\jansson\pack_unpack.c" />
-    <ClCompile Include="compat\jansson\strbuffer.c" />
-    <ClCompile Include="compat\jansson\strconv.c" />
-    <ClCompile Include="compat\jansson\utf.c" />
-    <ClCompile Include="compat\jansson\value.c" />
-    <ClCompile Include="compat\winansi.c" />
+    <ClCompile Include="compat\winansi.cpp" />
     <ClCompile Include="ccminer.cpp">
       <TreatWChar_tAsBuiltInType>false</TreatWChar_tAsBuiltInType>
       <Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
@@ -248,6 +279,14 @@
     <CudaCompile Include="cuda_bitcoin.cu">
       <FileType>CppCode</FileType>
     </CudaCompile>
+    <CudaCompile Include="neoscrypt\cuda_neoscrypt_tpruvot.cu">
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_37,sm_37;</CodeGeneration>
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_37,sm_37;</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="Sia\sia.cu" />
+    <ClCompile Include="sph\neoscrypt.cpp" />
+    <ClCompile Include="sph\sha256_Y.c" />
+    <ClCompile Include="sph\sph_sha2.c" />
     <ClCompile Include="util.cpp" />
     <ClCompile Include="fuguecoin.cpp" />
     <ClCompile Include="groestlcoin.cpp" />
@@ -259,10 +298,6 @@
     <ClCompile Include="crc32.c" />
     <ClCompile Include="hefty1.c" />
     <ClCompile Include="myriadgroestl.cpp" />
-    <ClCompile Include="scrypt.c">
-      <Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
-      <AdditionalOptions>/Tp %(AdditionalOptions)</AdditionalOptions>
-    </ClCompile>
     <ClCompile Include="lyra2\Lyra2.c" />
     <ClCompile Include="lyra2\Sponge.c" />
     <ClCompile Include="sph\aes_helper.c" />
@@ -293,6 +328,7 @@
     <ClCompile Include="sph\whirlpool.c" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="ccminer-config-win.h" />
     <ClInclude Include="compat.h" />
     <ClInclude Include="compat\getopt\getopt.h" />
     <ClInclude Include="compat\inttypes.h" />
@@ -310,14 +346,20 @@
     <ClInclude Include="compat\sys\time.h" />
     <ClInclude Include="compat\unistd.h" />
     <ClInclude Include="compat\winansi.h" />
-    <ClInclude Include="cpuminer-config.h" />
     <ClInclude Include="cuda_groestlcoin.h" />
     <ClInclude Include="cuda_helper.h" />
+    <ClInclude Include="cuda_vector.h" />
     <ClInclude Include="elist.h" />
-    <ClInclude Include="heavy\heavy.h" />
     <ClInclude Include="hefty1.h" />
+    <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
+    <ClInclude Include="lyra2\cuda_lyra2_vectors.h" />
     <ClInclude Include="miner.h" />
+    <ClInclude Include="neoscrypt\cuda_vector_tpruvot.cuh" />
+    <ClInclude Include="neoscrypt\cuda_vector_uint2x4.cuh" />
     <ClInclude Include="nvml.h" />
+    <ClInclude Include="Sia\sia.h" />
+    <ClInclude Include="sph\neoscrypt.h" />
+    <ClInclude Include="sph\sha256_Y.h" />
     <ClInclude Include="sph\sph_blake.h" />
     <ClInclude Include="sph\sph_bmw.h" />
     <ClInclude Include="sph\sph_cubehash.h" />
@@ -336,7 +378,7 @@
     <ClInclude Include="sph\sph_hamsi.h" />
     <ClInclude Include="sph\sph_types.h" />
     <ClInclude Include="sph\sph_whirlpool.h" />
-    <ClInclude Include="uint256.h" />
+    <ClInclude Include="sph\sysendian.h" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="lyra2\Lyra2.h" />
@@ -353,24 +395,15 @@
     </CudaCompile>
     <CudaCompile Include="cuda_nist5.cu">
     </CudaCompile>
+    <CudaCompile Include="lyra2\cuda_lyra2v2.cu" />
+    <CudaCompile Include="lyra2\lyra2REv2.cu" />
+    <CudaCompile Include="cuda_x11_aes_noasm.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </CudaCompile>
     <CudaCompile Include="groestl_functions_quad.cu">
       <ExcludedFromBuild>true</ExcludedFromBuild>
     </CudaCompile>
-    <CudaCompile Include="heavy\cuda_blake512.cu">
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_combine.cu">
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_groestl512.cu">
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_hefty1.cu">
-      <MaxRegCount>80</MaxRegCount>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_keccak512.cu">
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_sha256.cu">
-    </CudaCompile>
-    <CudaCompile Include="heavy\heavy.cu">
-    </CudaCompile>
     <CudaCompile Include="JHA\cuda_jha_compactionTest.cu">
       <AdditionalOptions Condition="'$(Configuration)'=='Release'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions>
       <AdditionalOptions Condition="'$(Configuration)'=='Debug'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions>
@@ -392,15 +425,22 @@
       <MaxRegCount>92</MaxRegCount>
     </CudaCompile>
     <CudaCompile Include="Algo256\cuda_skein256.cu" />
+    <CudaCompile Include="neoscrypt\cuda_neoscrypt.cu">
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_37,sm_37;</CodeGeneration>
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_37,sm_37;</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="neoscrypt\neoscrypt.cu">
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_37,sm_37;</CodeGeneration>
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_37,sm_37;</CodeGeneration>
+    </CudaCompile>
     <CudaCompile Include="pentablake.cu">
       <MaxRegCount>80</MaxRegCount>
       <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
       <FastMath>true</FastMath>
     </CudaCompile>
-    <CudaCompile Include="quark\animecoin.cu">
-    </CudaCompile>
     <CudaCompile Include="quark\cuda_bmw512.cu">
       <MaxRegCount>128</MaxRegCount>
+      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">80</MaxRegCount>
     </CudaCompile>
     <CudaCompile Include="quark\cuda_jh512.cu">
       <MaxRegCount>80</MaxRegCount>
@@ -433,15 +473,17 @@
     </CudaCompile>
     <CudaCompile Include="qubit\qubit_luffa512.cu">
     </CudaCompile>
-    <CudaCompile Include="lyra2\lyra2RE.cu" />
-    <CudaCompile Include="lyra2\cuda_lyra2.cu" />
+    <CudaCompile Include="Sia\cuda_sia.cu" />
+    <CudaCompile Include="skein.cu">
+      <MaxRegCount>64</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="x11\c11.cu" />
     <CudaCompile Include="x11\cuda_x11_aes.cu">
       <ExcludedFromBuild>true</ExcludedFromBuild>
     </CudaCompile>
     <CudaCompile Include="x11\cuda_x11_cubehash512.cu" />
     <CudaCompile Include="x11\cuda_x11_echo.cu">
     </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_luffa512.cu" />
     <CudaCompile Include="x11\cuda_x11_luffa512_Cubehash.cu">
       <MaxRegCount>80</MaxRegCount>
     </CudaCompile>
@@ -466,10 +508,12 @@
     </CudaCompile>
     <CudaCompile Include="x13\x13.cu">
     </CudaCompile>
+    <CudaCompile Include="x15\cuda_whirlpoolx.cu" />
     <CudaCompile Include="x15\cuda_x14_shabal512.cu">
     </CudaCompile>
     <CudaCompile Include="x15\cuda_x15_whirlpool.cu">
     </CudaCompile>
+    <CudaCompile Include="x15\whirlpoolx.cu" />
     <CudaCompile Include="x17\cuda_x17_haval512.cu">
     </CudaCompile>
     <CudaCompile Include="x17\cuda_x17_sha512.cu">
@@ -486,7 +530,7 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
   </ImportGroup>
   <!-- Copy the required dlls -->
   <Target Name="AfterBuild">
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 665a14b16c..a5138399e1 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -40,9 +40,6 @@
     <Filter Include="Header Files\sph">
       <UniqueIdentifier>{7c2a98c6-064c-4a69-b803-d6f6ff5edd0b}</UniqueIdentifier>
     </Filter>
-    <Filter Include="Source Files\CUDA\heavy">
-      <UniqueIdentifier>{c3222908-22ba-4586-a637-6363f455b06d}</UniqueIdentifier>
-    </Filter>
     <Filter Include="Source Files\CUDA\qubit">
       <UniqueIdentifier>{f3ed23a2-8ce7-41a5-b051-6da56047dc35}</UniqueIdentifier>
     </Filter>
@@ -58,38 +55,29 @@
     <Filter Include="Source Files\CUDA\x17">
       <UniqueIdentifier>{85dfae6a-66ca-4332-8cec-98ee70cbdf2f}</UniqueIdentifier>
     </Filter>
-    <Filter Include="Source Files\jansson">
-      <UniqueIdentifier>{17b56151-79ec-4a32-bac3-9d94ae7f68fe}</UniqueIdentifier>
-    </Filter>
     <Filter Include="Header Files\compat\nvapi">
       <UniqueIdentifier>{ef6f9983-bda5-4fb2-adfa-ac4f29b74f25}</UniqueIdentifier>
     </Filter>
     <Filter Include="Source Files\CUDA\Algo256">
       <UniqueIdentifier>{9762c92c-9677-4044-8292-ff6ba4bfdd89}</UniqueIdentifier>
     </Filter>
-    <Filter Include="Header Files\lyra2">
+    <Filter Include="Source Files\CUDA\Lyra2">
+      <UniqueIdentifier>{03b56ddb-6ebb-40b7-9a62-0a22c8c2865f}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\neoscrypt">
+      <UniqueIdentifier>{0e14317b-d054-4f9e-8f6f-3bd91b3aa160}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\CUDA\lyra2">
       <UniqueIdentifier>{2ff6e4ce-7c92-4cb2-a3ad-c331e94fd81d}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Source Files\CUDA\Sia">
+      <UniqueIdentifier>{62428d9b-4cac-44ca-a0c9-4b91f6c249d0}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\CUDA\sia">
+      <UniqueIdentifier>{85b17b96-98a4-4fc5-baa8-c1a6b10f2d99}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
-    <ClCompile Include="compat\jansson\dump.c">
-      <Filter>Source Files\jansson</Filter>
-    </ClCompile>
-    <ClCompile Include="compat\jansson\hashtable.c">
-      <Filter>Source Files\jansson</Filter>
-    </ClCompile>
-    <ClCompile Include="compat\jansson\load.c">
-      <Filter>Source Files\jansson</Filter>
-    </ClCompile>
-    <ClCompile Include="compat\jansson\strbuffer.c">
-      <Filter>Source Files\jansson</Filter>
-    </ClCompile>
-    <ClCompile Include="compat\jansson\utf.c">
-      <Filter>Source Files\jansson</Filter>
-    </ClCompile>
-    <ClCompile Include="compat\jansson\value.c">
-      <Filter>Source Files\jansson</Filter>
-    </ClCompile>
     <ClCompile Include="compat\getopt\getopt_long.c">
       <Filter>Source Files\getopt</Filter>
     </ClCompile>
@@ -108,15 +96,6 @@
     <ClCompile Include="hefty1.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="scrypt.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="fuguecoin.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="groestlcoin.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="sph\aes_helper.c">
       <Filter>Source Files\sph</Filter>
     </ClCompile>
@@ -162,9 +141,6 @@
     <ClCompile Include="sph\skein.c">
       <Filter>Source Files\sph</Filter>
     </ClCompile>
-    <ClCompile Include="myriadgroestl.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="sph\hamsi.c">
       <Filter>Source Files\sph</Filter>
     </ClCompile>
@@ -186,42 +162,48 @@
     <ClCompile Include="sph\sha2big.c">
       <Filter>Source Files\sph</Filter>
     </ClCompile>
-    <ClCompile Include="compat\winansi.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="hashlog.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="stats.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="api.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="nvml.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="sysinfos.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="compat\jansson\memory.c">
-      <Filter>Source Files\jansson</Filter>
+    <ClCompile Include="lyra2\Lyra2.c">
+      <Filter>Source Files\sph</Filter>
     </ClCompile>
-    <ClCompile Include="compat\jansson\pack_unpack.c">
-      <Filter>Source Files\jansson</Filter>
+    <ClCompile Include="lyra2\Sponge.c">
+      <Filter>Source Files\sph</Filter>
     </ClCompile>
-    <ClCompile Include="compat\jansson\strconv.c">
-      <Filter>Source Files\jansson</Filter>
+    <ClCompile Include="sph\sha256_Y.c">
+      <Filter>Source Files\CUDA\neoscrypt</Filter>
     </ClCompile>
-    <ClCompile Include="compat\jansson\error.c">
-      <Filter>Source Files\jansson</Filter>
+    <ClCompile Include="myriadgroestl.cpp">
+      <Filter>Source Files\CUDA</Filter>
     </ClCompile>
-    <ClCompile Include="lyra2\Lyra2.c">
-      <Filter>Source Files\sph</Filter>
+    <ClCompile Include="groestlcoin.cpp">
+      <Filter>Source Files\CUDA</Filter>
     </ClCompile>
-    <ClCompile Include="lyra2\Sponge.c">
+    <ClCompile Include="fuguecoin.cpp">
+      <Filter>Source Files\CUDA</Filter>
+    </ClCompile>
+    <ClCompile Include="api.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\sph_sha2.c">
       <Filter>Source Files\sph</Filter>
     </ClCompile>
+    <ClCompile Include="compat\winansi.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\neoscrypt.cpp">
+      <Filter>Source Files\CUDA\neoscrypt</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="compat.h">
@@ -248,15 +230,9 @@
     <ClInclude Include="compat\stdbool.h">
       <Filter>Header Files\compat</Filter>
     </ClInclude>
-    <ClInclude Include="cpuminer-config.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
     <ClInclude Include="hefty1.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="uint256.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
     <ClInclude Include="cuda_groestlcoin.h">
       <Filter>Header Files\CUDA</Filter>
     </ClInclude>
@@ -299,9 +275,6 @@
     <ClInclude Include="sph\sph_types.h">
       <Filter>Header Files\sph</Filter>
     </ClInclude>
-    <ClInclude Include="heavy\heavy.h">
-      <Filter>Header Files\CUDA</Filter>
-    </ClInclude>
     <ClInclude Include="cuda_helper.h">
       <Filter>Header Files\CUDA</Filter>
     </ClInclude>
@@ -357,10 +330,40 @@
       <Filter>Header Files\compat</Filter>
     </ClInclude>
     <ClInclude Include="lyra2\Lyra2.h">
-      <Filter>Header Files\lyra2</Filter>
+      <Filter>Header Files\CUDA\lyra2</Filter>
     </ClInclude>
     <ClInclude Include="lyra2\Sponge.h">
-      <Filter>Header Files\lyra2</Filter>
+      <Filter>Header Files\CUDA\lyra2</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sysendian.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sha256_Y.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\neoscrypt.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="cuda_vector.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="lyra2\cuda_lyra2_vectors.h">
+      <Filter>Header Files\CUDA\lyra2</Filter>
+    </ClInclude>
+    <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh">
+      <Filter>Header Files\CUDA\lyra2</Filter>
+    </ClInclude>
+    <ClInclude Include="Sia\sia.h">
+      <Filter>Header Files\CUDA\sia</Filter>
+    </ClInclude>
+    <ClInclude Include="ccminer-config-win.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="neoscrypt\cuda_vector_uint2x4.cuh">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="neoscrypt\cuda_vector_tpruvot.cuh">
+      <Filter>Header Files\CUDA</Filter>
     </ClInclude>
   </ItemGroup>
   <ItemGroup>
@@ -400,30 +403,6 @@
     <CudaCompile Include="quark\cuda_skein512.cu">
       <Filter>Source Files\CUDA\quark</Filter>
     </CudaCompile>
-    <CudaCompile Include="heavy\cuda_blake512.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_combine.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_groestl512.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_hefty1.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_keccak512.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_sha256.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy\heavy.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="quark\animecoin.cu">
-      <Filter>Source Files\CUDA\quark</Filter>
-    </CudaCompile>
     <CudaCompile Include="quark\cuda_bmw512.cu">
       <Filter>Source Files\CUDA\quark</Filter>
     </CudaCompile>
@@ -460,9 +439,6 @@
     <CudaCompile Include="x11\cuda_x11_echo.cu">
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_luffa512.cu">
-      <Filter>Source Files\CUDA\x11</Filter>
-    </CudaCompile>
     <CudaCompile Include="x11\cuda_x11_luffa512_Cubehash.cu">
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
@@ -544,12 +520,6 @@
     <CudaCompile Include="Algo256\cuda_skein256.cu">
       <Filter>Source Files\CUDA\Algo256</Filter>
     </CudaCompile>
-    <CudaCompile Include="lyra2\cuda_lyra2.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="lyra2\lyra2RE.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
     <CudaCompile Include="quark\cuda_jh512keccak512.cu">
       <Filter>Source Files\CUDA\quark</Filter>
     </CudaCompile>
@@ -559,5 +529,47 @@
     <CudaCompile Include="bitcoin.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
+    <CudaCompile Include="skein.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_bmw256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_cubehash256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lyra2\cuda_lyra2v2.cu">
+      <Filter>Source Files\CUDA\Lyra2</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lyra2\lyra2REv2.cu">
+      <Filter>Source Files\CUDA\Lyra2</Filter>
+    </CudaCompile>
+    <CudaCompile Include="cuda_x11_aes_noasm.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\c11.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="neoscrypt\cuda_neoscrypt.cu">
+      <Filter>Source Files\CUDA\neoscrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="neoscrypt\neoscrypt.cu">
+      <Filter>Source Files\CUDA\neoscrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x15\whirlpoolx.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x15\cuda_whirlpoolx.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Sia\cuda_sia.cu">
+      <Filter>Source Files\CUDA\Sia</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Sia\sia.cu">
+      <Filter>Source Files\CUDA\Sia</Filter>
+    </CudaCompile>
+    <CudaCompile Include="neoscrypt\cuda_neoscrypt_tpruvot.cu">
+      <Filter>Source Files\CUDA\neoscrypt</Filter>
+    </CudaCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/compat/curl-for-windows/openssl/config/opensslconf.h b/compat/curl-for-windows/openssl/config/opensslconf.h
deleted file mode 100644
index 9bf23692d6..0000000000
--- a/compat/curl-for-windows/openssl/config/opensslconf.h
+++ /dev/null
@@ -1,333 +0,0 @@
-/* opensslconf.h */
-/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
-
-/* OpenSSL was configured with the following options: */
-#undef OPENSSL_SYSNAME_WIN32
-#if defined(_WIN32)
-# define OPENSSL_SYSNAME_WIN32
-#endif
-
-#ifndef OPENSSL_DOING_MAKEDEPEND
-# ifndef OPENSSL_NO_CAPIENG
-#  define OPENSSL_NO_CAPIENG
-# endif
-# ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
-#  define OPENSSL_NO_EC_NISTP_64_GCC_128
-# endif
-# ifndef OPENSSL_NO_GMP
-#  define OPENSSL_NO_GMP
-# endif
-# ifndef OPENSSL_NO_GOST
-#  define OPENSSL_NO_GOST
-# endif
-# ifndef OPENSSL_NO_HW_PADLOCK
-#  define OPENSSL_NO_HW_PADLOCK
-# endif
-# ifndef OPENSSL_NO_JPAKE
-#  define OPENSSL_NO_JPAKE
-# endif
-# ifndef OPENSSL_NO_KRB5
-#  define OPENSSL_NO_KRB5
-# endif
-# ifndef OPENSSL_NO_MD2
-#  define OPENSSL_NO_MD2
-# endif
-# ifndef OPENSSL_NO_RC5
-#  define OPENSSL_NO_RC5
-# endif
-# ifndef OPENSSL_NO_RFC3779
-#  define OPENSSL_NO_RFC3779
-# endif
-# ifndef OPENSSL_NO_SCTP
-#  define OPENSSL_NO_SCTP
-# endif
-# ifndef OPENSSL_NO_STORE
-#  define OPENSSL_NO_STORE
-# endif
-#endif /* OPENSSL_DOING_MAKEDEPEND */
-
-#ifndef OPENSSL_THREADS
-# define OPENSSL_THREADS
-#endif
-#ifndef OPENSSL_NO_DYNAMIC_ENGINE
-# define OPENSSL_NO_DYNAMIC_ENGINE
-#endif
-
-/* The OPENSSL_NO_* macros are also defined as NO_* if the application
-   asks for it.  This is a transient feature that is provided for those
-   who haven't had the time to do the appropriate changes in their
-   applications.  */
-#ifdef OPENSSL_ALGORITHM_DEFINES
-# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
-#  define NO_CAMELLIA
-# endif
-# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
-#  define NO_CAPIENG
-# endif
-# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
-#  define NO_CAST
-# endif
-# if defined(OPENSSL_NO_CMS) && !defined(NO_CMS)
-#  define NO_CMS
-# endif
-# if defined(OPENSSL_NO_FIPS) && !defined(NO_FIPS)
-#  define NO_FIPS
-# endif
-# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
-#  define NO_GMP
-# endif
-# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
-#  define NO_IDEA
-# endif
-# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
-#  define NO_JPAKE
-# endif
-# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
-#  define NO_KRB5
-# endif
-# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
-#  define NO_MD2
-# endif
-# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
-#  define NO_MDC2
-# endif
-# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
-#  define NO_RC5
-# endif
-# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
-#  define NO_RFC3779
-# endif
-# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
-#  define NO_SEED
-# endif
-# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
-#  define NO_SHA0
-# endif
-# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
-#  define NO_STORE
-# endif
-# if defined(OPENSSL_NO_WHRLPOOL) && !defined(NO_WHRLPOOL)
-#  define NO_WHRLPOOL
-# endif
-# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
-#  define NO_MDC2
-# endif
-#endif
-
-/* crypto/opensslconf.h.in */
-
-#ifdef OPENSSL_DOING_MAKEDEPEND
- /* Include any symbols here that have to be explicitly set to enable a feature
-  * that should be visible to makedepend.
-  *
-  * [Our "make depend" doesn't actually look at this, we use actual build settings
-  * instead; we want to make it easy to remove subdirectories with disabled algorithms.]
-  */
-# ifndef OPENSSL_FIPS
-#  define OPENSSL_FIPS
-# endif
-#endif
-
-/* Generate 80386 code? */
-#undef I386_ONLY
-
-#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
-# if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
-#  if defined(_WIN32)
-#   define ENGINESDIR "ssl/lib/engines"
-#   define OPENSSLDIR "ssl"
-#  else
-#   define ENGINESDIR "/usr/local/ssl/lib/engines"
-#   define OPENSSLDIR "/usr/local/ssl"
-#  endif
-# endif
-#endif
-
-#undef OPENSSL_UNISTD
-#define OPENSSL_UNISTD <unistd.h>
-#if !defined(_WIN32) && !defined(__arm__) && !defined(__mips__) && !defined(SWIG)
-# include <unistd.h>
-#endif
-
-#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
-#if defined(_WIN32)
-# define OPENSSL_EXPORT_VAR_AS_FUNCTION
-#endif
-
-#if defined(HEADER_IDEA_H)
-# undef IDEA_INT
-# define IDEA_INT unsigned int
-#endif
-
-#if defined(HEADER_MD2_H)
-# undef MD2_INT
-# define MD2_INT unsigned int
-#endif
-
-#if defined(HEADER_RC2_H)
-/* I need to put in a mod for the alpha - eay */
-# undef RC2_INT
-# define RC2_INT unsigned int
-#endif
-
-#if defined(HEADER_RC4_H)
- /* using int types make the structure larger but make the code faster
-  * on most boxes I have tested - up to %20 faster. */
- /*
-  * I don't know what does "most" mean, but declaring "int" is a must on:
-  * - Intel P6 because partial register stalls are very expensive;
-  * - elder Alpha because it lacks byte load/store instructions;
-  */
-# undef RC4_INT
-# if defined(__arm__)
-#  define RC4_INT unsigned char
-# else
-#  define RC4_INT unsigned int
-# endif
-
-  /*
-   * This enables code handling data aligned at natural CPU word
-   * boundary. See crypto/rc4/rc4_enc.c for further details.
-   */
-# undef RC4_CHUNK
-# if (defined(_M_X64) || defined(__x86_64__)) && defined(_WIN32)
-#  define RC4_CHUNK unsigned long long
-# elif (defined(_M_X64) || defined(__x86_64__)) && !defined(_WIN32)
-#  define RC4_CHUNK unsigned long
-# elif defined(__arm__)
-#  define RC4_CHUNK unsigned long
-# else
-   /* On x86 RC4_CHUNK is not defined */
-# endif
-#endif
-
-#if defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)
- /* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
-  * %20 speed up (longs are 8 bytes, int's are 4). */
-# undef DES_LONG
-# if defined(_M_X64) || defined(__x86_64__) || defined(__arm__) || defined(__mips__)
-#  define DES_LONG unsigned int
-# elif defined(_M_IX86) || defined(__i386__)
-#  define DES_LONG unsigned long
-# endif
-#endif
-
-#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
-# define CONFIG_HEADER_BN_H
-
-# undef BL_LLONG
-# if defined(_M_IX86) || defined(__i386__) || defined(__arm__)
-#  define BL_LLONG
-# endif
-
- /* Should we define BN_DIV2W here? */
-
- /* Only one for the following should be defined */
- /* The prime number generation stuff may not work when
-  * EIGHT_BIT but I don't care since I've only used this mode
-  * for debuging the bignum libraries */
-# undef SIXTY_FOUR_BIT_LONG
-# undef SIXTY_FOUR_BIT
-# undef THIRTY_TWO_BIT
-# undef SIXTEEN_BIT
-# undef EIGHT_BIT
-# if (defined(_M_X64) || defined(__x86_64__)) && defined(_WIN32)
-#  define SIXTY_FOUR_BIT
-# elif (defined(_M_X64) || defined(__x86_64__)) && !defined(_WIN32)
-#  define SIXTY_FOUR_BIT_LONG
-# elif defined(_M_IX86) || defined(__i386__) || defined(__arm__) || defined(__mips__)
-#  define THIRTY_TWO_BIT
-# endif
-#endif
-
-#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
-# define CONFIG_HEADER_RC4_LOCL_H
- /* if this is defined data[i] is used instead of *data, this is a %20
-  * speedup on x86 */
-# undef RC4_INDEX
-# if defined(_M_IX86) || defined(__i386__)
-#  define RC4_INDEX
-# endif
-#endif
-
-#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
-# define CONFIG_HEADER_BF_LOCL_H
-# undef BF_PTR
-# if defined(__arm__)
-#  define BF_PTR
-# endif
-#endif /* HEADER_BF_LOCL_H */
-
-#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
-# define CONFIG_HEADER_DES_LOCL_H
-
-# ifndef DES_DEFAULT_OPTIONS
-   /* the following is tweaked from a config script, that is why it is a
-    * protected undef/define */
-#  undef DES_PTR
-#  if !defined(_WIN32) && (defined(_M_IX86) || defined(__i386__))
-#   define DES_PTR
-#  endif
-
-   /* This helps C compiler generate the correct code for multiple functional
-    * units.  It reduces register dependancies at the expense of 2 more
-    * registers */
-#  undef DES_RISC1
-#  if !defined(_WIN32) && (defined(_M_IX86) || defined(__i386__))
-#   define DES_RISC1
-#  endif
-
-#  undef DES_RISC2
-
-#  if defined(DES_RISC1) && defined(DES_RISC2)
-#   error YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
-#  endif
-
-  /* Unroll the inner loop, this sometimes helps, sometimes hinders.
-   * Very mucy CPU dependant */
-#  undef DES_UNROLL
-#  if !defined(_WIN32)
-#   define DES_UNROLL
-#  endif
-
-  /* These default values were supplied by
-   * Peter Gutman <pgut001@cs.auckland.ac.nz>
-   * They are only used if nothing else has been defined */
-# if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
-   /* Special defines which change the way the code is built depending on the
-      CPU and OS.  For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
-      even newer MIPS CPU's, but at the moment one size fits all for
-      optimization options.  Older Sparc's work better with only UNROLL, but
-      there's no way to tell at compile time what it is you're running on */
-#  if defined( sun )		/* Newer Sparc's */
-#    define DES_PTR
-#    define DES_RISC1
-#    define DES_UNROLL
-#  elif defined( __ultrix )	/* Older MIPS */
-#    define DES_PTR
-#    define DES_RISC2
-#    define DES_UNROLL
-#  elif defined( __osf1__ )	/* Alpha */
-#    define DES_PTR
-#    define DES_RISC2
-#  elif defined ( _AIX )		/* RS6000 */
-    /* Unknown */
-#  elif defined( __hpux )		/* HP-PA */
-    /* Unknown */
-#  elif defined( __aux )		/* 68K */
-    /* Unknown */
-#  elif defined( __dgux )		/* 88K (but P6 in latest boxes) */
-#    define DES_UNROLL
-#  elif defined( __sgi )		/* Newer MIPS */
-#    define DES_PTR
-#    define DES_RISC2
-#    define DES_UNROLL
-#  elif defined(i386) || defined(__i386__)	/* x86 boxes, should be gcc */
-#    define DES_PTR
-#    define DES_RISC1
-#    define DES_UNROLL
-#  endif /* Systems-specific speed defines */
-# endif
-
-# endif /* DES_DEFAULT_OPTIONS */
-#endif /* HEADER_DES_LOCL_H */
diff --git a/compat/curl-for-windows/openssl/openssl/crypto/opensslconf.h b/compat/curl-for-windows/openssl/openssl/crypto/opensslconf.h
deleted file mode 100644
index 76c99d433a..0000000000
--- a/compat/curl-for-windows/openssl/openssl/crypto/opensslconf.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../config/opensslconf.h"
diff --git a/compat/curl-for-windows/openssl/openssl/crypto/sha/sha.h b/compat/curl-for-windows/openssl/openssl/crypto/sha/sha.h
deleted file mode 100644
index 8a6bf4bbbb..0000000000
--- a/compat/curl-for-windows/openssl/openssl/crypto/sha/sha.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/* crypto/sha/sha.h */
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- * 
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to.  The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- * 
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *    "This product includes cryptographic software written by
- *     Eric Young (eay@cryptsoft.com)"
- *    The word 'cryptographic' can be left out if the rouines from the library
- *    being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from 
- *    the apps directory (application code) you must include an acknowledgement:
- *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- * 
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * 
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed.  i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- */
-
-#ifndef HEADER_SHA_H
-#define HEADER_SHA_H
-
-#include <openssl/e_os2.h>
-#include <stddef.h>
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#if defined(OPENSSL_NO_SHA) || (defined(OPENSSL_NO_SHA0) && defined(OPENSSL_NO_SHA1))
-#error SHA is disabled.
-#endif
-
-#if defined(OPENSSL_FIPS)
-#define FIPS_SHA_SIZE_T size_t
-#endif
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- * ! SHA_LONG has to be at least 32 bits wide. If it's wider, then !
- * ! SHA_LONG_LOG2 has to be defined along.                        !
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#if defined(__LP32__)
-#define SHA_LONG unsigned long
-#elif defined(OPENSSL_SYS_CRAY) || defined(__ILP64__)
-#define SHA_LONG unsigned long
-#define SHA_LONG_LOG2 3
-#else
-#define SHA_LONG unsigned int
-#endif
-
-#define SHA_LBLOCK	16
-#define SHA_CBLOCK	(SHA_LBLOCK*4)	/* SHA treats input data as a
-					 * contiguous array of 32 bit
-					 * wide big-endian values. */
-#define SHA_LAST_BLOCK  (SHA_CBLOCK-8)
-#define SHA_DIGEST_LENGTH 20
-
-typedef struct SHAstate_st
-	{
-	SHA_LONG h0,h1,h2,h3,h4;
-	SHA_LONG Nl,Nh;
-	SHA_LONG data[SHA_LBLOCK];
-	unsigned int num;
-	} SHA_CTX;
-
-#ifndef OPENSSL_NO_SHA0
-#ifdef OPENSSL_FIPS
-int private_SHA_Init(SHA_CTX *c);
-#endif
-int SHA_Init(SHA_CTX *c);
-int SHA_Update(SHA_CTX *c, const void *data, size_t len);
-int SHA_Final(unsigned char *md, SHA_CTX *c);
-unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md);
-void SHA_Transform(SHA_CTX *c, const unsigned char *data);
-#endif
-#ifndef OPENSSL_NO_SHA1
-#ifdef OPENSSL_FIPS
-int private_SHA1_Init(SHA_CTX *c);
-#endif
-int SHA1_Init(SHA_CTX *c);
-int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
-int SHA1_Final(unsigned char *md, SHA_CTX *c);
-unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md);
-void SHA1_Transform(SHA_CTX *c, const unsigned char *data);
-#endif
-
-#define SHA256_CBLOCK	(SHA_LBLOCK*4)	/* SHA-256 treats input data as a
-					 * contiguous array of 32 bit
-					 * wide big-endian values. */
-#define SHA224_DIGEST_LENGTH	28
-#define SHA256_DIGEST_LENGTH	32
-
-typedef struct SHA256state_st
-	{
-	SHA_LONG h[8];
-	SHA_LONG Nl,Nh;
-	SHA_LONG data[SHA_LBLOCK];
-	unsigned int num,md_len;
-	} SHA256_CTX;
-
-#ifndef OPENSSL_NO_SHA256
-#ifdef OPENSSL_FIPS
-int private_SHA224_Init(SHA256_CTX *c);
-int private_SHA256_Init(SHA256_CTX *c);
-#endif
-int SHA224_Init(SHA256_CTX *c);
-int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
-int SHA224_Final(unsigned char *md, SHA256_CTX *c);
-unsigned char *SHA224(const unsigned char *d, size_t n,unsigned char *md);
-int SHA256_Init(SHA256_CTX *c);
-int SHA256_Update(SHA256_CTX *c, const void *data, size_t len);
-int SHA256_Final(unsigned char *md, SHA256_CTX *c);
-unsigned char *SHA256(const unsigned char *d, size_t n,unsigned char *md);
-void SHA256_Transform(SHA256_CTX *c, const unsigned char *data);
-#endif
-
-#define SHA384_DIGEST_LENGTH	48
-#define SHA512_DIGEST_LENGTH	64
-
-#ifndef OPENSSL_NO_SHA512
-/*
- * Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64
- * being exactly 64-bit wide. See Implementation Notes in sha512.c
- * for further details.
- */
-#define SHA512_CBLOCK	(SHA_LBLOCK*8)	/* SHA-512 treats input data as a
-					 * contiguous array of 64 bit
-					 * wide big-endian values. */
-#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
-#define SHA_LONG64 unsigned __int64
-#define U64(C)     C##UI64
-#elif defined(__arch64__)
-#define SHA_LONG64 unsigned long
-#define U64(C)     C##UL
-#else
-#define SHA_LONG64 unsigned long long
-#define U64(C)     C##ULL
-#endif
-
-typedef struct SHA512state_st
-	{
-	SHA_LONG64 h[8];
-	SHA_LONG64 Nl,Nh;
-	union {
-		SHA_LONG64	d[SHA_LBLOCK];
-		unsigned char	p[SHA512_CBLOCK];
-	} u;
-	unsigned int num,md_len;
-	} SHA512_CTX;
-#endif
-
-#ifndef OPENSSL_NO_SHA512
-#ifdef OPENSSL_FIPS
-int private_SHA384_Init(SHA512_CTX *c);
-int private_SHA512_Init(SHA512_CTX *c);
-#endif
-int SHA384_Init(SHA512_CTX *c);
-int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
-int SHA384_Final(unsigned char *md, SHA512_CTX *c);
-unsigned char *SHA384(const unsigned char *d, size_t n,unsigned char *md);
-int SHA512_Init(SHA512_CTX *c);
-int SHA512_Update(SHA512_CTX *c, const void *data, size_t len);
-int SHA512_Final(unsigned char *md, SHA512_CTX *c);
-unsigned char *SHA512(const unsigned char *d, size_t n,unsigned char *md);
-void SHA512_Transform(SHA512_CTX *c, const unsigned char *data);
-#endif
-
-#ifdef  __cplusplus
-}
-#endif
-
-#endif
diff --git a/compat/curl-for-windows/openssl/openssl/e_os2.h b/compat/curl-for-windows/openssl/openssl/e_os2.h
deleted file mode 100644
index d22c0368f8..0000000000
--- a/compat/curl-for-windows/openssl/openssl/e_os2.h
+++ /dev/null
@@ -1,315 +0,0 @@
-/* e_os2.h */
-/* ====================================================================
- * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- * This product includes cryptographic software written by Eric Young
- * (eay@cryptsoft.com).  This product includes software written by Tim
- * Hudson (tjh@cryptsoft.com).
- *
- */
-
-#include <openssl/opensslconf.h>
-
-#ifndef HEADER_E_OS2_H
-#define HEADER_E_OS2_H
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************
- * Detect operating systems.  This probably needs completing.
- * The result is that at least one OPENSSL_SYS_os macro should be defined.
- * However, if none is defined, Unix is assumed.
- **/
-
-#define OPENSSL_SYS_UNIX
-
-/* ----------------------- Macintosh, before MacOS X ----------------------- */
-#if defined(__MWERKS__) && defined(macintosh) || defined(OPENSSL_SYSNAME_MAC)
-# undef OPENSSL_SYS_UNIX
-# define OPENSSL_SYS_MACINTOSH_CLASSIC
-#endif
-
-/* ----------------------- NetWare ----------------------------------------- */
-#if defined(NETWARE) || defined(OPENSSL_SYSNAME_NETWARE)
-# undef OPENSSL_SYS_UNIX
-# define OPENSSL_SYS_NETWARE
-#endif
-
-/* ---------------------- Microsoft operating systems ---------------------- */
-
-/* Note that MSDOS actually denotes 32-bit environments running on top of
-   MS-DOS, such as DJGPP one. */
-#if defined(OPENSSL_SYSNAME_MSDOS)
-# undef OPENSSL_SYS_UNIX
-# define OPENSSL_SYS_MSDOS
-#endif
-
-/* For 32 bit environment, there seems to be the CygWin environment and then
-   all the others that try to do the same thing Microsoft does... */
-#if defined(OPENSSL_SYSNAME_UWIN)
-# undef OPENSSL_SYS_UNIX
-# define OPENSSL_SYS_WIN32_UWIN
-#else
-# if defined(__CYGWIN32__) || defined(OPENSSL_SYSNAME_CYGWIN32)
-#  undef OPENSSL_SYS_UNIX
-#  define OPENSSL_SYS_WIN32_CYGWIN
-# else
-#  if defined(_WIN32) || defined(OPENSSL_SYSNAME_WIN32)
-#   undef OPENSSL_SYS_UNIX
-#   define OPENSSL_SYS_WIN32
-#  endif
-#  if defined(OPENSSL_SYSNAME_WINNT)
-#   undef OPENSSL_SYS_UNIX
-#   define OPENSSL_SYS_WINNT
-#  endif
-#  if defined(OPENSSL_SYSNAME_WINCE)
-#   undef OPENSSL_SYS_UNIX
-#   define OPENSSL_SYS_WINCE
-#  endif
-# endif
-#endif
-
-/* Anything that tries to look like Microsoft is "Windows" */
-#if defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_WINNT) || defined(OPENSSL_SYS_WINCE)
-# undef OPENSSL_SYS_UNIX
-# define OPENSSL_SYS_WINDOWS
-# ifndef OPENSSL_SYS_MSDOS
-#  define OPENSSL_SYS_MSDOS
-# endif
-#endif
-
-/* DLL settings.  This part is a bit tough, because it's up to the application
-   implementor how he or she will link the application, so it requires some
-   macro to be used. */
-#ifdef OPENSSL_SYS_WINDOWS
-# ifndef OPENSSL_OPT_WINDLL
-#  if defined(_WINDLL) /* This is used when building OpenSSL to indicate that
-                          DLL linkage should be used */
-#   define OPENSSL_OPT_WINDLL
-#  endif
-# endif
-#endif
-
-/* -------------------------------- OpenVMS -------------------------------- */
-#if defined(__VMS) || defined(VMS) || defined(OPENSSL_SYSNAME_VMS)
-# undef OPENSSL_SYS_UNIX
-# define OPENSSL_SYS_VMS
-# if defined(__DECC)
-#  define OPENSSL_SYS_VMS_DECC
-# elif defined(__DECCXX)
-#  define OPENSSL_SYS_VMS_DECC
-#  define OPENSSL_SYS_VMS_DECCXX
-# else
-#  define OPENSSL_SYS_VMS_NODECC
-# endif
-#endif
-
-/* --------------------------------- OS/2 ---------------------------------- */
-#if defined(__EMX__) || defined(__OS2__)
-# undef OPENSSL_SYS_UNIX
-# define OPENSSL_SYS_OS2
-#endif
-
-/* --------------------------------- Unix ---------------------------------- */
-#ifdef OPENSSL_SYS_UNIX
-# if defined(linux) || defined(__linux__) || defined(OPENSSL_SYSNAME_LINUX)
-#  define OPENSSL_SYS_LINUX
-# endif
-# ifdef OPENSSL_SYSNAME_MPE
-#  define OPENSSL_SYS_MPE
-# endif
-# ifdef OPENSSL_SYSNAME_SNI
-#  define OPENSSL_SYS_SNI
-# endif
-# ifdef OPENSSL_SYSNAME_ULTRASPARC
-#  define OPENSSL_SYS_ULTRASPARC
-# endif
-# ifdef OPENSSL_SYSNAME_NEWS4
-#  define OPENSSL_SYS_NEWS4
-# endif
-# ifdef OPENSSL_SYSNAME_MACOSX
-#  define OPENSSL_SYS_MACOSX
-# endif
-# ifdef OPENSSL_SYSNAME_MACOSX_RHAPSODY
-#  define OPENSSL_SYS_MACOSX_RHAPSODY
-#  define OPENSSL_SYS_MACOSX
-# endif
-# ifdef OPENSSL_SYSNAME_SUNOS
-#  define OPENSSL_SYS_SUNOS
-#endif
-# if defined(_CRAY) || defined(OPENSSL_SYSNAME_CRAY)
-#  define OPENSSL_SYS_CRAY
-# endif
-# if defined(_AIX) || defined(OPENSSL_SYSNAME_AIX)
-#  define OPENSSL_SYS_AIX
-# endif
-#endif
-
-/* --------------------------------- VOS ----------------------------------- */
-#if defined(__VOS__) || defined(OPENSSL_SYSNAME_VOS)
-# define OPENSSL_SYS_VOS
-#ifdef __HPPA__
-# define OPENSSL_SYS_VOS_HPPA
-#endif
-#ifdef __IA32__
-# define OPENSSL_SYS_VOS_IA32
-#endif
-#endif
-
-/* ------------------------------- VxWorks --------------------------------- */
-#ifdef OPENSSL_SYSNAME_VXWORKS
-# define OPENSSL_SYS_VXWORKS
-#endif
-
-/* --------------------------------- BeOS ---------------------------------- */
-#if defined(__BEOS__)
-# define OPENSSL_SYS_BEOS
-# include <sys/socket.h>
-# if defined(BONE_VERSION)
-#  define OPENSSL_SYS_BEOS_BONE
-# else
-#  define OPENSSL_SYS_BEOS_R5
-# endif
-#endif
-
-/**
- * That's it for OS-specific stuff
- *****************************************************************************/
-
-
-/* Specials for I/O an exit */
-#ifdef OPENSSL_SYS_MSDOS
-# define OPENSSL_UNISTD_IO <io.h>
-# define OPENSSL_DECLARE_EXIT extern void exit(int);
-#else
-# define OPENSSL_UNISTD_IO OPENSSL_UNISTD
-# define OPENSSL_DECLARE_EXIT /* declared in unistd.h */
-#endif
-
-/* Definitions of OPENSSL_GLOBAL and OPENSSL_EXTERN, to define and declare
-   certain global symbols that, with some compilers under VMS, have to be
-   defined and declared explicitely with globaldef and globalref.
-   Definitions of OPENSSL_EXPORT and OPENSSL_IMPORT, to define and declare
-   DLL exports and imports for compilers under Win32.  These are a little
-   more complicated to use.  Basically, for any library that exports some
-   global variables, the following code must be present in the header file
-   that declares them, before OPENSSL_EXTERN is used:
-
-   #ifdef SOME_BUILD_FLAG_MACRO
-   # undef OPENSSL_EXTERN
-   # define OPENSSL_EXTERN OPENSSL_EXPORT
-   #endif
-
-   The default is to have OPENSSL_EXPORT, OPENSSL_IMPORT and OPENSSL_GLOBAL
-   have some generally sensible values, and for OPENSSL_EXTERN to have the
-   value OPENSSL_IMPORT.
-*/
-
-#if defined(OPENSSL_SYS_VMS_NODECC)
-# define OPENSSL_EXPORT globalref
-# define OPENSSL_IMPORT globalref
-# define OPENSSL_GLOBAL globaldef
-#elif defined(OPENSSL_SYS_WINDOWS) && defined(OPENSSL_OPT_WINDLL)
-# define OPENSSL_EXPORT extern __declspec(dllexport)
-# define OPENSSL_IMPORT extern __declspec(dllimport)
-# define OPENSSL_GLOBAL
-#else
-# define OPENSSL_EXPORT extern
-# define OPENSSL_IMPORT extern
-# define OPENSSL_GLOBAL
-#endif
-#define OPENSSL_EXTERN OPENSSL_IMPORT
-
-/* Macros to allow global variables to be reached through function calls when
-   required (if a shared library version requires it, for example.
-   The way it's done allows definitions like this:
-
-	// in foobar.c
-	OPENSSL_IMPLEMENT_GLOBAL(int,foobar,0)
-	// in foobar.h
-	OPENSSL_DECLARE_GLOBAL(int,foobar);
-	#define foobar OPENSSL_GLOBAL_REF(foobar)
-*/
-#ifdef OPENSSL_EXPORT_VAR_AS_FUNCTION
-# define OPENSSL_IMPLEMENT_GLOBAL(type,name,value)			\
-	type *_shadow_##name(void)					\
-	{ static type _hide_##name=value; return &_hide_##name; }
-# define OPENSSL_DECLARE_GLOBAL(type,name) type *_shadow_##name(void)
-# define OPENSSL_GLOBAL_REF(name) (*(_shadow_##name()))
-#else
-# define OPENSSL_IMPLEMENT_GLOBAL(type,name,value) OPENSSL_GLOBAL type _shadow_##name=value;
-# define OPENSSL_DECLARE_GLOBAL(type,name) OPENSSL_EXPORT type _shadow_##name
-# define OPENSSL_GLOBAL_REF(name) _shadow_##name
-#endif
-
-#if defined(OPENSSL_SYS_MACINTOSH_CLASSIC) && macintosh==1 && !defined(MAC_OS_GUSI_SOURCE)
-#  define ossl_ssize_t long
-#endif
-
-#ifdef OPENSSL_SYS_MSDOS
-#  define ossl_ssize_t long
-#endif
-
-#if defined(NeXT) || defined(OPENSSL_SYS_NEWS4) || defined(OPENSSL_SYS_SUNOS)
-#  define ssize_t int
-#endif
-
-#if defined(__ultrix) && !defined(ssize_t)
-#  define ossl_ssize_t int 
-#endif
-
-#ifndef ossl_ssize_t
-#  define ossl_ssize_t ssize_t
-#endif
-
-#ifdef  __cplusplus
-}
-#endif
-#endif
diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/e_os2.h b/compat/curl-for-windows/openssl/openssl/include/openssl/e_os2.h
deleted file mode 100644
index ab3f1ee44f..0000000000
--- a/compat/curl-for-windows/openssl/openssl/include/openssl/e_os2.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../e_os2.h"
diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/opensslconf.h b/compat/curl-for-windows/openssl/openssl/include/openssl/opensslconf.h
deleted file mode 100644
index 221be629b7..0000000000
--- a/compat/curl-for-windows/openssl/openssl/include/openssl/opensslconf.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../crypto/opensslconf.h"
diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/sha.h b/compat/curl-for-windows/openssl/openssl/include/openssl/sha.h
deleted file mode 100644
index ab9d94c386..0000000000
--- a/compat/curl-for-windows/openssl/openssl/include/openssl/sha.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../crypto/sha/sha.h"
diff --git a/compat/curl-for-windows/out/x64/Release/lib/libcurl.x64.lib b/compat/curl-for-windows/out/x64/Release/lib/libcurl.x64.lib
deleted file mode 100644
index 1cb88e7bd0..0000000000
Binary files a/compat/curl-for-windows/out/x64/Release/lib/libcurl.x64.lib and /dev/null differ
diff --git a/compat/curl-for-windows/out/x64/Release/lib/openssl.x64.lib b/compat/curl-for-windows/out/x64/Release/lib/openssl.x64.lib
deleted file mode 100644
index 26ed21b4dd..0000000000
Binary files a/compat/curl-for-windows/out/x64/Release/lib/openssl.x64.lib and /dev/null differ
diff --git a/compat/curl-for-windows/out/x64/Release/lib/zlib.x64.lib b/compat/curl-for-windows/out/x64/Release/lib/zlib.x64.lib
deleted file mode 100644
index 22947086cb..0000000000
Binary files a/compat/curl-for-windows/out/x64/Release/lib/zlib.x64.lib and /dev/null differ
diff --git a/compat/curl-for-windows/out/x86/Release/lib/libcurl.x86.lib b/compat/curl-for-windows/out/x86/Release/lib/libcurl.x86.lib
deleted file mode 100644
index e86173f881..0000000000
Binary files a/compat/curl-for-windows/out/x86/Release/lib/libcurl.x86.lib and /dev/null differ
diff --git a/compat/curl-for-windows/out/x86/Release/lib/openssl.x86.lib b/compat/curl-for-windows/out/x86/Release/lib/openssl.x86.lib
deleted file mode 100644
index 1f75262911..0000000000
Binary files a/compat/curl-for-windows/out/x86/Release/lib/openssl.x86.lib and /dev/null differ
diff --git a/compat/curl-for-windows/out/x86/Release/lib/zlib.x86.lib b/compat/curl-for-windows/out/x86/Release/lib/zlib.x86.lib
deleted file mode 100644
index 10dc5baca1..0000000000
Binary files a/compat/curl-for-windows/out/x86/Release/lib/zlib.x86.lib and /dev/null differ
diff --git a/compat/includes-x64/jansson.h b/compat/includes-x64/jansson.h
new file mode 100644
index 0000000000..a5927bd630
--- /dev/null
+++ b/compat/includes-x64/jansson.h
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2009-2016 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#ifndef JANSSON_H
+#define JANSSON_H
+
+#include <stdio.h>
+#include <stdlib.h>  /* for size_t */
+#include <stdarg.h>
+
+#include "jansson_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* version */
+
+#define JANSSON_MAJOR_VERSION  2
+#define JANSSON_MINOR_VERSION  10
+#define JANSSON_MICRO_VERSION  0
+
+/* Micro version is omitted if it's 0 */
+#define JANSSON_VERSION  "2.10"
+
+/* Version as a 3-byte hex number, e.g. 0x010201 == 1.2.1. Use this
+   for numeric comparisons, e.g. #if JANSSON_VERSION_HEX >= ... */
+#define JANSSON_VERSION_HEX  ((JANSSON_MAJOR_VERSION << 16) |   \
+                              (JANSSON_MINOR_VERSION << 8)  |   \
+                              (JANSSON_MICRO_VERSION << 0))
+
+
+/* types */
+
+typedef enum {
+    JSON_OBJECT,
+    JSON_ARRAY,
+    JSON_STRING,
+    JSON_INTEGER,
+    JSON_REAL,
+    JSON_TRUE,
+    JSON_FALSE,
+    JSON_NULL
+} json_type;
+
+typedef struct json_t {
+    json_type type;
+    size_t refcount;
+} json_t;
+
+#ifndef JANSSON_USING_CMAKE /* disabled if using cmake */
+#if JSON_INTEGER_IS_LONG_LONG
+#ifdef _WIN32
+#define JSON_INTEGER_FORMAT "I64d"
+#else
+#define JSON_INTEGER_FORMAT "lld"
+#endif
+typedef long long json_int_t;
+#else
+#define JSON_INTEGER_FORMAT "ld"
+typedef long json_int_t;
+#endif /* JSON_INTEGER_IS_LONG_LONG */
+#endif
+
+#define json_typeof(json)      ((json)->type)
+#define json_is_object(json)   ((json) && json_typeof(json) == JSON_OBJECT)
+#define json_is_array(json)    ((json) && json_typeof(json) == JSON_ARRAY)
+#define json_is_string(json)   ((json) && json_typeof(json) == JSON_STRING)
+#define json_is_integer(json)  ((json) && json_typeof(json) == JSON_INTEGER)
+#define json_is_real(json)     ((json) && json_typeof(json) == JSON_REAL)
+#define json_is_number(json)   (json_is_integer(json) || json_is_real(json))
+#define json_is_true(json)     ((json) && json_typeof(json) == JSON_TRUE)
+#define json_is_false(json)    ((json) && json_typeof(json) == JSON_FALSE)
+#define json_boolean_value     json_is_true
+#define json_is_boolean(json)  (json_is_true(json) || json_is_false(json))
+#define json_is_null(json)     ((json) && json_typeof(json) == JSON_NULL)
+
+/* construction, destruction, reference counting */
+
+json_t *json_object(void);
+json_t *json_array(void);
+json_t *json_string(const char *value);
+json_t *json_stringn(const char *value, size_t len);
+json_t *json_string_nocheck(const char *value);
+json_t *json_stringn_nocheck(const char *value, size_t len);
+json_t *json_integer(json_int_t value);
+json_t *json_real(double value);
+json_t *json_true(void);
+json_t *json_false(void);
+#define json_boolean(val)      ((val) ? json_true() : json_false())
+json_t *json_null(void);
+
+static JSON_INLINE
+json_t *json_incref(json_t *json)
+{
+    if(json && json->refcount != (size_t)-1)
+        ++json->refcount;
+    return json;
+}
+
+/* do not call json_delete directly */
+void json_delete(json_t *json);
+
+static JSON_INLINE
+void json_decref(json_t *json)
+{
+    if(json && json->refcount != (size_t)-1 && --json->refcount == 0)
+        json_delete(json);
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+static JSON_INLINE
+void json_decrefp(json_t **json)
+{
+    if(json) {
+        json_decref(*json);
+	*json = NULL;
+    }
+}
+
+#define json_auto_t json_t __attribute__((cleanup(json_decrefp)))
+#endif
+
+
+/* error reporting */
+
+#define JSON_ERROR_TEXT_LENGTH    160
+#define JSON_ERROR_SOURCE_LENGTH   80
+
+typedef struct {
+    int line;
+    int column;
+    int position;
+    char source[JSON_ERROR_SOURCE_LENGTH];
+    char text[JSON_ERROR_TEXT_LENGTH];
+} json_error_t;
+
+
+/* getters, setters, manipulation */
+
+void json_object_seed(size_t seed);
+size_t json_object_size(const json_t *object);
+json_t *json_object_get(const json_t *object, const char *key);
+int json_object_set_new(json_t *object, const char *key, json_t *value);
+int json_object_set_new_nocheck(json_t *object, const char *key, json_t *value);
+int json_object_del(json_t *object, const char *key);
+int json_object_clear(json_t *object);
+int json_object_update(json_t *object, json_t *other);
+int json_object_update_existing(json_t *object, json_t *other);
+int json_object_update_missing(json_t *object, json_t *other);
+void *json_object_iter(json_t *object);
+void *json_object_iter_at(json_t *object, const char *key);
+void *json_object_key_to_iter(const char *key);
+void *json_object_iter_next(json_t *object, void *iter);
+const char *json_object_iter_key(void *iter);
+json_t *json_object_iter_value(void *iter);
+int json_object_iter_set_new(json_t *object, void *iter, json_t *value);
+
+#define json_object_foreach(object, key, value) \
+    for(key = json_object_iter_key(json_object_iter(object)); \
+        key && (value = json_object_iter_value(json_object_key_to_iter(key))); \
+        key = json_object_iter_key(json_object_iter_next(object, json_object_key_to_iter(key))))
+
+#define json_object_foreach_safe(object, n, key, value)     \
+    for(key = json_object_iter_key(json_object_iter(object)), \
+            n = json_object_iter_next(object, json_object_key_to_iter(key)); \
+        key && (value = json_object_iter_value(json_object_key_to_iter(key))); \
+        key = json_object_iter_key(n), \
+            n = json_object_iter_next(object, json_object_key_to_iter(key)))
+
+#define json_array_foreach(array, index, value) \
+	for(index = 0; \
+		index < json_array_size(array) && (value = json_array_get(array, index)); \
+		index++)
+
+static JSON_INLINE
+int json_object_set(json_t *object, const char *key, json_t *value)
+{
+    return json_object_set_new(object, key, json_incref(value));
+}
+
+static JSON_INLINE
+int json_object_set_nocheck(json_t *object, const char *key, json_t *value)
+{
+    return json_object_set_new_nocheck(object, key, json_incref(value));
+}
+
+static JSON_INLINE
+int json_object_iter_set(json_t *object, void *iter, json_t *value)
+{
+    return json_object_iter_set_new(object, iter, json_incref(value));
+}
+
+size_t json_array_size(const json_t *array);
+json_t *json_array_get(const json_t *array, size_t index);
+int json_array_set_new(json_t *array, size_t index, json_t *value);
+int json_array_append_new(json_t *array, json_t *value);
+int json_array_insert_new(json_t *array, size_t index, json_t *value);
+int json_array_remove(json_t *array, size_t index);
+int json_array_clear(json_t *array);
+int json_array_extend(json_t *array, json_t *other);
+
+static JSON_INLINE
+int json_array_set(json_t *array, size_t ind, json_t *value)
+{
+    return json_array_set_new(array, ind, json_incref(value));
+}
+
+static JSON_INLINE
+int json_array_append(json_t *array, json_t *value)
+{
+    return json_array_append_new(array, json_incref(value));
+}
+
+static JSON_INLINE
+int json_array_insert(json_t *array, size_t ind, json_t *value)
+{
+    return json_array_insert_new(array, ind, json_incref(value));
+}
+
+const char *json_string_value(const json_t *string);
+size_t json_string_length(const json_t *string);
+json_int_t json_integer_value(const json_t *integer);
+double json_real_value(const json_t *real);
+double json_number_value(const json_t *json);
+
+int json_string_set(json_t *string, const char *value);
+int json_string_setn(json_t *string, const char *value, size_t len);
+int json_string_set_nocheck(json_t *string, const char *value);
+int json_string_setn_nocheck(json_t *string, const char *value, size_t len);
+int json_integer_set(json_t *integer, json_int_t value);
+int json_real_set(json_t *real, double value);
+
+/* pack, unpack */
+
+json_t *json_pack(const char *fmt, ...);
+json_t *json_pack_ex(json_error_t *error, size_t flags, const char *fmt, ...);
+json_t *json_vpack_ex(json_error_t *error, size_t flags, const char *fmt, va_list ap);
+
+#define JSON_VALIDATE_ONLY  0x1
+#define JSON_STRICT         0x2
+
+int json_unpack(json_t *root, const char *fmt, ...);
+int json_unpack_ex(json_t *root, json_error_t *error, size_t flags, const char *fmt, ...);
+int json_vunpack_ex(json_t *root, json_error_t *error, size_t flags, const char *fmt, va_list ap);
+
+
+/* equality */
+
+int json_equal(json_t *value1, json_t *value2);
+
+
+/* copying */
+
+json_t *json_copy(json_t *value);
+json_t *json_deep_copy(const json_t *value);
+
+
+/* decoding */
+
+#define JSON_REJECT_DUPLICATES  0x1
+#define JSON_DISABLE_EOF_CHECK  0x2
+#define JSON_DECODE_ANY         0x4
+#define JSON_DECODE_INT_AS_REAL 0x8
+#define JSON_ALLOW_NUL          0x10
+
+typedef size_t (*json_load_callback_t)(void *buffer, size_t buflen, void *data);
+
+json_t *json_loads(const char *input, size_t flags, json_error_t *error);
+json_t *json_loadb(const char *buffer, size_t buflen, size_t flags, json_error_t *error);
+json_t *json_loadf(FILE *input, size_t flags, json_error_t *error);
+json_t *json_loadfd(int input, size_t flags, json_error_t *error);
+json_t *json_load_file(const char *path, size_t flags, json_error_t *error);
+json_t *json_load_callback(json_load_callback_t callback, void *data, size_t flags, json_error_t *error);
+
+
+/* encoding */
+
+#define JSON_MAX_INDENT         0x1F
+#define JSON_INDENT(n)          ((n) & JSON_MAX_INDENT)
+#define JSON_COMPACT            0x20
+#define JSON_ENSURE_ASCII       0x40
+#define JSON_SORT_KEYS          0x80
+#define JSON_PRESERVE_ORDER     0x100
+#define JSON_ENCODE_ANY         0x200
+#define JSON_ESCAPE_SLASH       0x400
+#define JSON_REAL_PRECISION(n)  (((n) & 0x1F) << 11)
+#define JSON_EMBED              0x10000
+
+typedef int (*json_dump_callback_t)(const char *buffer, size_t size, void *data);
+
+char *json_dumps(const json_t *json, size_t flags);
+size_t json_dumpb(const json_t *json, char *buffer, size_t size, size_t flags);
+int json_dumpf(const json_t *json, FILE *output, size_t flags);
+int json_dumpfd(const json_t *json, int output, size_t flags);
+int json_dump_file(const json_t *json, const char *path, size_t flags);
+int json_dump_callback(const json_t *json, json_dump_callback_t callback, void *data, size_t flags);
+
+/* custom memory allocation */
+
+typedef void *(*json_malloc_t)(size_t);
+typedef void (*json_free_t)(void *);
+
+void json_set_alloc_funcs(json_malloc_t malloc_fn, json_free_t free_fn);
+void json_get_alloc_funcs(json_malloc_t *malloc_fn, json_free_t *free_fn);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/compat/includes-x64/jansson_config.h b/compat/includes-x64/jansson_config.h
new file mode 100644
index 0000000000..35eee9381d
--- /dev/null
+++ b/compat/includes-x64/jansson_config.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2010-2016 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ *
+ *
+ * This file specifies a part of the site-specific configuration for
+ * Jansson, namely those things that affect the public API in
+ * jansson.h.
+ *
+ * The CMake system will generate the jansson_config.h file and
+ * copy it to the build and install directories.
+ */
+
+#ifndef JANSSON_CONFIG_H
+#define JANSSON_CONFIG_H
+
+/* Define this so that we can disable scattered automake configuration in source files */
+#ifndef JANSSON_USING_CMAKE
+#define JANSSON_USING_CMAKE
+#endif
+
+/* Note: when using cmake, JSON_INTEGER_IS_LONG_LONG is not defined nor used,
+ * as we will also check for __int64 etc types.
+ * (the definition was used in the automake system) */
+
+/* Bring in the cmake-detected defines */
+#define HAVE_STDINT_H 1
+/* #undef HAVE_INTTYPES_H */
+/* #undef HAVE_SYS_TYPES_H */
+
+/* Include our standard type header for the integer typedef */
+
+#if defined(HAVE_STDINT_H)
+#  include <stdint.h>
+#elif defined(HAVE_INTTYPES_H)
+#  include <inttypes.h>
+#elif defined(HAVE_SYS_TYPES_H)
+#  include <sys/types.h>
+#endif
+
+
+/* If your compiler supports the inline keyword in C, JSON_INLINE is
+   defined to `inline', otherwise empty. In C++, the inline is always
+   supported. */
+#ifdef __cplusplus
+#define JSON_INLINE inline
+#else
+#define JSON_INLINE inline
+#endif
+
+
+#define json_int_t int64_t
+#define json_strtoint strtoll
+#define JSON_INTEGER_FORMAT "I64d"
+
+
+/* If locale.h and localeconv() are available, define to 1, otherwise to 0. */
+#define JSON_HAVE_LOCALECONV 1
+
+
+/* Maximum recursion depth for parsing JSON input.
+   This limits the depth of e.g. array-within-array constructions. */
+#define JSON_PARSER_MAX_DEPTH 2048
+
+
+#endif
diff --git a/compat/includes-x64/openssl/e_os2.h b/compat/includes-x64/openssl/e_os2.h
new file mode 100644
index 0000000000..99ea3477d7
--- /dev/null
+++ b/compat/includes-x64/openssl/e_os2.h
@@ -0,0 +1,311 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifndef HEADER_E_OS2_H
+# define HEADER_E_OS2_H
+
+# include <openssl/opensslconf.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * Detect operating systems.  This probably needs completing.
+ * The result is that at least one OPENSSL_SYS_os macro should be defined.
+ * However, if none is defined, Unix is assumed.
+ **/
+
+# define OPENSSL_SYS_UNIX
+
+/* --------------------- Microsoft operating systems ---------------------- */
+
+/*
+ * Note that MSDOS actually denotes 32-bit environments running on top of
+ * MS-DOS, such as DJGPP one.
+ */
+# if defined(OPENSSL_SYS_MSDOS)
+#  undef OPENSSL_SYS_UNIX
+# endif
+
+/*
+ * For 32 bit environment, there seems to be the CygWin environment and then
+ * all the others that try to do the same thing Microsoft does...
+ */
+/*
+ * UEFI lives here because it might be built with a Microsoft toolchain and
+ * we need to avoid the false positive match on Windows.
+ */
+# if defined(OPENSSL_SYS_UEFI)
+#  undef OPENSSL_SYS_UNIX
+# elif defined(OPENSSL_SYS_UWIN)
+#  undef OPENSSL_SYS_UNIX
+#  define OPENSSL_SYS_WIN32_UWIN
+# else
+#  if defined(__CYGWIN__) || defined(OPENSSL_SYS_CYGWIN)
+#   undef OPENSSL_SYS_UNIX
+#   define OPENSSL_SYS_WIN32_CYGWIN
+#  else
+#   if defined(_WIN32) || defined(OPENSSL_SYS_WIN32)
+#    undef OPENSSL_SYS_UNIX
+#    if !defined(OPENSSL_SYS_WIN32)
+#     define OPENSSL_SYS_WIN32
+#    endif
+#   endif
+#   if defined(_WIN64) || defined(OPENSSL_SYS_WIN64)
+#    undef OPENSSL_SYS_UNIX
+#    if !defined(OPENSSL_SYS_WIN64)
+#     define OPENSSL_SYS_WIN64
+#    endif
+#   endif
+#   if defined(OPENSSL_SYS_WINNT)
+#    undef OPENSSL_SYS_UNIX
+#   endif
+#   if defined(OPENSSL_SYS_WINCE)
+#    undef OPENSSL_SYS_UNIX
+#   endif
+#  endif
+# endif
+
+/* Anything that tries to look like Microsoft is "Windows" */
+# if defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_WIN64) || defined(OPENSSL_SYS_WINNT) || defined(OPENSSL_SYS_WINCE)
+#  undef OPENSSL_SYS_UNIX
+#  define OPENSSL_SYS_WINDOWS
+#  ifndef OPENSSL_SYS_MSDOS
+#   define OPENSSL_SYS_MSDOS
+#  endif
+# endif
+
+/*
+ * DLL settings.  This part is a bit tough, because it's up to the
+ * application implementor how he or she will link the application, so it
+ * requires some macro to be used.
+ */
+# ifdef OPENSSL_SYS_WINDOWS
+#  ifndef OPENSSL_OPT_WINDLL
+#   if defined(_WINDLL)         /* This is used when building OpenSSL to
+                                 * indicate that DLL linkage should be used */
+#    define OPENSSL_OPT_WINDLL
+#   endif
+#  endif
+# endif
+
+/* ------------------------------- OpenVMS -------------------------------- */
+# if defined(__VMS) || defined(VMS) || defined(OPENSSL_SYS_VMS)
+#  if !defined(OPENSSL_SYS_VMS)
+#   undef OPENSSL_SYS_UNIX
+#  endif
+#  define OPENSSL_SYS_VMS
+#  if defined(__DECC)
+#   define OPENSSL_SYS_VMS_DECC
+#  elif defined(__DECCXX)
+#   define OPENSSL_SYS_VMS_DECC
+#   define OPENSSL_SYS_VMS_DECCXX
+#  else
+#   define OPENSSL_SYS_VMS_NODECC
+#  endif
+# endif
+
+/* -------------------------------- Unix ---------------------------------- */
+# ifdef OPENSSL_SYS_UNIX
+#  if defined(linux) || defined(__linux__) && !defined(OPENSSL_SYS_LINUX)
+#   define OPENSSL_SYS_LINUX
+#  endif
+#  if defined(_AIX) && !defined(OPENSSL_SYS_AIX)
+#   define OPENSSL_SYS_AIX
+#  endif
+# endif
+
+/* -------------------------------- VOS ----------------------------------- */
+# if defined(__VOS__) && !defined(OPENSSL_SYS_VOS)
+#  define OPENSSL_SYS_VOS
+#  ifdef __HPPA__
+#   define OPENSSL_SYS_VOS_HPPA
+#  endif
+#  ifdef __IA32__
+#   define OPENSSL_SYS_VOS_IA32
+#  endif
+# endif
+
+/**
+ * That's it for OS-specific stuff
+ *****************************************************************************/
+
+/* Specials for I/O an exit */
+# ifdef OPENSSL_SYS_MSDOS
+#  define OPENSSL_UNISTD_IO <io.h>
+#  define OPENSSL_DECLARE_EXIT extern void exit(int);
+# else
+#  define OPENSSL_UNISTD_IO OPENSSL_UNISTD
+#  define OPENSSL_DECLARE_EXIT  /* declared in unistd.h */
+# endif
+
+/*-
+ * Definitions of OPENSSL_GLOBAL and OPENSSL_EXTERN, to define and declare
+ * certain global symbols that, with some compilers under VMS, have to be
+ * defined and declared explicitly with globaldef and globalref.
+ * Definitions of OPENSSL_EXPORT and OPENSSL_IMPORT, to define and declare
+ * DLL exports and imports for compilers under Win32.  These are a little
+ * more complicated to use.  Basically, for any library that exports some
+ * global variables, the following code must be present in the header file
+ * that declares them, before OPENSSL_EXTERN is used:
+ *
+ * #ifdef SOME_BUILD_FLAG_MACRO
+ * # undef OPENSSL_EXTERN
+ * # define OPENSSL_EXTERN OPENSSL_EXPORT
+ * #endif
+ *
+ * The default is to have OPENSSL_EXPORT, OPENSSL_EXTERN and OPENSSL_GLOBAL
+ * have some generally sensible values.
+ */
+
+# if defined(OPENSSL_SYS_VMS_NODECC)
+#  define OPENSSL_EXPORT globalref
+#  define OPENSSL_EXTERN globalref
+#  define OPENSSL_GLOBAL globaldef
+# elif defined(OPENSSL_SYS_WINDOWS) && defined(OPENSSL_OPT_WINDLL)
+#  define OPENSSL_EXPORT extern __declspec(dllexport)
+#  define OPENSSL_EXTERN extern __declspec(dllimport)
+#  define OPENSSL_GLOBAL
+# else
+#  define OPENSSL_EXPORT extern
+#  define OPENSSL_EXTERN extern
+#  define OPENSSL_GLOBAL
+# endif
+
+/*-
+ * Macros to allow global variables to be reached through function calls when
+ * required (if a shared library version requires it, for example.
+ * The way it's done allows definitions like this:
+ *
+ *      // in foobar.c
+ *      OPENSSL_IMPLEMENT_GLOBAL(int,foobar,0)
+ *      // in foobar.h
+ *      OPENSSL_DECLARE_GLOBAL(int,foobar);
+ *      #define foobar OPENSSL_GLOBAL_REF(foobar)
+ */
+# ifdef OPENSSL_EXPORT_VAR_AS_FUNCTION
+#  define OPENSSL_IMPLEMENT_GLOBAL(type,name,value)                      \
+        type *_shadow_##name(void)                                      \
+        { static type _hide_##name=value; return &_hide_##name; }
+#  define OPENSSL_DECLARE_GLOBAL(type,name) type *_shadow_##name(void)
+#  define OPENSSL_GLOBAL_REF(name) (*(_shadow_##name()))
+# else
+#  define OPENSSL_IMPLEMENT_GLOBAL(type,name,value) OPENSSL_GLOBAL type _shadow_##name=value;
+#  define OPENSSL_DECLARE_GLOBAL(type,name) OPENSSL_EXPORT type _shadow_##name
+#  define OPENSSL_GLOBAL_REF(name) _shadow_##name
+# endif
+
+# ifdef _WIN32
+#  ifdef _WIN64
+#   define ossl_ssize_t __int64
+#   define OSSL_SSIZE_MAX _I64_MAX
+#  else
+#   define ossl_ssize_t int
+#   define OSSL_SSIZE_MAX INT_MAX
+#  endif
+# endif
+
+# if defined(OPENSSL_SYS_UEFI) && !defined(ssize_t)
+#  define ossl_ssize_t int
+#  define OSSL_SSIZE_MAX INT_MAX
+# endif
+
+# ifndef ossl_ssize_t
+#  define ossl_ssize_t ssize_t
+#  if defined(SSIZE_MAX)
+#   define OSSL_SSIZE_MAX SSIZE_MAX
+#  elif defined(_POSIX_SSIZE_MAX)
+#   define OSSL_SSIZE_MAX _POSIX_SSIZE_MAX
+#  endif
+# endif
+
+# ifdef DEBUG_UNUSED
+#  define __owur __attribute__((__warn_unused_result__))
+# else
+#  define __owur
+# endif
+
+/* Standard integer types */
+# if defined(OPENSSL_SYS_UEFI)
+typedef INT8 int8_t;
+typedef UINT8 uint8_t;
+typedef INT16 int16_t;
+typedef UINT16 uint16_t;
+typedef INT32 int32_t;
+typedef UINT32 uint32_t;
+typedef INT64 int64_t;
+typedef UINT64 uint64_t;
+#  define PRIu64 "%Lu"
+# elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
+     defined(__osf__) || defined(__sgi) || defined(__hpux) || \
+     defined(OPENSSL_SYS_VMS) || defined (__OpenBSD__)
+#  include <inttypes.h>
+# elif defined(_MSC_VER) && _MSC_VER<=1500
+/*
+ * minimally required typdefs for systems not supporting inttypes.h or
+ * stdint.h: currently just older VC++
+ */
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+typedef short int16_t;
+typedef unsigned short uint16_t;
+typedef int int32_t;
+typedef unsigned int uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+# else
+#  include <stdint.h>
+# endif
+
+/*
+ * We need a format operator for some client tools for uint64_t.  If inttypes.h
+ * isn't available or did not define it, just go with hard-coded.
+ */
+# ifndef PRIu64
+#  ifdef SIXTY_FOUR_BIT_LONG
+#   define PRIu64 "lu"
+#  else
+#   define PRIu64 "llu"
+#  endif
+# endif
+
+/* ossl_inline: portable inline definition usable in public headers */
+# if !defined(inline) && !defined(__cplusplus)
+#  if defined(__STDC_VERSION__) && __STDC_VERSION__>=199901L
+   /* just use inline */
+#   define ossl_inline inline
+#  elif defined(__GNUC__) && __GNUC__>=2
+#   define ossl_inline __inline__
+#  elif defined(_MSC_VER)
+  /*
+   * Visual Studio: inline is available in C++ only, however
+   * __inline is available for C, see
+   * http://msdn.microsoft.com/en-us/library/z8y1yy88.aspx
+   */
+#   define ossl_inline __inline
+#  else
+#   define ossl_inline
+#  endif
+# else
+#  define ossl_inline inline
+# endif
+
+# if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#  define ossl_noreturn _Noreturn
+# elif defined(__GNUC__) && __GNUC__ >= 2
+#  define ossl_noreturn __attribute__((noreturn))
+# else
+#  define ossl_noreturn
+# endif
+
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/compat/includes-x64/openssl/opensslconf.h b/compat/includes-x64/openssl/opensslconf.h
new file mode 100644
index 0000000000..8c82f7d9d5
--- /dev/null
+++ b/compat/includes-x64/openssl/opensslconf.h
@@ -0,0 +1,172 @@
+/*
+ * WARNING: do not edit!
+ * Generated by makefile from include\openssl\opensslconf.h.in
+ *
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# error OPENSSL_ALGORITHM_DEFINES no longer supported
+#endif
+
+/*
+ * OpenSSL was configured with the following options:
+ */
+
+#ifndef OPENSSL_SYS_WIN64A
+# define OPENSSL_SYS_WIN64A 1
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_THREADS
+# define OPENSSL_THREADS
+#endif
+#ifndef OPENSSL_NO_ASAN
+# define OPENSSL_NO_ASAN
+#endif
+#ifndef OPENSSL_NO_CRYPTO_MDEBUG
+# define OPENSSL_NO_CRYPTO_MDEBUG
+#endif
+#ifndef OPENSSL_NO_CRYPTO_MDEBUG_BACKTRACE
+# define OPENSSL_NO_CRYPTO_MDEBUG_BACKTRACE
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_EGD
+# define OPENSSL_NO_EGD
+#endif
+#ifndef OPENSSL_NO_FUZZ_AFL
+# define OPENSSL_NO_FUZZ_AFL
+#endif
+#ifndef OPENSSL_NO_FUZZ_LIBFUZZER
+# define OPENSSL_NO_FUZZ_LIBFUZZER
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_MSAN
+# define OPENSSL_NO_MSAN
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SSL_TRACE
+# define OPENSSL_NO_SSL_TRACE
+#endif
+#ifndef OPENSSL_NO_SSL3
+# define OPENSSL_NO_SSL3
+#endif
+#ifndef OPENSSL_NO_SSL3_METHOD
+# define OPENSSL_NO_SSL3_METHOD
+#endif
+#ifndef OPENSSL_NO_UBSAN
+# define OPENSSL_NO_UBSAN
+#endif
+#ifndef OPENSSL_NO_UNIT_TEST
+# define OPENSSL_NO_UNIT_TEST
+#endif
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
+# define OPENSSL_NO_WEAK_SSL_CIPHERS
+#endif
+#ifndef OPENSSL_NO_AFALGENG
+# define OPENSSL_NO_AFALGENG
+#endif
+
+
+/*
+ * Sometimes OPENSSSL_NO_xxx ends up with an empty file and some compilers
+ * don't like that.  This will hopefully silence them.
+ */
+#define NON_EMPTY_TRANSLATION_UNIT static void *dummy = &dummy;
+
+/*
+ * Applications should use -DOPENSSL_API_COMPAT=<version> to suppress the
+ * declarations of functions deprecated in or before <version>. Otherwise, they
+ * still won't see them if the library has been built to disable deprecated
+ * functions.
+ */
+#if defined(OPENSSL_NO_DEPRECATED)
+# define DECLARE_DEPRECATED(f)
+#elif __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 0)
+# define DECLARE_DEPRECATED(f)    f __attribute__ ((deprecated));
+#else
+# define DECLARE_DEPRECATED(f)   f;
+#endif
+
+#ifndef OPENSSL_FILE
+# ifdef OPENSSL_NO_FILENAMES
+#  define OPENSSL_FILE ""
+#  define OPENSSL_LINE 0
+# else
+#  define OPENSSL_FILE __FILE__
+#  define OPENSSL_LINE __LINE__
+# endif
+#endif
+
+#ifndef OPENSSL_MIN_API
+# define OPENSSL_MIN_API 0
+#endif
+
+#if !defined(OPENSSL_API_COMPAT) || OPENSSL_API_COMPAT < OPENSSL_MIN_API
+# undef OPENSSL_API_COMPAT
+# define OPENSSL_API_COMPAT OPENSSL_MIN_API
+#endif
+
+#if OPENSSL_API_COMPAT < 0x10100000L
+# define DEPRECATEDIN_1_1_0(f)   DECLARE_DEPRECATED(f)
+#else
+# define DEPRECATEDIN_1_1_0(f)
+#endif
+
+#if OPENSSL_API_COMPAT < 0x10000000L
+# define DEPRECATEDIN_1_0_0(f)   DECLARE_DEPRECATED(f)
+#else
+# define DEPRECATEDIN_1_0_0(f)
+#endif
+
+#if OPENSSL_API_COMPAT < 0x00908000L
+# define DEPRECATEDIN_0_9_8(f)   DECLARE_DEPRECATED(f)
+#else
+# define DEPRECATEDIN_0_9_8(f)
+#endif
+
+#define OPENSSL_CPUID_OBJ
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <unistd.h>
+
+#define OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+/*
+ * The following are cipher-specific, but are part of the public API.
+ */
+#if !defined(OPENSSL_SYS_UEFI)
+# undef BN_LLONG
+/* Only one for the following should be defined */
+# undef SIXTY_FOUR_BIT_LONG
+# define SIXTY_FOUR_BIT
+# undef THIRTY_TWO_BIT
+#endif
+
+#define RC4_INT unsigned int
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/compat/includes-x64/openssl/sha.h b/compat/includes-x64/openssl/sha.h
new file mode 100644
index 0000000000..6a1eb0de8b
--- /dev/null
+++ b/compat/includes-x64/openssl/sha.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifndef HEADER_SHA_H
+# define HEADER_SHA_H
+
+# include <openssl/e_os2.h>
+# include <stddef.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/*-
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ * ! SHA_LONG has to be at least 32 bits wide.                    !
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+# define SHA_LONG unsigned int
+
+# define SHA_LBLOCK      16
+# define SHA_CBLOCK      (SHA_LBLOCK*4)/* SHA treats input data as a
+                                        * contiguous array of 32 bit wide
+                                        * big-endian values. */
+# define SHA_LAST_BLOCK  (SHA_CBLOCK-8)
+# define SHA_DIGEST_LENGTH 20
+
+typedef struct SHAstate_st {
+    SHA_LONG h0, h1, h2, h3, h4;
+    SHA_LONG Nl, Nh;
+    SHA_LONG data[SHA_LBLOCK];
+    unsigned int num;
+} SHA_CTX;
+
+int SHA1_Init(SHA_CTX *c);
+int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
+int SHA1_Final(unsigned char *md, SHA_CTX *c);
+unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md);
+void SHA1_Transform(SHA_CTX *c, const unsigned char *data);
+
+# define SHA256_CBLOCK   (SHA_LBLOCK*4)/* SHA-256 treats input data as a
+                                        * contiguous array of 32 bit wide
+                                        * big-endian values. */
+
+typedef struct SHA256state_st {
+    SHA_LONG h[8];
+    SHA_LONG Nl, Nh;
+    SHA_LONG data[SHA_LBLOCK];
+    unsigned int num, md_len;
+} SHA256_CTX;
+
+int SHA224_Init(SHA256_CTX *c);
+int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
+int SHA224_Final(unsigned char *md, SHA256_CTX *c);
+unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md);
+int SHA256_Init(SHA256_CTX *c);
+int SHA256_Update(SHA256_CTX *c, const void *data, size_t len);
+int SHA256_Final(unsigned char *md, SHA256_CTX *c);
+unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md);
+void SHA256_Transform(SHA256_CTX *c, const unsigned char *data);
+
+# define SHA224_DIGEST_LENGTH    28
+# define SHA256_DIGEST_LENGTH    32
+# define SHA384_DIGEST_LENGTH    48
+# define SHA512_DIGEST_LENGTH    64
+
+/*
+ * Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64
+ * being exactly 64-bit wide. See Implementation Notes in sha512.c
+ * for further details.
+ */
+/*
+ * SHA-512 treats input data as a
+ * contiguous array of 64 bit
+ * wide big-endian values.
+ */
+# define SHA512_CBLOCK   (SHA_LBLOCK*8)
+# if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+#  define SHA_LONG64 unsigned __int64
+#  define U64(C)     C##UI64
+# elif defined(__arch64__)
+#  define SHA_LONG64 unsigned long
+#  define U64(C)     C##UL
+# else
+#  define SHA_LONG64 unsigned long long
+#  define U64(C)     C##ULL
+# endif
+
+typedef struct SHA512state_st {
+    SHA_LONG64 h[8];
+    SHA_LONG64 Nl, Nh;
+    union {
+        SHA_LONG64 d[SHA_LBLOCK];
+        unsigned char p[SHA512_CBLOCK];
+    } u;
+    unsigned int num, md_len;
+} SHA512_CTX;
+
+int SHA384_Init(SHA512_CTX *c);
+int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
+int SHA384_Final(unsigned char *md, SHA512_CTX *c);
+unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md);
+int SHA512_Init(SHA512_CTX *c);
+int SHA512_Update(SHA512_CTX *c, const void *data, size_t len);
+int SHA512_Final(unsigned char *md, SHA512_CTX *c);
+unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md);
+void SHA512_Transform(SHA512_CTX *c, const unsigned char *data);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
diff --git a/compat/includes-x86/jansson.h b/compat/includes-x86/jansson.h
new file mode 100644
index 0000000000..a5927bd630
--- /dev/null
+++ b/compat/includes-x86/jansson.h
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2009-2016 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#ifndef JANSSON_H
+#define JANSSON_H
+
+#include <stdio.h>
+#include <stdlib.h>  /* for size_t */
+#include <stdarg.h>
+
+#include "jansson_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* version */
+
+#define JANSSON_MAJOR_VERSION  2
+#define JANSSON_MINOR_VERSION  10
+#define JANSSON_MICRO_VERSION  0
+
+/* Micro version is omitted if it's 0 */
+#define JANSSON_VERSION  "2.10"
+
+/* Version as a 3-byte hex number, e.g. 0x010201 == 1.2.1. Use this
+   for numeric comparisons, e.g. #if JANSSON_VERSION_HEX >= ... */
+#define JANSSON_VERSION_HEX  ((JANSSON_MAJOR_VERSION << 16) |   \
+                              (JANSSON_MINOR_VERSION << 8)  |   \
+                              (JANSSON_MICRO_VERSION << 0))
+
+
+/* types */
+
+typedef enum {
+    JSON_OBJECT,
+    JSON_ARRAY,
+    JSON_STRING,
+    JSON_INTEGER,
+    JSON_REAL,
+    JSON_TRUE,
+    JSON_FALSE,
+    JSON_NULL
+} json_type;
+
+typedef struct json_t {
+    json_type type;
+    size_t refcount;
+} json_t;
+
+#ifndef JANSSON_USING_CMAKE /* disabled if using cmake */
+#if JSON_INTEGER_IS_LONG_LONG
+#ifdef _WIN32
+#define JSON_INTEGER_FORMAT "I64d"
+#else
+#define JSON_INTEGER_FORMAT "lld"
+#endif
+typedef long long json_int_t;
+#else
+#define JSON_INTEGER_FORMAT "ld"
+typedef long json_int_t;
+#endif /* JSON_INTEGER_IS_LONG_LONG */
+#endif
+
+#define json_typeof(json)      ((json)->type)
+#define json_is_object(json)   ((json) && json_typeof(json) == JSON_OBJECT)
+#define json_is_array(json)    ((json) && json_typeof(json) == JSON_ARRAY)
+#define json_is_string(json)   ((json) && json_typeof(json) == JSON_STRING)
+#define json_is_integer(json)  ((json) && json_typeof(json) == JSON_INTEGER)
+#define json_is_real(json)     ((json) && json_typeof(json) == JSON_REAL)
+#define json_is_number(json)   (json_is_integer(json) || json_is_real(json))
+#define json_is_true(json)     ((json) && json_typeof(json) == JSON_TRUE)
+#define json_is_false(json)    ((json) && json_typeof(json) == JSON_FALSE)
+#define json_boolean_value     json_is_true
+#define json_is_boolean(json)  (json_is_true(json) || json_is_false(json))
+#define json_is_null(json)     ((json) && json_typeof(json) == JSON_NULL)
+
+/* construction, destruction, reference counting */
+
+json_t *json_object(void);
+json_t *json_array(void);
+json_t *json_string(const char *value);
+json_t *json_stringn(const char *value, size_t len);
+json_t *json_string_nocheck(const char *value);
+json_t *json_stringn_nocheck(const char *value, size_t len);
+json_t *json_integer(json_int_t value);
+json_t *json_real(double value);
+json_t *json_true(void);
+json_t *json_false(void);
+#define json_boolean(val)      ((val) ? json_true() : json_false())
+json_t *json_null(void);
+
+static JSON_INLINE
+json_t *json_incref(json_t *json)
+{
+    if(json && json->refcount != (size_t)-1)
+        ++json->refcount;
+    return json;
+}
+
+/* do not call json_delete directly */
+void json_delete(json_t *json);
+
+static JSON_INLINE
+void json_decref(json_t *json)
+{
+    if(json && json->refcount != (size_t)-1 && --json->refcount == 0)
+        json_delete(json);
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+static JSON_INLINE
+void json_decrefp(json_t **json)
+{
+    if(json) {
+        json_decref(*json);
+	*json = NULL;
+    }
+}
+
+#define json_auto_t json_t __attribute__((cleanup(json_decrefp)))
+#endif
+
+
+/* error reporting */
+
+#define JSON_ERROR_TEXT_LENGTH    160
+#define JSON_ERROR_SOURCE_LENGTH   80
+
+typedef struct {
+    int line;
+    int column;
+    int position;
+    char source[JSON_ERROR_SOURCE_LENGTH];
+    char text[JSON_ERROR_TEXT_LENGTH];
+} json_error_t;
+
+
+/* getters, setters, manipulation */
+
+void json_object_seed(size_t seed);
+size_t json_object_size(const json_t *object);
+json_t *json_object_get(const json_t *object, const char *key);
+int json_object_set_new(json_t *object, const char *key, json_t *value);
+int json_object_set_new_nocheck(json_t *object, const char *key, json_t *value);
+int json_object_del(json_t *object, const char *key);
+int json_object_clear(json_t *object);
+int json_object_update(json_t *object, json_t *other);
+int json_object_update_existing(json_t *object, json_t *other);
+int json_object_update_missing(json_t *object, json_t *other);
+void *json_object_iter(json_t *object);
+void *json_object_iter_at(json_t *object, const char *key);
+void *json_object_key_to_iter(const char *key);
+void *json_object_iter_next(json_t *object, void *iter);
+const char *json_object_iter_key(void *iter);
+json_t *json_object_iter_value(void *iter);
+int json_object_iter_set_new(json_t *object, void *iter, json_t *value);
+
+#define json_object_foreach(object, key, value) \
+    for(key = json_object_iter_key(json_object_iter(object)); \
+        key && (value = json_object_iter_value(json_object_key_to_iter(key))); \
+        key = json_object_iter_key(json_object_iter_next(object, json_object_key_to_iter(key))))
+
+#define json_object_foreach_safe(object, n, key, value)     \
+    for(key = json_object_iter_key(json_object_iter(object)), \
+            n = json_object_iter_next(object, json_object_key_to_iter(key)); \
+        key && (value = json_object_iter_value(json_object_key_to_iter(key))); \
+        key = json_object_iter_key(n), \
+            n = json_object_iter_next(object, json_object_key_to_iter(key)))
+
+#define json_array_foreach(array, index, value) \
+	for(index = 0; \
+		index < json_array_size(array) && (value = json_array_get(array, index)); \
+		index++)
+
+static JSON_INLINE
+int json_object_set(json_t *object, const char *key, json_t *value)
+{
+    return json_object_set_new(object, key, json_incref(value));
+}
+
+static JSON_INLINE
+int json_object_set_nocheck(json_t *object, const char *key, json_t *value)
+{
+    return json_object_set_new_nocheck(object, key, json_incref(value));
+}
+
+static JSON_INLINE
+int json_object_iter_set(json_t *object, void *iter, json_t *value)
+{
+    return json_object_iter_set_new(object, iter, json_incref(value));
+}
+
+size_t json_array_size(const json_t *array);
+json_t *json_array_get(const json_t *array, size_t index);
+int json_array_set_new(json_t *array, size_t index, json_t *value);
+int json_array_append_new(json_t *array, json_t *value);
+int json_array_insert_new(json_t *array, size_t index, json_t *value);
+int json_array_remove(json_t *array, size_t index);
+int json_array_clear(json_t *array);
+int json_array_extend(json_t *array, json_t *other);
+
+static JSON_INLINE
+int json_array_set(json_t *array, size_t ind, json_t *value)
+{
+    return json_array_set_new(array, ind, json_incref(value));
+}
+
+static JSON_INLINE
+int json_array_append(json_t *array, json_t *value)
+{
+    return json_array_append_new(array, json_incref(value));
+}
+
+static JSON_INLINE
+int json_array_insert(json_t *array, size_t ind, json_t *value)
+{
+    return json_array_insert_new(array, ind, json_incref(value));
+}
+
+const char *json_string_value(const json_t *string);
+size_t json_string_length(const json_t *string);
+json_int_t json_integer_value(const json_t *integer);
+double json_real_value(const json_t *real);
+double json_number_value(const json_t *json);
+
+int json_string_set(json_t *string, const char *value);
+int json_string_setn(json_t *string, const char *value, size_t len);
+int json_string_set_nocheck(json_t *string, const char *value);
+int json_string_setn_nocheck(json_t *string, const char *value, size_t len);
+int json_integer_set(json_t *integer, json_int_t value);
+int json_real_set(json_t *real, double value);
+
+/* pack, unpack */
+
+json_t *json_pack(const char *fmt, ...);
+json_t *json_pack_ex(json_error_t *error, size_t flags, const char *fmt, ...);
+json_t *json_vpack_ex(json_error_t *error, size_t flags, const char *fmt, va_list ap);
+
+#define JSON_VALIDATE_ONLY  0x1
+#define JSON_STRICT         0x2
+
+int json_unpack(json_t *root, const char *fmt, ...);
+int json_unpack_ex(json_t *root, json_error_t *error, size_t flags, const char *fmt, ...);
+int json_vunpack_ex(json_t *root, json_error_t *error, size_t flags, const char *fmt, va_list ap);
+
+
+/* equality */
+
+int json_equal(json_t *value1, json_t *value2);
+
+
+/* copying */
+
+json_t *json_copy(json_t *value);
+json_t *json_deep_copy(const json_t *value);
+
+
+/* decoding */
+
+#define JSON_REJECT_DUPLICATES  0x1
+#define JSON_DISABLE_EOF_CHECK  0x2
+#define JSON_DECODE_ANY         0x4
+#define JSON_DECODE_INT_AS_REAL 0x8
+#define JSON_ALLOW_NUL          0x10
+
+typedef size_t (*json_load_callback_t)(void *buffer, size_t buflen, void *data);
+
+json_t *json_loads(const char *input, size_t flags, json_error_t *error);
+json_t *json_loadb(const char *buffer, size_t buflen, size_t flags, json_error_t *error);
+json_t *json_loadf(FILE *input, size_t flags, json_error_t *error);
+json_t *json_loadfd(int input, size_t flags, json_error_t *error);
+json_t *json_load_file(const char *path, size_t flags, json_error_t *error);
+json_t *json_load_callback(json_load_callback_t callback, void *data, size_t flags, json_error_t *error);
+
+
+/* encoding */
+
+#define JSON_MAX_INDENT         0x1F
+#define JSON_INDENT(n)          ((n) & JSON_MAX_INDENT)
+#define JSON_COMPACT            0x20
+#define JSON_ENSURE_ASCII       0x40
+#define JSON_SORT_KEYS          0x80
+#define JSON_PRESERVE_ORDER     0x100
+#define JSON_ENCODE_ANY         0x200
+#define JSON_ESCAPE_SLASH       0x400
+#define JSON_REAL_PRECISION(n)  (((n) & 0x1F) << 11)
+#define JSON_EMBED              0x10000
+
+typedef int (*json_dump_callback_t)(const char *buffer, size_t size, void *data);
+
+char *json_dumps(const json_t *json, size_t flags);
+size_t json_dumpb(const json_t *json, char *buffer, size_t size, size_t flags);
+int json_dumpf(const json_t *json, FILE *output, size_t flags);
+int json_dumpfd(const json_t *json, int output, size_t flags);
+int json_dump_file(const json_t *json, const char *path, size_t flags);
+int json_dump_callback(const json_t *json, json_dump_callback_t callback, void *data, size_t flags);
+
+/* custom memory allocation */
+
+typedef void *(*json_malloc_t)(size_t);
+typedef void (*json_free_t)(void *);
+
+void json_set_alloc_funcs(json_malloc_t malloc_fn, json_free_t free_fn);
+void json_get_alloc_funcs(json_malloc_t *malloc_fn, json_free_t *free_fn);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/compat/includes-x86/jansson_config.h b/compat/includes-x86/jansson_config.h
new file mode 100644
index 0000000000..35eee9381d
--- /dev/null
+++ b/compat/includes-x86/jansson_config.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2010-2016 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ *
+ *
+ * This file specifies a part of the site-specific configuration for
+ * Jansson, namely those things that affect the public API in
+ * jansson.h.
+ *
+ * The CMake system will generate the jansson_config.h file and
+ * copy it to the build and install directories.
+ */
+
+#ifndef JANSSON_CONFIG_H
+#define JANSSON_CONFIG_H
+
+/* Define this so that we can disable scattered automake configuration in source files */
+#ifndef JANSSON_USING_CMAKE
+#define JANSSON_USING_CMAKE
+#endif
+
+/* Note: when using cmake, JSON_INTEGER_IS_LONG_LONG is not defined nor used,
+ * as we will also check for __int64 etc types.
+ * (the definition was used in the automake system) */
+
+/* Bring in the cmake-detected defines */
+#define HAVE_STDINT_H 1
+/* #undef HAVE_INTTYPES_H */
+/* #undef HAVE_SYS_TYPES_H */
+
+/* Include our standard type header for the integer typedef */
+
+#if defined(HAVE_STDINT_H)
+#  include <stdint.h>
+#elif defined(HAVE_INTTYPES_H)
+#  include <inttypes.h>
+#elif defined(HAVE_SYS_TYPES_H)
+#  include <sys/types.h>
+#endif
+
+
+/* If your compiler supports the inline keyword in C, JSON_INLINE is
+   defined to `inline', otherwise empty. In C++, the inline is always
+   supported. */
+#ifdef __cplusplus
+#define JSON_INLINE inline
+#else
+#define JSON_INLINE inline
+#endif
+
+
+#define json_int_t int64_t
+#define json_strtoint strtoll
+#define JSON_INTEGER_FORMAT "I64d"
+
+
+/* If locale.h and localeconv() are available, define to 1, otherwise to 0. */
+#define JSON_HAVE_LOCALECONV 1
+
+
+/* Maximum recursion depth for parsing JSON input.
+   This limits the depth of e.g. array-within-array constructions. */
+#define JSON_PARSER_MAX_DEPTH 2048
+
+
+#endif
diff --git a/compat/includes-x86/openssl/e_os2.h b/compat/includes-x86/openssl/e_os2.h
new file mode 100644
index 0000000000..99ea3477d7
--- /dev/null
+++ b/compat/includes-x86/openssl/e_os2.h
@@ -0,0 +1,311 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifndef HEADER_E_OS2_H
+# define HEADER_E_OS2_H
+
+# include <openssl/opensslconf.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * Detect operating systems.  This probably needs completing.
+ * The result is that at least one OPENSSL_SYS_os macro should be defined.
+ * However, if none is defined, Unix is assumed.
+ **/
+
+# define OPENSSL_SYS_UNIX
+
+/* --------------------- Microsoft operating systems ---------------------- */
+
+/*
+ * Note that MSDOS actually denotes 32-bit environments running on top of
+ * MS-DOS, such as DJGPP one.
+ */
+# if defined(OPENSSL_SYS_MSDOS)
+#  undef OPENSSL_SYS_UNIX
+# endif
+
+/*
+ * For 32 bit environment, there seems to be the CygWin environment and then
+ * all the others that try to do the same thing Microsoft does...
+ */
+/*
+ * UEFI lives here because it might be built with a Microsoft toolchain and
+ * we need to avoid the false positive match on Windows.
+ */
+# if defined(OPENSSL_SYS_UEFI)
+#  undef OPENSSL_SYS_UNIX
+# elif defined(OPENSSL_SYS_UWIN)
+#  undef OPENSSL_SYS_UNIX
+#  define OPENSSL_SYS_WIN32_UWIN
+# else
+#  if defined(__CYGWIN__) || defined(OPENSSL_SYS_CYGWIN)
+#   undef OPENSSL_SYS_UNIX
+#   define OPENSSL_SYS_WIN32_CYGWIN
+#  else
+#   if defined(_WIN32) || defined(OPENSSL_SYS_WIN32)
+#    undef OPENSSL_SYS_UNIX
+#    if !defined(OPENSSL_SYS_WIN32)
+#     define OPENSSL_SYS_WIN32
+#    endif
+#   endif
+#   if defined(_WIN64) || defined(OPENSSL_SYS_WIN64)
+#    undef OPENSSL_SYS_UNIX
+#    if !defined(OPENSSL_SYS_WIN64)
+#     define OPENSSL_SYS_WIN64
+#    endif
+#   endif
+#   if defined(OPENSSL_SYS_WINNT)
+#    undef OPENSSL_SYS_UNIX
+#   endif
+#   if defined(OPENSSL_SYS_WINCE)
+#    undef OPENSSL_SYS_UNIX
+#   endif
+#  endif
+# endif
+
+/* Anything that tries to look like Microsoft is "Windows" */
+# if defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_WIN64) || defined(OPENSSL_SYS_WINNT) || defined(OPENSSL_SYS_WINCE)
+#  undef OPENSSL_SYS_UNIX
+#  define OPENSSL_SYS_WINDOWS
+#  ifndef OPENSSL_SYS_MSDOS
+#   define OPENSSL_SYS_MSDOS
+#  endif
+# endif
+
+/*
+ * DLL settings.  This part is a bit tough, because it's up to the
+ * application implementor how he or she will link the application, so it
+ * requires some macro to be used.
+ */
+# ifdef OPENSSL_SYS_WINDOWS
+#  ifndef OPENSSL_OPT_WINDLL
+#   if defined(_WINDLL)         /* This is used when building OpenSSL to
+                                 * indicate that DLL linkage should be used */
+#    define OPENSSL_OPT_WINDLL
+#   endif
+#  endif
+# endif
+
+/* ------------------------------- OpenVMS -------------------------------- */
+# if defined(__VMS) || defined(VMS) || defined(OPENSSL_SYS_VMS)
+#  if !defined(OPENSSL_SYS_VMS)
+#   undef OPENSSL_SYS_UNIX
+#  endif
+#  define OPENSSL_SYS_VMS
+#  if defined(__DECC)
+#   define OPENSSL_SYS_VMS_DECC
+#  elif defined(__DECCXX)
+#   define OPENSSL_SYS_VMS_DECC
+#   define OPENSSL_SYS_VMS_DECCXX
+#  else
+#   define OPENSSL_SYS_VMS_NODECC
+#  endif
+# endif
+
+/* -------------------------------- Unix ---------------------------------- */
+# ifdef OPENSSL_SYS_UNIX
+#  if defined(linux) || defined(__linux__) && !defined(OPENSSL_SYS_LINUX)
+#   define OPENSSL_SYS_LINUX
+#  endif
+#  if defined(_AIX) && !defined(OPENSSL_SYS_AIX)
+#   define OPENSSL_SYS_AIX
+#  endif
+# endif
+
+/* -------------------------------- VOS ----------------------------------- */
+# if defined(__VOS__) && !defined(OPENSSL_SYS_VOS)
+#  define OPENSSL_SYS_VOS
+#  ifdef __HPPA__
+#   define OPENSSL_SYS_VOS_HPPA
+#  endif
+#  ifdef __IA32__
+#   define OPENSSL_SYS_VOS_IA32
+#  endif
+# endif
+
+/**
+ * That's it for OS-specific stuff
+ *****************************************************************************/
+
+/* Specials for I/O an exit */
+# ifdef OPENSSL_SYS_MSDOS
+#  define OPENSSL_UNISTD_IO <io.h>
+#  define OPENSSL_DECLARE_EXIT extern void exit(int);
+# else
+#  define OPENSSL_UNISTD_IO OPENSSL_UNISTD
+#  define OPENSSL_DECLARE_EXIT  /* declared in unistd.h */
+# endif
+
+/*-
+ * Definitions of OPENSSL_GLOBAL and OPENSSL_EXTERN, to define and declare
+ * certain global symbols that, with some compilers under VMS, have to be
+ * defined and declared explicitly with globaldef and globalref.
+ * Definitions of OPENSSL_EXPORT and OPENSSL_IMPORT, to define and declare
+ * DLL exports and imports for compilers under Win32.  These are a little
+ * more complicated to use.  Basically, for any library that exports some
+ * global variables, the following code must be present in the header file
+ * that declares them, before OPENSSL_EXTERN is used:
+ *
+ * #ifdef SOME_BUILD_FLAG_MACRO
+ * # undef OPENSSL_EXTERN
+ * # define OPENSSL_EXTERN OPENSSL_EXPORT
+ * #endif
+ *
+ * The default is to have OPENSSL_EXPORT, OPENSSL_EXTERN and OPENSSL_GLOBAL
+ * have some generally sensible values.
+ */
+
+# if defined(OPENSSL_SYS_VMS_NODECC)
+#  define OPENSSL_EXPORT globalref
+#  define OPENSSL_EXTERN globalref
+#  define OPENSSL_GLOBAL globaldef
+# elif defined(OPENSSL_SYS_WINDOWS) && defined(OPENSSL_OPT_WINDLL)
+#  define OPENSSL_EXPORT extern __declspec(dllexport)
+#  define OPENSSL_EXTERN extern __declspec(dllimport)
+#  define OPENSSL_GLOBAL
+# else
+#  define OPENSSL_EXPORT extern
+#  define OPENSSL_EXTERN extern
+#  define OPENSSL_GLOBAL
+# endif
+
+/*-
+ * Macros to allow global variables to be reached through function calls when
+ * required (if a shared library version requires it, for example.
+ * The way it's done allows definitions like this:
+ *
+ *      // in foobar.c
+ *      OPENSSL_IMPLEMENT_GLOBAL(int,foobar,0)
+ *      // in foobar.h
+ *      OPENSSL_DECLARE_GLOBAL(int,foobar);
+ *      #define foobar OPENSSL_GLOBAL_REF(foobar)
+ */
+# ifdef OPENSSL_EXPORT_VAR_AS_FUNCTION
+#  define OPENSSL_IMPLEMENT_GLOBAL(type,name,value)                      \
+        type *_shadow_##name(void)                                      \
+        { static type _hide_##name=value; return &_hide_##name; }
+#  define OPENSSL_DECLARE_GLOBAL(type,name) type *_shadow_##name(void)
+#  define OPENSSL_GLOBAL_REF(name) (*(_shadow_##name()))
+# else
+#  define OPENSSL_IMPLEMENT_GLOBAL(type,name,value) OPENSSL_GLOBAL type _shadow_##name=value;
+#  define OPENSSL_DECLARE_GLOBAL(type,name) OPENSSL_EXPORT type _shadow_##name
+#  define OPENSSL_GLOBAL_REF(name) _shadow_##name
+# endif
+
+# ifdef _WIN32
+#  ifdef _WIN64
+#   define ossl_ssize_t __int64
+#   define OSSL_SSIZE_MAX _I64_MAX
+#  else
+#   define ossl_ssize_t int
+#   define OSSL_SSIZE_MAX INT_MAX
+#  endif
+# endif
+
+# if defined(OPENSSL_SYS_UEFI) && !defined(ssize_t)
+#  define ossl_ssize_t int
+#  define OSSL_SSIZE_MAX INT_MAX
+# endif
+
+# ifndef ossl_ssize_t
+#  define ossl_ssize_t ssize_t
+#  if defined(SSIZE_MAX)
+#   define OSSL_SSIZE_MAX SSIZE_MAX
+#  elif defined(_POSIX_SSIZE_MAX)
+#   define OSSL_SSIZE_MAX _POSIX_SSIZE_MAX
+#  endif
+# endif
+
+# ifdef DEBUG_UNUSED
+#  define __owur __attribute__((__warn_unused_result__))
+# else
+#  define __owur
+# endif
+
+/* Standard integer types */
+# if defined(OPENSSL_SYS_UEFI)
+typedef INT8 int8_t;
+typedef UINT8 uint8_t;
+typedef INT16 int16_t;
+typedef UINT16 uint16_t;
+typedef INT32 int32_t;
+typedef UINT32 uint32_t;
+typedef INT64 int64_t;
+typedef UINT64 uint64_t;
+#  define PRIu64 "%Lu"
+# elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
+     defined(__osf__) || defined(__sgi) || defined(__hpux) || \
+     defined(OPENSSL_SYS_VMS) || defined (__OpenBSD__)
+#  include <inttypes.h>
+# elif defined(_MSC_VER) && _MSC_VER<=1500
+/*
+ * minimally required typdefs for systems not supporting inttypes.h or
+ * stdint.h: currently just older VC++
+ */
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+typedef short int16_t;
+typedef unsigned short uint16_t;
+typedef int int32_t;
+typedef unsigned int uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+# else
+#  include <stdint.h>
+# endif
+
+/*
+ * We need a format operator for some client tools for uint64_t.  If inttypes.h
+ * isn't available or did not define it, just go with hard-coded.
+ */
+# ifndef PRIu64
+#  ifdef SIXTY_FOUR_BIT_LONG
+#   define PRIu64 "lu"
+#  else
+#   define PRIu64 "llu"
+#  endif
+# endif
+
+/* ossl_inline: portable inline definition usable in public headers */
+# if !defined(inline) && !defined(__cplusplus)
+#  if defined(__STDC_VERSION__) && __STDC_VERSION__>=199901L
+   /* just use inline */
+#   define ossl_inline inline
+#  elif defined(__GNUC__) && __GNUC__>=2
+#   define ossl_inline __inline__
+#  elif defined(_MSC_VER)
+  /*
+   * Visual Studio: inline is available in C++ only, however
+   * __inline is available for C, see
+   * http://msdn.microsoft.com/en-us/library/z8y1yy88.aspx
+   */
+#   define ossl_inline __inline
+#  else
+#   define ossl_inline
+#  endif
+# else
+#  define ossl_inline inline
+# endif
+
+# if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#  define ossl_noreturn _Noreturn
+# elif defined(__GNUC__) && __GNUC__ >= 2
+#  define ossl_noreturn __attribute__((noreturn))
+# else
+#  define ossl_noreturn
+# endif
+
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/compat/includes-x86/openssl/opensslconf.h b/compat/includes-x86/openssl/opensslconf.h
new file mode 100644
index 0000000000..1cb04a5a42
--- /dev/null
+++ b/compat/includes-x86/openssl/opensslconf.h
@@ -0,0 +1,172 @@
+/*
+ * WARNING: do not edit!
+ * Generated by makefile from include\openssl\opensslconf.h.in
+ *
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# error OPENSSL_ALGORITHM_DEFINES no longer supported
+#endif
+
+/*
+ * OpenSSL was configured with the following options:
+ */
+
+#ifndef OPENSSL_SYS_WIN32
+# define OPENSSL_SYS_WIN32 1
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_THREADS
+# define OPENSSL_THREADS
+#endif
+#ifndef OPENSSL_NO_ASAN
+# define OPENSSL_NO_ASAN
+#endif
+#ifndef OPENSSL_NO_CRYPTO_MDEBUG
+# define OPENSSL_NO_CRYPTO_MDEBUG
+#endif
+#ifndef OPENSSL_NO_CRYPTO_MDEBUG_BACKTRACE
+# define OPENSSL_NO_CRYPTO_MDEBUG_BACKTRACE
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_EGD
+# define OPENSSL_NO_EGD
+#endif
+#ifndef OPENSSL_NO_FUZZ_AFL
+# define OPENSSL_NO_FUZZ_AFL
+#endif
+#ifndef OPENSSL_NO_FUZZ_LIBFUZZER
+# define OPENSSL_NO_FUZZ_LIBFUZZER
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_MSAN
+# define OPENSSL_NO_MSAN
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SSL_TRACE
+# define OPENSSL_NO_SSL_TRACE
+#endif
+#ifndef OPENSSL_NO_SSL3
+# define OPENSSL_NO_SSL3
+#endif
+#ifndef OPENSSL_NO_SSL3_METHOD
+# define OPENSSL_NO_SSL3_METHOD
+#endif
+#ifndef OPENSSL_NO_UBSAN
+# define OPENSSL_NO_UBSAN
+#endif
+#ifndef OPENSSL_NO_UNIT_TEST
+# define OPENSSL_NO_UNIT_TEST
+#endif
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
+# define OPENSSL_NO_WEAK_SSL_CIPHERS
+#endif
+#ifndef OPENSSL_NO_AFALGENG
+# define OPENSSL_NO_AFALGENG
+#endif
+
+
+/*
+ * Sometimes OPENSSSL_NO_xxx ends up with an empty file and some compilers
+ * don't like that.  This will hopefully silence them.
+ */
+#define NON_EMPTY_TRANSLATION_UNIT static void *dummy = &dummy;
+
+/*
+ * Applications should use -DOPENSSL_API_COMPAT=<version> to suppress the
+ * declarations of functions deprecated in or before <version>. Otherwise, they
+ * still won't see them if the library has been built to disable deprecated
+ * functions.
+ */
+#if defined(OPENSSL_NO_DEPRECATED)
+# define DECLARE_DEPRECATED(f)
+#elif __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 0)
+# define DECLARE_DEPRECATED(f)    f __attribute__ ((deprecated));
+#else
+# define DECLARE_DEPRECATED(f)   f;
+#endif
+
+#ifndef OPENSSL_FILE
+# ifdef OPENSSL_NO_FILENAMES
+#  define OPENSSL_FILE ""
+#  define OPENSSL_LINE 0
+# else
+#  define OPENSSL_FILE __FILE__
+#  define OPENSSL_LINE __LINE__
+# endif
+#endif
+
+#ifndef OPENSSL_MIN_API
+# define OPENSSL_MIN_API 0
+#endif
+
+#if !defined(OPENSSL_API_COMPAT) || OPENSSL_API_COMPAT < OPENSSL_MIN_API
+# undef OPENSSL_API_COMPAT
+# define OPENSSL_API_COMPAT OPENSSL_MIN_API
+#endif
+
+#if OPENSSL_API_COMPAT < 0x10100000L
+# define DEPRECATEDIN_1_1_0(f)   DECLARE_DEPRECATED(f)
+#else
+# define DEPRECATEDIN_1_1_0(f)
+#endif
+
+#if OPENSSL_API_COMPAT < 0x10000000L
+# define DEPRECATEDIN_1_0_0(f)   DECLARE_DEPRECATED(f)
+#else
+# define DEPRECATEDIN_1_0_0(f)
+#endif
+
+#if OPENSSL_API_COMPAT < 0x00908000L
+# define DEPRECATEDIN_0_9_8(f)   DECLARE_DEPRECATED(f)
+#else
+# define DEPRECATEDIN_0_9_8(f)
+#endif
+
+#define OPENSSL_CPUID_OBJ
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <unistd.h>
+
+#define OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+/*
+ * The following are cipher-specific, but are part of the public API.
+ */
+#if !defined(OPENSSL_SYS_UEFI)
+# define BN_LLONG
+/* Only one for the following should be defined */
+# undef SIXTY_FOUR_BIT_LONG
+# undef SIXTY_FOUR_BIT
+# define THIRTY_TWO_BIT
+#endif
+
+#define RC4_INT unsigned int
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/compat/includes-x86/openssl/sha.h b/compat/includes-x86/openssl/sha.h
new file mode 100644
index 0000000000..6a1eb0de8b
--- /dev/null
+++ b/compat/includes-x86/openssl/sha.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifndef HEADER_SHA_H
+# define HEADER_SHA_H
+
+# include <openssl/e_os2.h>
+# include <stddef.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/*-
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ * ! SHA_LONG has to be at least 32 bits wide.                    !
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+# define SHA_LONG unsigned int
+
+# define SHA_LBLOCK      16
+# define SHA_CBLOCK      (SHA_LBLOCK*4)/* SHA treats input data as a
+                                        * contiguous array of 32 bit wide
+                                        * big-endian values. */
+# define SHA_LAST_BLOCK  (SHA_CBLOCK-8)
+# define SHA_DIGEST_LENGTH 20
+
+typedef struct SHAstate_st {
+    SHA_LONG h0, h1, h2, h3, h4;
+    SHA_LONG Nl, Nh;
+    SHA_LONG data[SHA_LBLOCK];
+    unsigned int num;
+} SHA_CTX;
+
+int SHA1_Init(SHA_CTX *c);
+int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
+int SHA1_Final(unsigned char *md, SHA_CTX *c);
+unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md);
+void SHA1_Transform(SHA_CTX *c, const unsigned char *data);
+
+# define SHA256_CBLOCK   (SHA_LBLOCK*4)/* SHA-256 treats input data as a
+                                        * contiguous array of 32 bit wide
+                                        * big-endian values. */
+
+typedef struct SHA256state_st {
+    SHA_LONG h[8];
+    SHA_LONG Nl, Nh;
+    SHA_LONG data[SHA_LBLOCK];
+    unsigned int num, md_len;
+} SHA256_CTX;
+
+int SHA224_Init(SHA256_CTX *c);
+int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
+int SHA224_Final(unsigned char *md, SHA256_CTX *c);
+unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md);
+int SHA256_Init(SHA256_CTX *c);
+int SHA256_Update(SHA256_CTX *c, const void *data, size_t len);
+int SHA256_Final(unsigned char *md, SHA256_CTX *c);
+unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md);
+void SHA256_Transform(SHA256_CTX *c, const unsigned char *data);
+
+# define SHA224_DIGEST_LENGTH    28
+# define SHA256_DIGEST_LENGTH    32
+# define SHA384_DIGEST_LENGTH    48
+# define SHA512_DIGEST_LENGTH    64
+
+/*
+ * Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64
+ * being exactly 64-bit wide. See Implementation Notes in sha512.c
+ * for further details.
+ */
+/*
+ * SHA-512 treats input data as a
+ * contiguous array of 64 bit
+ * wide big-endian values.
+ */
+# define SHA512_CBLOCK   (SHA_LBLOCK*8)
+# if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+#  define SHA_LONG64 unsigned __int64
+#  define U64(C)     C##UI64
+# elif defined(__arch64__)
+#  define SHA_LONG64 unsigned long
+#  define U64(C)     C##UL
+# else
+#  define SHA_LONG64 unsigned long long
+#  define U64(C)     C##ULL
+# endif
+
+typedef struct SHA512state_st {
+    SHA_LONG64 h[8];
+    SHA_LONG64 Nl, Nh;
+    union {
+        SHA_LONG64 d[SHA_LBLOCK];
+        unsigned char p[SHA512_CBLOCK];
+    } u;
+    unsigned int num, md_len;
+} SHA512_CTX;
+
+int SHA384_Init(SHA512_CTX *c);
+int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
+int SHA384_Final(unsigned char *md, SHA512_CTX *c);
+unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md);
+int SHA512_Init(SHA512_CTX *c);
+int SHA512_Update(SHA512_CTX *c, const void *data, size_t len);
+int SHA512_Final(unsigned char *md, SHA512_CTX *c);
+unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md);
+void SHA512_Transform(SHA512_CTX *c, const unsigned char *data);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
diff --git a/compat/curl-for-windows/curl/include/curl/curl.h b/compat/includes/curl/curl.h
similarity index 87%
rename from compat/curl-for-windows/curl/include/curl/curl.h
rename to compat/includes/curl/curl.h
index 44b1b7e391..1030712648 100644
--- a/compat/curl-for-windows/curl/include/curl/curl.h
+++ b/compat/includes/curl/curl.h
@@ -7,11 +7,11 @@
  *                            | (__| |_| |  _ <| |___
  *                             \___|\___/|_| \_\_____|
  *
- * Copyright (C) 1998 - 2014, Daniel Stenberg, <daniel@haxx.se>, et al.
+ * Copyright (C) 1998 - 2017, Daniel Stenberg, <daniel@haxx.se>, et al.
  *
  * This software is licensed as described in the file COPYING, which
  * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
+ * are also available at https://curl.haxx.se/docs/copyright.html.
  *
  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
  * copies of the Software, and permit persons to whom the Software is
@@ -24,13 +24,18 @@
 
 /*
  * If you have libcurl problems, all docs and details are found here:
- *   http://curl.haxx.se/libcurl/
+ *   https://curl.haxx.se/libcurl/
  *
  * curl-library mailing list subscription and unsubscription web interface:
- *   http://cool.haxx.se/mailman/listinfo/curl-library/
+ *   https://cool.haxx.se/mailman/listinfo/curl-library/
  */
 
+#ifdef CURL_NO_OLDIES
+#define CURL_STRICTER
+#endif
+
 #include "curlver.h"         /* libcurl version defines   */
+#include "system.h"          /* determine things run-time */
 #include "curlbuild.h"       /* libcurl build definitions */
 #include "curlrules.h"       /* libcurl rules enforcement */
 
@@ -56,7 +61,8 @@
 #include <time.h>
 
 #if defined(WIN32) && !defined(_WIN32_WCE) && !defined(__CYGWIN__)
-#if !(defined(_WINSOCKAPI_) || defined(_WINSOCK_H) || defined(__LWIP_OPT_H__))
+#if !(defined(_WINSOCKAPI_) || defined(_WINSOCK_H) || \
+      defined(__LWIP_OPT_H__) || defined(LWIP_HDR_OPT_H))
 /* The check above prevents the winsock2 inclusion if winsock.h already was
    included, since they can't co-exist without problems */
 #include <winsock2.h>
@@ -90,7 +96,13 @@
 extern "C" {
 #endif
 
+#if defined(BUILDING_LIBCURL) || defined(CURL_STRICTER)
+typedef struct Curl_easy CURL;
+typedef struct Curl_share CURLSH;
+#else
 typedef void CURL;
+typedef void CURLSH;
+#endif
 
 /*
  * libcurl external API function linkage decorations.
@@ -112,7 +124,7 @@ typedef void CURL;
 
 #ifndef curl_socket_typedef
 /* socket typedef */
-#if defined(WIN32) && !defined(__LWIP_OPT_H__)
+#if defined(WIN32) && !defined(__LWIP_OPT_H__) && !defined(LWIP_HDR_OPT_H)
 typedef SOCKET curl_socket_t;
 #define CURL_SOCKET_BAD INVALID_SOCKET
 #else
@@ -127,33 +139,43 @@ struct curl_httppost {
   char *name;                       /* pointer to allocated name */
   long namelength;                  /* length of name length */
   char *contents;                   /* pointer to allocated data contents */
-  long contentslength;              /* length of contents field */
+  long contentslength;              /* length of contents field, see also
+                                       CURL_HTTPPOST_LARGE */
   char *buffer;                     /* pointer to allocated buffer contents */
   long bufferlength;                /* length of buffer field */
   char *contenttype;                /* Content-Type */
-  struct curl_slist* contentheader; /* list of extra headers for this form */
+  struct curl_slist *contentheader; /* list of extra headers for this form */
   struct curl_httppost *more;       /* if one field name has more than one
                                        file, this link should link to following
                                        files */
   long flags;                       /* as defined below */
-#define HTTPPOST_FILENAME (1<<0)    /* specified content is a file name */
-#define HTTPPOST_READFILE (1<<1)    /* specified content is a file name */
-#define HTTPPOST_PTRNAME (1<<2)     /* name is only stored pointer
-                                       do not free in formfree */
-#define HTTPPOST_PTRCONTENTS (1<<3) /* contents is only stored pointer
-                                       do not free in formfree */
-#define HTTPPOST_BUFFER (1<<4)      /* upload file from buffer */
-#define HTTPPOST_PTRBUFFER (1<<5)   /* upload file from pointer contents */
-#define HTTPPOST_CALLBACK (1<<6)    /* upload file contents by using the
-                                       regular read callback to get the data
-                                       and pass the given pointer as custom
-                                       pointer */
+
+/* specified content is a file name */
+#define CURL_HTTPPOST_FILENAME (1<<0)
+/* specified content is a file name */
+#define CURL_HTTPPOST_READFILE (1<<1)
+/* name is only stored pointer do not free in formfree */
+#define CURL_HTTPPOST_PTRNAME (1<<2)
+/* contents is only stored pointer do not free in formfree */
+#define CURL_HTTPPOST_PTRCONTENTS (1<<3)
+/* upload file from buffer */
+#define CURL_HTTPPOST_BUFFER (1<<4)
+/* upload file from pointer contents */
+#define CURL_HTTPPOST_PTRBUFFER (1<<5)
+/* upload file contents by using the regular read callback to get the data and
+   pass the given pointer as custom pointer */
+#define CURL_HTTPPOST_CALLBACK (1<<6)
+/* use size in 'contentlen', added in 7.46.0 */
+#define CURL_HTTPPOST_LARGE (1<<7)
 
   char *showfilename;               /* The file name to show. If not set, the
                                        actual file name will be used (if this
                                        is a file part) */
   void *userp;                      /* custom pointer used for
                                        HTTPPOST_CALLBACK posts */
+  curl_off_t contentlen;            /* alternative length of contents
+                                       field. Used if CURL_HTTPPOST_LARGE is
+                                       set. Added in 7.46.0 */
 };
 
 /* This is the CURLOPT_PROGRESSFUNCTION callback proto. It is now considered
@@ -172,6 +194,11 @@ typedef int (*curl_xferinfo_callback)(void *clientp,
                                       curl_off_t ultotal,
                                       curl_off_t ulnow);
 
+#ifndef CURL_MAX_READ_SIZE
+  /* The maximum receive buffer size configurable via CURLOPT_BUFFERSIZE. */
+#define CURL_MAX_READ_SIZE 524288
+#endif
+
 #ifndef CURL_MAX_WRITE_SIZE
   /* Tests have proven that 20K is a very bad buffer size for uploads on
      Windows, while 16K for some odd reason performed a lot better.
@@ -249,7 +276,7 @@ struct curl_fileinfo {
   unsigned int flags;
 
   /* used internally */
-  char * b_data;
+  char *b_data;
   size_t b_size;
   size_t b_used;
 };
@@ -362,6 +389,7 @@ typedef curlioerr (*curl_ioctl_callback)(CURL *handle,
                                          int cmd,
                                          void *clientp);
 
+#ifndef CURL_DID_MEMORY_FUNC_TYPEDEFS
 /*
  * The following typedef's are signatures of malloc, free, realloc, strdup and
  * calloc respectively.  Function pointers of these types can be passed to the
@@ -374,6 +402,9 @@ typedef void *(*curl_realloc_callback)(void *ptr, size_t size);
 typedef char *(*curl_strdup_callback)(const char *str);
 typedef void *(*curl_calloc_callback)(size_t nmemb, size_t size);
 
+#define CURL_DID_MEMORY_FUNC_TYPEDEFS
+#endif
+
 /* the kind of data that is passed to information_callback*/
 typedef enum {
   CURLINFO_TEXT = 0,
@@ -410,7 +441,7 @@ typedef enum {
   CURLE_COULDNT_RESOLVE_PROXY,   /* 5 */
   CURLE_COULDNT_RESOLVE_HOST,    /* 6 */
   CURLE_COULDNT_CONNECT,         /* 7 */
-  CURLE_FTP_WEIRD_SERVER_REPLY,  /* 8 */
+  CURLE_WEIRD_SERVER_REPLY,      /* 8 */
   CURLE_REMOTE_ACCESS_DENIED,    /* 9 a service was denied by the server
                                     due to lack of access - when login fails
                                     this is not returned. */
@@ -454,15 +485,15 @@ typedef enum {
   CURLE_LDAP_CANNOT_BIND,        /* 38 */
   CURLE_LDAP_SEARCH_FAILED,      /* 39 */
   CURLE_OBSOLETE40,              /* 40 - NOT USED */
-  CURLE_FUNCTION_NOT_FOUND,      /* 41 */
+  CURLE_FUNCTION_NOT_FOUND,      /* 41 - NOT USED starting with 7.53.0 */
   CURLE_ABORTED_BY_CALLBACK,     /* 42 */
   CURLE_BAD_FUNCTION_ARGUMENT,   /* 43 */
   CURLE_OBSOLETE44,              /* 44 - NOT USED */
   CURLE_INTERFACE_FAILED,        /* 45 - CURLOPT_INTERFACE failed */
   CURLE_OBSOLETE46,              /* 46 - NOT USED */
-  CURLE_TOO_MANY_REDIRECTS ,     /* 47 - catch endless re-direct loops */
+  CURLE_TOO_MANY_REDIRECTS,      /* 47 - catch endless re-direct loops */
   CURLE_UNKNOWN_OPTION,          /* 48 - User specified an unknown option */
-  CURLE_TELNET_OPTION_SYNTAX ,   /* 49 - Malformed telnet option */
+  CURLE_TELNET_OPTION_SYNTAX,    /* 49 - Malformed telnet option */
   CURLE_OBSOLETE50,              /* 50 - NOT USED */
   CURLE_PEER_FAILED_VERIFICATION, /* 51 - peer's certificate or fingerprint
                                      wasn't verified fine */
@@ -523,6 +554,9 @@ typedef enum {
                                     session will be queued */
   CURLE_SSL_PINNEDPUBKEYNOTMATCH, /* 90 - specified pinned public key did not
                                      match */
+  CURLE_SSL_INVALIDCERTSTATUS,   /* 91 - invalid certificate status */
+  CURLE_HTTP2_STREAM,            /* 92 - stream error in HTTP/2 framing layer
+                                    */
   CURL_LAST /* never use! */
 } CURLcode;
 
@@ -538,6 +572,7 @@ typedef enum {
 
 /*  compatibility with older names */
 #define CURLOPT_ENCODING CURLOPT_ACCEPT_ENCODING
+#define CURLE_FTP_WEIRD_SERVER_REPLY CURLE_WEIRD_SERVER_REPLY
 
 /* The following were added in 7.21.5, April 2011 */
 #define CURLE_UNKNOWN_TELNET_OPTION CURLE_UNKNOWN_OPTION
@@ -611,6 +646,7 @@ typedef enum {
                            CONNECT HTTP/1.1 */
   CURLPROXY_HTTP_1_0 = 1,   /* added in 7.19.4, force to use CONNECT
                                HTTP/1.0  */
+  CURLPROXY_HTTPS = 2, /* added in 7.52.0 */
   CURLPROXY_SOCKS4 = 4, /* support added in 7.15.2, enum existed already
                            in 7.10 */
   CURLPROXY_SOCKS5 = 5, /* added in 7.10 */
@@ -724,6 +760,10 @@ typedef enum {
    servers, a user can this way allow the vulnerability back. */
 #define CURLSSLOPT_ALLOW_BEAST (1<<0)
 
+/* - NO_REVOKE tells libcurl to disable certificate revocation checks for those
+   SSL backends where such behavior is present. */
+#define CURLSSLOPT_NO_REVOKE (1<<1)
+
 #ifndef CURL_NO_OLDIES /* define this to test if your app builds with all
                           the obsolete stuff removed! */
 
@@ -813,9 +853,13 @@ typedef enum {
    but 32 */
 #define CURLOPTTYPE_LONG          0
 #define CURLOPTTYPE_OBJECTPOINT   10000
+#define CURLOPTTYPE_STRINGPOINT   10000
 #define CURLOPTTYPE_FUNCTIONPOINT 20000
 #define CURLOPTTYPE_OFF_T         30000
 
+/* *STRINGPOINT is an alias for OBJECTPOINT to allow tools to extract the
+   string options from the header file */
+
 /* name is uppercase CURLOPT_<name>,
    type is one of the defined CURLOPTTYPE_<type>
    number is unique identifier */
@@ -829,6 +873,7 @@ typedef enum {
 /* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */
 #define LONG          CURLOPTTYPE_LONG
 #define OBJECTPOINT   CURLOPTTYPE_OBJECTPOINT
+#define STRINGPOINT   CURLOPTTYPE_OBJECTPOINT
 #define FUNCTIONPOINT CURLOPTTYPE_FUNCTIONPOINT
 #define OFF_T         CURLOPTTYPE_OFF_T
 #define CINIT(name,type,number) CURLOPT_/**/name = type + number
@@ -845,22 +890,22 @@ typedef enum {
   CINIT(WRITEDATA, OBJECTPOINT, 1),
 
   /* The full URL to get/put */
-  CINIT(URL,  OBJECTPOINT, 2),
+  CINIT(URL, STRINGPOINT, 2),
 
   /* Port number to connect to, if other than default. */
   CINIT(PORT, LONG, 3),
 
   /* Name of proxy to use. */
-  CINIT(PROXY, OBJECTPOINT, 4),
+  CINIT(PROXY, STRINGPOINT, 4),
 
   /* "user:password;options" to use when fetching. */
-  CINIT(USERPWD, OBJECTPOINT, 5),
+  CINIT(USERPWD, STRINGPOINT, 5),
 
   /* "user:password" to use with proxy. */
-  CINIT(PROXYUSERPWD, OBJECTPOINT, 6),
+  CINIT(PROXYUSERPWD, STRINGPOINT, 6),
 
   /* Range to get, specified as an ASCII string. */
-  CINIT(RANGE, OBJECTPOINT, 7),
+  CINIT(RANGE, STRINGPOINT, 7),
 
   /* not used */
 
@@ -897,14 +942,14 @@ typedef enum {
   CINIT(POSTFIELDS, OBJECTPOINT, 15),
 
   /* Set the referrer page (needed by some CGIs) */
-  CINIT(REFERER, OBJECTPOINT, 16),
+  CINIT(REFERER, STRINGPOINT, 16),
 
   /* Set the FTP PORT string (interface name, named or numerical IP address)
      Use i.e '-' to use default address. */
-  CINIT(FTPPORT, OBJECTPOINT, 17),
+  CINIT(FTPPORT, STRINGPOINT, 17),
 
   /* Set the User-Agent string (examined by some CGIs) */
-  CINIT(USERAGENT, OBJECTPOINT, 18),
+  CINIT(USERAGENT, STRINGPOINT, 18),
 
   /* If the download receives less than "low speed limit" bytes/second
    * during "low speed time" seconds, the operations is aborted.
@@ -927,7 +972,7 @@ typedef enum {
   CINIT(RESUME_FROM, LONG, 21),
 
   /* Set cookie in request: */
-  CINIT(COOKIE, OBJECTPOINT, 22),
+  CINIT(COOKIE, STRINGPOINT, 22),
 
   /* This points to a linked list of headers, struct curl_slist kind. This
      list is also used for RTSP (in spite of its name) */
@@ -937,10 +982,10 @@ typedef enum {
   CINIT(HTTPPOST, OBJECTPOINT, 24),
 
   /* name of the file keeping your private SSL-certificate */
-  CINIT(SSLCERT, OBJECTPOINT, 25),
+  CINIT(SSLCERT, STRINGPOINT, 25),
 
   /* password for the SSL or SSH private key */
-  CINIT(KEYPASSWD, OBJECTPOINT, 26),
+  CINIT(KEYPASSWD, STRINGPOINT, 26),
 
   /* send TYPE parameter? */
   CINIT(CRLF, LONG, 27),
@@ -954,7 +999,7 @@ typedef enum {
 
   /* point to a file to read the initial cookies from, also enables
      "cookie awareness" */
-  CINIT(COOKIEFILE, OBJECTPOINT, 31),
+  CINIT(COOKIEFILE, STRINGPOINT, 31),
 
   /* What version to specifically try to use.
      See CURL_SSLVERSION defines below. */
@@ -973,9 +1018,9 @@ typedef enum {
      HTTP: DELETE, TRACE and others
      FTP: to use a different list command
      */
-  CINIT(CUSTOMREQUEST, OBJECTPOINT, 36),
+  CINIT(CUSTOMREQUEST, STRINGPOINT, 36),
 
-  /* HTTP request, for odd commands like DELETE, TRACE and others */
+  /* FILE handle to use instead of stderr */
   CINIT(STDERR, OBJECTPOINT, 37),
 
   /* 38 is not used */
@@ -1032,19 +1077,19 @@ typedef enum {
   CINIT(HTTPPROXYTUNNEL, LONG, 61),
 
   /* Set the interface string to use as outgoing network interface */
-  CINIT(INTERFACE, OBJECTPOINT, 62),
+  CINIT(INTERFACE, STRINGPOINT, 62),
 
   /* Set the krb4/5 security level, this also enables krb4/5 awareness.  This
    * is a string, 'clear', 'safe', 'confidential' or 'private'.  If the string
    * is set but doesn't match one of these, 'private' will be used.  */
-  CINIT(KRBLEVEL, OBJECTPOINT, 63),
+  CINIT(KRBLEVEL, STRINGPOINT, 63),
 
   /* Set if we should verify the peer in ssl handshake, set 1 to verify. */
   CINIT(SSL_VERIFYPEER, LONG, 64),
 
   /* The CApath or CAfile used to validate the peer certificate
      this option is used only if SSL_VERIFYPEER is true */
-  CINIT(CAINFO, OBJECTPOINT, 65),
+  CINIT(CAINFO, STRINGPOINT, 65),
 
   /* 66 = OBSOLETE */
   /* 67 = OBSOLETE */
@@ -1078,10 +1123,10 @@ typedef enum {
 
   /* Set to a file name that contains random data for libcurl to use to
      seed the random engine when doing SSL connects. */
-  CINIT(RANDOM_FILE, OBJECTPOINT, 76),
+  CINIT(RANDOM_FILE, STRINGPOINT, 76),
 
   /* Set to the Entropy Gathering Daemon socket pathname */
-  CINIT(EGDSOCKET, OBJECTPOINT, 77),
+  CINIT(EGDSOCKET, STRINGPOINT, 77),
 
   /* Time-out connect operations after this amount of seconds, if connects are
      OK within this time, then fine... This only aborts the connect phase. */
@@ -1103,10 +1148,10 @@ typedef enum {
 
   /* Specify which file name to write all known cookies in after completed
      operation. Set file name to "-" (dash) to make it go to stdout. */
-  CINIT(COOKIEJAR, OBJECTPOINT, 82),
+  CINIT(COOKIEJAR, STRINGPOINT, 82),
 
   /* Specify which SSL ciphers to use */
-  CINIT(SSL_CIPHER_LIST, OBJECTPOINT, 83),
+  CINIT(SSL_CIPHER_LIST, STRINGPOINT, 83),
 
   /* Specify which HTTP version to use! This must be set to one of the
      CURL_HTTP_VERSION* enums set below. */
@@ -1118,16 +1163,16 @@ typedef enum {
   CINIT(FTP_USE_EPSV, LONG, 85),
 
   /* type of the file keeping your SSL-certificate ("DER", "PEM", "ENG") */
-  CINIT(SSLCERTTYPE, OBJECTPOINT, 86),
+  CINIT(SSLCERTTYPE, STRINGPOINT, 86),
 
   /* name of the file keeping your private SSL-key */
-  CINIT(SSLKEY, OBJECTPOINT, 87),
+  CINIT(SSLKEY, STRINGPOINT, 87),
 
   /* type of the file keeping your private SSL-key ("DER", "PEM", "ENG") */
-  CINIT(SSLKEYTYPE, OBJECTPOINT, 88),
+  CINIT(SSLKEYTYPE, STRINGPOINT, 88),
 
   /* crypto engine for the SSL-sub system */
-  CINIT(SSLENGINE, OBJECTPOINT, 89),
+  CINIT(SSLENGINE, STRINGPOINT, 89),
 
   /* set the crypto engine for the SSL-sub system as default
      the param has no meaning...
@@ -1154,7 +1199,7 @@ typedef enum {
 
   /* The CApath directory used to validate the peer certificate
      this option is used only if SSL_VERIFYPEER is true */
-  CINIT(CAPATH, OBJECTPOINT, 97),
+  CINIT(CAPATH, STRINGPOINT, 97),
 
   /* Instruct libcurl to use a smaller receive buffer */
   CINIT(BUFFERSIZE, LONG, 98),
@@ -1168,13 +1213,14 @@ typedef enum {
   CINIT(SHARE, OBJECTPOINT, 100),
 
   /* indicates type of proxy. accepted values are CURLPROXY_HTTP (default),
-     CURLPROXY_SOCKS4, CURLPROXY_SOCKS4A and CURLPROXY_SOCKS5. */
+     CURLPROXY_HTTPS, CURLPROXY_SOCKS4, CURLPROXY_SOCKS4A and
+     CURLPROXY_SOCKS5. */
   CINIT(PROXYTYPE, LONG, 101),
 
   /* Set the Accept-Encoding string. Use this to tell a server you would like
      the response to be compressed. Before 7.21.6, this was known as
      CURLOPT_ENCODING */
-  CINIT(ACCEPT_ENCODING, OBJECTPOINT, 102),
+  CINIT(ACCEPT_ENCODING, STRINGPOINT, 102),
 
   /* Set pointer to private data */
   CINIT(PRIVATE, OBJECTPOINT, 103),
@@ -1255,7 +1301,7 @@ typedef enum {
      to parse (using the CURLOPT_NETRC option). If not set, libcurl will do
      a poor attempt to find the user's home directory and check for a .netrc
      file in there. */
-  CINIT(NETRC_FILE, OBJECTPOINT, 118),
+  CINIT(NETRC_FILE, STRINGPOINT, 118),
 
   /* Enable SSL/TLS for FTP, pick one of:
      CURLUSESSL_TRY     - try using SSL, proceed anyway otherwise
@@ -1298,10 +1344,10 @@ typedef enum {
 
   /* zero terminated string for pass on to the FTP server when asked for
      "account" info */
-  CINIT(FTP_ACCOUNT, OBJECTPOINT, 134),
+  CINIT(FTP_ACCOUNT, STRINGPOINT, 134),
 
-  /* feed cookies into cookie engine */
-  CINIT(COOKIELIST, OBJECTPOINT, 135),
+  /* feed cookie into cookie engine */
+  CINIT(COOKIELIST, STRINGPOINT, 135),
 
   /* ignore Content-Length */
   CINIT(IGNORE_CONTENT_LENGTH, LONG, 136),
@@ -1347,7 +1393,7 @@ typedef enum {
   CINIT(MAX_RECV_SPEED_LARGE, OFF_T, 146),
 
   /* Pointer to command string to send if USER/PASS fails. */
-  CINIT(FTP_ALTERNATIVE_TO_USER, OBJECTPOINT, 147),
+  CINIT(FTP_ALTERNATIVE_TO_USER, STRINGPOINT, 147),
 
   /* callback function for setting socket options */
   CINIT(SOCKOPTFUNCTION, FUNCTIONPOINT, 148),
@@ -1361,8 +1407,8 @@ typedef enum {
   CINIT(SSH_AUTH_TYPES, LONG, 151),
 
   /* Used by scp/sftp to do public/private key authentication */
-  CINIT(SSH_PUBLIC_KEYFILE, OBJECTPOINT, 152),
-  CINIT(SSH_PRIVATE_KEYFILE, OBJECTPOINT, 153),
+  CINIT(SSH_PUBLIC_KEYFILE, STRINGPOINT, 152),
+  CINIT(SSH_PRIVATE_KEYFILE, STRINGPOINT, 153),
 
   /* Send CCC (Clear Command Channel) after authentication */
   CINIT(FTP_SSL_CCC, LONG, 154),
@@ -1386,7 +1432,7 @@ typedef enum {
   CINIT(POSTREDIR, LONG, 161),
 
   /* used by scp/sftp to verify the host's public key */
-  CINIT(SSH_HOST_PUBLIC_KEY_MD5, OBJECTPOINT, 162),
+  CINIT(SSH_HOST_PUBLIC_KEY_MD5, STRINGPOINT, 162),
 
   /* Callback function for opening socket (instead of socket(2)). Optionally,
      callback is able change the address or refuse to connect returning
@@ -1406,10 +1452,10 @@ typedef enum {
   CINIT(SEEKDATA, OBJECTPOINT, 168),
 
   /* CRL file */
-  CINIT(CRLFILE, OBJECTPOINT, 169),
+  CINIT(CRLFILE, STRINGPOINT, 169),
 
   /* Issuer certificate */
-  CINIT(ISSUERCERT, OBJECTPOINT, 170),
+  CINIT(ISSUERCERT, STRINGPOINT, 170),
 
   /* (IPv6) Address scope */
   CINIT(ADDRESS_SCOPE, LONG, 171),
@@ -1419,12 +1465,12 @@ typedef enum {
   CINIT(CERTINFO, LONG, 172),
 
   /* "name" and "pwd" to use when fetching. */
-  CINIT(USERNAME, OBJECTPOINT, 173),
-  CINIT(PASSWORD, OBJECTPOINT, 174),
+  CINIT(USERNAME, STRINGPOINT, 173),
+  CINIT(PASSWORD, STRINGPOINT, 174),
 
     /* "name" and "pwd" to use with Proxy when fetching. */
-  CINIT(PROXYUSERNAME, OBJECTPOINT, 175),
-  CINIT(PROXYPASSWORD, OBJECTPOINT, 176),
+  CINIT(PROXYUSERNAME, STRINGPOINT, 175),
+  CINIT(PROXYPASSWORD, STRINGPOINT, 176),
 
   /* Comma separated list of hostnames defining no-proxy zones. These should
      match both hostnames directly, and hostnames within a domain. For
@@ -1433,13 +1479,13 @@ typedef enum {
      implementations of this, .local.com will be considered to be the same as
      local.com. A single * is the only valid wildcard, and effectively
      disables the use of proxy. */
-  CINIT(NOPROXY, OBJECTPOINT, 177),
+  CINIT(NOPROXY, STRINGPOINT, 177),
 
   /* block size for TFTP transfers */
   CINIT(TFTP_BLKSIZE, LONG, 178),
 
   /* Socks Service */
-  CINIT(SOCKS5_GSSAPI_SERVICE, OBJECTPOINT, 179),
+  CINIT(SOCKS5_GSSAPI_SERVICE, STRINGPOINT, 179), /* DEPRECATED, do not use! */
 
   /* Socks Service */
   CINIT(SOCKS5_GSSAPI_NEC, LONG, 180),
@@ -1457,7 +1503,7 @@ typedef enum {
   CINIT(REDIR_PROTOCOLS, LONG, 182),
 
   /* set the SSH knownhost file name to use */
-  CINIT(SSH_KNOWNHOSTS, OBJECTPOINT, 183),
+  CINIT(SSH_KNOWNHOSTS, STRINGPOINT, 183),
 
   /* set the SSH host key callback, must point to a curl_sshkeycallback
      function */
@@ -1467,9 +1513,9 @@ typedef enum {
   CINIT(SSH_KEYDATA, OBJECTPOINT, 185),
 
   /* set the SMTP mail originator */
-  CINIT(MAIL_FROM, OBJECTPOINT, 186),
+  CINIT(MAIL_FROM, STRINGPOINT, 186),
 
-  /* set the SMTP mail receiver(s) */
+  /* set the list of SMTP mail receiver(s) */
   CINIT(MAIL_RCPT, OBJECTPOINT, 187),
 
   /* FTP: send PRET before PASV */
@@ -1479,13 +1525,13 @@ typedef enum {
   CINIT(RTSP_REQUEST, LONG, 189),
 
   /* The RTSP session identifier */
-  CINIT(RTSP_SESSION_ID, OBJECTPOINT, 190),
+  CINIT(RTSP_SESSION_ID, STRINGPOINT, 190),
 
   /* The RTSP stream URI */
-  CINIT(RTSP_STREAM_URI, OBJECTPOINT, 191),
+  CINIT(RTSP_STREAM_URI, STRINGPOINT, 191),
 
   /* The Transport: header to use in RTSP requests */
-  CINIT(RTSP_TRANSPORT, OBJECTPOINT, 192),
+  CINIT(RTSP_TRANSPORT, STRINGPOINT, 192),
 
   /* Manually initialize the client RTSP CSeq for this handle */
   CINIT(RTSP_CLIENT_CSEQ, LONG, 193),
@@ -1523,13 +1569,13 @@ typedef enum {
   CINIT(RESOLVE, OBJECTPOINT, 203),
 
   /* Set a username for authenticated TLS */
-  CINIT(TLSAUTH_USERNAME, OBJECTPOINT, 204),
+  CINIT(TLSAUTH_USERNAME, STRINGPOINT, 204),
 
   /* Set a password for authenticated TLS */
-  CINIT(TLSAUTH_PASSWORD, OBJECTPOINT, 205),
+  CINIT(TLSAUTH_PASSWORD, STRINGPOINT, 205),
 
   /* Set authentication type for authenticated TLS */
-  CINIT(TLSAUTH_TYPE, OBJECTPOINT, 206),
+  CINIT(TLSAUTH_TYPE, STRINGPOINT, 206),
 
   /* Set to 1 to enable the "TE:" header in HTTP requests to ask for
      compressed transfer-encoded responses. Set to 0 to disable the use of TE:
@@ -1552,10 +1598,10 @@ typedef enum {
   CINIT(GSSAPI_DELEGATION, LONG, 210),
 
   /* Set the name servers to use for DNS resolution */
-  CINIT(DNS_SERVERS, OBJECTPOINT, 211),
+  CINIT(DNS_SERVERS, STRINGPOINT, 211),
 
   /* Time-out accept operations (currently for FTP only) after this amount
-     of miliseconds. */
+     of milliseconds. */
   CINIT(ACCEPTTIMEOUT_MS, LONG, 212),
 
   /* Set TCP keepalive */
@@ -1569,7 +1615,7 @@ typedef enum {
   CINIT(SSL_OPTIONS, LONG, 216),
 
   /* Set the SMTP auth originator */
-  CINIT(MAIL_AUTH, OBJECTPOINT, 217),
+  CINIT(MAIL_AUTH, STRINGPOINT, 217),
 
   /* Enable/disable SASL initial response */
   CINIT(SASL_IR, LONG, 218),
@@ -1580,23 +1626,23 @@ typedef enum {
   CINIT(XFERINFOFUNCTION, FUNCTIONPOINT, 219),
 
   /* The XOAUTH2 bearer token */
-  CINIT(XOAUTH2_BEARER, OBJECTPOINT, 220),
+  CINIT(XOAUTH2_BEARER, STRINGPOINT, 220),
 
   /* Set the interface string to use as outgoing network
    * interface for DNS requests.
    * Only supported by the c-ares DNS backend */
-  CINIT(DNS_INTERFACE, OBJECTPOINT, 221),
+  CINIT(DNS_INTERFACE, STRINGPOINT, 221),
 
   /* Set the local IPv4 address to use for outgoing DNS requests.
    * Only supported by the c-ares DNS backend */
-  CINIT(DNS_LOCAL_IP4, OBJECTPOINT, 222),
+  CINIT(DNS_LOCAL_IP4, STRINGPOINT, 222),
 
   /* Set the local IPv4 address to use for outgoing DNS requests.
    * Only supported by the c-ares DNS backend */
-  CINIT(DNS_LOCAL_IP6, OBJECTPOINT, 223),
+  CINIT(DNS_LOCAL_IP6, STRINGPOINT, 223),
 
   /* Set authentication options directly */
-  CINIT(LOGIN_OPTIONS, OBJECTPOINT, 224),
+  CINIT(LOGIN_OPTIONS, STRINGPOINT, 224),
 
   /* Enable/disable TLS NPN extension (http2 over ssl might fail without) */
   CINIT(SSL_ENABLE_NPN, LONG, 225),
@@ -1617,10 +1663,124 @@ typedef enum {
 
   /* The public key in DER form used to validate the peer public key
      this option is used only if SSL_VERIFYPEER is true */
-  CINIT(PINNEDPUBLICKEY, OBJECTPOINT, 230),
+  CINIT(PINNEDPUBLICKEY, STRINGPOINT, 230),
 
   /* Path to Unix domain socket */
-  CINIT(UNIX_SOCKET_PATH, OBJECTPOINT, 231),
+  CINIT(UNIX_SOCKET_PATH, STRINGPOINT, 231),
+
+  /* Set if we should verify the certificate status. */
+  CINIT(SSL_VERIFYSTATUS, LONG, 232),
+
+  /* Set if we should enable TLS false start. */
+  CINIT(SSL_FALSESTART, LONG, 233),
+
+  /* Do not squash dot-dot sequences */
+  CINIT(PATH_AS_IS, LONG, 234),
+
+  /* Proxy Service Name */
+  CINIT(PROXY_SERVICE_NAME, STRINGPOINT, 235),
+
+  /* Service Name */
+  CINIT(SERVICE_NAME, STRINGPOINT, 236),
+
+  /* Wait/don't wait for pipe/mutex to clarify */
+  CINIT(PIPEWAIT, LONG, 237),
+
+  /* Set the protocol used when curl is given a URL without a protocol */
+  CINIT(DEFAULT_PROTOCOL, STRINGPOINT, 238),
+
+  /* Set stream weight, 1 - 256 (default is 16) */
+  CINIT(STREAM_WEIGHT, LONG, 239),
+
+  /* Set stream dependency on another CURL handle */
+  CINIT(STREAM_DEPENDS, OBJECTPOINT, 240),
+
+  /* Set E-xclusive stream dependency on another CURL handle */
+  CINIT(STREAM_DEPENDS_E, OBJECTPOINT, 241),
+
+  /* Do not send any tftp option requests to the server */
+  CINIT(TFTP_NO_OPTIONS, LONG, 242),
+
+  /* Linked-list of host:port:connect-to-host:connect-to-port,
+     overrides the URL's host:port (only for the network layer) */
+  CINIT(CONNECT_TO, OBJECTPOINT, 243),
+
+  /* Set TCP Fast Open */
+  CINIT(TCP_FASTOPEN, LONG, 244),
+
+  /* Continue to send data if the server responds early with an
+   * HTTP status code >= 300 */
+  CINIT(KEEP_SENDING_ON_ERROR, LONG, 245),
+
+  /* The CApath or CAfile used to validate the proxy certificate
+     this option is used only if PROXY_SSL_VERIFYPEER is true */
+  CINIT(PROXY_CAINFO, STRINGPOINT, 246),
+
+  /* The CApath directory used to validate the proxy certificate
+     this option is used only if PROXY_SSL_VERIFYPEER is true */
+  CINIT(PROXY_CAPATH, STRINGPOINT, 247),
+
+  /* Set if we should verify the proxy in ssl handshake,
+     set 1 to verify. */
+  CINIT(PROXY_SSL_VERIFYPEER, LONG, 248),
+
+  /* Set if we should verify the Common name from the proxy certificate in ssl
+   * handshake, set 1 to check existence, 2 to ensure that it matches
+   * the provided hostname. */
+  CINIT(PROXY_SSL_VERIFYHOST, LONG, 249),
+
+  /* What version to specifically try to use for proxy.
+     See CURL_SSLVERSION defines below. */
+  CINIT(PROXY_SSLVERSION, LONG, 250),
+
+  /* Set a username for authenticated TLS for proxy */
+  CINIT(PROXY_TLSAUTH_USERNAME, STRINGPOINT, 251),
+
+  /* Set a password for authenticated TLS for proxy */
+  CINIT(PROXY_TLSAUTH_PASSWORD, STRINGPOINT, 252),
+
+  /* Set authentication type for authenticated TLS for proxy */
+  CINIT(PROXY_TLSAUTH_TYPE, STRINGPOINT, 253),
+
+  /* name of the file keeping your private SSL-certificate for proxy */
+  CINIT(PROXY_SSLCERT, STRINGPOINT, 254),
+
+  /* type of the file keeping your SSL-certificate ("DER", "PEM", "ENG") for
+     proxy */
+  CINIT(PROXY_SSLCERTTYPE, STRINGPOINT, 255),
+
+  /* name of the file keeping your private SSL-key for proxy */
+  CINIT(PROXY_SSLKEY, STRINGPOINT, 256),
+
+  /* type of the file keeping your private SSL-key ("DER", "PEM", "ENG") for
+     proxy */
+  CINIT(PROXY_SSLKEYTYPE, STRINGPOINT, 257),
+
+  /* password for the SSL private key for proxy */
+  CINIT(PROXY_KEYPASSWD, STRINGPOINT, 258),
+
+  /* Specify which SSL ciphers to use for proxy */
+  CINIT(PROXY_SSL_CIPHER_LIST, STRINGPOINT, 259),
+
+  /* CRL file for proxy */
+  CINIT(PROXY_CRLFILE, STRINGPOINT, 260),
+
+  /* Enable/disable specific SSL features with a bitmask for proxy, see
+     CURLSSLOPT_* */
+  CINIT(PROXY_SSL_OPTIONS, LONG, 261),
+
+  /* Name of pre proxy to use. */
+  CINIT(PRE_PROXY, STRINGPOINT, 262),
+
+  /* The public key in DER form used to validate the proxy public key
+     this option is used only if PROXY_SSL_VERIFYPEER is true */
+  CINIT(PROXY_PINNEDPUBLICKEY, STRINGPOINT, 263),
+
+  /* Path to an abstract Unix domain socket */
+  CINIT(ABSTRACT_UNIX_SOCKET, STRINGPOINT, 264),
+
+  /* Suppress proxy CONNECT response headers from user callbacks */
+  CINIT(SUPPRESS_CONNECT_HEADERS, LONG, 265),
 
   CURLOPT_LASTENTRY /* the last unused */
 } CURLoption;
@@ -1671,11 +1831,19 @@ enum {
                              for us! */
   CURL_HTTP_VERSION_1_0,  /* please use HTTP 1.0 in the request */
   CURL_HTTP_VERSION_1_1,  /* please use HTTP 1.1 in the request */
-  CURL_HTTP_VERSION_2_0,  /* please use HTTP 2.0 in the request */
+  CURL_HTTP_VERSION_2_0,  /* please use HTTP 2 in the request */
+  CURL_HTTP_VERSION_2TLS, /* use version 2 for HTTPS, version 1.1 for HTTP */
+  CURL_HTTP_VERSION_2_PRIOR_KNOWLEDGE,  /* please use HTTP 2 without HTTP/1.1
+                                           Upgrade */
 
   CURL_HTTP_VERSION_LAST /* *ILLEGAL* http version */
 };
 
+/* Convenience definition simple because the name of the version is HTTP/2 and
+   not 2.0. The 2_0 version of the enum name was set while the version was
+   still planned to be 2.0 and we stick to it for compatibility. */
+#define CURL_HTTP_VERSION_2 CURL_HTTP_VERSION_2_0
+
 /*
  * Public API enums for RTSP requests
  */
@@ -1715,10 +1883,23 @@ enum {
   CURL_SSLVERSION_TLSv1_0,
   CURL_SSLVERSION_TLSv1_1,
   CURL_SSLVERSION_TLSv1_2,
+  CURL_SSLVERSION_TLSv1_3,
 
   CURL_SSLVERSION_LAST /* never use, keep last */
 };
 
+enum {
+  CURL_SSLVERSION_MAX_NONE =     0,
+  CURL_SSLVERSION_MAX_DEFAULT =  (CURL_SSLVERSION_TLSv1   << 16),
+  CURL_SSLVERSION_MAX_TLSv1_0 =  (CURL_SSLVERSION_TLSv1_0 << 16),
+  CURL_SSLVERSION_MAX_TLSv1_1 =  (CURL_SSLVERSION_TLSv1_1 << 16),
+  CURL_SSLVERSION_MAX_TLSv1_2 =  (CURL_SSLVERSION_TLSv1_2 << 16),
+  CURL_SSLVERSION_MAX_TLSv1_3 =  (CURL_SSLVERSION_TLSv1_3 << 16),
+
+  /* never use, keep last */
+  CURL_SSLVERSION_MAX_LAST =     (CURL_SSLVERSION_LAST    << 16)
+};
+
 enum CURL_TLSAUTH {
   CURL_TLSAUTH_NONE,
   CURL_TLSAUTH_SRP,
@@ -1749,7 +1930,10 @@ typedef enum {
 
 
 /* curl_strequal() and curl_strnequal() are subject for removal in a future
-   libcurl, see lib/README.curlx for details */
+   libcurl, see lib/README.curlx for details
+
+   !checksrc! disable SPACEBEFOREPAREN 2
+*/
 CURL_EXTERN int (curl_strequal)(const char *s1, const char *s2);
 CURL_EXTERN int (curl_strnequal)(const char *s1, const char *s2, size_t n);
 
@@ -1791,6 +1975,7 @@ typedef enum {
   CFINIT(OBSOLETE2),
 
   CFINIT(STREAM),
+  CFINIT(CONTENTLEN), /* added in 7.46.0, provide a curl_off_t length */
 
   CURLFORM_LASTENTRY /* the last unused */
 } CURLformoption;
@@ -2045,12 +2230,18 @@ typedef enum {
   CURLSSLBACKEND_CYASSL = 7,
   CURLSSLBACKEND_SCHANNEL = 8,
   CURLSSLBACKEND_DARWINSSL = 9,
-  CURLSSLBACKEND_AXTLS = 10
+  CURLSSLBACKEND_AXTLS = 10,
+  CURLSSLBACKEND_MBEDTLS = 11
 } curl_sslbackend;
 
+/* aliases for library clones and renames */
+#define CURLSSLBACKEND_LIBRESSL 1
+#define CURLSSLBACKEND_BORINGSSL 1
+#define CURLSSLBACKEND_WOLFSSL 6
+
 /* Information about the SSL library used and the respective internal SSL
    handle, which can be used to obtain further information regarding the
-   connection. Asked for with CURLINFO_TLS_SESSION. */
+   connection. Asked for with CURLINFO_TLS_SSL_PTR or CURLINFO_TLS_SESSION. */
 struct curl_tlssessioninfo {
   curl_sslbackend backend;
   void *internals;
@@ -2060,6 +2251,7 @@ struct curl_tlssessioninfo {
 #define CURLINFO_LONG     0x200000
 #define CURLINFO_DOUBLE   0x300000
 #define CURLINFO_SLIST    0x400000
+#define CURLINFO_SOCKET   0x500000
 #define CURLINFO_MASK     0x0fffff
 #define CURLINFO_TYPEMASK 0xf00000
 
@@ -2108,9 +2300,15 @@ typedef enum {
   CURLINFO_LOCAL_IP         = CURLINFO_STRING + 41,
   CURLINFO_LOCAL_PORT       = CURLINFO_LONG   + 42,
   CURLINFO_TLS_SESSION      = CURLINFO_SLIST  + 43,
+  CURLINFO_ACTIVESOCKET     = CURLINFO_SOCKET + 44,
+  CURLINFO_TLS_SSL_PTR      = CURLINFO_SLIST  + 45,
+  CURLINFO_HTTP_VERSION     = CURLINFO_LONG   + 46,
+  CURLINFO_PROXY_SSL_VERIFYRESULT = CURLINFO_LONG + 47,
+  CURLINFO_PROTOCOL         = CURLINFO_LONG   + 48,
+  CURLINFO_SCHEME           = CURLINFO_STRING + 49,
   /* Fill in new entries below here! */
 
-  CURLINFO_LASTONE          = 43
+  CURLINFO_LASTONE          = 49
 } CURLINFO;
 
 /* CURLINFO_RESPONSE_CODE is the new name for the option previously known as
@@ -2172,7 +2370,6 @@ typedef void (*curl_unlock_function)(CURL *handle,
                                      curl_lock_data data,
                                      void *userptr);
 
-typedef void CURLSH;
 
 typedef enum {
   CURLSHE_OK,  /* all is fine */
@@ -2265,11 +2462,14 @@ typedef struct {
 #define CURL_VERSION_CURLDEBUG    (1<<13) /* Debug memory tracking supported */
 #define CURL_VERSION_TLSAUTH_SRP  (1<<14) /* TLS-SRP auth is supported */
 #define CURL_VERSION_NTLM_WB      (1<<15) /* NTLM delegation to winbind helper
-                                             is suported */
+                                             is supported */
 #define CURL_VERSION_HTTP2        (1<<16) /* HTTP2 support built-in */
 #define CURL_VERSION_GSSAPI       (1<<17) /* Built against a GSS-API library */
 #define CURL_VERSION_KERBEROS5    (1<<18) /* Kerberos V5 auth is supported */
 #define CURL_VERSION_UNIX_SOCKETS (1<<19) /* Unix domain sockets support */
+#define CURL_VERSION_PSL          (1<<20) /* Mozilla's Public Suffix List, used
+                                             for cookie domain verification */
+#define CURL_VERSION_HTTPS_PROXY  (1<<21) /* HTTPS-proxy support built-in */
 
  /*
  * NAME curl_version_info()
diff --git a/compat/curl-for-windows/curl/include/curl/curlbuild.h b/compat/includes/curl/curlbuild.h
similarity index 98%
rename from compat/curl-for-windows/curl/include/curl/curlbuild.h
rename to compat/includes/curl/curlbuild.h
index f09419a843..ae95095fa5 100644
--- a/compat/curl-for-windows/curl/include/curl/curlbuild.h
+++ b/compat/includes/curl/curlbuild.h
@@ -7,11 +7,11 @@
  *                            | (__| |_| |  _ <| |___
  *                             \___|\___/|_| \_\_____|
  *
- * Copyright (C) 1998 - 2013, Daniel Stenberg, <daniel@haxx.se>, et al.
+ * Copyright (C) 1998 - 2016, Daniel Stenberg, <daniel@haxx.se>, et al.
  *
  * This software is licensed as described in the file COPYING, which
  * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
+ * are also available at https://curl.haxx.se/docs/copyright.html.
  *
  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
  * copies of the Software, and permit persons to whom the Software is
@@ -49,7 +49,7 @@
  *
  * If you think that something actually needs to be changed, adjusted
  * or fixed in this file, then, report it on the libcurl development
- * mailing list: http://cool.haxx.se/mailman/listinfo/curl-library/
+ * mailing list: https://cool.haxx.se/mailman/listinfo/curl-library/
  *
  * Try to keep one section per platform, compiler and architecture,
  * otherwise, if an existing section is reused for a different one and
@@ -527,8 +527,9 @@
 /* ===================================== */
 
 #elif defined(__GNUC__)
-#  if defined(__ILP32__) || \
-      defined(__i386__) || defined(__ppc__) || defined(__arm__) || defined(__sparc__)
+#  if !defined(__LP64__) && (defined(__ILP32__) || \
+      defined(__i386__) || defined(__ppc__) || defined(__arm__) || \
+      defined(__sparc__) || defined(__mips__) || defined(__sh__))
 #    define CURL_SIZEOF_LONG           4
 #    define CURL_TYPEOF_CURL_OFF_T     long long
 #    define CURL_FORMAT_CURL_OFF_T     "lld"
diff --git a/compat/curl-for-windows/curl/include/curl/curlrules.h b/compat/includes/curl/curlrules.h
similarity index 91%
rename from compat/curl-for-windows/curl/include/curl/curlrules.h
rename to compat/includes/curl/curlrules.h
index 7c2ede35b6..0abd9f71d8 100644
--- a/compat/curl-for-windows/curl/include/curl/curlrules.h
+++ b/compat/includes/curl/curlrules.h
@@ -7,11 +7,11 @@
  *                            | (__| |_| |  _ <| |___
  *                             \___|\___/|_| \_\_____|
  *
- * Copyright (C) 1998 - 2012, Daniel Stenberg, <daniel@haxx.se>, et al.
+ * Copyright (C) 1998 - 2017, Daniel Stenberg, <daniel@haxx.se>, et al.
  *
  * This software is licensed as described in the file COPYING, which
  * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
+ * are also available at https://curl.haxx.se/docs/copyright.html.
  *
  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
  * copies of the Software, and permit persons to whom the Software is
@@ -47,7 +47,7 @@
  * library is properly built and used.
  *
  * You can find further help on the libcurl development mailing list:
- * http://cool.haxx.se/mailman/listinfo/curl-library/
+ * https://cool.haxx.se/mailman/listinfo/curl-library/
  *
  * NOTE 2
  * ------
@@ -105,11 +105,6 @@
    Error Compilation_aborted_CURL_FORMAT_CURL_OFF_TU_is_missing
 #endif
 
-#ifndef CURL_FORMAT_OFF_T
-#  error "CURL_FORMAT_OFF_T definition is missing!"
-   Error Compilation_aborted_CURL_FORMAT_OFF_T_is_missing
-#endif
-
 #ifndef CURL_SIZEOF_CURL_OFF_T
 #  error "CURL_SIZEOF_CURL_OFF_T definition is missing!"
    Error Compilation_aborted_CURL_SIZEOF_CURL_OFF_T_is_missing
@@ -241,22 +236,4 @@ typedef char
 #undef CurlchkszEQ
 #undef CurlchkszGE
 
-/*
- * Get rid of macros not intended to exist beyond this point.
- */
-
-#undef CURL_PULL_WS2TCPIP_H
-#undef CURL_PULL_SYS_TYPES_H
-#undef CURL_PULL_SYS_SOCKET_H
-#undef CURL_PULL_SYS_POLL_H
-#undef CURL_PULL_STDINT_H
-#undef CURL_PULL_INTTYPES_H
-
-#undef CURL_TYPEOF_CURL_SOCKLEN_T
-#undef CURL_TYPEOF_CURL_OFF_T
-
-#ifdef CURL_NO_OLDIES
-#undef CURL_FORMAT_OFF_T /* not required since 7.19.0 - obsoleted in 7.20.0 */
-#endif
-
 #endif /* __CURL_CURLRULES_H */
diff --git a/compat/curl-for-windows/curl/include/curl/curlver.h b/compat/includes/curl/curlver.h
similarity index 77%
rename from compat/curl-for-windows/curl/include/curl/curlver.h
rename to compat/includes/curl/curlver.h
index ccdafc1de1..95a2cbbe78 100644
--- a/compat/curl-for-windows/curl/include/curl/curlver.h
+++ b/compat/includes/curl/curlver.h
@@ -7,11 +7,11 @@
  *                            | (__| |_| |  _ <| |___
  *                             \___|\___/|_| \_\_____|
  *
- * Copyright (C) 1998 - 2015, Daniel Stenberg, <daniel@haxx.se>, et al.
+ * Copyright (C) 1998 - 2017, Daniel Stenberg, <daniel@haxx.se>, et al.
  *
  * This software is licensed as described in the file COPYING, which
  * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
+ * are also available at https://curl.haxx.se/docs/copyright.html.
  *
  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
  * copies of the Software, and permit persons to whom the Software is
@@ -26,16 +26,16 @@
    a script at release-time. This was made its own header file in 7.11.2 */
 
 /* This is the global package copyright */
-#define LIBCURL_COPYRIGHT "1996 - 2015 Daniel Stenberg, <daniel@haxx.se>."
+#define LIBCURL_COPYRIGHT "1996 - 2017 Daniel Stenberg, <daniel@haxx.se>."
 
 /* This is the version number of the libcurl package from which this header
    file origins: */
-#define LIBCURL_VERSION "7.40.0"
+#define LIBCURL_VERSION "7.54.0"
 
 /* The numeric version number is also available "in parts" by using these
    defines: */
 #define LIBCURL_VERSION_MAJOR 7
-#define LIBCURL_VERSION_MINOR 40
+#define LIBCURL_VERSION_MINOR 54
 #define LIBCURL_VERSION_PATCH 0
 
 /* This is the numeric version of the libcurl version number, meant for easier
@@ -52,8 +52,12 @@
    This 6-digit (24 bits) hexadecimal number does not show pre-release number,
    and it is always a greater number in a more recent release. It makes
    comparisons with greater than and less than work.
+
+   Note: This define is the full hex number and _does not_ use the
+   CURL_VERSION_BITS() macro since curl's own configure script greps for it
+   and needs it to contain the full number.
 */
-#define LIBCURL_VERSION_NUM 0x072800
+#define LIBCURL_VERSION_NUM 0x073600
 
 /*
  * This is the date and time when the full source package was created. The
@@ -64,6 +68,10 @@
  *
  * "Mon Feb 12 11:35:33 UTC 2007"
  */
-#define LIBCURL_TIMESTAMP "Thu Jan  8 08:17:17 UTC 2015"
+#define LIBCURL_TIMESTAMP "Wed Apr 19 05:43:55 UTC 2017"
+
+#define CURL_VERSION_BITS(x,y,z) ((x)<<16|(y)<<8|z)
+#define CURL_AT_LEAST_VERSION(x,y,z) \
+  (LIBCURL_VERSION_NUM >= CURL_VERSION_BITS(x, y, z))
 
 #endif /* __CURL_CURLVER_H */
diff --git a/compat/curl-for-windows/curl/include/curl/easy.h b/compat/includes/curl/easy.h
similarity index 94%
rename from compat/curl-for-windows/curl/include/curl/easy.h
rename to compat/includes/curl/easy.h
index c1e3e76096..752c5049f8 100644
--- a/compat/curl-for-windows/curl/include/curl/easy.h
+++ b/compat/includes/curl/easy.h
@@ -7,11 +7,11 @@
  *                            | (__| |_| |  _ <| |___
  *                             \___|\___/|_| \_\_____|
  *
- * Copyright (C) 1998 - 2008, Daniel Stenberg, <daniel@haxx.se>, et al.
+ * Copyright (C) 1998 - 2016, Daniel Stenberg, <daniel@haxx.se>, et al.
  *
  * This software is licensed as described in the file COPYING, which
  * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
+ * are also available at https://curl.haxx.se/docs/copyright.html.
  *
  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
  * copies of the Software, and permit persons to whom the Software is
@@ -58,7 +58,7 @@ CURL_EXTERN CURLcode curl_easy_getinfo(CURL *curl, CURLINFO info, ...);
  * curl_easy_duphandle() for each new thread to avoid a series of identical
  * curl_easy_setopt() invokes in every thread.
  */
-CURL_EXTERN CURL* curl_easy_duphandle(CURL *curl);
+CURL_EXTERN CURL *curl_easy_duphandle(CURL *curl);
 
 /*
  * NAME curl_easy_reset()
diff --git a/compat/curl-for-windows/curl/include/curl/mprintf.h b/compat/includes/curl/mprintf.h
similarity index 68%
rename from compat/curl-for-windows/curl/include/curl/mprintf.h
rename to compat/includes/curl/mprintf.h
index cc9e7f5d1f..e20f546e19 100644
--- a/compat/curl-for-windows/curl/include/curl/mprintf.h
+++ b/compat/includes/curl/mprintf.h
@@ -7,11 +7,11 @@
  *                            | (__| |_| |  _ <| |___
  *                             \___|\___/|_| \_\_____|
  *
- * Copyright (C) 1998 - 2013, Daniel Stenberg, <daniel@haxx.se>, et al.
+ * Copyright (C) 1998 - 2016, Daniel Stenberg, <daniel@haxx.se>, et al.
  *
  * This software is licensed as described in the file COPYING, which
  * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
+ * are also available at https://curl.haxx.se/docs/copyright.html.
  *
  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
  * copies of the Software, and permit persons to whom the Software is
@@ -24,8 +24,7 @@
 
 #include <stdarg.h>
 #include <stdio.h> /* needed for FILE */
-
-#include "curl.h"
+#include "curl.h"  /* for CURL_EXTERN */
 
 #ifdef  __cplusplus
 extern "C" {
@@ -44,36 +43,6 @@ CURL_EXTERN int curl_mvsnprintf(char *buffer, size_t maxlength,
 CURL_EXTERN char *curl_maprintf(const char *format, ...);
 CURL_EXTERN char *curl_mvaprintf(const char *format, va_list args);
 
-#ifdef _MPRINTF_REPLACE
-# undef printf
-# undef fprintf
-# undef sprintf
-# undef vsprintf
-# undef snprintf
-# undef vprintf
-# undef vfprintf
-# undef vsnprintf
-# undef aprintf
-# undef vaprintf
-# define printf curl_mprintf
-# define fprintf curl_mfprintf
-#ifdef CURLDEBUG
-/* When built with CURLDEBUG we define away the sprintf functions since we
-   don't want internal code to be using them */
-# define sprintf sprintf_was_used
-# define vsprintf vsprintf_was_used
-#else
-# define sprintf curl_msprintf
-# define vsprintf curl_mvsprintf
-#endif
-# define snprintf curl_msnprintf
-# define vprintf curl_mvprintf
-# define vfprintf curl_mvfprintf
-# define vsnprintf curl_mvsnprintf
-# define aprintf curl_maprintf
-# define vaprintf curl_mvaprintf
-#endif
-
 #ifdef  __cplusplus
 }
 #endif
diff --git a/compat/curl-for-windows/curl/include/curl/multi.h b/compat/includes/curl/multi.h
similarity index 90%
rename from compat/curl-for-windows/curl/include/curl/multi.h
rename to compat/includes/curl/multi.h
index 3c4acb0f6e..f93e511be0 100644
--- a/compat/curl-for-windows/curl/include/curl/multi.h
+++ b/compat/includes/curl/multi.h
@@ -7,11 +7,11 @@
  *                            | (__| |_| |  _ <| |___
  *                             \___|\___/|_| \_\_____|
  *
- * Copyright (C) 1998 - 2013, Daniel Stenberg, <daniel@haxx.se>, et al.
+ * Copyright (C) 1998 - 2016, Daniel Stenberg, <daniel@haxx.se>, et al.
  *
  * This software is licensed as described in the file COPYING, which
  * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
+ * are also available at https://curl.haxx.se/docs/copyright.html.
  *
  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
  * copies of the Software, and permit persons to whom the Software is
@@ -52,7 +52,11 @@
 extern "C" {
 #endif
 
+#if defined(BUILDING_LIBCURL) || defined(CURL_STRICTER)
+typedef struct Curl_multi CURLM;
+#else
 typedef void CURLM;
+#endif
 
 typedef enum {
   CURLM_CALL_MULTI_PERFORM = -1, /* please call curl_multi_perform() or
@@ -74,6 +78,11 @@ typedef enum {
    curl_multi_perform() and CURLM_CALL_MULTI_PERFORM */
 #define CURLM_CALL_MULTI_SOCKET CURLM_CALL_MULTI_PERFORM
 
+/* bitmask bits for CURLMOPT_PIPELINING */
+#define CURLPIPE_NOTHING   0L
+#define CURLPIPE_HTTP1     1L
+#define CURLPIPE_MULTIPLEX 2L
+
 typedef enum {
   CURLMSG_NONE, /* first, not used */
   CURLMSG_DONE, /* This easy handle has completed. 'result' contains
@@ -209,7 +218,7 @@ CURL_EXTERN CURLMcode curl_multi_cleanup(CURLM *multi_handle);
  *          curl_multi_cleanup().
  *
  *          The 'CURLMsg' struct is meant to be very simple and only contain
- *          very basic informations. If more involved information is wanted,
+ *          very basic information. If more involved information is wanted,
  *          we will provide the particular "transfer handle" in that struct
  *          and that should/could/would be used in subsequent
  *          curl_easy_getinfo() calls (or similar). The point being that we
@@ -365,6 +374,12 @@ typedef enum {
   /* maximum number of open connections in total */
   CINIT(MAX_TOTAL_CONNECTIONS, LONG, 13),
 
+   /* This is the server push callback function pointer */
+  CINIT(PUSHFUNCTION, FUNCTIONPOINT, 14),
+
+  /* This is the argument passed to the server push callback */
+  CINIT(PUSHDATA, OBJECTPOINT, 15),
+
   CURLMOPT_LASTENTRY /* the last unused */
 } CURLMoption;
 
@@ -392,6 +407,31 @@ CURL_EXTERN CURLMcode curl_multi_setopt(CURLM *multi_handle,
 CURL_EXTERN CURLMcode curl_multi_assign(CURLM *multi_handle,
                                         curl_socket_t sockfd, void *sockp);
 
+
+/*
+ * Name: curl_push_callback
+ *
+ * Desc: This callback gets called when a new stream is being pushed by the
+ *       server. It approves or denies the new stream.
+ *
+ * Returns: CURL_PUSH_OK or CURL_PUSH_DENY.
+ */
+#define CURL_PUSH_OK   0
+#define CURL_PUSH_DENY 1
+
+struct curl_pushheaders;  /* forward declaration only */
+
+CURL_EXTERN char *curl_pushheader_bynum(struct curl_pushheaders *h,
+                                        size_t num);
+CURL_EXTERN char *curl_pushheader_byname(struct curl_pushheaders *h,
+                                         const char *name);
+
+typedef int (*curl_push_callback)(CURL *parent,
+                                  CURL *easy,
+                                  size_t num_headers,
+                                  struct curl_pushheaders *headers,
+                                  void *userp);
+
 #ifdef __cplusplus
 } /* end of extern "C" */
 #endif
diff --git a/compat/curl-for-windows/curl/include/curl/stdcheaders.h b/compat/includes/curl/stdcheaders.h
similarity index 82%
rename from compat/curl-for-windows/curl/include/curl/stdcheaders.h
rename to compat/includes/curl/stdcheaders.h
index ad82ef6335..027b6f4211 100644
--- a/compat/curl-for-windows/curl/include/curl/stdcheaders.h
+++ b/compat/includes/curl/stdcheaders.h
@@ -7,11 +7,11 @@
  *                            | (__| |_| |  _ <| |___
  *                             \___|\___/|_| \_\_____|
  *
- * Copyright (C) 1998 - 2010, Daniel Stenberg, <daniel@haxx.se>, et al.
+ * Copyright (C) 1998 - 2016, Daniel Stenberg, <daniel@haxx.se>, et al.
  *
  * This software is licensed as described in the file COPYING, which
  * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
+ * are also available at https://curl.haxx.se/docs/copyright.html.
  *
  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
  * copies of the Software, and permit persons to whom the Software is
@@ -24,8 +24,8 @@
 
 #include <sys/types.h>
 
-size_t fread (void *, size_t, size_t, FILE *);
-size_t fwrite (const void *, size_t, size_t, FILE *);
+size_t fread(void *, size_t, size_t, FILE *);
+size_t fwrite(const void *, size_t, size_t, FILE *);
 
 int strcasecmp(const char *, const char *);
 int strncasecmp(const char *, const char *, size_t);
diff --git a/compat/includes/curl/system.h b/compat/includes/curl/system.h
new file mode 100644
index 0000000000..ed3a55c954
--- /dev/null
+++ b/compat/includes/curl/system.h
@@ -0,0 +1,484 @@
+#ifndef __CURL_SYSTEM_H
+#define __CURL_SYSTEM_H
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2017, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at https://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+
+/*
+ * This header is supposed to eventually replace curlbuild.h. This little one
+ * is still learning.  During the experimental phase, this header files
+ * defines symbols using the prefixes CURLSYS_ or curlsys_. When we feel
+ * confident enough, we replace curlbuild.h with this file and rename all
+ * prefixes to CURL_ and curl_.
+ */
+
+/*
+ * Try to keep one section per platform, compiler and architecture, otherwise,
+ * if an existing section is reused for a different one and later on the
+ * original is adjusted, probably the piggybacking one can be adversely
+ * changed.
+ *
+ * In order to differentiate between platforms/compilers/architectures use
+ * only compiler built in predefined preprocessor symbols.
+ *
+ * curl_off_t
+ * ----------
+ *
+ * For any given platform/compiler curl_off_t must be typedef'ed to a 64-bit
+ * wide signed integral data type. The width of this data type must remain
+ * constant and independent of any possible large file support settings.
+ *
+ * As an exception to the above, curl_off_t shall be typedef'ed to a 32-bit
+ * wide signed integral data type if there is no 64-bit type.
+ *
+ * As a general rule, curl_off_t shall not be mapped to off_t. This rule shall
+ * only be violated if off_t is the only 64-bit data type available and the
+ * size of off_t is independent of large file support settings. Keep your
+ * build on the safe side avoiding an off_t gating.  If you have a 64-bit
+ * off_t then take for sure that another 64-bit data type exists, dig deeper
+ * and you will find it.
+ *
+ */
+
+#if defined(__DJGPP__) || defined(__GO32__)
+#  if defined(__DJGPP__) && (__DJGPP__ > 1)
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "lld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "llu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     8
+#    define CURLSYS_SUFFIX_CURL_OFF_T     LL
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    ULL
+#  else
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     4
+#    define CURLSYS_SUFFIX_CURL_OFF_T     L
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#  endif
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T int
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+
+#elif defined(__SALFORDC__)
+#  define CURLSYS_SIZEOF_LONG           4
+#  define CURLSYS_TYPEOF_CURL_OFF_T     long
+#  define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#  define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#  define CURLSYS_SIZEOF_CURL_OFF_T     4
+#  define CURLSYS_SUFFIX_CURL_OFF_T     L
+#  define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T int
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+
+#elif defined(__BORLANDC__)
+#  if (__BORLANDC__ < 0x520)
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     4
+#    define CURLSYS_SUFFIX_CURL_OFF_T     L
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#  else
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     __int64
+#    define CURLSYS_FORMAT_CURL_OFF_T     "I64d"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "I64u"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     8
+#    define CURLSYS_SUFFIX_CURL_OFF_T     i64
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    ui64
+#  endif
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T int
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+
+#elif defined(__TURBOC__)
+#  define CURLSYS_SIZEOF_LONG           4
+#  define CURLSYS_TYPEOF_CURL_OFF_T     long
+#  define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#  define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#  define CURLSYS_SIZEOF_CURL_OFF_T     4
+#  define CURLSYS_SUFFIX_CURL_OFF_T     L
+#  define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T int
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+
+#elif defined(__WATCOMC__)
+#  if defined(__386__)
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     __int64
+#    define CURLSYS_FORMAT_CURL_OFF_T     "I64d"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "I64u"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     8
+#    define CURLSYS_SUFFIX_CURL_OFF_T     i64
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    ui64
+#  else
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     4
+#    define CURLSYS_SUFFIX_CURL_OFF_T     L
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#  endif
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T int
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+
+#elif defined(__POCC__)
+#  if (__POCC__ < 280)
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     4
+#    define CURLSYS_SUFFIX_CURL_OFF_T     L
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#  elif defined(_MSC_VER)
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     __int64
+#    define CURLSYS_FORMAT_CURL_OFF_T     "I64d"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "I64u"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     8
+#    define CURLSYS_SUFFIX_CURL_OFF_T     i64
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    ui64
+#  else
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "lld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "llu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     8
+#    define CURLSYS_SUFFIX_CURL_OFF_T     LL
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    ULL
+#  endif
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T int
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+
+#elif defined(__LCC__)
+#  define CURLSYS_SIZEOF_LONG           4
+#  define CURLSYS_TYPEOF_CURL_OFF_T     long
+#  define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#  define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#  define CURLSYS_SIZEOF_CURL_OFF_T     4
+#  define CURLSYS_SUFFIX_CURL_OFF_T     L
+#  define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T int
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+
+#elif defined(__SYMBIAN32__)
+#  if defined(__EABI__)  /* Treat all ARM compilers equally */
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "lld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "llu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     8
+#    define CURLSYS_SUFFIX_CURL_OFF_T     LL
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    ULL
+#  elif defined(__CW32__)
+#    pragma longlong on
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "lld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "llu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     8
+#    define CURLSYS_SUFFIX_CURL_OFF_T     LL
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    ULL
+#  elif defined(__VC32__)
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     __int64
+#    define CURLSYS_FORMAT_CURL_OFF_T     "lld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "llu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     8
+#    define CURLSYS_SUFFIX_CURL_OFF_T     LL
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    ULL
+#  endif
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T unsigned int
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+
+#elif defined(__MWERKS__)
+#  define CURLSYS_SIZEOF_LONG           4
+#  define CURLSYS_TYPEOF_CURL_OFF_T     long long
+#  define CURLSYS_FORMAT_CURL_OFF_T     "lld"
+#  define CURLSYS_FORMAT_CURL_OFF_TU    "llu"
+#  define CURLSYS_SIZEOF_CURL_OFF_T     8
+#  define CURLSYS_SUFFIX_CURL_OFF_T     LL
+#  define CURLSYS_SUFFIX_CURL_OFF_TU    ULL
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T int
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+
+#elif defined(_WIN32_WCE)
+#  define CURLSYS_SIZEOF_LONG           4
+#  define CURLSYS_TYPEOF_CURL_OFF_T     __int64
+#  define CURLSYS_FORMAT_CURL_OFF_T     "I64d"
+#  define CURLSYS_FORMAT_CURL_OFF_TU    "I64u"
+#  define CURLSYS_SIZEOF_CURL_OFF_T     8
+#  define CURLSYS_SUFFIX_CURL_OFF_T     i64
+#  define CURLSYS_SUFFIX_CURL_OFF_TU    ui64
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T int
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+
+#elif defined(__MINGW32__)
+#  define CURLSYS_SIZEOF_LONG           4
+#  define CURLSYS_TYPEOF_CURL_OFF_T     long long
+#  define CURLSYS_FORMAT_CURL_OFF_T     "I64d"
+#  define CURLSYS_FORMAT_CURL_OFF_TU    "I64u"
+#  define CURLSYS_SIZEOF_CURL_OFF_T     8
+#  define CURLSYS_SUFFIX_CURL_OFF_T     LL
+#  define CURLSYS_SUFFIX_CURL_OFF_TU    ULL
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T socklen_t
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+#  define CURLSYS_PULL_SYS_TYPES_H      1
+#  define CURLSYS_PULL_WS2TCPIP_H       1
+
+#elif defined(__VMS)
+#  if defined(__VAX)
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     4
+#    define CURLSYS_SUFFIX_CURL_OFF_T     L
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#  else
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "lld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "llu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     8
+#    define CURLSYS_SUFFIX_CURL_OFF_T     LL
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    ULL
+#  endif
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T unsigned int
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+
+#elif defined(__OS400__)
+#  if defined(__ILEC400__)
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "lld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "llu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     8
+#    define CURLSYS_SUFFIX_CURL_OFF_T     LL
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    ULL
+#    define CURLSYS_TYPEOF_CURL_SOCKLEN_T socklen_t
+#    define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+#    define CURLSYS_PULL_SYS_TYPES_H      1
+#    define CURLSYS_PULL_SYS_SOCKET_H     1
+#  endif
+
+#elif defined(__MVS__)
+#  if defined(__IBMC__) || defined(__IBMCPP__)
+#    if defined(_ILP32)
+#      define CURLSYS_SIZEOF_LONG           4
+#    elif defined(_LP64)
+#      define CURLSYS_SIZEOF_LONG           8
+#    endif
+#    if defined(_LONG_LONG)
+#      define CURLSYS_TYPEOF_CURL_OFF_T     long long
+#      define CURLSYS_FORMAT_CURL_OFF_T     "lld"
+#      define CURLSYS_FORMAT_CURL_OFF_TU    "llu"
+#      define CURLSYS_SIZEOF_CURL_OFF_T     8
+#      define CURLSYS_SUFFIX_CURL_OFF_T     LL
+#      define CURLSYS_SUFFIX_CURL_OFF_TU    ULL
+#    elif defined(_LP64)
+#      define CURLSYS_TYPEOF_CURL_OFF_T     long
+#      define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#      define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#      define CURLSYS_SIZEOF_CURL_OFF_T     8
+#      define CURLSYS_SUFFIX_CURL_OFF_T     L
+#      define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#    else
+#      define CURLSYS_TYPEOF_CURL_OFF_T     long
+#      define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#      define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#      define CURLSYS_SIZEOF_CURL_OFF_T     4
+#      define CURLSYS_SUFFIX_CURL_OFF_T     L
+#      define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#    endif
+#    define CURLSYS_TYPEOF_CURL_SOCKLEN_T socklen_t
+#    define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+#    define CURLSYS_PULL_SYS_TYPES_H      1
+#    define CURLSYS_PULL_SYS_SOCKET_H     1
+#  endif
+
+#elif defined(__370__)
+#  if defined(__IBMC__) || defined(__IBMCPP__)
+#    if defined(_ILP32)
+#      define CURLSYS_SIZEOF_LONG           4
+#    elif defined(_LP64)
+#      define CURLSYS_SIZEOF_LONG           8
+#    endif
+#    if defined(_LONG_LONG)
+#      define CURLSYS_TYPEOF_CURL_OFF_T     long long
+#      define CURLSYS_FORMAT_CURL_OFF_T     "lld"
+#      define CURLSYS_FORMAT_CURL_OFF_TU    "llu"
+#      define CURLSYS_SIZEOF_CURL_OFF_T     8
+#      define CURLSYS_SUFFIX_CURL_OFF_T     LL
+#      define CURLSYS_SUFFIX_CURL_OFF_TU    ULL
+#    elif defined(_LP64)
+#      define CURLSYS_TYPEOF_CURL_OFF_T     long
+#      define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#      define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#      define CURLSYS_SIZEOF_CURL_OFF_T     8
+#      define CURLSYS_SUFFIX_CURL_OFF_T     L
+#      define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#    else
+#      define CURLSYS_TYPEOF_CURL_OFF_T     long
+#      define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#      define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#      define CURLSYS_SIZEOF_CURL_OFF_T     4
+#      define CURLSYS_SUFFIX_CURL_OFF_T     L
+#      define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#    endif
+#    define CURLSYS_TYPEOF_CURL_SOCKLEN_T socklen_t
+#    define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+#    define CURLSYS_PULL_SYS_TYPES_H      1
+#    define CURLSYS_PULL_SYS_SOCKET_H     1
+#  endif
+
+#elif defined(TPF)
+#  define CURLSYS_SIZEOF_LONG           8
+#  define CURLSYS_TYPEOF_CURL_OFF_T     long
+#  define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#  define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#  define CURLSYS_SIZEOF_CURL_OFF_T     8
+#  define CURLSYS_SUFFIX_CURL_OFF_T     L
+#  define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T int
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+
+#elif defined(__TINYC__) /* also known as tcc */
+
+#  define CURLSYS_SIZEOF_LONG           4
+#  define CURLSYS_TYPEOF_CURL_OFF_T     long long
+#  define CURLSYS_FORMAT_CURL_OFF_T     "lld"
+#  define CURLSYS_FORMAT_CURL_OFF_TU    "llu"
+#  define CURLSYS_SIZEOF_CURL_OFF_T     8
+#  define CURLSYS_SUFFIX_CURL_OFF_T     LL
+#  define CURLSYS_SUFFIX_CURL_OFF_TU    ULL
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T socklen_t
+#  define CURLSYS_PULL_SYS_TYPES_H      1
+#  define CURLSYS_PULL_SYS_SOCKET_H     1
+
+/* ===================================== */
+/*    KEEP MSVC THE PENULTIMATE ENTRY    */
+/* ===================================== */
+
+#elif defined(_MSC_VER)
+#  if (_MSC_VER >= 900) && (_INTEGRAL_MAX_BITS >= 64)
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     __int64
+#    define CURLSYS_FORMAT_CURL_OFF_T     "I64d"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "I64u"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     8
+#    define CURLSYS_SUFFIX_CURL_OFF_T     i64
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    ui64
+#  else
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     4
+#    define CURLSYS_SUFFIX_CURL_OFF_T     L
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#  endif
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T int
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+
+/* ===================================== */
+/*    KEEP GENERIC GCC THE LAST ENTRY    */
+/* ===================================== */
+
+#elif defined(__GNUC__)
+#  if !defined(__LP64__) && (defined(__ILP32__) || \
+      defined(__i386__) || defined(__ppc__) || defined(__arm__) || \
+      defined(__sparc__) || defined(__mips__) || defined(__sh__))
+#    define CURLSYS_SIZEOF_LONG           4
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "lld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "llu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     8
+#    define CURLSYS_SUFFIX_CURL_OFF_T     LL
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    ULL
+#  elif defined(__LP64__) || \
+        defined(__x86_64__) || defined(__ppc64__) || defined(__sparc64__)
+#    define CURLSYS_SIZEOF_LONG           8
+#    define CURLSYS_TYPEOF_CURL_OFF_T     long
+#    define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+#    define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+#    define CURLSYS_SIZEOF_CURL_OFF_T     8
+#    define CURLSYS_SUFFIX_CURL_OFF_T     L
+#    define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+#  endif
+#  define CURLSYS_TYPEOF_CURL_SOCKLEN_T socklen_t
+#  define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+#  define CURLSYS_PULL_SYS_TYPES_H      1
+#  define CURLSYS_PULL_SYS_SOCKET_H     1
+
+#else
+/* generic "safe guess" on old 32 bit style */
+# define CURLSYS_SIZEOF_LONG 4
+# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4
+# define CURLSYS_SIZEOF_CURL_OFF_T 4
+# define CURLSYS_TYPEOF_CURL_OFF_T     long
+# define CURLSYS_FORMAT_CURL_OFF_T     "ld"
+# define CURLSYS_FORMAT_CURL_OFF_TU    "lu"
+# define CURLSYS_SUFFIX_CURL_OFF_T     L
+# define CURLSYS_SUFFIX_CURL_OFF_TU    UL
+# define CURLSYS_TYPEOF_CURL_SOCKLEN_T int
+#endif
+
+/* CURLSYS_PULL_WS2TCPIP_H is defined above when inclusion of header file  */
+/* ws2tcpip.h is required here to properly make type definitions below. */
+#ifdef CURLSYS_PULL_WS2TCPIP_H
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  include <windows.h>
+#  include <winsock2.h>
+#  include <ws2tcpip.h>
+#endif
+
+/* CURLSYS_PULL_SYS_TYPES_H is defined above when inclusion of header file  */
+/* sys/types.h is required here to properly make type definitions below. */
+#ifdef CURLSYS_PULL_SYS_TYPES_H
+#  include <sys/types.h>
+#endif
+
+/* CURLSYS_PULL_SYS_SOCKET_H is defined above when inclusion of header file  */
+/* sys/socket.h is required here to properly make type definitions below. */
+#ifdef CURLSYS_PULL_SYS_SOCKET_H
+#  include <sys/socket.h>
+#endif
+
+/* Data type definition of curl_socklen_t. */
+#ifdef CURLSYS_TYPEOF_CURL_SOCKLEN_T
+  typedef CURLSYS_TYPEOF_CURL_SOCKLEN_T curlsys_socklen_t;
+#endif
+
+/* Data type definition of curl_off_t. */
+
+#ifdef CURLSYS_TYPEOF_CURL_OFF_T
+  typedef CURLSYS_TYPEOF_CURL_OFF_T curlsys_off_t;
+#endif
+
+#endif /* __CURL_SYSTEM_H */
+
diff --git a/compat/curl-for-windows/curl/include/curl/typecheck-gcc.h b/compat/includes/curl/typecheck-gcc.h
similarity index 93%
rename from compat/curl-for-windows/curl/include/curl/typecheck-gcc.h
rename to compat/includes/curl/typecheck-gcc.h
index 69d41a20d1..3d683152b6 100644
--- a/compat/curl-for-windows/curl/include/curl/typecheck-gcc.h
+++ b/compat/includes/curl/typecheck-gcc.h
@@ -7,11 +7,11 @@
  *                            | (__| |_| |  _ <| |___
  *                             \___|\___/|_| \_\_____|
  *
- * Copyright (C) 1998 - 2014, Daniel Stenberg, <daniel@haxx.se>, et al.
+ * Copyright (C) 1998 - 2016, Daniel Stenberg, <daniel@haxx.se>, et al.
  *
  * This software is licensed as described in the file COPYING, which
  * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
+ * are also available at https://curl.haxx.se/docs/copyright.html.
  *
  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
  * copies of the Software, and permit persons to whom the Software is
@@ -40,7 +40,7 @@
  */
 #define curl_easy_setopt(handle, option, value)                               \
 __extension__ ({                                                              \
-  __typeof__ (option) _curl_opt = option;                                     \
+  __typeof__(option) _curl_opt = option;                                     \
   if(__builtin_constant_p(_curl_opt)) {                                       \
     if(_curl_is_long_option(_curl_opt))                                       \
       if(!_curl_is_long(value))                                               \
@@ -110,7 +110,7 @@ __extension__ ({                                                              \
 /* FIXME: don't allow const pointers */
 #define curl_easy_getinfo(handle, info, arg)                                  \
 __extension__ ({                                                              \
-  __typeof__ (info) _curl_info = info;                                        \
+  __typeof__(info) _curl_info = info;                                        \
   if(__builtin_constant_p(_curl_info)) {                                      \
     if(_curl_is_string_info(_curl_info))                                      \
       if(!_curl_is_arr((arg), char *))                                        \
@@ -151,7 +151,7 @@ _CURL_WARNING(_curl_easy_setopt_err_curl_off_t,
   "curl_easy_setopt expects a curl_off_t argument for this option")
 _CURL_WARNING(_curl_easy_setopt_err_string,
               "curl_easy_setopt expects a "
-              "string (char* or char[]) argument for this option"
+              "string ('char *' or char[]) argument for this option"
   )
 _CURL_WARNING(_curl_easy_setopt_err_write_callback,
   "curl_easy_setopt expects a curl_write_callback argument for this option")
@@ -182,24 +182,25 @@ _CURL_WARNING(_curl_easy_setopt_err_error_buffer,
               "curl_easy_setopt expects a "
               "char buffer of CURL_ERROR_SIZE as argument for this option")
 _CURL_WARNING(_curl_easy_setopt_err_FILE,
-  "curl_easy_setopt expects a FILE* argument for this option")
+  "curl_easy_setopt expects a 'FILE *' argument for this option")
 _CURL_WARNING(_curl_easy_setopt_err_postfields,
-  "curl_easy_setopt expects a void* or char* argument for this option")
+  "curl_easy_setopt expects a 'void *' or 'char *' argument for this option")
 _CURL_WARNING(_curl_easy_setopt_err_curl_httpost,
-  "curl_easy_setopt expects a struct curl_httppost* argument for this option")
+              "curl_easy_setopt expects a 'struct curl_httppost *' "
+              "argument for this option")
 _CURL_WARNING(_curl_easy_setopt_err_curl_slist,
-  "curl_easy_setopt expects a struct curl_slist* argument for this option")
+  "curl_easy_setopt expects a 'struct curl_slist *' argument for this option")
 _CURL_WARNING(_curl_easy_setopt_err_CURLSH,
   "curl_easy_setopt expects a CURLSH* argument for this option")
 
 _CURL_WARNING(_curl_easy_getinfo_err_string,
-  "curl_easy_getinfo expects a pointer to char * for this info")
+  "curl_easy_getinfo expects a pointer to 'char *' for this info")
 _CURL_WARNING(_curl_easy_getinfo_err_long,
   "curl_easy_getinfo expects a pointer to long for this info")
 _CURL_WARNING(_curl_easy_getinfo_err_double,
   "curl_easy_getinfo expects a pointer to double for this info")
 _CURL_WARNING(_curl_easy_getinfo_err_curl_slist,
-  "curl_easy_getinfo expects a pointer to struct curl_slist * for this info")
+  "curl_easy_getinfo expects a pointer to 'struct curl_slist *' for this info")
 
 /* groups of curl_easy_setops options that take the same type of argument */
 
@@ -218,58 +219,68 @@ _CURL_WARNING(_curl_easy_getinfo_err_curl_slist,
 
 /* evaluates to true if option takes a char* argument */
 #define _curl_is_string_option(option)                                        \
-  ((option) == CURLOPT_URL ||                                                 \
-   (option) == CURLOPT_PROXY ||                                               \
-   (option) == CURLOPT_INTERFACE ||                                           \
-   (option) == CURLOPT_NETRC_FILE ||                                          \
-   (option) == CURLOPT_USERPWD ||                                             \
-   (option) == CURLOPT_USERNAME ||                                            \
-   (option) == CURLOPT_PASSWORD ||                                            \
-   (option) == CURLOPT_PROXYUSERPWD ||                                        \
-   (option) == CURLOPT_PROXYUSERNAME ||                                       \
-   (option) == CURLOPT_PROXYPASSWORD ||                                       \
-   (option) == CURLOPT_NOPROXY ||                                             \
+  ((option) == CURLOPT_ABSTRACT_UNIX_SOCKET ||                                \
    (option) == CURLOPT_ACCEPT_ENCODING ||                                     \
-   (option) == CURLOPT_REFERER ||                                             \
-   (option) == CURLOPT_USERAGENT ||                                           \
+   (option) == CURLOPT_CAINFO ||                                              \
+   (option) == CURLOPT_CAPATH ||                                              \
    (option) == CURLOPT_COOKIE ||                                              \
    (option) == CURLOPT_COOKIEFILE ||                                          \
    (option) == CURLOPT_COOKIEJAR ||                                           \
    (option) == CURLOPT_COOKIELIST ||                                          \
+   (option) == CURLOPT_CRLFILE ||                                             \
+   (option) == CURLOPT_CUSTOMREQUEST ||                                       \
+   (option) == CURLOPT_DEFAULT_PROTOCOL ||                                    \
+   (option) == CURLOPT_DNS_INTERFACE ||                                       \
+   (option) == CURLOPT_DNS_LOCAL_IP4 ||                                       \
+   (option) == CURLOPT_DNS_LOCAL_IP6 ||                                       \
+   (option) == CURLOPT_DNS_SERVERS ||                                         \
+   (option) == CURLOPT_EGDSOCKET ||                                           \
    (option) == CURLOPT_FTPPORT ||                                             \
-   (option) == CURLOPT_FTP_ALTERNATIVE_TO_USER ||                             \
    (option) == CURLOPT_FTP_ACCOUNT ||                                         \
-   (option) == CURLOPT_RANGE ||                                               \
-   (option) == CURLOPT_CUSTOMREQUEST ||                                       \
-   (option) == CURLOPT_SSLCERT ||                                             \
-   (option) == CURLOPT_SSLCERTTYPE ||                                         \
-   (option) == CURLOPT_SSLKEY ||                                              \
-   (option) == CURLOPT_SSLKEYTYPE ||                                          \
+   (option) == CURLOPT_FTP_ALTERNATIVE_TO_USER ||                             \
+   (option) == CURLOPT_INTERFACE ||                                           \
+   (option) == CURLOPT_ISSUERCERT ||                                          \
    (option) == CURLOPT_KEYPASSWD ||                                           \
-   (option) == CURLOPT_SSLENGINE ||                                           \
-   (option) == CURLOPT_CAINFO ||                                              \
-   (option) == CURLOPT_CAPATH ||                                              \
-   (option) == CURLOPT_RANDOM_FILE ||                                         \
-   (option) == CURLOPT_EGDSOCKET ||                                           \
-   (option) == CURLOPT_SSL_CIPHER_LIST ||                                     \
    (option) == CURLOPT_KRBLEVEL ||                                            \
-   (option) == CURLOPT_SSH_HOST_PUBLIC_KEY_MD5 ||                             \
-   (option) == CURLOPT_SSH_PUBLIC_KEYFILE ||                                  \
-   (option) == CURLOPT_SSH_PRIVATE_KEYFILE ||                                 \
-   (option) == CURLOPT_CRLFILE ||                                             \
-   (option) == CURLOPT_ISSUERCERT ||                                          \
-   (option) == CURLOPT_SOCKS5_GSSAPI_SERVICE ||                               \
-   (option) == CURLOPT_SSH_KNOWNHOSTS ||                                      \
+   (option) == CURLOPT_LOGIN_OPTIONS ||                                       \
+   (option) == CURLOPT_MAIL_AUTH ||                                           \
    (option) == CURLOPT_MAIL_FROM ||                                           \
+   (option) == CURLOPT_NETRC_FILE ||                                          \
+   (option) == CURLOPT_NOPROXY ||                                             \
+   (option) == CURLOPT_PASSWORD ||                                            \
+   (option) == CURLOPT_PINNEDPUBLICKEY ||                                     \
+   (option) == CURLOPT_PROXY ||                                               \
+   (option) == CURLOPT_PROXYPASSWORD ||                                       \
+   (option) == CURLOPT_PROXYUSERNAME ||                                       \
+   (option) == CURLOPT_PROXYUSERPWD ||                                        \
+   (option) == CURLOPT_PROXY_SERVICE_NAME ||                                  \
+   (option) == CURLOPT_RANDOM_FILE ||                                         \
+   (option) == CURLOPT_RANGE ||                                               \
+   (option) == CURLOPT_REFERER ||                                             \
    (option) == CURLOPT_RTSP_SESSION_ID ||                                     \
    (option) == CURLOPT_RTSP_STREAM_URI ||                                     \
    (option) == CURLOPT_RTSP_TRANSPORT ||                                      \
+   (option) == CURLOPT_SERVICE_NAME ||                                        \
+   (option) == CURLOPT_SOCKS5_GSSAPI_SERVICE ||                               \
+   (option) == CURLOPT_SSH_HOST_PUBLIC_KEY_MD5 ||                             \
+   (option) == CURLOPT_SSH_KNOWNHOSTS ||                                      \
+   (option) == CURLOPT_SSH_PRIVATE_KEYFILE ||                                 \
+   (option) == CURLOPT_SSH_PUBLIC_KEYFILE ||                                  \
+   (option) == CURLOPT_SSLCERT ||                                             \
+   (option) == CURLOPT_SSLCERTTYPE ||                                         \
+   (option) == CURLOPT_SSLENGINE ||                                           \
+   (option) == CURLOPT_SSLKEY ||                                              \
+   (option) == CURLOPT_SSLKEYTYPE ||                                          \
+   (option) == CURLOPT_SSL_CIPHER_LIST ||                                     \
+   (option) == CURLOPT_TLSAUTH_PASSWORD ||                                    \
+   (option) == CURLOPT_TLSAUTH_TYPE ||                                        \
+   (option) == CURLOPT_TLSAUTH_USERNAME ||                                    \
+   (option) == CURLOPT_UNIX_SOCKET_PATH ||                                    \
+   (option) == CURLOPT_URL ||                                                 \
+   (option) == CURLOPT_USERAGENT ||                                           \
+   (option) == CURLOPT_USERNAME ||                                            \
+   (option) == CURLOPT_USERPWD ||                                             \
    (option) == CURLOPT_XOAUTH2_BEARER ||                                      \
-   (option) == CURLOPT_DNS_SERVERS ||                                         \
-   (option) == CURLOPT_DNS_INTERFACE ||                                       \
-   (option) == CURLOPT_DNS_LOCAL_IP4 ||                                       \
-   (option) == CURLOPT_DNS_LOCAL_IP6 ||                                       \
-   (option) == CURLOPT_LOGIN_OPTIONS ||                                       \
    0)
 
 /* evaluates to true if option takes a curl_write_callback argument */
@@ -285,21 +296,22 @@ _CURL_WARNING(_curl_easy_getinfo_err_curl_slist,
 
 /* evaluates to true if option takes a data argument to pass to a callback */
 #define _curl_is_cb_data_option(option)                                       \
-  ((option) == CURLOPT_WRITEDATA ||                                           \
-   (option) == CURLOPT_READDATA ||                                            \
+  ((option) == CURLOPT_CHUNK_DATA ||                                          \
+   (option) == CURLOPT_CLOSESOCKETDATA ||                                     \
+   (option) == CURLOPT_DEBUGDATA ||                                           \
+   (option) == CURLOPT_FNMATCH_DATA ||                                        \
+   (option) == CURLOPT_HEADERDATA ||                                          \
+   (option) == CURLOPT_INTERLEAVEDATA ||                                      \
    (option) == CURLOPT_IOCTLDATA ||                                           \
-   (option) == CURLOPT_SOCKOPTDATA ||                                         \
    (option) == CURLOPT_OPENSOCKETDATA ||                                      \
+   (option) == CURLOPT_PRIVATE ||                                             \
    (option) == CURLOPT_PROGRESSDATA ||                                        \
-   (option) == CURLOPT_HEADERDATA ||                                         \
-   (option) == CURLOPT_DEBUGDATA ||                                           \
-   (option) == CURLOPT_SSL_CTX_DATA ||                                        \
+   (option) == CURLOPT_READDATA ||                                            \
    (option) == CURLOPT_SEEKDATA ||                                            \
-   (option) == CURLOPT_PRIVATE ||                                             \
+   (option) == CURLOPT_SOCKOPTDATA ||                                         \
    (option) == CURLOPT_SSH_KEYDATA ||                                         \
-   (option) == CURLOPT_INTERLEAVEDATA ||                                      \
-   (option) == CURLOPT_CHUNK_DATA ||                                          \
-   (option) == CURLOPT_FNMATCH_DATA ||                                        \
+   (option) == CURLOPT_SSL_CTX_DATA ||                                        \
+   (option) == CURLOPT_WRITEDATA ||                                           \
    0)
 
 /* evaluates to true if option takes a POST data argument (void* or char*) */
@@ -310,13 +322,15 @@ _CURL_WARNING(_curl_easy_getinfo_err_curl_slist,
 
 /* evaluates to true if option takes a struct curl_slist * argument */
 #define _curl_is_slist_option(option)                                         \
-  ((option) == CURLOPT_HTTPHEADER ||                                          \
-   (option) == CURLOPT_HTTP200ALIASES ||                                      \
-   (option) == CURLOPT_QUOTE ||                                               \
+  ((option) == CURLOPT_HTTP200ALIASES ||                                      \
+   (option) == CURLOPT_HTTPHEADER ||                                          \
+   (option) == CURLOPT_MAIL_RCPT ||                                           \
    (option) == CURLOPT_POSTQUOTE ||                                           \
    (option) == CURLOPT_PREQUOTE ||                                            \
+   (option) == CURLOPT_PROXYHEADER ||                                         \
+   (option) == CURLOPT_QUOTE ||                                               \
+   (option) == CURLOPT_RESOLVE ||                                             \
    (option) == CURLOPT_TELNETOPTIONS ||                                       \
-   (option) == CURLOPT_MAIL_RCPT ||                                           \
    0)
 
 /* groups of curl_easy_getinfo infos that take the same type of argument */
@@ -351,7 +365,7 @@ _CURL_WARNING(_curl_easy_getinfo_err_curl_slist,
 
 /* XXX: should evaluate to true iff expr is a pointer */
 #define _curl_is_any_ptr(expr)                                                \
-  (sizeof(expr) == sizeof(void*))
+  (sizeof(expr) == sizeof(void *))
 
 /* evaluates to true if expr is NULL */
 /* XXX: must not evaluate expr, so this check is not accurate */
@@ -443,12 +457,12 @@ _CURL_WARNING(_curl_easy_getinfo_err_curl_slist,
    _curl_callback_compatible((expr), _curl_read_callback4) ||                 \
    _curl_callback_compatible((expr), _curl_read_callback5) ||                 \
    _curl_callback_compatible((expr), _curl_read_callback6))
-typedef size_t (_curl_read_callback1)(char *, size_t, size_t, void*);
-typedef size_t (_curl_read_callback2)(char *, size_t, size_t, const void*);
-typedef size_t (_curl_read_callback3)(char *, size_t, size_t, FILE*);
-typedef size_t (_curl_read_callback4)(void *, size_t, size_t, void*);
-typedef size_t (_curl_read_callback5)(void *, size_t, size_t, const void*);
-typedef size_t (_curl_read_callback6)(void *, size_t, size_t, FILE*);
+typedef size_t (_curl_read_callback1)(char *, size_t, size_t, void *);
+typedef size_t (_curl_read_callback2)(char *, size_t, size_t, const void *);
+typedef size_t (_curl_read_callback3)(char *, size_t, size_t, FILE *);
+typedef size_t (_curl_read_callback4)(void *, size_t, size_t, void *);
+typedef size_t (_curl_read_callback5)(void *, size_t, size_t, const void *);
+typedef size_t (_curl_read_callback6)(void *, size_t, size_t, FILE *);
 
 /* evaluates to true if expr is of type curl_write_callback or "similar" */
 #define _curl_is_write_cb(expr)                                               \
@@ -461,14 +475,14 @@ typedef size_t (_curl_read_callback6)(void *, size_t, size_t, FILE*);
    _curl_callback_compatible((expr), _curl_write_callback4) ||                \
    _curl_callback_compatible((expr), _curl_write_callback5) ||                \
    _curl_callback_compatible((expr), _curl_write_callback6))
-typedef size_t (_curl_write_callback1)(const char *, size_t, size_t, void*);
+typedef size_t (_curl_write_callback1)(const char *, size_t, size_t, void *);
 typedef size_t (_curl_write_callback2)(const char *, size_t, size_t,
-                                       const void*);
-typedef size_t (_curl_write_callback3)(const char *, size_t, size_t, FILE*);
-typedef size_t (_curl_write_callback4)(const void *, size_t, size_t, void*);
+                                       const void *);
+typedef size_t (_curl_write_callback3)(const char *, size_t, size_t, FILE *);
+typedef size_t (_curl_write_callback4)(const void *, size_t, size_t, void *);
 typedef size_t (_curl_write_callback5)(const void *, size_t, size_t,
-                                       const void*);
-typedef size_t (_curl_write_callback6)(const void *, size_t, size_t, FILE*);
+                                       const void *);
+typedef size_t (_curl_write_callback6)(const void *, size_t, size_t, FILE *);
 
 /* evaluates to true if expr is of type curl_ioctl_callback or "similar" */
 #define _curl_is_ioctl_cb(expr)                                         \
@@ -478,10 +492,10 @@ typedef size_t (_curl_write_callback6)(const void *, size_t, size_t, FILE*);
    _curl_callback_compatible((expr), _curl_ioctl_callback2) ||                \
    _curl_callback_compatible((expr), _curl_ioctl_callback3) ||                \
    _curl_callback_compatible((expr), _curl_ioctl_callback4))
-typedef curlioerr (_curl_ioctl_callback1)(CURL *, int, void*);
-typedef curlioerr (_curl_ioctl_callback2)(CURL *, int, const void*);
-typedef curlioerr (_curl_ioctl_callback3)(CURL *, curliocmd, void*);
-typedef curlioerr (_curl_ioctl_callback4)(CURL *, curliocmd, const void*);
+typedef curlioerr (_curl_ioctl_callback1)(CURL *, int, void *);
+typedef curlioerr (_curl_ioctl_callback2)(CURL *, int, const void *);
+typedef curlioerr (_curl_ioctl_callback3)(CURL *, curliocmd, void *);
+typedef curlioerr (_curl_ioctl_callback4)(CURL *, curliocmd, const void *);
 
 /* evaluates to true if expr is of type curl_sockopt_callback or "similar" */
 #define _curl_is_sockopt_cb(expr)                                       \
diff --git a/compat/pthreads/pthread.h b/compat/includes/pthreads/pthread.h
similarity index 100%
rename from compat/pthreads/pthread.h
rename to compat/includes/pthreads/pthread.h
diff --git a/compat/pthreads/sched.h b/compat/includes/pthreads/sched.h
similarity index 100%
rename from compat/pthreads/sched.h
rename to compat/includes/pthreads/sched.h
diff --git a/compat/curl-for-windows/zlib/zconf.h b/compat/includes/zlib/zconf.h
similarity index 100%
rename from compat/curl-for-windows/zlib/zconf.h
rename to compat/includes/zlib/zconf.h
diff --git a/compat/curl-for-windows/zlib/zlib.h b/compat/includes/zlib/zlib.h
similarity index 100%
rename from compat/curl-for-windows/zlib/zlib.h
rename to compat/includes/zlib/zlib.h
diff --git a/compat/jansson/config.h b/compat/jansson/config.h
deleted file mode 100644
index 43858aa61f..0000000000
--- a/compat/jansson/config.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* config.h.  Generated from config.h.in by configure.  */
-/* config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#define HAVE_DLFCN_H 1
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#define HAVE_UNISTD_H 1
-
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
-   */
-#define LT_OBJDIR ".libs/"
-
-/* Name of package */
-#define PACKAGE "jansson"
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT "petri@digip.org"
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME "jansson"
-
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "jansson 1.3"
-
-/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME "jansson"
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "1.3"
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
-
-/* Version number of package */
-#define VERSION "1.3"
-
-/* Define to `__inline__' or `__inline' if that's what the C compiler
-   calls it, or to nothing if 'inline' is not supported under any name.  */
-#ifndef __cplusplus
-/* #undef inline */
-#endif
-
-/* Define to the type of a signed integer type of width exactly 32 bits if
-   such a type exists and the standard includes do not define it. */
-/* #undef int32_t */
diff --git a/compat/jansson/jansson_config.h b/compat/jansson/jansson_config.h
index 90ca129281..42421e8a86 100644
--- a/compat/jansson/jansson_config.h
+++ b/compat/jansson/jansson_config.h
@@ -22,8 +22,10 @@
    supported. */
 
 #ifdef _MSC_VER
+#ifndef __cplusplus
 #define inline __inline
 #endif
+#endif
 
 #ifdef __cplusplus
 #define JSON_INLINE inline
diff --git a/compat/libs/x64/jansson.lib b/compat/libs/x64/jansson.lib
new file mode 100644
index 0000000000..925e87c850
Binary files /dev/null and b/compat/libs/x64/jansson.lib differ
diff --git a/compat/libs/x64/libcrypto.lib b/compat/libs/x64/libcrypto.lib
new file mode 100644
index 0000000000..a364a4d822
Binary files /dev/null and b/compat/libs/x64/libcrypto.lib differ
diff --git a/compat/libs/x64/libcurl.lib b/compat/libs/x64/libcurl.lib
new file mode 100644
index 0000000000..ded35f14e4
Binary files /dev/null and b/compat/libs/x64/libcurl.lib differ
diff --git a/compat/pthreads/x64/pthreadVC2.lib b/compat/libs/x64/pthreadVC2.lib
similarity index 100%
rename from compat/pthreads/x64/pthreadVC2.lib
rename to compat/libs/x64/pthreadVC2.lib
diff --git a/compat/libs/x64/zlibstat.lib b/compat/libs/x64/zlibstat.lib
new file mode 100644
index 0000000000..5078caf7a1
Binary files /dev/null and b/compat/libs/x64/zlibstat.lib differ
diff --git a/compat/libs/x86/jansson.lib b/compat/libs/x86/jansson.lib
new file mode 100644
index 0000000000..10b32d1b30
Binary files /dev/null and b/compat/libs/x86/jansson.lib differ
diff --git a/compat/libs/x86/libcrypto.lib b/compat/libs/x86/libcrypto.lib
new file mode 100644
index 0000000000..6a7068f058
Binary files /dev/null and b/compat/libs/x86/libcrypto.lib differ
diff --git a/compat/libs/x86/libcurl.lib b/compat/libs/x86/libcurl.lib
new file mode 100644
index 0000000000..e227cfdfe3
Binary files /dev/null and b/compat/libs/x86/libcurl.lib differ
diff --git a/compat/pthreads/x86/pthreadVC2.lib b/compat/libs/x86/pthreadVC2.lib
similarity index 100%
rename from compat/pthreads/x86/pthreadVC2.lib
rename to compat/libs/x86/pthreadVC2.lib
diff --git a/compat/libs/x86/zlibstat.lib b/compat/libs/x86/zlibstat.lib
new file mode 100644
index 0000000000..387e902b03
Binary files /dev/null and b/compat/libs/x86/zlibstat.lib differ
diff --git a/compat/winansi.c b/compat/winansi.cpp
similarity index 96%
rename from compat/winansi.c
rename to compat/winansi.cpp
index 50e8388ac1..802f93ced9 100644
--- a/compat/winansi.c
+++ b/compat/winansi.cpp
@@ -1,3 +1,15 @@
+extern void proper_exit(int reason);
+enum
+{
+	LOG_ERR,
+	LOG_WARNING,
+	LOG_NOTICE,
+	LOG_INFO,
+	LOG_DEBUG,
+	/* custom notices */
+	LOG_BLUE = 0x10,
+};
+extern void applog(int prio, const char *fmt, ...);
 /**
  * Old Git implementation of windows terminal colors (2009)
  * before use of a threaded wrapper.
@@ -345,9 +357,12 @@ int winansi_vfprintf(FILE *stream, const char *format, va_list list)
 	va_end(cp);
 
 	if (len > sizeof(small_buf) - 1) {
-		buf = malloc(len + 1);
-		if (!buf)
-			goto abort;
+		buf = (char*)malloc(len + 1);
+		if(buf == NULL)
+		{
+			applog(LOG_ERR, "Out of memory!");
+			proper_exit(2);
+		}
 
 		len = vsnprintf(buf, len + 1, format, list);
 #ifdef WIN32
diff --git a/compile b/compile
deleted file mode 100644
index b1f4749152..0000000000
--- a/compile
+++ /dev/null
@@ -1,310 +0,0 @@
-#! /bin/sh
-# Wrapper for compilers which do not understand '-c -o'.
-
-scriptversion=2012-01-04.17; # UTC
-
-# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2009, 2010, 2012 Free
-# Software Foundation, Inc.
-# Written by Tom Tromey <tromey@cygnus.com>.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# This file is maintained in Automake, please report
-# bugs to <bug-automake@gnu.org> or send patches to
-# <automake-patches@gnu.org>.
-
-nl='
-'
-
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent tools from complaining about whitespace usage.
-IFS=" ""	$nl"
-
-file_conv=
-
-# func_file_conv build_file lazy
-# Convert a $build file to $host form and store it in $file
-# Currently only supports Windows hosts. If the determined conversion
-# type is listed in (the comma separated) LAZY, no conversion will
-# take place.
-func_file_conv ()
-{
-  file=$1
-  case $file in
-    / | /[!/]*) # absolute file, and not a UNC file
-      if test -z "$file_conv"; then
-	# lazily determine how to convert abs files
-	case `uname -s` in
-	  MINGW*)
-	    file_conv=mingw
-	    ;;
-	  CYGWIN*)
-	    file_conv=cygwin
-	    ;;
-	  *)
-	    file_conv=wine
-	    ;;
-	esac
-      fi
-      case $file_conv/,$2, in
-	*,$file_conv,*)
-	  ;;
-	mingw/*)
-	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
-	  ;;
-	cygwin/*)
-	  file=`cygpath -m "$file" || echo "$file"`
-	  ;;
-	wine/*)
-	  file=`winepath -w "$file" || echo "$file"`
-	  ;;
-      esac
-      ;;
-  esac
-}
-
-# func_cl_wrapper cl arg...
-# Adjust compile command to suit cl
-func_cl_wrapper ()
-{
-  # Assume a capable shell
-  lib_path=
-  shared=:
-  linker_opts=
-  for arg
-  do
-    if test -n "$eat"; then
-      eat=
-    else
-      case $1 in
-	-o)
-	  # configure might choose to run compile as 'compile cc -o foo foo.c'.
-	  eat=1
-	  case $2 in
-	    *.o | *.[oO][bB][jJ])
-	      func_file_conv "$2"
-	      set x "$@" -Fo"$file"
-	      shift
-	      ;;
-	    *)
-	      func_file_conv "$2"
-	      set x "$@" -Fe"$file"
-	      shift
-	      ;;
-	  esac
-	  ;;
-	-I*)
-	  func_file_conv "${1#-I}" mingw
-	  set x "$@" -I"$file"
-	  shift
-	  ;;
-	-l*)
-	  lib=${1#-l}
-	  found=no
-	  save_IFS=$IFS
-	  IFS=';'
-	  for dir in $lib_path $LIB
-	  do
-	    IFS=$save_IFS
-	    if $shared && test -f "$dir/$lib.dll.lib"; then
-	      found=yes
-	      set x "$@" "$dir/$lib.dll.lib"
-	      break
-	    fi
-	    if test -f "$dir/$lib.lib"; then
-	      found=yes
-	      set x "$@" "$dir/$lib.lib"
-	      break
-	    fi
-	  done
-	  IFS=$save_IFS
-
-	  test "$found" != yes && set x "$@" "$lib.lib"
-	  shift
-	  ;;
-	-L*)
-	  func_file_conv "${1#-L}"
-	  if test -z "$lib_path"; then
-	    lib_path=$file
-	  else
-	    lib_path="$lib_path;$file"
-	  fi
-	  linker_opts="$linker_opts -LIBPATH:$file"
-	  ;;
-	-static)
-	  shared=false
-	  ;;
-	-Wl,*)
-	  arg=${1#-Wl,}
-	  save_ifs="$IFS"; IFS=','
-	  for flag in $arg; do
-	    IFS="$save_ifs"
-	    linker_opts="$linker_opts $flag"
-	  done
-	  IFS="$save_ifs"
-	  ;;
-	-Xlinker)
-	  eat=1
-	  linker_opts="$linker_opts $2"
-	  ;;
-	-*)
-	  set x "$@" "$1"
-	  shift
-	  ;;
-	*.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
-	  func_file_conv "$1"
-	  set x "$@" -Tp"$file"
-	  shift
-	  ;;
-	*.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
-	  func_file_conv "$1" mingw
-	  set x "$@" "$file"
-	  shift
-	  ;;
-	*)
-	  set x "$@" "$1"
-	  shift
-	  ;;
-      esac
-    fi
-    shift
-  done
-  if test -n "$linker_opts"; then
-    linker_opts="-link$linker_opts"
-  fi
-  exec "$@" $linker_opts
-  exit 1
-}
-
-eat=
-
-case $1 in
-  '')
-     echo "$0: No command.  Try '$0 --help' for more information." 1>&2
-     exit 1;
-     ;;
-  -h | --h*)
-    cat <<\EOF
-Usage: compile [--help] [--version] PROGRAM [ARGS]
-
-Wrapper for compilers which do not understand '-c -o'.
-Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
-arguments, and rename the output as expected.
-
-If you are trying to build a whole package this is not the
-right script to run: please start by reading the file 'INSTALL'.
-
-Report bugs to <bug-automake@gnu.org>.
-EOF
-    exit $?
-    ;;
-  -v | --v*)
-    echo "compile $scriptversion"
-    exit $?
-    ;;
-  cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
-    func_cl_wrapper "$@"      # Doesn't return...
-    ;;
-esac
-
-ofile=
-cfile=
-
-for arg
-do
-  if test -n "$eat"; then
-    eat=
-  else
-    case $1 in
-      -o)
-	# configure might choose to run compile as 'compile cc -o foo foo.c'.
-	# So we strip '-o arg' only if arg is an object.
-	eat=1
-	case $2 in
-	  *.o | *.obj)
-	    ofile=$2
-	    ;;
-	  *)
-	    set x "$@" -o "$2"
-	    shift
-	    ;;
-	esac
-	;;
-      *.c)
-	cfile=$1
-	set x "$@" "$1"
-	shift
-	;;
-      *)
-	set x "$@" "$1"
-	shift
-	;;
-    esac
-  fi
-  shift
-done
-
-if test -z "$ofile" || test -z "$cfile"; then
-  # If no '-o' option was seen then we might have been invoked from a
-  # pattern rule where we don't need one.  That is ok -- this is a
-  # normal compilation that the losing compiler can handle.  If no
-  # '.c' file was seen then we are probably linking.  That is also
-  # ok.
-  exec "$@"
-fi
-
-# Name of file we expect compiler to create.
-cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
-
-# Create the lock directory.
-# Note: use '[/\\:.-]' here to ensure that we don't use the same name
-# that we are using for the .o file.  Also, base the name on the expected
-# object file name, since that is what matters with a parallel build.
-lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
-while true; do
-  if mkdir "$lockdir" >/dev/null 2>&1; then
-    break
-  fi
-  sleep 1
-done
-# FIXME: race condition here if user kills between mkdir and trap.
-trap "rmdir '$lockdir'; exit 1" 1 2 15
-
-# Run the compile.
-"$@"
-ret=$?
-
-if test -f "$cofile"; then
-  test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
-elif test -f "${cofile}bj"; then
-  test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
-fi
-
-rmdir "$lockdir"
-exit $ret
-
-# Local Variables:
-# mode: shell-script
-# sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
-# time-stamp-end: "; # UTC"
-# End:
diff --git a/config.guess b/config.guess
deleted file mode 100644
index f32079abda..0000000000
--- a/config.guess
+++ /dev/null
@@ -1,1526 +0,0 @@
-#! /bin/sh
-# Attempt to guess a canonical system name.
-#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
-#   Free Software Foundation, Inc.
-
-timestamp='2008-01-23'
-
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
-# 02110-1301, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-
-# Originally written by Per Bothner <per@bothner.com>.
-# Please send patches to <config-patches@gnu.org>.  Submit a context
-# diff and a properly formatted ChangeLog entry.
-#
-# This script attempts to guess a canonical system name similar to
-# config.sub.  If it succeeds, it prints the system name on stdout, and
-# exits with 0.  Otherwise, it exits with 1.
-#
-# The plan is that this can be called by configure scripts if you
-# don't specify an explicit build system type.
-
-me=`echo "$0" | sed -e 's,.*/,,'`
-
-usage="\
-Usage: $0 [OPTION]
-
-Output the configuration name of the system \`$me' is run on.
-
-Operation modes:
-  -h, --help         print this help, then exit
-  -t, --time-stamp   print date of last modification, then exit
-  -v, --version      print version number, then exit
-
-Report bugs and patches to <config-patches@gnu.org>."
-
-version="\
-GNU config.guess ($timestamp)
-
-Originally written by Per Bothner.
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
-2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
-
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
-
-help="
-Try \`$me --help' for more information."
-
-# Parse command line
-while test $# -gt 0 ; do
-  case $1 in
-    --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit ;;
-    --version | -v )
-       echo "$version" ; exit ;;
-    --help | --h* | -h )
-       echo "$usage"; exit ;;
-    -- )     # Stop option processing
-       shift; break ;;
-    - )	# Use stdin as input.
-       break ;;
-    -* )
-       echo "$me: invalid option $1$help" >&2
-       exit 1 ;;
-    * )
-       break ;;
-  esac
-done
-
-if test $# != 0; then
-  echo "$me: too many arguments$help" >&2
-  exit 1
-fi
-
-trap 'exit 1' 1 2 15
-
-# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
-# compiler to aid in system detection is discouraged as it requires
-# temporary files to be created and, as you can see below, it is a
-# headache to deal with in a portable fashion.
-
-# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
-# use `HOST_CC' if defined, but it is deprecated.
-
-# Portable tmp directory creation inspired by the Autoconf team.
-
-set_cc_for_build='
-trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
-trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
-: ${TMPDIR=/tmp} ;
- { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
- { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
- { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
- { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
-dummy=$tmp/dummy ;
-tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
-case $CC_FOR_BUILD,$HOST_CC,$CC in
- ,,)    echo "int x;" > $dummy.c ;
-	for c in cc gcc c89 c99 ; do
-	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
-	     CC_FOR_BUILD="$c"; break ;
-	  fi ;
-	done ;
-	if test x"$CC_FOR_BUILD" = x ; then
-	  CC_FOR_BUILD=no_compiler_found ;
-	fi
-	;;
- ,,*)   CC_FOR_BUILD=$CC ;;
- ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
-esac ; set_cc_for_build= ;'
-
-# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
-# (ghazi@noc.rutgers.edu 1994-08-24)
-if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
-	PATH=$PATH:/.attbin ; export PATH
-fi
-
-UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
-UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
-UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
-UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
-
-# Note: order is significant - the case branches are not exclusive.
-
-case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
-    *:NetBSD:*:*)
-	# NetBSD (nbsd) targets should (where applicable) match one or
-	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
-	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
-	# switched to ELF, *-*-netbsd* would select the old
-	# object file format.  This provides both forward
-	# compatibility and a consistent mechanism for selecting the
-	# object file format.
-	#
-	# Note: NetBSD doesn't particularly care about the vendor
-	# portion of the name.  We always set it to "unknown".
-	sysctl="sysctl -n hw.machine_arch"
-	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
-	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
-	case "${UNAME_MACHINE_ARCH}" in
-	    armeb) machine=armeb-unknown ;;
-	    arm*) machine=arm-unknown ;;
-	    sh3el) machine=shl-unknown ;;
-	    sh3eb) machine=sh-unknown ;;
-	    sh5el) machine=sh5le-unknown ;;
-	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
-	esac
-	# The Operating System including object format, if it has switched
-	# to ELF recently, or will in the future.
-	case "${UNAME_MACHINE_ARCH}" in
-	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
-		eval $set_cc_for_build
-		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
-			| grep __ELF__ >/dev/null
-		then
-		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
-		    # Return netbsd for either.  FIX?
-		    os=netbsd
-		else
-		    os=netbsdelf
-		fi
-		;;
-	    *)
-	        os=netbsd
-		;;
-	esac
-	# The OS release
-	# Debian GNU/NetBSD machines have a different userland, and
-	# thus, need a distinct triplet. However, they do not need
-	# kernel version information, so it can be replaced with a
-	# suitable tag, in the style of linux-gnu.
-	case "${UNAME_VERSION}" in
-	    Debian*)
-		release='-gnu'
-		;;
-	    *)
-		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
-		;;
-	esac
-	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
-	# contains redundant information, the shorter form:
-	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
-	echo "${machine}-${os}${release}"
-	exit ;;
-    *:OpenBSD:*:*)
-	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
-	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
-	exit ;;
-    *:ekkoBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
-	exit ;;
-    *:SolidBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
-	exit ;;
-    macppc:MirBSD:*:*)
-	echo powerpc-unknown-mirbsd${UNAME_RELEASE}
-	exit ;;
-    *:MirBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
-	exit ;;
-    alpha:OSF1:*:*)
-	case $UNAME_RELEASE in
-	*4.0)
-		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
-		;;
-	*5.*)
-	        UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
-		;;
-	esac
-	# According to Compaq, /usr/sbin/psrinfo has been available on
-	# OSF/1 and Tru64 systems produced since 1995.  I hope that
-	# covers most systems running today.  This code pipes the CPU
-	# types through head -n 1, so we only detect the type of CPU 0.
-	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
-	case "$ALPHA_CPU_TYPE" in
-	    "EV4 (21064)")
-		UNAME_MACHINE="alpha" ;;
-	    "EV4.5 (21064)")
-		UNAME_MACHINE="alpha" ;;
-	    "LCA4 (21066/21068)")
-		UNAME_MACHINE="alpha" ;;
-	    "EV5 (21164)")
-		UNAME_MACHINE="alphaev5" ;;
-	    "EV5.6 (21164A)")
-		UNAME_MACHINE="alphaev56" ;;
-	    "EV5.6 (21164PC)")
-		UNAME_MACHINE="alphapca56" ;;
-	    "EV5.7 (21164PC)")
-		UNAME_MACHINE="alphapca57" ;;
-	    "EV6 (21264)")
-		UNAME_MACHINE="alphaev6" ;;
-	    "EV6.7 (21264A)")
-		UNAME_MACHINE="alphaev67" ;;
-	    "EV6.8CB (21264C)")
-		UNAME_MACHINE="alphaev68" ;;
-	    "EV6.8AL (21264B)")
-		UNAME_MACHINE="alphaev68" ;;
-	    "EV6.8CX (21264D)")
-		UNAME_MACHINE="alphaev68" ;;
-	    "EV6.9A (21264/EV69A)")
-		UNAME_MACHINE="alphaev69" ;;
-	    "EV7 (21364)")
-		UNAME_MACHINE="alphaev7" ;;
-	    "EV7.9 (21364A)")
-		UNAME_MACHINE="alphaev79" ;;
-	esac
-	# A Pn.n version is a patched version.
-	# A Vn.n version is a released version.
-	# A Tn.n version is a released field test version.
-	# A Xn.n version is an unreleased experimental baselevel.
-	# 1.2 uses "1.2" for uname -r.
-	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-	exit ;;
-    Alpha\ *:Windows_NT*:*)
-	# How do we know it's Interix rather than the generic POSIX subsystem?
-	# Should we change UNAME_MACHINE based on the output of uname instead
-	# of the specific Alpha model?
-	echo alpha-pc-interix
-	exit ;;
-    21064:Windows_NT:50:3)
-	echo alpha-dec-winnt3.5
-	exit ;;
-    Amiga*:UNIX_System_V:4.0:*)
-	echo m68k-unknown-sysv4
-	exit ;;
-    *:[Aa]miga[Oo][Ss]:*:*)
-	echo ${UNAME_MACHINE}-unknown-amigaos
-	exit ;;
-    *:[Mm]orph[Oo][Ss]:*:*)
-	echo ${UNAME_MACHINE}-unknown-morphos
-	exit ;;
-    *:OS/390:*:*)
-	echo i370-ibm-openedition
-	exit ;;
-    *:z/VM:*:*)
-	echo s390-ibm-zvmoe
-	exit ;;
-    *:OS400:*:*)
-        echo powerpc-ibm-os400
-	exit ;;
-    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
-	echo arm-acorn-riscix${UNAME_RELEASE}
-	exit ;;
-    arm:riscos:*:*|arm:RISCOS:*:*)
-	echo arm-unknown-riscos
-	exit ;;
-    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
-	echo hppa1.1-hitachi-hiuxmpp
-	exit ;;
-    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
-	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
-	if test "`(/bin/universe) 2>/dev/null`" = att ; then
-		echo pyramid-pyramid-sysv3
-	else
-		echo pyramid-pyramid-bsd
-	fi
-	exit ;;
-    NILE*:*:*:dcosx)
-	echo pyramid-pyramid-svr4
-	exit ;;
-    DRS?6000:unix:4.0:6*)
-	echo sparc-icl-nx6
-	exit ;;
-    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
-	case `/usr/bin/uname -p` in
-	    sparc) echo sparc-icl-nx7; exit ;;
-	esac ;;
-    sun4H:SunOS:5.*:*)
-	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
-	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
-	echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4*:SunOS:6*:*)
-	# According to config.sub, this is the proper way to canonicalize
-	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
-	# it's likely to be more like Solaris than SunOS4.
-	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4*:SunOS:*:*)
-	case "`/usr/bin/arch -k`" in
-	    Series*|S4*)
-		UNAME_RELEASE=`uname -v`
-		;;
-	esac
-	# Japanese Language versions have a version number like `4.1.3-JL'.
-	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
-	exit ;;
-    sun3*:SunOS:*:*)
-	echo m68k-sun-sunos${UNAME_RELEASE}
-	exit ;;
-    sun*:*:4.2BSD:*)
-	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
-	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
-	case "`/bin/arch`" in
-	    sun3)
-		echo m68k-sun-sunos${UNAME_RELEASE}
-		;;
-	    sun4)
-		echo sparc-sun-sunos${UNAME_RELEASE}
-		;;
-	esac
-	exit ;;
-    aushp:SunOS:*:*)
-	echo sparc-auspex-sunos${UNAME_RELEASE}
-	exit ;;
-    # The situation for MiNT is a little confusing.  The machine name
-    # can be virtually everything (everything which is not
-    # "atarist" or "atariste" at least should have a processor
-    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
-    # to the lowercase version "mint" (or "freemint").  Finally
-    # the system name "TOS" denotes a system which is actually not
-    # MiNT.  But MiNT is downward compatible to TOS, so this should
-    # be no problem.
-    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
-        echo m68k-atari-mint${UNAME_RELEASE}
-	exit ;;
-    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
-	echo m68k-atari-mint${UNAME_RELEASE}
-        exit ;;
-    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
-        echo m68k-atari-mint${UNAME_RELEASE}
-	exit ;;
-    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
-        echo m68k-milan-mint${UNAME_RELEASE}
-        exit ;;
-    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
-        echo m68k-hades-mint${UNAME_RELEASE}
-        exit ;;
-    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
-        echo m68k-unknown-mint${UNAME_RELEASE}
-        exit ;;
-    m68k:machten:*:*)
-	echo m68k-apple-machten${UNAME_RELEASE}
-	exit ;;
-    powerpc:machten:*:*)
-	echo powerpc-apple-machten${UNAME_RELEASE}
-	exit ;;
-    RISC*:Mach:*:*)
-	echo mips-dec-mach_bsd4.3
-	exit ;;
-    RISC*:ULTRIX:*:*)
-	echo mips-dec-ultrix${UNAME_RELEASE}
-	exit ;;
-    VAX*:ULTRIX*:*:*)
-	echo vax-dec-ultrix${UNAME_RELEASE}
-	exit ;;
-    2020:CLIX:*:* | 2430:CLIX:*:*)
-	echo clipper-intergraph-clix${UNAME_RELEASE}
-	exit ;;
-    mips:*:*:UMIPS | mips:*:*:RISCos)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-#ifdef __cplusplus
-#include <stdio.h>  /* for printf() prototype */
-	int main (int argc, char *argv[]) {
-#else
-	int main (argc, argv) int argc; char *argv[]; {
-#endif
-	#if defined (host_mips) && defined (MIPSEB)
-	#if defined (SYSTYPE_SYSV)
-	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_SVR4)
-	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
-	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
-	#endif
-	#endif
-	  exit (-1);
-	}
-EOF
-	$CC_FOR_BUILD -o $dummy $dummy.c &&
-	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
-	  SYSTEM_NAME=`$dummy $dummyarg` &&
-	    { echo "$SYSTEM_NAME"; exit; }
-	echo mips-mips-riscos${UNAME_RELEASE}
-	exit ;;
-    Motorola:PowerMAX_OS:*:*)
-	echo powerpc-motorola-powermax
-	exit ;;
-    Motorola:*:4.3:PL8-*)
-	echo powerpc-harris-powermax
-	exit ;;
-    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
-	echo powerpc-harris-powermax
-	exit ;;
-    Night_Hawk:Power_UNIX:*:*)
-	echo powerpc-harris-powerunix
-	exit ;;
-    m88k:CX/UX:7*:*)
-	echo m88k-harris-cxux7
-	exit ;;
-    m88k:*:4*:R4*)
-	echo m88k-motorola-sysv4
-	exit ;;
-    m88k:*:3*:R3*)
-	echo m88k-motorola-sysv3
-	exit ;;
-    AViiON:dgux:*:*)
-        # DG/UX returns AViiON for all architectures
-        UNAME_PROCESSOR=`/usr/bin/uname -p`
-	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
-	then
-	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
-	       [ ${TARGET_BINARY_INTERFACE}x = x ]
-	    then
-		echo m88k-dg-dgux${UNAME_RELEASE}
-	    else
-		echo m88k-dg-dguxbcs${UNAME_RELEASE}
-	    fi
-	else
-	    echo i586-dg-dgux${UNAME_RELEASE}
-	fi
- 	exit ;;
-    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
-	echo m88k-dolphin-sysv3
-	exit ;;
-    M88*:*:R3*:*)
-	# Delta 88k system running SVR3
-	echo m88k-motorola-sysv3
-	exit ;;
-    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
-	echo m88k-tektronix-sysv3
-	exit ;;
-    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
-	echo m68k-tektronix-bsd
-	exit ;;
-    *:IRIX*:*:*)
-	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
-	exit ;;
-    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
-	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
-	exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
-    i*86:AIX:*:*)
-	echo i386-ibm-aix
-	exit ;;
-    ia64:AIX:*:*)
-	if [ -x /usr/bin/oslevel ] ; then
-		IBM_REV=`/usr/bin/oslevel`
-	else
-		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
-	fi
-	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
-	exit ;;
-    *:AIX:2:3)
-	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
-		eval $set_cc_for_build
-		sed 's/^		//' << EOF >$dummy.c
-		#include <sys/systemcfg.h>
-
-		main()
-			{
-			if (!__power_pc())
-				exit(1);
-			puts("powerpc-ibm-aix3.2.5");
-			exit(0);
-			}
-EOF
-		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
-		then
-			echo "$SYSTEM_NAME"
-		else
-			echo rs6000-ibm-aix3.2.5
-		fi
-	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
-		echo rs6000-ibm-aix3.2.4
-	else
-		echo rs6000-ibm-aix3.2
-	fi
-	exit ;;
-    *:AIX:*:[456])
-	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
-	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
-		IBM_ARCH=rs6000
-	else
-		IBM_ARCH=powerpc
-	fi
-	if [ -x /usr/bin/oslevel ] ; then
-		IBM_REV=`/usr/bin/oslevel`
-	else
-		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
-	fi
-	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
-	exit ;;
-    *:AIX:*:*)
-	echo rs6000-ibm-aix
-	exit ;;
-    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
-	echo romp-ibm-bsd4.4
-	exit ;;
-    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
-	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
-	exit ;;                             # report: romp-ibm BSD 4.3
-    *:BOSX:*:*)
-	echo rs6000-bull-bosx
-	exit ;;
-    DPX/2?00:B.O.S.:*:*)
-	echo m68k-bull-sysv3
-	exit ;;
-    9000/[34]??:4.3bsd:1.*:*)
-	echo m68k-hp-bsd
-	exit ;;
-    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
-	echo m68k-hp-bsd4.4
-	exit ;;
-    9000/[34678]??:HP-UX:*:*)
-	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
-	case "${UNAME_MACHINE}" in
-	    9000/31? )            HP_ARCH=m68000 ;;
-	    9000/[34]?? )         HP_ARCH=m68k ;;
-	    9000/[678][0-9][0-9])
-		if [ -x /usr/bin/getconf ]; then
-		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
-                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
-                    case "${sc_cpu_version}" in
-                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
-                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
-                      532)                      # CPU_PA_RISC2_0
-                        case "${sc_kernel_bits}" in
-                          32) HP_ARCH="hppa2.0n" ;;
-                          64) HP_ARCH="hppa2.0w" ;;
-			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
-                        esac ;;
-                    esac
-		fi
-		if [ "${HP_ARCH}" = "" ]; then
-		    eval $set_cc_for_build
-		    sed 's/^              //' << EOF >$dummy.c
-
-              #define _HPUX_SOURCE
-              #include <stdlib.h>
-              #include <unistd.h>
-
-              int main ()
-              {
-              #if defined(_SC_KERNEL_BITS)
-                  long bits = sysconf(_SC_KERNEL_BITS);
-              #endif
-                  long cpu  = sysconf (_SC_CPU_VERSION);
-
-                  switch (cpu)
-              	{
-              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
-              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
-              	case CPU_PA_RISC2_0:
-              #if defined(_SC_KERNEL_BITS)
-              	    switch (bits)
-              		{
-              		case 64: puts ("hppa2.0w"); break;
-              		case 32: puts ("hppa2.0n"); break;
-              		default: puts ("hppa2.0"); break;
-              		} break;
-              #else  /* !defined(_SC_KERNEL_BITS) */
-              	    puts ("hppa2.0"); break;
-              #endif
-              	default: puts ("hppa1.0"); break;
-              	}
-                  exit (0);
-              }
-EOF
-		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
-		    test -z "$HP_ARCH" && HP_ARCH=hppa
-		fi ;;
-	esac
-	if [ ${HP_ARCH} = "hppa2.0w" ]
-	then
-	    eval $set_cc_for_build
-
-	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
-	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
-	    # generating 64-bit code.  GNU and HP use different nomenclature:
-	    #
-	    # $ CC_FOR_BUILD=cc ./config.guess
-	    # => hppa2.0w-hp-hpux11.23
-	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
-	    # => hppa64-hp-hpux11.23
-
-	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
-		grep __LP64__ >/dev/null
-	    then
-		HP_ARCH="hppa2.0w"
-	    else
-		HP_ARCH="hppa64"
-	    fi
-	fi
-	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
-	exit ;;
-    ia64:HP-UX:*:*)
-	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
-	echo ia64-hp-hpux${HPUX_REV}
-	exit ;;
-    3050*:HI-UX:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#include <unistd.h>
-	int
-	main ()
-	{
-	  long cpu = sysconf (_SC_CPU_VERSION);
-	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
-	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
-	     results, however.  */
-	  if (CPU_IS_PA_RISC (cpu))
-	    {
-	      switch (cpu)
-		{
-		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
-		  default: puts ("hppa-hitachi-hiuxwe2"); break;
-		}
-	    }
-	  else if (CPU_IS_HP_MC68K (cpu))
-	    puts ("m68k-hitachi-hiuxwe2");
-	  else puts ("unknown-hitachi-hiuxwe2");
-	  exit (0);
-	}
-EOF
-	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
-		{ echo "$SYSTEM_NAME"; exit; }
-	echo unknown-hitachi-hiuxwe2
-	exit ;;
-    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
-	echo hppa1.1-hp-bsd
-	exit ;;
-    9000/8??:4.3bsd:*:*)
-	echo hppa1.0-hp-bsd
-	exit ;;
-    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
-	echo hppa1.0-hp-mpeix
-	exit ;;
-    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
-	echo hppa1.1-hp-osf
-	exit ;;
-    hp8??:OSF1:*:*)
-	echo hppa1.0-hp-osf
-	exit ;;
-    i*86:OSF1:*:*)
-	if [ -x /usr/sbin/sysversion ] ; then
-	    echo ${UNAME_MACHINE}-unknown-osf1mk
-	else
-	    echo ${UNAME_MACHINE}-unknown-osf1
-	fi
-	exit ;;
-    parisc*:Lites*:*:*)
-	echo hppa1.1-hp-lites
-	exit ;;
-    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
-	echo c1-convex-bsd
-        exit ;;
-    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-        exit ;;
-    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
-	echo c34-convex-bsd
-        exit ;;
-    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
-	echo c38-convex-bsd
-        exit ;;
-    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
-	echo c4-convex-bsd
-        exit ;;
-    CRAY*Y-MP:*:*:*)
-	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*[A-Z]90:*:*:*)
-	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
-	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
-	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
-	      -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*TS:*:*:*)
-	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*T3E:*:*:*)
-	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*SV1:*:*:*)
-	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    *:UNICOS/mp:*:*)
-	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
-	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
-        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-        exit ;;
-    5000:UNIX_System_V:4.*:*)
-        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-        FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
-        echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-	exit ;;
-    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
-	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
-	exit ;;
-    sparc*:BSD/OS:*:*)
-	echo sparc-unknown-bsdi${UNAME_RELEASE}
-	exit ;;
-    *:BSD/OS:*:*)
-	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
-	exit ;;
-    *:FreeBSD:*:*)
-	case ${UNAME_MACHINE} in
-	    pc98)
-		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
-	    amd64)
-		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
-	    *)
-		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
-	esac
-	exit ;;
-    i*:CYGWIN*:*)
-	echo ${UNAME_MACHINE}-pc-cygwin
-	exit ;;
-    *:MINGW*:*)
-	echo ${UNAME_MACHINE}-pc-mingw32
-	exit ;;
-    i*:windows32*:*)
-    	# uname -m includes "-pc" on this system.
-    	echo ${UNAME_MACHINE}-mingw32
-	exit ;;
-    i*:PW*:*)
-	echo ${UNAME_MACHINE}-pc-pw32
-	exit ;;
-    *:Interix*:[3456]*)
-    	case ${UNAME_MACHINE} in
-	    x86)
-		echo i586-pc-interix${UNAME_RELEASE}
-		exit ;;
-	    EM64T | authenticamd)
-		echo x86_64-unknown-interix${UNAME_RELEASE}
-		exit ;;
-	    IA64)
-		echo ia64-unknown-interix${UNAME_RELEASE}
-		exit ;;
-	esac ;;
-    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
-	echo i${UNAME_MACHINE}-pc-mks
-	exit ;;
-    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
-	# How do we know it's Interix rather than the generic POSIX subsystem?
-	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
-	# UNAME_MACHINE based on the output of uname instead of i386?
-	echo i586-pc-interix
-	exit ;;
-    i*:UWIN*:*)
-	echo ${UNAME_MACHINE}-pc-uwin
-	exit ;;
-    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
-	echo x86_64-unknown-cygwin
-	exit ;;
-    p*:CYGWIN*:*)
-	echo powerpcle-unknown-cygwin
-	exit ;;
-    prep*:SunOS:5.*:*)
-	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    *:GNU:*:*)
-	# the GNU system
-	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
-	exit ;;
-    *:GNU/*:*:*)
-	# other systems with GNU libc and userland
-	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
-	exit ;;
-    i*86:Minix:*:*)
-	echo ${UNAME_MACHINE}-pc-minix
-	exit ;;
-    arm*:Linux:*:*)
-	eval $set_cc_for_build
-	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
-	    | grep -q __ARM_EABI__
-	then
-	    echo ${UNAME_MACHINE}-unknown-linux-gnu
-	else
-	    echo ${UNAME_MACHINE}-unknown-linux-gnueabi
-	fi
-	exit ;;
-    avr32*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    cris:Linux:*:*)
-	echo cris-axis-linux-gnu
-	exit ;;
-    crisv32:Linux:*:*)
-	echo crisv32-axis-linux-gnu
-	exit ;;
-    frv:Linux:*:*)
-    	echo frv-unknown-linux-gnu
-	exit ;;
-    ia64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    m32r*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    m68*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    mips:Linux:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#undef CPU
-	#undef mips
-	#undef mipsel
-	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=mipsel
-	#else
-	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=mips
-	#else
-	CPU=
-	#endif
-	#endif
-EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^CPU/{
-		s: ::g
-		p
-	    }'`"
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
-	;;
-    mips64:Linux:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#undef CPU
-	#undef mips64
-	#undef mips64el
-	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=mips64el
-	#else
-	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=mips64
-	#else
-	CPU=
-	#endif
-	#endif
-EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^CPU/{
-		s: ::g
-		p
-	    }'`"
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
-	;;
-    or32:Linux:*:*)
-	echo or32-unknown-linux-gnu
-	exit ;;
-    ppc:Linux:*:*)
-	echo powerpc-unknown-linux-gnu
-	exit ;;
-    ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-gnu
-	exit ;;
-    alpha:Linux:*:*)
-	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
-	  EV5)   UNAME_MACHINE=alphaev5 ;;
-	  EV56)  UNAME_MACHINE=alphaev56 ;;
-	  PCA56) UNAME_MACHINE=alphapca56 ;;
-	  PCA57) UNAME_MACHINE=alphapca56 ;;
-	  EV6)   UNAME_MACHINE=alphaev6 ;;
-	  EV67)  UNAME_MACHINE=alphaev67 ;;
-	  EV68*) UNAME_MACHINE=alphaev68 ;;
-        esac
-	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
-	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
-	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
-	exit ;;
-    parisc:Linux:*:* | hppa:Linux:*:*)
-	# Look for CPU level
-	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
-	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
-	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
-	  *)    echo hppa-unknown-linux-gnu ;;
-	esac
-	exit ;;
-    parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-gnu
-	exit ;;
-    s390:Linux:*:* | s390x:Linux:*:*)
-	echo ${UNAME_MACHINE}-ibm-linux
-	exit ;;
-    sh64*:Linux:*:*)
-    	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    sh*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    sparc:Linux:*:* | sparc64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    vax:Linux:*:*)
-	echo ${UNAME_MACHINE}-dec-linux-gnu
-	exit ;;
-    x86_64:Linux:*:*)
-	echo x86_64-unknown-linux-gnu
-	exit ;;
-    xtensa*:Linux:*:*)
-    	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    i*86:Linux:*:*)
-	# The BFD linker knows what the default object file format is, so
-	# first see if it will tell us. cd to the root directory to prevent
-	# problems with other programs or directories called `ld' in the path.
-	# Set LC_ALL=C to ensure ld outputs messages in English.
-	ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
-			 | sed -ne '/supported targets:/!d
-				    s/[ 	][ 	]*/ /g
-				    s/.*supported targets: *//
-				    s/ .*//
-				    p'`
-        case "$ld_supported_targets" in
-	  elf32-i386)
-		TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu"
-		;;
-	  a.out-i386-linux)
-		echo "${UNAME_MACHINE}-pc-linux-gnuaout"
-		exit ;;
-	  coff-i386)
-		echo "${UNAME_MACHINE}-pc-linux-gnucoff"
-		exit ;;
-	  "")
-		# Either a pre-BFD a.out linker (linux-gnuoldld) or
-		# one that does not give us useful --help.
-		echo "${UNAME_MACHINE}-pc-linux-gnuoldld"
-		exit ;;
-	esac
-	# Determine whether the default compiler is a.out or elf
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#include <features.h>
-	#ifdef __ELF__
-	# ifdef __GLIBC__
-	#  if __GLIBC__ >= 2
-	LIBC=gnu
-	#  else
-	LIBC=gnulibc1
-	#  endif
-	# else
-	LIBC=gnulibc1
-	# endif
-	#else
-	#if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-	LIBC=gnu
-	#else
-	LIBC=gnuaout
-	#endif
-	#endif
-	#ifdef __dietlibc__
-	LIBC=dietlibc
-	#endif
-EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^LIBC/{
-		s: ::g
-		p
-	    }'`"
-	test x"${LIBC}" != x && {
-		echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
-		exit
-	}
-	test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; }
-	;;
-    i*86:DYNIX/ptx:4*:*)
-	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
-	# earlier versions are messed up and put the nodename in both
-	# sysname and nodename.
-	echo i386-sequent-sysv4
-	exit ;;
-    i*86:UNIX_SV:4.2MP:2.*)
-        # Unixware is an offshoot of SVR4, but it has its own version
-        # number series starting with 2...
-        # I am not positive that other SVR4 systems won't match this,
-	# I just have to hope.  -- rms.
-        # Use sysv4.2uw... so that sysv4* matches it.
-	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
-	exit ;;
-    i*86:OS/2:*:*)
-	# If we were able to find `uname', then EMX Unix compatibility
-	# is probably installed.
-	echo ${UNAME_MACHINE}-pc-os2-emx
-	exit ;;
-    i*86:XTS-300:*:STOP)
-	echo ${UNAME_MACHINE}-unknown-stop
-	exit ;;
-    i*86:atheos:*:*)
-	echo ${UNAME_MACHINE}-unknown-atheos
-	exit ;;
-    i*86:syllable:*:*)
-	echo ${UNAME_MACHINE}-pc-syllable
-	exit ;;
-    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
-	echo i386-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    i*86:*DOS:*:*)
-	echo ${UNAME_MACHINE}-pc-msdosdjgpp
-	exit ;;
-    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
-	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
-	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
-		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
-	else
-		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
-	fi
-	exit ;;
-    i*86:*:5:[678]*)
-    	# UnixWare 7.x, OpenUNIX and OpenServer 6.
-	case `/bin/uname -X | grep "^Machine"` in
-	    *486*)	     UNAME_MACHINE=i486 ;;
-	    *Pentium)	     UNAME_MACHINE=i586 ;;
-	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
-	esac
-	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
-	exit ;;
-    i*86:*:3.2:*)
-	if test -f /usr/options/cb.name; then
-		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
-		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
-	elif /bin/uname -X 2>/dev/null >/dev/null ; then
-		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
-		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
-		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
-			&& UNAME_MACHINE=i586
-		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
-			&& UNAME_MACHINE=i686
-		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
-			&& UNAME_MACHINE=i686
-		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
-	else
-		echo ${UNAME_MACHINE}-pc-sysv32
-	fi
-	exit ;;
-    pc:*:*:*)
-	# Left here for compatibility:
-        # uname -m prints for DJGPP always 'pc', but it prints nothing about
-        # the processor, so we play safe by assuming i386.
-	echo i386-pc-msdosdjgpp
-        exit ;;
-    Intel:Mach:3*:*)
-	echo i386-pc-mach3
-	exit ;;
-    paragon:*:*:*)
-	echo i860-intel-osf1
-	exit ;;
-    i860:*:4.*:*) # i860-SVR4
-	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
-	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
-	else # Add other i860-SVR4 vendors below as they are discovered.
-	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
-	fi
-	exit ;;
-    mini*:CTIX:SYS*5:*)
-	# "miniframe"
-	echo m68010-convergent-sysv
-	exit ;;
-    mc68k:UNIX:SYSTEM5:3.51m)
-	echo m68k-convergent-sysv
-	exit ;;
-    M680?0:D-NIX:5.3:*)
-	echo m68k-diab-dnix
-	exit ;;
-    M68*:*:R3V[5678]*:*)
-	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
-    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
-	OS_REL=''
-	test -r /etc/.relid \
-	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
-    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
-        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-          && { echo i486-ncr-sysv4; exit; } ;;
-    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
-	echo m68k-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    mc68030:UNIX_System_V:4.*:*)
-	echo m68k-atari-sysv4
-	exit ;;
-    TSUNAMI:LynxOS:2.*:*)
-	echo sparc-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    rs6000:LynxOS:2.*:*)
-	echo rs6000-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
-	echo powerpc-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    SM[BE]S:UNIX_SV:*:*)
-	echo mips-dde-sysv${UNAME_RELEASE}
-	exit ;;
-    RM*:ReliantUNIX-*:*:*)
-	echo mips-sni-sysv4
-	exit ;;
-    RM*:SINIX-*:*:*)
-	echo mips-sni-sysv4
-	exit ;;
-    *:SINIX-*:*:*)
-	if uname -p 2>/dev/null >/dev/null ; then
-		UNAME_MACHINE=`(uname -p) 2>/dev/null`
-		echo ${UNAME_MACHINE}-sni-sysv4
-	else
-		echo ns32k-sni-sysv
-	fi
-	exit ;;
-    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
-                      # says <Richard.M.Bartel@ccMail.Census.GOV>
-        echo i586-unisys-sysv4
-        exit ;;
-    *:UNIX_System_V:4*:FTX*)
-	# From Gerald Hewes <hewes@openmarket.com>.
-	# How about differentiating between stratus architectures? -djm
-	echo hppa1.1-stratus-sysv4
-	exit ;;
-    *:*:*:FTX*)
-	# From seanf@swdc.stratus.com.
-	echo i860-stratus-sysv4
-	exit ;;
-    i*86:VOS:*:*)
-	# From Paul.Green@stratus.com.
-	echo ${UNAME_MACHINE}-stratus-vos
-	exit ;;
-    *:VOS:*:*)
-	# From Paul.Green@stratus.com.
-	echo hppa1.1-stratus-vos
-	exit ;;
-    mc68*:A/UX:*:*)
-	echo m68k-apple-aux${UNAME_RELEASE}
-	exit ;;
-    news*:NEWS-OS:6*:*)
-	echo mips-sony-newsos6
-	exit ;;
-    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
-	if [ -d /usr/nec ]; then
-	        echo mips-nec-sysv${UNAME_RELEASE}
-	else
-	        echo mips-unknown-sysv${UNAME_RELEASE}
-	fi
-        exit ;;
-    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
-	echo powerpc-be-beos
-	exit ;;
-    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
-	echo powerpc-apple-beos
-	exit ;;
-    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
-	echo i586-pc-beos
-	exit ;;
-    SX-4:SUPER-UX:*:*)
-	echo sx4-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-5:SUPER-UX:*:*)
-	echo sx5-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-6:SUPER-UX:*:*)
-	echo sx6-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-7:SUPER-UX:*:*)
-	echo sx7-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-8:SUPER-UX:*:*)
-	echo sx8-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-8R:SUPER-UX:*:*)
-	echo sx8r-nec-superux${UNAME_RELEASE}
-	exit ;;
-    Power*:Rhapsody:*:*)
-	echo powerpc-apple-rhapsody${UNAME_RELEASE}
-	exit ;;
-    *:Rhapsody:*:*)
-	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
-	exit ;;
-    *:Darwin:*:*)
-	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
-	case $UNAME_PROCESSOR in
-	    unknown) UNAME_PROCESSOR=powerpc ;;
-	esac
-	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
-	exit ;;
-    *:procnto*:*:* | *:QNX:[0123456789]*:*)
-	UNAME_PROCESSOR=`uname -p`
-	if test "$UNAME_PROCESSOR" = "x86"; then
-		UNAME_PROCESSOR=i386
-		UNAME_MACHINE=pc
-	fi
-	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
-	exit ;;
-    *:QNX:*:4*)
-	echo i386-pc-qnx
-	exit ;;
-    NSE-?:NONSTOP_KERNEL:*:*)
-	echo nse-tandem-nsk${UNAME_RELEASE}
-	exit ;;
-    NSR-?:NONSTOP_KERNEL:*:*)
-	echo nsr-tandem-nsk${UNAME_RELEASE}
-	exit ;;
-    *:NonStop-UX:*:*)
-	echo mips-compaq-nonstopux
-	exit ;;
-    BS2000:POSIX*:*:*)
-	echo bs2000-siemens-sysv
-	exit ;;
-    DS/*:UNIX_System_V:*:*)
-	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
-	exit ;;
-    *:Plan9:*:*)
-	# "uname -m" is not consistent, so use $cputype instead. 386
-	# is converted to i386 for consistency with other x86
-	# operating systems.
-	if test "$cputype" = "386"; then
-	    UNAME_MACHINE=i386
-	else
-	    UNAME_MACHINE="$cputype"
-	fi
-	echo ${UNAME_MACHINE}-unknown-plan9
-	exit ;;
-    *:TOPS-10:*:*)
-	echo pdp10-unknown-tops10
-	exit ;;
-    *:TENEX:*:*)
-	echo pdp10-unknown-tenex
-	exit ;;
-    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
-	echo pdp10-dec-tops20
-	exit ;;
-    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
-	echo pdp10-xkl-tops20
-	exit ;;
-    *:TOPS-20:*:*)
-	echo pdp10-unknown-tops20
-	exit ;;
-    *:ITS:*:*)
-	echo pdp10-unknown-its
-	exit ;;
-    SEI:*:*:SEIUX)
-        echo mips-sei-seiux${UNAME_RELEASE}
-	exit ;;
-    *:DragonFly:*:*)
-	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
-	exit ;;
-    *:*VMS:*:*)
-    	UNAME_MACHINE=`(uname -p) 2>/dev/null`
-	case "${UNAME_MACHINE}" in
-	    A*) echo alpha-dec-vms ; exit ;;
-	    I*) echo ia64-dec-vms ; exit ;;
-	    V*) echo vax-dec-vms ; exit ;;
-	esac ;;
-    *:XENIX:*:SysV)
-	echo i386-pc-xenix
-	exit ;;
-    i*86:skyos:*:*)
-	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
-	exit ;;
-    i*86:rdos:*:*)
-	echo ${UNAME_MACHINE}-pc-rdos
-	exit ;;
-esac
-
-#echo '(No uname command or uname output not recognized.)' 1>&2
-#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
-
-eval $set_cc_for_build
-cat >$dummy.c <<EOF
-#ifdef _SEQUENT_
-# include <sys/types.h>
-# include <sys/utsname.h>
-#endif
-main ()
-{
-#if defined (sony)
-#if defined (MIPSEB)
-  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
-     I don't know....  */
-  printf ("mips-sony-bsd\n"); exit (0);
-#else
-#include <sys/param.h>
-  printf ("m68k-sony-newsos%s\n",
-#ifdef NEWSOS4
-          "4"
-#else
-	  ""
-#endif
-         ); exit (0);
-#endif
-#endif
-
-#if defined (__arm) && defined (__acorn) && defined (__unix)
-  printf ("arm-acorn-riscix\n"); exit (0);
-#endif
-
-#if defined (hp300) && !defined (hpux)
-  printf ("m68k-hp-bsd\n"); exit (0);
-#endif
-
-#if defined (NeXT)
-#if !defined (__ARCHITECTURE__)
-#define __ARCHITECTURE__ "m68k"
-#endif
-  int version;
-  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
-  if (version < 4)
-    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
-  else
-    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
-  exit (0);
-#endif
-
-#if defined (MULTIMAX) || defined (n16)
-#if defined (UMAXV)
-  printf ("ns32k-encore-sysv\n"); exit (0);
-#else
-#if defined (CMU)
-  printf ("ns32k-encore-mach\n"); exit (0);
-#else
-  printf ("ns32k-encore-bsd\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (__386BSD__)
-  printf ("i386-pc-bsd\n"); exit (0);
-#endif
-
-#if defined (sequent)
-#if defined (i386)
-  printf ("i386-sequent-dynix\n"); exit (0);
-#endif
-#if defined (ns32000)
-  printf ("ns32k-sequent-dynix\n"); exit (0);
-#endif
-#endif
-
-#if defined (_SEQUENT_)
-    struct utsname un;
-
-    uname(&un);
-
-    if (strncmp(un.version, "V2", 2) == 0) {
-	printf ("i386-sequent-ptx2\n"); exit (0);
-    }
-    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
-	printf ("i386-sequent-ptx1\n"); exit (0);
-    }
-    printf ("i386-sequent-ptx\n"); exit (0);
-
-#endif
-
-#if defined (vax)
-# if !defined (ultrix)
-#  include <sys/param.h>
-#  if defined (BSD)
-#   if BSD == 43
-      printf ("vax-dec-bsd4.3\n"); exit (0);
-#   else
-#    if BSD == 199006
-      printf ("vax-dec-bsd4.3reno\n"); exit (0);
-#    else
-      printf ("vax-dec-bsd\n"); exit (0);
-#    endif
-#   endif
-#  else
-    printf ("vax-dec-bsd\n"); exit (0);
-#  endif
-# else
-    printf ("vax-dec-ultrix\n"); exit (0);
-# endif
-#endif
-
-#if defined (alliant) && defined (i860)
-  printf ("i860-alliant-bsd\n"); exit (0);
-#endif
-
-  exit (1);
-}
-EOF
-
-$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
-	{ echo "$SYSTEM_NAME"; exit; }
-
-# Apollos put the system type in the environment.
-
-test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
-
-# Convex versions that predate uname can use getsysinfo(1)
-
-if [ -x /usr/convex/getsysinfo ]
-then
-    case `getsysinfo -f cpu_type` in
-    c1*)
-	echo c1-convex-bsd
-	exit ;;
-    c2*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-	exit ;;
-    c34*)
-	echo c34-convex-bsd
-	exit ;;
-    c38*)
-	echo c38-convex-bsd
-	exit ;;
-    c4*)
-	echo c4-convex-bsd
-	exit ;;
-    esac
-fi
-
-cat >&2 <<EOF
-$0: unable to guess system type
-
-This script, last modified $timestamp, has failed to recognize
-the operating system you are using. It is advised that you
-download the most up to date version of the config scripts from
-
-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
-and
-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
-
-If the version you run ($0) is already up to date, please
-send the following data and any information you think might be
-pertinent to <config-patches@gnu.org> in order to provide the needed
-information to handle your system.
-
-config.guess timestamp = $timestamp
-
-uname -m = `(uname -m) 2>/dev/null || echo unknown`
-uname -r = `(uname -r) 2>/dev/null || echo unknown`
-uname -s = `(uname -s) 2>/dev/null || echo unknown`
-uname -v = `(uname -v) 2>/dev/null || echo unknown`
-
-/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
-/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
-
-hostinfo               = `(hostinfo) 2>/dev/null`
-/bin/universe          = `(/bin/universe) 2>/dev/null`
-/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
-/bin/arch              = `(/bin/arch) 2>/dev/null`
-/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
-/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
-
-UNAME_MACHINE = ${UNAME_MACHINE}
-UNAME_RELEASE = ${UNAME_RELEASE}
-UNAME_SYSTEM  = ${UNAME_SYSTEM}
-UNAME_VERSION = ${UNAME_VERSION}
-EOF
-
-exit 1
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "timestamp='"
-# time-stamp-format: "%:y-%02m-%02d"
-# time-stamp-end: "'"
-# End:
diff --git a/configure.ac b/configure.ac
index fc104296f0..134f4689f6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,10 +1,10 @@
-AC_INIT([ccminer], [1.5.31-git(SP-MOD)])
+AC_INIT([ccminer], [8.12-KlausT])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
 AC_CONFIG_SRCDIR([ccminer.cpp])
 AM_INIT_AUTOMAKE([foreign subdir-objects])
-AC_CONFIG_HEADERS([cpuminer-config.h])
+AC_CONFIG_HEADERS([ccminer-config.h])
 
 dnl Make sure anyone changing configure.ac/Makefile.am has a clue
 AM_MAINTAINER_MODE
@@ -178,3 +178,4 @@ AC_SUBST(NVCC)
 AC_SUBST(OPENMP_CFLAGS) 
 
 AC_OUTPUT
+
diff --git a/configure.sh b/configure.sh
index 1084ba7441..fd5bb1d251 100755
--- a/configure.sh
+++ b/configure.sh
@@ -5,7 +5,6 @@
 
 #--ptxas-options=\"-v -dlcm=cg\""
 
-extracflags="-march=native -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16"
-
-CUDA_CFLAGS="-O3 -Xcompiler -Wall" ./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda --with-nvml=libnvidia-ml.so
+extracflags="-march=native -std=c++11 -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16"
 
+CUDA_CFLAGS="-O3 -std=c++11 -Xcompiler -Wall -D_FORCE_INLINES" ./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda --with-nvml=libnvidia-ml.so
diff --git a/cpu-miner.c b/cpu-miner.c
deleted file mode 100644
index c5aee0faca..0000000000
--- a/cpu-miner.c
+++ /dev/null
@@ -1,2084 +0,0 @@
-﻿/*
- * Copyright 2010 Jeff Garzik
- * Copyright 2012-2014 pooler
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.  See COPYING for more details.
- */
-
-#include "cpuminer-config.h"
-#define _GNU_SOURCE
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include <inttypes.h>
-#include <unistd.h>
-#include <math.h>
-#include <sys/time.h>
-#include <time.h>
-#include <signal.h>
-
-#include <curl/curl.h>
-#include <jansson.h>
-#include <openssl/sha.h>
-
-#ifdef WIN32
-#include <windows.h>
-#include <stdint.h>
-#else
-#include <errno.h>
-#include <sys/resource.h>
-#if HAVE_SYS_SYSCTL_H
-#include <sys/types.h>
-#if HAVE_SYS_PARAM_H
-#include <sys/param.h>
-#endif
-#include <sys/sysctl.h>
-#endif
-#endif
-
-#include "compat.h"
-#include "miner.h"
-
-#ifdef WIN32
-#include <Mmsystem.h>
-#pragma comment(lib, "winmm.lib")
-#include "compat/winansi.h"
-BOOL WINAPI ConsoleHandler(DWORD);
-#endif
-
-#define PROGRAM_NAME		"ccminer"
-#define LP_SCANTIME		60
-#define HEAVYCOIN_BLKHDR_SZ		84
-#define MNR_BLKHDR_SZ 80
-
-// from heavy.cu
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-int cuda_num_devices();
-void cuda_devicenames();
-void cuda_devicereset();
-int cuda_finddevice(char *name);
-#ifdef __cplusplus
-}
-#endif
-
-
-#ifdef __linux /* Linux specific policy and affinity management */
-#include <sched.h>
-static inline void drop_policy(void)
-{
-	struct sched_param param;
-	param.sched_priority = 0;
-
-#ifdef SCHED_IDLE
-	if (unlikely(sched_setscheduler(0, SCHED_IDLE, &param) == -1))
-#endif
-#ifdef SCHED_BATCH
-		sched_setscheduler(0, SCHED_BATCH, &param);
-#endif
-}
-
-static inline void affine_to_cpu(int id, int cpu)
-{
-	cpu_set_t set;
-
-	CPU_ZERO(&set);
-	CPU_SET(cpu, &set);
-	sched_setaffinity(0, sizeof(&set), &set);
-}
-#elif defined(__FreeBSD__) /* FreeBSD specific policy and affinity management */
-#include <sys/cpuset.h>
-static inline void drop_policy(void)
-{
-}
-
-static inline void affine_to_cpu(int id, int cpu)
-{
-	cpuset_t set;
-	CPU_ZERO(&set);
-	CPU_SET(cpu, &set);
-	cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset_t), &set);
-}
-#else
-static inline void drop_policy(void)
-{
-}
-
-static inline void affine_to_cpu(int id, int cpu)
-{
-}
-#endif
-		
-enum workio_commands {
-	WC_GET_WORK,
-	WC_SUBMIT_WORK,
-};
-
-struct workio_cmd {
-	enum workio_commands	cmd;
-	struct thr_info		*thr;
-	union {
-		struct work	*work;
-	} u;
-};
-
-typedef enum {
-	ALGO_ANIME,
-	ALGO_BLAKE,
-	ALGO_BLAKECOIN,
-	ALGO_DEEP,
-	ALGO_DOOM,
-	ALGO_FRESH,
-	ALGO_FUGUE256,		/* Fugue256 */
-	ALGO_GROESTL,
-	ALGO_HEAVY,		/* Heavycoin hash */
-	ALGO_KECCAK,
-	ALGO_JACKPOT,
-	ALGO_LUFFA_DOOM,
-	ALGO_MJOLLNIR,		/* Mjollnir hash */
-	ALGO_MYR_GR,
-	ALGO_NIST5,
-	ALGO_PENTABLAKE,
-	ALGO_QUARK,
-	ALGO_QUBIT,
-	ALGO_S3,
-	ALGO_WHC,
-	ALGO_X11,
-	ALGO_X13,
-	ALGO_X14,
-	ALGO_X15,
-	ALGO_X17,
-	ALGO_DMD_GR,
-} sha256_algos;
-
-static const char *algo_names[] = {
-	"anime",
-	"blake",
-	"blakecoin",
-	"deep",
-	"doom", /* is luffa */
-	"fresh",
-	"fugue256",
-	"groestl",
-	"heavy",
-	"keccak",
-	"jackpot",
-	"luffa",
-	"mjollnir",
-	"myr-gr",
-	"nist5",
-	"penta",
-	"quark",
-	"qubit",
-	"s3",
-	"whirl",
-	"x11",
-	"x13",
-	"x14",
-	"x15",
-	"x17",
-	"dmd-gr",
-};
-
-bool opt_debug = false;
-bool opt_tracegpu = false;
-bool opt_protocol = false;
-bool opt_benchmark = false;
-bool want_longpoll = true;
-bool have_longpoll = false;
-bool want_stratum = true;
-bool have_stratum = false;
-static bool submit_old = false;
-bool use_syslog = false;
-bool use_colors = true;
-static bool opt_background = false;
-bool opt_quiet = false;
-static int opt_retries = -1;
-static int opt_fail_pause = 30;
-int opt_timeout = 270;
-static int opt_scantime = 5;
-static json_t *opt_config;
-static const bool opt_time = true;
-static sha256_algos opt_algo = ALGO_HEAVY;
-int opt_n_threads = 0;
-static double opt_difficulty = 1; // CH
-bool opt_trust_pool = false;
-uint16_t opt_vote = 9999;
-static int num_processors;
-int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
-char *device_name[8]; // CB
-static char *rpc_url;
-static char *rpc_userpass;
-static char *rpc_user, *rpc_pass;
-static char *short_url = NULL;
-char *opt_cert;
-char *opt_proxy;
-long opt_proxy_type;
-struct thr_info *thr_info;
-static int work_thr_id;
-int longpoll_thr_id = -1;
-int stratum_thr_id = -1;
-struct work_restart *work_restart = NULL;
-static struct stratum_ctx stratum;
-
-pthread_mutex_t applog_lock;
-static pthread_mutex_t stats_lock;
-static unsigned long accepted_count = 0L;
-static unsigned long rejected_count = 0L;
-static double *thr_hashrates;
-uint64_t global_hashrate = 0;
-
-#ifdef HAVE_GETOPT_LONG
-#include <getopt.h>
-#else
-struct option {
-	const char *name;
-	int has_arg;
-	int *flag;
-	int val;
-};
-#endif
-
-static char const usage[] = "\
-Usage: " PROGRAM_NAME " [OPTIONS]\n\
-Options:\n\
-  -a, --algo=ALGO       specify the hash algorithm to use\n\
-			anime       Animecoin\n\
-			blake       Blake 256 (SFR/NEOS)\n\
-			blakecoin   Fast Blake 256 (8 rounds)\n\
-			deep        Deepcoin\n\
-			dmd-gr      Diamond-Groestl\n\
-			fresh       Freshcoin (shavite 80)\n\
-			fugue256    Fuguecoin\n\
-			groestl     Groestlcoin\n\
-			heavy       Heavycoin\n\
-			jackpot     Jackpot\n\
-			keccak      Keccak-256 (Maxcoin)\n\
-			luffa       Doomcoin\n\
-			mjollnir    Mjollnircoin\n\
-			myr-gr      Myriad-Groestl\n\
-			nist5       NIST5 (TalkCoin)\n\
-			penta       Pentablake hash (5x Blake 512)\n\
-			quark       Quark\n\
-			qubit       Qubit\n\
-			s3          S3 (1Coin)\n\
-			x11         X11 (DarkCoin)\n\
-			x13         X13 (MaruCoin)\n\
-			x14         X14\n\
-			x15         X15\n\
-			x17         X17 (peoplecurrency)\n\
-			whirl       Whirlcoin (old whirlpool)\n\
-  -d, --devices         Comma separated list of CUDA devices to use.\n\
-                        Device IDs start counting from 0! Alternatively takes\n\
-                        string names of your cards like gtx780ti or gt640#2\n\
-                        (matching 2nd gt640 in the PC)\n\
-  -f, --diff            Divide difficulty by this factor (std is 1) \n\
-  -v, --vote=VOTE       block reward vote (for HeavyCoin)\n\
-  -m, --trust-pool      trust the max block reward vote (maxvote) sent by the pool\n\
-  -o, --url=URL         URL of mining server\n\
-  -O, --userpass=U:P    username:password pair for mining server\n\
-  -u, --user=USERNAME   username for mining server\n\
-  -p, --pass=PASSWORD   password for mining server\n\
-      --cert=FILE       certificate for mining server using SSL\n\
-  -x, --proxy=[PROTOCOL://]HOST[:PORT]  connect through a proxy\n\
-  -t, --threads=N       number of miner threads (default: number of nVidia GPUs)\n\
-  -r, --retries=N       number of times to retry if a network call fails\n\
-                          (default: retry indefinitely)\n\
-  -R, --retry-pause=N   time to pause between retries, in seconds (default: 30)\n\
-  -T, --timeout=N       network timeout, in seconds (default: 270)\n\
-  -s, --scantime=N      upper bound on time spent scanning current work when\n\
-                          long polling is unavailable, in seconds (default: 5)\n\
-      --no-longpoll     disable X-Long-Polling support\n\
-      --no-stratum      disable X-Stratum support\n\
-  -q, --quiet           disable per-thread hashmeter output\n\
-  -K, --no-color        disable colored output\n\
-  -D, --debug           enable debug output\n\
-  -P, --protocol-dump   verbose dump of protocol-level activities\n"
-#ifdef HAVE_SYSLOG_H
-"\
-  -S, --syslog          use system log for output messages\n"
-#endif
-#ifndef WIN32
-"\
-  -B, --background      run the miner in the background\n"
-#endif
-"\
-      --benchmark       run in offline benchmark mode\n\
-      --cputest         debug hashes from cpu algorithms\n\
-  -c, --config=FILE     load a JSON-format configuration file\n\
-  -V, --version         display version information and exit\n\
-  -h, --help            display this help text and exit\n\
-";
-
-static char const short_options[] =
-#ifndef WIN32
-	"B"
-#endif
-#ifdef HAVE_SYSLOG_H
-	"S"
-#endif
-	"a:c:CKDhp:Px:qr:R:s:t:T:o:u:O:Vd:f:mv:";
-
-static struct option const options[] = {
-	{ "algo", 1, NULL, 'a' },
-#ifndef WIN32
-	{ "background", 0, NULL, 'B' },
-#endif
-	{ "benchmark", 0, NULL, 1005 },
-	{ "cputest", 0, NULL, 1006 },
-	{ "cert", 1, NULL, 1001 },
-	{ "config", 1, NULL, 'c' },
-	{ "no-color", 0, NULL, 'K' },
-	{ "debug", 0, NULL, 'D' },
-	{ "help", 0, NULL, 'h' },
-	{ "no-longpoll", 0, NULL, 1003 },
-	{ "no-stratum", 0, NULL, 1007 },
-	{ "pass", 1, NULL, 'p' },
-	{ "protocol-dump", 0, NULL, 'P' },
-	{ "proxy", 1, NULL, 'x' },
-	{ "quiet", 0, NULL, 'q' },
-	{ "retries", 1, NULL, 'r' },
-	{ "retry-pause", 1, NULL, 'R' },
-	{ "scantime", 1, NULL, 's' },
-#ifdef HAVE_SYSLOG_H
-	{ "syslog", 0, NULL, 'S' },
-#endif
-	{ "threads", 1, NULL, 't' },
-	{ "vote", 1, NULL, 'v' },
-	{ "trust-pool", 0, NULL, 'm' },
-	{ "timeout", 1, NULL, 'T' },
-	{ "url", 1, NULL, 'o' },
-	{ "user", 1, NULL, 'u' },
-	{ "userpass", 1, NULL, 'O' },
-	{ "version", 0, NULL, 'V' },
-	{ "devices", 1, NULL, 'd' },
-	{ "diff", 1, NULL, 'f' },
-	{ 0, 0, 0, 0 }
-};
-
-struct work {
-	uint32_t data[32];
-	uint32_t target[8];
-	uint32_t maxvote;
-
-	char job_id[128];
-	size_t xnonce2_len;
-	unsigned char xnonce2[32];
-
-	union {
-		uint32_t u32[2];
-		uint64_t u64[1];
-	} noncerange;
-
-	double difficulty;
-
-	uint32_t scanned_from;
-	uint32_t scanned_to;
-};
-
-static struct work _ALIGN(64) g_work;
-static time_t g_work_time;
-static pthread_mutex_t g_work_lock;
-
-
-void proper_exit(int reason)
-{
-	cuda_devicereset();
-	hashlog_purge_all();
-	exit(reason);
-}
-
-static bool jobj_binary(const json_t *obj, const char *key,
-			void *buf, size_t buflen)
-{
-	const char *hexstr;
-	json_t *tmp;
-
-	tmp = json_object_get(obj, key);
-	if (unlikely(!tmp)) {
-		applog(LOG_ERR, "JSON key '%s' not found", key);
-		return false;
-	}
-	hexstr = json_string_value(tmp);
-	if (unlikely(!hexstr)) {
-		applog(LOG_ERR, "JSON key '%s' is not a string", key);
-		return false;
-	}
-	if (!hex2bin((unsigned char*)buf, hexstr, buflen))
-		return false;
-
-	return true;
-}
-
-static bool work_decode(const json_t *val, struct work *work)
-{
-	int i;
-	
-	if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data)))) {
-		applog(LOG_ERR, "JSON inval data");
-		goto err_out;
-	}
-	if (unlikely(!jobj_binary(val, "target", work->target, sizeof(work->target)))) {
-		applog(LOG_ERR, "JSON inval target");
-		goto err_out;
-	}
-	if (opt_algo == ALGO_HEAVY) {
-		if (unlikely(!jobj_binary(val, "maxvote", &work->maxvote, sizeof(work->maxvote)))) {
-			work->maxvote = 1024;
-		}
-	} else work->maxvote = 0;
-
-	for (i = 0; i < ARRAY_SIZE(work->data); i++)
-		work->data[i] = le32dec(work->data + i);
-	for (i = 0; i < ARRAY_SIZE(work->target); i++)
-		work->target[i] = le32dec(work->target + i);
-
-	json_t *jr = json_object_get(val, "noncerange");
-	if (jr) {
-		const char * hexstr = json_string_value(jr);
-		if (likely(hexstr)) {
-			// never seen yet...
-			hex2bin((unsigned char*)work->noncerange.u64, hexstr, 8);
-			applog(LOG_DEBUG, "received noncerange: %08x-%08x", work->noncerange.u32[0], work->noncerange.u32[1]);
-		}
-	}
-
-	/* use work ntime as job id (solo-mining) */
-	cbin2hex(work->job_id, (const char*)&work->data[17], 4);
-
-	return true;
-
-err_out:
-	return false;
-}
-
-/**
- * Calculate the work difficulty as double
- */
-static void calc_diff(struct work *work, int known)
-{
-	// sample for diff 32.53 : 00000007de5f0000
-	const uint64_t diffone = 0xFFFF000000000000ull;
-	uint64_t *data64, d64;
-	char rtarget[32];
-
-	swab256(rtarget, work->target);
-	data64 = (uint64_t *)(rtarget + 3); /* todo: index (3) can be tuned here */
-	d64 = swab64(*data64);
-	if (unlikely(!d64))
-		d64 = 1;
-	work->difficulty = (double)diffone / d64;
-	if (opt_difficulty > 0.) {
-		work->difficulty /= opt_difficulty;
-	}
-}
-
-static int share_result(int result, const char *reason)
-{
-	char s[345];
-	double hashrate;
-
-	hashrate = 0.;
-	pthread_mutex_lock(&stats_lock);
-	for (int i = 0; i < opt_n_threads; i++)
-		hashrate += thr_hashrates[i];
-	result ? accepted_count++ : rejected_count++;
-	pthread_mutex_unlock(&stats_lock);
-
-	global_hashrate = llround(hashrate);
-
-	sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate);
-	applog(LOG_NOTICE, "accepted: %lu/%lu (%.2f%%), %s khash/s %s",
-			accepted_count,
-			accepted_count + rejected_count,
-			100. * accepted_count / (accepted_count + rejected_count),
-			s,
-			use_colors ?
-				(result ? CL_GRN "yay!!!" : CL_RED "booooo")
-			:	(result ? "(yay!!!)" : "(booooo)"));
-
-	if (reason) {
-		applog(LOG_WARNING, "reject reason: %s", reason);
-		if (strncmp(reason, "low difficulty share", 20) == 0) {
-			opt_difficulty = (opt_difficulty * 2.0) / 3.0;
-			applog(LOG_WARNING, "factor reduced to : %0.2f", opt_difficulty);
-			return 0;
-		}
-	}
-	return 1;
-}
-
-static bool submit_upstream_work(CURL *curl, struct work *work)
-{
-	char *str = NULL;
-	json_t *val, *res, *reason;
-	char s[345];
-	int i;
-	bool rc = false;
-
-	/* pass if the previous hash is not the current previous hash */
-	pthread_mutex_lock(&g_work_lock);
-	if (memcmp(work->data + 1, g_work.data + 1, 32)) {
-		pthread_mutex_unlock(&g_work_lock);
-		if (opt_debug)
-			applog(LOG_DEBUG, "DEBUG: stale work detected, discarding");
-		return true;
-	}
-	calc_diff(work, 0);
-	pthread_mutex_unlock(&g_work_lock);
-
-	if (have_stratum) {
-		uint32_t sent;
-		uint32_t ntime, nonce;
-		uint16_t nvote;
-		char *ntimestr, *noncestr, *xnonce2str, *nvotestr;
-
-		le32enc(&ntime, work->data[17]);
-		le32enc(&nonce, work->data[19]);
-		be16enc(&nvote, *((uint16_t*)&work->data[20]));
-
-		ntimestr = bin2hex((const unsigned char *)(&ntime), 4);
-		noncestr = bin2hex((const unsigned char *)(&nonce), 4);
-		xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len);
-		nvotestr = bin2hex((const unsigned char *)(&nvote), 2);
-
-		sent = hashlog_already_submittted(work->job_id, nonce);
-		if (sent > 0) {
-			sent = (uint32_t) time(NULL) - sent;
-			if (!opt_quiet) {
-				applog(LOG_WARNING, "skip submit, nonce %s was already sent %u seconds ago", noncestr, sent);
-				hashlog_dump_job(work->job_id);
-			}
-			rc = true;
-			goto out;
-		}
-
-		if (opt_algo == ALGO_HEAVY) {
-			sprintf(s,
-				"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-				rpc_user, work->job_id + 8, xnonce2str, ntimestr, noncestr, nvotestr);
-		} else {
-			sprintf(s,
-				"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-				rpc_user, work->job_id + 8, xnonce2str, ntimestr, noncestr);
-		}
-		free(ntimestr);
-		free(noncestr);
-		free(xnonce2str);
-		free(nvotestr);
-
-		if (unlikely(!stratum_send_line(&stratum, s))) {
-			applog(LOG_ERR, "submit_upstream_work stratum_send_line failed");
-			goto out;
-		}
-
-		hashlog_remember_submit(work->job_id, nonce, work->scanned_from);
-
-	} else {
-
-		/* build hex string */
-
-		if (opt_algo != ALGO_HEAVY && opt_algo != ALGO_MJOLLNIR) {
-			for (i = 0; i < ARRAY_SIZE(work->data); i++)
-				le32enc(work->data + i, work->data[i]);
-			}
-			str = bin2hex((unsigned char *)work->data, sizeof(work->data));
-			if (unlikely(!str)) {
-				applog(LOG_ERR, "submit_upstream_work OOM");
-				goto out;
-		}
-
-		/* build JSON-RPC request */
-		sprintf(s,
-			"{\"method\": \"getwork\", \"params\": [ \"%s\" ], \"id\":1}\r\n",
-			str);
-
-		/* issue JSON-RPC request */
-		val = json_rpc_call(curl, rpc_url, rpc_userpass, s, false, false, NULL);
-		if (unlikely(!val)) {
-			applog(LOG_ERR, "submit_upstream_work json_rpc_call failed");
-			goto out;
-		}
-
-		res = json_object_get(val, "result");
-		reason = json_object_get(val, "reject-reason");
-		if (!share_result(json_is_true(res), reason ? json_string_value(reason) : NULL))
-			hashlog_purge_job(work->job_id);
-
-		json_decref(val);
-	}
-
-	rc = true;
-
-out:
-	free(str);
-	return rc;
-}
-
-static const char *rpc_req =
-	"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n";
-
-static bool get_upstream_work(CURL *curl, struct work *work)
-{
-	json_t *val;
-	bool rc;
-	struct timeval tv_start, tv_end, diff;
-
-	gettimeofday(&tv_start, NULL);
-	val = json_rpc_call(curl, rpc_url, rpc_userpass, rpc_req,
-			    want_longpoll, false, NULL);
-	gettimeofday(&tv_end, NULL);
-
-	if (have_stratum) {
-		if (val)
-			json_decref(val);
-		return true;
-	}
-
-	if (!val)
-		return false;
-
-	rc = work_decode(json_object_get(val, "result"), work);
-
-	if (opt_protocol && rc) {
-		timeval_subtract(&diff, &tv_end, &tv_start);
-		/* show time because curl can be slower against versions/config */
-		applog(LOG_DEBUG, "got new work in %.2f ms",
-		       (1000.0 * diff.tv_sec) + (0.001 * diff.tv_usec));
-	}
-
-	json_decref(val);
-
-	return rc;
-}
-
-static void workio_cmd_free(struct workio_cmd *wc)
-{
-	if (!wc)
-		return;
-
-	switch (wc->cmd) {
-	case WC_SUBMIT_WORK:
-		aligned_free(wc->u.work);
-		break;
-	default: /* do nothing */
-		break;
-	}
-
-	memset(wc, 0, sizeof(*wc));	/* poison */
-	free(wc);
-}
-
-static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
-{
-	struct work *ret_work;
-	int failures = 0;
-
-	ret_work = (struct work*)aligned_calloc(sizeof(*ret_work));
-	if (!ret_work)
-		return false;
-
-	/* obtain new work from bitcoin via JSON-RPC */
-	while (!get_upstream_work(curl, ret_work)) {
-		if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
-			applog(LOG_ERR, "json_rpc_call failed, terminating workio thread");
-			aligned_free(ret_work);
-			return false;
-		}
-
-		/* pause, then restart work-request loop */
-		applog(LOG_ERR, "json_rpc_call failed, retry after %d seconds",
-			opt_fail_pause);
-		sleep(opt_fail_pause);
-	}
-
-	/* send work to requesting thread */
-	if (!tq_push(wc->thr->q, ret_work))
-		aligned_free(ret_work);
-
-	return true;
-}
-
-static bool workio_submit_work(struct workio_cmd *wc, CURL *curl)
-{
-	int failures = 0;
-
-	/* submit solution to bitcoin via JSON-RPC */
-	while (!submit_upstream_work(curl, wc->u.work)) {
-		if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
-			applog(LOG_ERR, "...terminating workio thread");
-			return false;
-		}
-
-		/* pause, then restart work-request loop */
-		if (!opt_benchmark)
-			applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
-
-		sleep(opt_fail_pause);
-	}
-
-	return true;
-}
-
-static void *workio_thread(void *userdata)
-{
-	struct thr_info *mythr = (struct thr_info*)userdata;
-	CURL *curl;
-	bool ok = true;
-
-	curl = curl_easy_init();
-	if (unlikely(!curl)) {
-		applog(LOG_ERR, "CURL initialization failed");
-		return NULL;
-	}
-
-	while (ok) {
-		struct workio_cmd *wc;
-
-		/* wait for workio_cmd sent to us, on our queue */
-		wc = (struct workio_cmd *)tq_pop(mythr->q, NULL);
-		if (!wc) {
-			ok = false;
-			break;
-		}
-
-		/* process workio_cmd */
-		switch (wc->cmd) {
-		case WC_GET_WORK:
-			ok = workio_get_work(wc, curl);
-			break;
-		case WC_SUBMIT_WORK:
-			ok = workio_submit_work(wc, curl);
-			break;
-
-		default:		/* should never happen */
-			ok = false;
-			break;
-		}
-
-		workio_cmd_free(wc);
-	}
-
-	tq_freeze(mythr->q);
-	curl_easy_cleanup(curl);
-
-	return NULL;
-}
-
-static bool get_work(struct thr_info *thr, struct work *work)
-{
-	struct workio_cmd *wc;
-	struct work *work_heap;
-
-	if (opt_benchmark) {
-		memset(work->data, 0x55, 76);
-		work->data[17] = swab32((uint32_t)time(NULL));
-		memset(work->data + 19, 0x00, 52);
-		work->data[20] = 0x80000000;
-		work->data[31] = 0x00000280;
-		memset(work->target, 0x00, sizeof(work->target));
-		return true;
-	}
-
-	/* fill out work request message */
-	wc = (struct workio_cmd *)calloc(1, sizeof(*wc));
-	if (!wc)
-		return false;
-
-	wc->cmd = WC_GET_WORK;
-	wc->thr = thr;
-
-	/* send work request to workio thread */
-	if (!tq_push(thr_info[work_thr_id].q, wc)) {
-		workio_cmd_free(wc);
-		return false;
-	}
-
-	/* wait for response, a unit of work */
-	work_heap = (struct work *)tq_pop(thr->q, NULL);
-	if (!work_heap)
-		return false;
-
-	/* copy returned work into storage provided by caller */
-	memcpy(work, work_heap, sizeof(*work));
-	free(work_heap);
-
-	return true;
-}
-
-static bool submit_work(struct thr_info *thr, const struct work *work_in)
-{
-	struct workio_cmd *wc;
-	/* fill out work request message */
-	wc = (struct workio_cmd *)calloc(1, sizeof(*wc));
-	if (!wc)
-		return false;
-
-	wc->u.work = (struct work *)aligned_calloc(sizeof(*work_in));
-	if (!wc->u.work)
-		goto err_out;
-
-	wc->cmd = WC_SUBMIT_WORK;
-	wc->thr = thr;
-	memcpy(wc->u.work, work_in, sizeof(*work_in));
-
-	/* send solution to workio thread */
-	if (!tq_push(thr_info[work_thr_id].q, wc))
-		goto err_out;
-
-	return true;
-
-err_out:
-	workio_cmd_free(wc);
-	return false;
-}
-
-static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
-{
-	unsigned char merkle_root[64];
-	int i;
-
-	if (!sctx->job.job_id) {
-		/* job not yet retrieved */
-		return;
-	}
-
-	pthread_mutex_lock(&sctx->work_lock);
-
-	// store the job ntime as high part of jobid
-	snprintf(work->job_id, sizeof(work->job_id), "%07x %s",
-		be32dec(sctx->job.ntime) & 0xfffffff, sctx->job.job_id);
-	work->xnonce2_len = sctx->xnonce2_size;
-	memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size);
-
-	/* Generate merkle root */
-	switch (opt_algo) {
-		case ALGO_HEAVY:
-		case ALGO_MJOLLNIR:
-			heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
-			break;
-		case ALGO_FUGUE256:
-		case ALGO_GROESTL:
-		case ALGO_KECCAK:
-		case ALGO_BLAKECOIN:
-		case ALGO_WHC:
-			SHA256((uint8_t*)sctx->job.coinbase, sctx->job.coinbase_size, (uint8_t*)merkle_root);
-			break;
-		default:
-			sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
-	}
-
-	for (i = 0; i < sctx->job.merkle_count; i++) {
-		memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
-		if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR)
-			heavycoin_hash(merkle_root, merkle_root, 64);
-		else
-			sha256d(merkle_root, merkle_root, 64);
-	}
-	
-	/* Increment extranonce2 */
-	for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++);
-
-	/* Assemble block header */
-	memset(work->data, 0, sizeof(work->data));
-	work->data[0] = le32dec(sctx->job.version);
-	for (i = 0; i < 8; i++)
-		work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i);
-	for (i = 0; i < 8; i++)
-		work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
-	work->data[17] = le32dec(sctx->job.ntime);
-	work->data[18] = le32dec(sctx->job.nbits);
-	if (opt_algo == ALGO_MJOLLNIR)
-	{
-		for (i = 0; i < 20; i++)
-			work->data[i] = be32dec((uint32_t *)&work->data[i]);
-	}
-
-	work->data[20] = 0x80000000;
-	work->data[31] = (opt_algo == ALGO_MJOLLNIR) ? 0x000002A0 : 0x00000280;
-
-	// HeavyCoin
-	if (opt_algo == ALGO_HEAVY) {
-		uint16_t *ext;
-		work->maxvote = 1024;
-		ext = (uint16_t*)(&work->data[20]);
-		ext[0] = opt_vote;
-		ext[1] = be16dec(sctx->job.nreward);
-
-		for (i = 0; i < 20; i++)
-			work->data[i] = be32dec((uint32_t *)&work->data[i]);
-	}
-	//
-
-	pthread_mutex_unlock(&sctx->work_lock);
-
-	if (opt_debug) {
-		char *tm = atime2str(swab32(work->data[17]) - sctx->srvtime_diff);
-		char *xnonce2str = bin2hex(work->xnonce2, sctx->xnonce2_size);
-		applog(LOG_DEBUG, "DEBUG: job_id=%s xnonce2=%s time=%s",
-		       work->job_id, xnonce2str, tm);
-		free(tm);
-		free(xnonce2str);
-	}
-
-	if (opt_algo == ALGO_JACKPOT)
-		diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
-	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH)
-		diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
-	else if (opt_algo == ALGO_KECCAK)
-		diff_to_target(work->target, sctx->job.diff / (128.0 * opt_difficulty));
-	else
-		diff_to_target(work->target, sctx->job.diff / opt_difficulty);
-}
-
-static void *miner_thread(void *userdata)
-{
-	struct thr_info *mythr = (struct thr_info *)userdata;
-	int thr_id = mythr->id;
-	struct work work;
-	uint32_t max_nonce;
-	uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1);
-	bool work_done = false;
-	bool extrajob = false;
-	char s[16];
-	int rc = 0;
-
-	memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized
-
-	/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
-	 * and if that fails, then SCHED_BATCH. No need for this to be an
-	 * error if it fails */
-	if (!opt_benchmark) {
-		setpriority(PRIO_PROCESS, 0, 19);
-		drop_policy();
-	}
-
-	/* Cpu affinity only makes sense if the number of threads is a multiple
-	 * of the number of CPUs */
-	if (num_processors > 1 && opt_n_threads % num_processors == 0) {
-		if (!opt_quiet)
-			applog(LOG_DEBUG, "Binding thread %d to cpu %d", thr_id,
-					thr_id % num_processors);
-		affine_to_cpu(thr_id, thr_id % num_processors);
-	}
-
-	while (1) {
-		unsigned long hashes_done;
-		uint32_t start_nonce;
-		struct timeval tv_start, tv_end, diff;
-		int64_t max64;
-		uint64_t umax64;
-
-		// &work.data[19]
-		int wcmplen = 76;
-		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
-
-		if (have_stratum) {
-			uint32_t sleeptime = 0;
-			while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) {
-				usleep(100*1000);
-				if (sleeptime > 4) {
-					extrajob = true;
-					break;
-				}
-				sleeptime++;
-			}
-			if (sleeptime && opt_debug && !opt_quiet)
-				applog(LOG_DEBUG, "sleeptime: %u ms", sleeptime*100);
-			nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
-			pthread_mutex_lock(&g_work_lock);
-			extrajob |= work_done;
-			if ((*nonceptr) >= end_nonce || extrajob) {
-				work_done = false;
-				extrajob = false;
-				stratum_gen_work(&stratum, &g_work);
-			}
-		} else {
-			int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime;
-			/* obtain new work from internal workio thread */
-			pthread_mutex_lock(&g_work_lock);
-			if (time(NULL) - g_work_time >= min_scantime ||
-			     (*nonceptr) >= end_nonce) {
-				if (unlikely(!get_work(mythr, &g_work))) {
-					applog(LOG_ERR, "work retrieval failed, exiting "
-						"mining thread %d", mythr->id);
-					pthread_mutex_unlock(&g_work_lock);
-					goto out;
-				}
-				g_work_time = time(NULL);
-			}
-		}
-#if 0
-		if (!opt_benchmark && g_work.job_id[0] == '\0') {
-			applog(LOG_ERR, "work data not read yet");
-			extrajob = true;
-			work_done = true;
-			sleep(1);
-			//continue;
-		}
-#endif
-		if (rc > 1) {
-			/* if we found more than one on last loop */
-			/* todo: handle an array to get them directly */
-			pthread_mutex_unlock(&g_work_lock);
-			goto continue_scan;
-		}
-
-		if (memcmp(work.target, g_work.target, sizeof(work.target))) {
-			calc_diff(&g_work, 0);
-			if (opt_debug) {
-				uint64_t target64 = g_work.target[7] * 0x100000000ULL + g_work.target[6];
-				applog(LOG_DEBUG, "job %s target change: %llx (%.1f)", g_work.job_id, target64, g_work.difficulty);
-			}
-			memcpy(work.target, g_work.target, sizeof(work.target));
-			work.difficulty = g_work.difficulty;
-			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
-			/* on new target, ignoring nonce, clear sent data (hashlog) */
-			if (memcmp(work.target, g_work.target, sizeof(work.target))) {
-				hashlog_purge_job(work.job_id);
-			}
-		}
-		if (memcmp(work.data, g_work.data, wcmplen)) {
-			if (opt_debug) {
-#if 0
-				for (int n=0; n <= (wcmplen-8); n+=8) {
-					if (memcmp(work.data + n, g_work.data + n, 8)) {
-						applog(LOG_DEBUG, "job %s work updated at offset %d:", g_work.job_id, n);
-						applog_hash((uint8_t*) work.data + n);
-						applog_compare_hash((uint8_t*) g_work.data + n, (uint8_t*) work.data + n);
-					}
-				}
-#endif
-			}
-			memcpy(&work, &g_work, sizeof(struct work));
-			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
-		} else
-			(*nonceptr)++; //??
-		work_restart[thr_id].restart = 0;
-
-		if (opt_debug)
-			applog(LOG_DEBUG, "job %s %08x", g_work.job_id, (*nonceptr));
-		pthread_mutex_unlock(&g_work_lock);
-
-		/* adjust max_nonce to meet target scan time */
-		if (have_stratum)
-			max64 = LP_SCANTIME;
-		else
-			max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime)
-			      - time(NULL);
-
-		max64 *= (int64_t)thr_hashrates[thr_id];
-
-		if (max64 <= 0) {
-			/* should not be set too high,
-			   else you can miss multiple nounces */
-			switch (opt_algo) {
-			case ALGO_JACKPOT:
-				max64 = 0x1fffLL;
-				break;
-			case ALGO_BLAKECOIN:
-				max64 = 0x3ffffffLL;
-				break;
-			case ALGO_BLAKE:
-				/* based on the 750Ti hashrate (100kH) */
-				max64 = 0x1ffffffLL;
-				break;
-			default:
-				max64 = 0xfffffLL;
-				break;
-			}
-		}
-
-		start_nonce = *nonceptr;
-
-		/* do not recompute something already scanned */
-		if (opt_algo == ALGO_BLAKE && opt_n_threads == 1) {
-			union {
-				uint64_t data;
-				uint32_t scanned[2];
-			} range;
-
-			range.data = hashlog_get_scan_range(work.job_id);
-			if (range.data) {
-				bool stall = false;
-				if (range.scanned[0] == 1 && range.scanned[1] == 0xFFFFFFFFUL) {
-					applog(LOG_WARNING, "detected a rescan of fully scanned job!");
-				} else if (range.scanned[0] > 0 && range.scanned[1] > 0 && range.scanned[1] < 0xFFFFFFF0UL) {
-					/* continue scan the end */
-					start_nonce = range.scanned[1] + 1;
-					//applog(LOG_DEBUG, "scan the next part %x + 1 (%x-%x)", range.scanned[1], range.scanned[0], range.scanned[1]);
-				}
-
-				stall = (start_nonce == work.scanned_from && end_nonce == work.scanned_to);
-				stall |= (start_nonce == work.scanned_from && start_nonce == range.scanned[1] + 1);
-				stall |= (start_nonce > range.scanned[0] && start_nonce < range.scanned[1]);
-
-				if (stall) {
-					if (opt_debug && !opt_quiet)
-						applog(LOG_DEBUG, "job done, wait for a new one...");
-					work_restart[thr_id].restart = 1;
-					hashlog_purge_old();
-					// wait a bit for a new job...
-					usleep(500*1000);
-					(*nonceptr) = end_nonce + 1;
-					work_done = true;
-					continue;
-				}
-			}
-		}
-
-		umax64 = (uint64_t) max64;
-		if ((umax64 + start_nonce) >= end_nonce)
-			max_nonce = end_nonce;
-		else
-			max_nonce = (uint32_t) umax64 + start_nonce;
-
-		work.scanned_from = start_nonce;
-		(*nonceptr) = start_nonce;
-
-		hashes_done = 0;
-continue_scan:
-		gettimeofday(&tv_start, NULL);
-
-		/* scan nonces for a proof-of-work hash */
-		switch (opt_algo) {
-
-		case ALGO_HEAVY:
-			rc = scanhash_heavy(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ);
-			break;
-
-		case ALGO_KECCAK:
-			rc = scanhash_keccak256(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_MJOLLNIR:
-			rc = scanhash_heavy(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ);
-			break;
-
-		case ALGO_DEEP:
-			rc = scanhash_deep(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_DOOM:
-		case ALGO_LUFFA_DOOM:
-			rc = scanhash_doom(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_FUGUE256:
-			rc = scanhash_fugue256(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_GROESTL:
-		case ALGO_DMD_GR:
-			rc = scanhash_groestlcoin(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_MYR_GR:
-			rc = scanhash_myriad(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_JACKPOT:
-			rc = scanhash_jackpot(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_QUARK:
-			rc = scanhash_quark(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_QUBIT:
-			rc = scanhash_qubit(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_ANIME:
-			rc = scanhash_anime(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_BLAKECOIN:
-			rc = scanhash_blake256(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done, 8);
-			break;
-
-		case ALGO_BLAKE:
-			rc = scanhash_blake256(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done, 14);
-			break;
-
-		case ALGO_FRESH:
-			rc = scanhash_fresh(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_NIST5:
-			rc = scanhash_nist5(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_PENTABLAKE:
-			rc = scanhash_pentablake(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_S3:
-			rc = scanhash_s3(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_WHC:
-			rc = scanhash_whc(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_X11:
-			rc = scanhash_x11(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_X13:
-			rc = scanhash_x13(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_X14:
-			rc = scanhash_x14(thr_id, work.data, work.target,
-				max_nonce, &hashes_done);
-			break;
-
-		case ALGO_X15:
-			rc = scanhash_x15(thr_id, work.data, work.target,
-				max_nonce, &hashes_done);
-			break;
-
-		case ALGO_X17:
-			rc = scanhash_x17(thr_id, work.data, work.target,
-				max_nonce, &hashes_done);
-			break;
-
-		default:
-			/* should never happen */
-			goto out;
-		}
-
-		/* record scanhash elapsed time */
-		gettimeofday(&tv_end, NULL);
-
-		if (rc && opt_debug)
-			applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", *nonceptr, swab32(*nonceptr));
-
-		timeval_subtract(&diff, &tv_end, &tv_start);
-		if (diff.tv_usec || diff.tv_sec) {
-			pthread_mutex_lock(&stats_lock);
-			if (diff.tv_sec + 1e-6 * diff.tv_usec > 0.0) {
-				thr_hashrates[thr_id] = hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec);
-				if (rc > 1)
-					thr_hashrates[thr_id] = (rc * hashes_done) / (diff.tv_sec + 1e-6 * diff.tv_usec);
-			}
-			pthread_mutex_unlock(&stats_lock);
-		}
-		if (!opt_quiet) {
-			sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f",
-				1e-3 * thr_hashrates[thr_id]);
-			applog(LOG_INFO, "GPU #%d: %s, %s kH/s",
-				device_map[thr_id], device_name[thr_id], s);
-		}
-		if (thr_id == opt_n_threads - 1) {
-			double hashrate = 0.;
-			for (int i = 0; i < opt_n_threads && thr_hashrates[i]; i++)
-				hashrate += thr_hashrates[i];
-
-			global_hashrate = llround(hashrate);
-			if (opt_benchmark) {
-				sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", hashrate / 1000.);
-				applog(LOG_NOTICE, "Total: %s kH/s", s);
-			}
-		}
-
-		if (rc) {
-			work.scanned_to = *nonceptr;
-		} else {
-			work.scanned_to = max_nonce;
-		}
-
-		// could be used to store speeds too..
-		hashlog_remember_scan_range(work.job_id, work.scanned_from, work.scanned_to);
-
-		/* if nonce found, submit work */
-		if (rc) {
-			if (!opt_benchmark && !submit_work(mythr, &work))
-				break;
-		}
-	}
-
-out:
-	tq_freeze(mythr->q);
-
-	return NULL;
-}
-
-static void restart_threads(void)
-{
-	int i;
-
-	for (i = 0; i < opt_n_threads; i++)
-		work_restart[i].restart = 1;
-}
-
-static void *longpoll_thread(void *userdata)
-{
-	struct thr_info *mythr = (struct thr_info *)userdata;
-	CURL *curl = NULL;
-	char *copy_start, *hdr_path = NULL, *lp_url = NULL;
-	bool need_slash = false;
-
-	curl = curl_easy_init();
-	if (unlikely(!curl)) {
-		applog(LOG_ERR, "CURL initialization failed");
-		goto out;
-	}
-
-start:
-	hdr_path = (char*)tq_pop(mythr->q, NULL);
-	if (!hdr_path)
-		goto out;
-
-	/* full URL */
-	if (strstr(hdr_path, "://")) {
-		lp_url = hdr_path;
-		hdr_path = NULL;
-	}
-	
-	/* absolute path, on current server */
-	else {
-		copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path;
-		if (rpc_url[strlen(rpc_url) - 1] != '/')
-			need_slash = true;
-
-		lp_url = (char*)malloc(strlen(rpc_url) + strlen(copy_start) + 2);
-		if (!lp_url)
-			goto out;
-
-		sprintf(lp_url, "%s%s%s", rpc_url, need_slash ? "/" : "", copy_start);
-	}
-
-	applog(LOG_INFO, "Long-polling activated for %s", lp_url);
-
-	while (1) {
-		json_t *val, *soval;
-		int err;
-
-		val = json_rpc_call(curl, lp_url, rpc_userpass, rpc_req,
-				    false, true, &err);
-		if (have_stratum) {
-			if (val)
-				json_decref(val);
-			goto out;
-		}
-		if (likely(val)) {
-			if (!opt_quiet) applog(LOG_INFO, "LONGPOLL detected new block");
-			soval = json_object_get(json_object_get(val, "result"), "submitold");
-			submit_old = soval ? json_is_true(soval) : false;
-			pthread_mutex_lock(&g_work_lock);
-			if (work_decode(json_object_get(val, "result"), &g_work)) {
-				if (opt_debug)
-					applog(LOG_BLUE, "LONGPOLL pushed new work");
-				time(&g_work_time);
-				restart_threads();
-			}
-			pthread_mutex_unlock(&g_work_lock);
-			json_decref(val);
-		} else {
-			pthread_mutex_lock(&g_work_lock);
-			g_work_time -= LP_SCANTIME;
-			pthread_mutex_unlock(&g_work_lock);
-			if (err == CURLE_OPERATION_TIMEDOUT) {
-				restart_threads();
-			} else {
-				have_longpoll = false;
-				restart_threads();
-				free(hdr_path);
-				free(lp_url);
-				lp_url = NULL;
-				sleep(opt_fail_pause);
-				goto start;
-			}
-		}
-	}
-
-out:
-	free(hdr_path);
-	free(lp_url);
-	tq_freeze(mythr->q);
-	if (curl)
-		curl_easy_cleanup(curl);
-
-	return NULL;
-}
-
-static bool stratum_handle_response(char *buf)
-{
-	json_t *val, *err_val, *res_val, *id_val;
-	json_error_t err;
-	bool ret = false;
-
-	val = JSON_LOADS(buf, &err);
-	if (!val) {
-		applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
-		goto out;
-	}
-
-	res_val = json_object_get(val, "result");
-	err_val = json_object_get(val, "error");
-	id_val = json_object_get(val, "id");
-
-	if (!id_val || json_is_null(id_val) || !res_val)
-		goto out;
-
-	share_result(json_is_true(res_val),
-		err_val ? json_string_value(json_array_get(err_val, 1)) : NULL);
-
-	ret = true;
-out:
-	if (val)
-		json_decref(val);
-
-	return ret;
-}
-
-static void *stratum_thread(void *userdata)
-{
-	struct thr_info *mythr = (struct thr_info *)userdata;
-	char *s;
-
-	stratum.url = (char*)tq_pop(mythr->q, NULL);
-	if (!stratum.url)
-		goto out;
-	applog(LOG_BLUE, "Starting Stratum on %s", stratum.url);
-
-	while (1) {
-		int failures = 0;
-
-		while (!stratum.curl) {
-			pthread_mutex_lock(&g_work_lock);
-			g_work_time = 0;
-			pthread_mutex_unlock(&g_work_lock);
-			restart_threads();
-
-			if (!stratum_connect(&stratum, stratum.url) ||
-			    !stratum_subscribe(&stratum) ||
-			    !stratum_authorize(&stratum, rpc_user, rpc_pass)) {
-				stratum_disconnect(&stratum);
-				if (opt_retries >= 0 && ++failures > opt_retries) {
-					applog(LOG_ERR, "...terminating workio thread");
-					tq_push(thr_info[work_thr_id].q, NULL);
-					goto out;
-				}
-				if (!opt_benchmark)
-					applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
-				sleep(opt_fail_pause);
-			}
-		}
-
-		if (stratum.job.job_id &&
-		    (!g_work_time || strncmp(stratum.job.job_id, g_work.job_id + 8, 120))) {
-			pthread_mutex_lock(&g_work_lock);
-			stratum_gen_work(&stratum, &g_work);
-			time(&g_work_time);
-			if (stratum.job.clean) {
-				if (!opt_quiet)
-					applog(LOG_BLUE, "%s sent %s block %d", short_url, algo_names[opt_algo],
-						stratum.bloc_height);
-				restart_threads();
-				hashlog_purge_old();
-			} else if (!opt_quiet) {
-					applog(LOG_BLUE, "%s asks job %d for block %d", short_url,
-						strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
-			}
-			pthread_mutex_unlock(&g_work_lock);
-		}
-		
-		if (!stratum_socket_full(&stratum, 120)) {
-			applog(LOG_ERR, "Stratum connection timed out");
-			s = NULL;
-		} else
-			s = stratum_recv_line(&stratum);
-		if (!s) {
-			stratum_disconnect(&stratum);
-			applog(LOG_ERR, "Stratum connection interrupted");
-			continue;
-		}
-		if (!stratum_handle_method(&stratum, s))
-			stratum_handle_response(s);
-		free(s);
-	}
-
-out:
-	return NULL;
-}
-
-#define PROGRAM_VERSION "1.4.7.SP"
-static void show_version_and_exit(void)
-{
-	printf("%s v%s\n"
-#ifdef WIN32
-		"pthreads static %s\n"
-#endif
-		"%s\n",
-		PACKAGE_STRING, PROGRAM_VERSION,
-#ifdef WIN32
-		PTW32_VERSION_STRING,
-#endif
-		curl_version());
-	proper_exit(0);
-}
-
-static void show_usage_and_exit(int status)
-{
-	if (status)
-		fprintf(stderr, "Try `" PROGRAM_NAME " --help' for more information.\n");
-	else
-		printf(usage);
-	proper_exit(status);
-}
-
-static void parse_arg (int key, char *arg)
-{
-	char *p;
-	int v, i;
-	double d;
-
-	switch(key) {
-	case 'a':
-		for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
-			if (algo_names[i] &&
-			    !strcmp(arg, algo_names[i])) {
-				opt_algo = (sha256_algos)i;
-				break;
-			}
-		}
-		if (i == ARRAY_SIZE(algo_names))
-			show_usage_and_exit(1);
-		break;
-	case 'B':
-		opt_background = true;
-		break;
-	case 'c': {
-		json_error_t err;
-		if (opt_config)
-			json_decref(opt_config);
-#if JANSSON_VERSION_HEX >= 0x020000
-		opt_config = json_load_file(arg, 0, &err);
-#else
-		opt_config = json_load_file(arg, &err);
-#endif
-		if (!json_is_object(opt_config)) {
-			applog(LOG_ERR, "JSON decode of %s failed", arg);
-			proper_exit(1);
-		}
-		break;
-	}
-	case 'C':
-		/* color for compat */
-		use_colors = true;
-		break;
-	case 'K':
-		use_colors = false;
-		break;
-	case 'D':
-		opt_debug = true;
-		break;
-	case 'q':
-		opt_quiet = true;
-		break;
-	case 'p':
-		free(rpc_pass);
-		rpc_pass = strdup(arg);
-		break;
-	case 'P':
-		opt_protocol = true;
-		break;
-	case 'r':
-		v = atoi(arg);
-		if (v < -1 || v > 9999)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_retries = v;
-		break;
-	case 'R':
-		v = atoi(arg);
-		if (v < 1 || v > 9999)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_fail_pause = v;
-		break;
-	case 's':
-		v = atoi(arg);
-		if (v < 1 || v > 9999)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_scantime = v;
-		break;
-	case 'T':
-		v = atoi(arg);
-		if (v < 1 || v > 99999)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_timeout = v;
-		break;
-	case 't':
-		v = atoi(arg);
-		if (v < 1 || v > 9999)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_n_threads = v;
-		break;
-	case 'v':
-		v = atoi(arg);
-		if (v < 0 || v > 1024)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_vote = (uint16_t)v;
-		break;
-	case 'm':
-		opt_trust_pool = true;
-		break;
-	case 'u':
-		free(rpc_user);
-		rpc_user = strdup(arg);
-		break;
-	case 'o':			/* --url */
-		p = strstr(arg, "://");
-		if (p) {
-			if (strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) &&
-					strncasecmp(arg, "stratum+tcp://", 14))
-				show_usage_and_exit(1);
-			free(rpc_url);
-			rpc_url = strdup(arg);
-			short_url = &rpc_url[(p - arg) + 3];
-		} else {
-			if (!strlen(arg) || *arg == '/')
-				show_usage_and_exit(1);
-			free(rpc_url);
-			rpc_url = (char*)malloc(strlen(arg) + 8);
-			sprintf(rpc_url, "http://%s", arg);
-			short_url = &rpc_url[7];
-		}
-		p = strrchr(rpc_url, '@');
-		if (p) {
-			char *sp, *ap;
-			*p = '\0';
-			ap = strstr(rpc_url, "://") + 3;
-			sp = strchr(ap, ':');
-			if (sp) {
-				free(rpc_userpass);
-				rpc_userpass = strdup(ap);
-				free(rpc_user);
-				rpc_user = (char*)calloc(sp - ap + 1, 1);
-				strncpy(rpc_user, ap, sp - ap);
-				free(rpc_pass);
-				rpc_pass = strdup(sp + 1);
-			} else {
-				free(rpc_user);
-				rpc_user = strdup(ap);
-			}
-			memmove(ap, p + 1, strlen(p + 1) + 1);
-			short_url = p + 1;
-		}
-		have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7);
-		break;
-	case 'O':			/* --userpass */
-		p = strchr(arg, ':');
-		if (!p)
-			show_usage_and_exit(1);
-		free(rpc_userpass);
-		rpc_userpass = strdup(arg);
-		free(rpc_user);
-		rpc_user = (char*)calloc(p - arg + 1, 1);
-		strncpy(rpc_user, arg, p - arg);
-		free(rpc_pass);
-		rpc_pass = strdup(p + 1);
-		break;
-	case 'x':			/* --proxy */
-		if (!strncasecmp(arg, "socks4://", 9))
-			opt_proxy_type = CURLPROXY_SOCKS4;
-		else if (!strncasecmp(arg, "socks5://", 9))
-			opt_proxy_type = CURLPROXY_SOCKS5;
-#if LIBCURL_VERSION_NUM >= 0x071200
-		else if (!strncasecmp(arg, "socks4a://", 10))
-			opt_proxy_type = CURLPROXY_SOCKS4A;
-		else if (!strncasecmp(arg, "socks5h://", 10))
-			opt_proxy_type = CURLPROXY_SOCKS5_HOSTNAME;
-#endif
-		else
-			opt_proxy_type = CURLPROXY_HTTP;
-		free(opt_proxy);
-		opt_proxy = strdup(arg);
-		break;
-	case 1001:
-		free(opt_cert);
-		opt_cert = strdup(arg);
-		break;
-	case 1005:
-		opt_benchmark = true;
-		want_longpoll = false;
-		want_stratum = false;
-		have_stratum = false;
-		break;
-	case 1006:
-		print_hash_tests();
-		proper_exit(0);
-		break;
-	case 1003:
-		want_longpoll = false;
-		break;
-	case 1007:
-		want_stratum = false;
-		break;
-	case 'S':
-		use_syslog = true;
-		break;
-	case 'd': // CB
-		{
-			char * pch = strtok (arg,",");
-			opt_n_threads = 0;
-			while (pch != NULL) {
-				if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
-				{
-					if (atoi(pch) < num_processors)
-						device_map[opt_n_threads++] = atoi(pch);
-					else {
-						applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
-						proper_exit(1);
-					}
-				} else {
-					int device = cuda_finddevice(pch);
-					if (device >= 0 && device < num_processors)
-						device_map[opt_n_threads++] = device;
-					else {
-						applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
-						proper_exit(1);
-					}
-				}
-				pch = strtok (NULL, ",");
-			}
-		}
-		break;
-	case 'f': // CH - Divisor for Difficulty
-		d = atof(arg);
-		if (d == 0)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_difficulty = d;
-		break;
-	case 'V':
-		show_version_and_exit();
-	case 'h':
-		show_usage_and_exit(0);
-	default:
-		show_usage_and_exit(1);
-	}
-
-	if (use_syslog)
-		use_colors = false;
-}
-
-static void parse_config(void)
-{
-	int i;
-	json_t *val;
-
-	if (!json_is_object(opt_config))
-		return;
-
-	for (i = 0; i < ARRAY_SIZE(options); i++) {
-		if (!options[i].name)
-			break;
-		if (!strcmp(options[i].name, "config"))
-			continue;
-
-		val = json_object_get(opt_config, options[i].name);
-		if (!val)
-			continue;
-
-		if (options[i].has_arg && json_is_string(val)) {
-			char *s = strdup(json_string_value(val));
-			if (!s)
-				break;
-			parse_arg(options[i].val, s);
-			free(s);
-		} else if (!options[i].has_arg && json_is_true(val))
-			parse_arg(options[i].val, "");
-		else
-			applog(LOG_ERR, "JSON option %s invalid",
-				options[i].name);
-	}
-
-	if (opt_algo == ALGO_HEAVY && opt_vote == 9999) {
-		fprintf(stderr, "Heavycoin hash requires block reward vote parameter (see --vote)\n");
-		show_usage_and_exit(1);
-	}
-}
-
-static void parse_cmdline(int argc, char *argv[])
-{
-	int key;
-
-	while (1) {
-#if HAVE_GETOPT_LONG
-		key = getopt_long(argc, argv, short_options, options, NULL);
-#else
-		key = getopt(argc, argv, short_options);
-#endif
-		if (key < 0)
-			break;
-
-		parse_arg(key, optarg);
-	}
-	if (optind < argc) {
-		fprintf(stderr, "%s: unsupported non-option argument '%s'\n",
-			argv[0], argv[optind]);
-		show_usage_and_exit(1);
-	}
-
-	if (opt_algo == ALGO_HEAVY && opt_vote == 9999) {
-		fprintf(stderr, "%s: Heavycoin hash requires block reward vote parameter (see --vote)\n",
-			argv[0]);
-		show_usage_and_exit(1);
-	}
-
-	parse_config();
-}
-
-#ifndef WIN32
-static void signal_handler(int sig)
-{
-	switch (sig) {
-	case SIGHUP:
-		applog(LOG_INFO, "SIGHUP received");
-		break;
-	case SIGINT:
-		signal(sig, SIG_IGN);
-		applog(LOG_INFO, "SIGINT received, exiting");
-		proper_exit(0);
-		break;
-	case SIGTERM:
-		applog(LOG_INFO, "SIGTERM received, exiting");
-		proper_exit(0);
-		break;
-	}
-}
-#else
-BOOL WINAPI ConsoleHandler(DWORD dwType)
-{
-	switch (dwType) {
-	case CTRL_C_EVENT:
-		applog(LOG_INFO, "CTRL_C_EVENT received, exiting");
-		proper_exit(0);
-		break;
-	case CTRL_BREAK_EVENT:
-		applog(LOG_INFO, "CTRL_BREAK_EVENT received, exiting");
-		proper_exit(0);
-		break;
-	default:
-		return false;
-	}
-	return true;
-}
-#endif
-
-int main(int argc, char *argv[])
-{
-	struct thr_info *thr;
-	long flags;
-	int i;
-
-	printf("*** ccMiner for nVidia GPUs by Christian Buchner and Christian H. ***\n");
-	printf("\t This is the forked version "PROGRAM_VERSION" (sp-hash@github)\n");
-#ifdef WIN32
-	printf("\t  Built with VC++ 2013 and nVidia CUDA SDK 6.5\n\n");
-#else
-	printf("\t  Built with the nVidia CUDA SDK 6.5\n\n");
-#endif
-	printf("\t  based on pooler-cpuminer 2.3.2 (c) 2010 Jeff Garzik, 2012 pooler\n");
-	printf("\t    and HVC extension from http://hvc.1gh.com/" "\n\n");
-	printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n\n");
-	printf("\tInclude some of djm34 additions, cleaned by Tanguy Pruvot\n");
-	printf("\t\t  Optimized Kernals By SP^Cryptoburners.\n\n");
-
-	rpc_user = strdup("");
-	rpc_pass = strdup("");
-
-	pthread_mutex_init(&applog_lock, NULL);
-	num_processors = cuda_num_devices();
-
-	/* parse command line */
-	parse_cmdline(argc, argv);
-
-	cuda_devicenames();
-
-	if (!opt_benchmark && !rpc_url) {
-		fprintf(stderr, "%s: no URL supplied\n", argv[0]);
-		show_usage_and_exit(1);
-	}
-
-	if (!rpc_userpass) {
-		rpc_userpass = (char*)malloc(strlen(rpc_user) + strlen(rpc_pass) + 2);
-		if (!rpc_userpass)
-			return 1;
-		sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
-	}
-
-	/* init stratum data.. */
-	memset(&stratum.url, 0, sizeof(stratum));
-
-	pthread_mutex_init(&stats_lock, NULL);
-	pthread_mutex_init(&g_work_lock, NULL);
-	pthread_mutex_init(&stratum.sock_lock, NULL);
-	pthread_mutex_init(&stratum.work_lock, NULL);
-
-	flags = !opt_benchmark && strncmp(rpc_url, "https:", 6)
-	      ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
-	      : CURL_GLOBAL_ALL;
-	if (curl_global_init(flags)) {
-		applog(LOG_ERR, "CURL initialization failed");
-		return 1;
-	}
-
-#ifndef WIN32
-	if (opt_background) {
-		i = fork();
-		if (i < 0) exit(1);
-		if (i > 0) exit(0);
-		i = setsid();
-		if (i < 0)
-			applog(LOG_ERR, "setsid() failed (errno = %d)", errno);
-		i = chdir("/");
-		if (i < 0)
-			applog(LOG_ERR, "chdir() failed (errno = %d)", errno);
-		signal(SIGHUP, signal_handler);
-		signal(SIGTERM, signal_handler);
-	}
-	/* Always catch Ctrl+C */
-	signal(SIGINT, signal_handler);
-#else
-	SetConsoleCtrlHandler((PHANDLER_ROUTINE)ConsoleHandler, TRUE);
-#endif
-
-	if (num_processors == 0)
-	{
-		applog(LOG_ERR, "No CUDA devices found! terminating.");
-		exit(1);
-	}
-	if (!opt_n_threads)
-		opt_n_threads = num_processors;
-
-#ifdef HAVE_SYSLOG_H
-	if (use_syslog)
-		openlog("cpuminer", LOG_PID, LOG_USER);
-#endif
-
-	work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart));
-	if (!work_restart)
-		return 1;
-
-	thr_info = (struct thr_info *)calloc(opt_n_threads + 3, sizeof(*thr));
-	if (!thr_info)
-		return 1;
-	
-	thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double));
-	if (!thr_hashrates)
-		return 1;
-
-	/* init workio thread info */
-	work_thr_id = opt_n_threads;
-	thr = &thr_info[work_thr_id];
-	thr->id = work_thr_id;
-	thr->q = tq_new();
-	if (!thr->q)
-		return 1;
-
-	/* start work I/O thread */
-	if (pthread_create(&thr->pth, NULL, workio_thread, thr)) {
-		applog(LOG_ERR, "workio thread create failed");
-		return 1;
-	}
-
-	if (want_longpoll && !have_stratum) {
-		/* init longpoll thread info */
-		longpoll_thr_id = opt_n_threads + 1;
-		thr = &thr_info[longpoll_thr_id];
-		thr->id = longpoll_thr_id;
-		thr->q = tq_new();
-		if (!thr->q)
-			return 1;
-
-		/* start longpoll thread */
-		if (unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) {
-			applog(LOG_ERR, "longpoll thread create failed");
-			return 1;
-		}
-	}
-	if (want_stratum) {
-		/* init stratum thread info */
-		stratum_thr_id = opt_n_threads + 2;
-		thr = &thr_info[stratum_thr_id];
-		thr->id = stratum_thr_id;
-		thr->q = tq_new();
-		if (!thr->q)
-			return 1;
-
-		/* start stratum thread */
-		if (unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) {
-			applog(LOG_ERR, "stratum thread create failed");
-			return 1;
-		}
-
-		if (have_stratum)
-			tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url));
-	}
-
-	/* start mining threads */
-	for (i = 0; i < opt_n_threads; i++) {
-		thr = &thr_info[i];
-
-		thr->id = i;
-		thr->q = tq_new();
-		if (!thr->q)
-			return 1;
-
-		if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) {
-			applog(LOG_ERR, "thread %d create failed", i);
-			return 1;
-		}
-	}
-
-	applog(LOG_INFO, "%d miner threads started, "
-		"using '%s' algorithm.",
-		opt_n_threads,
-		algo_names[opt_algo]);
-
-#ifdef WIN32
-	timeBeginPeriod(1); // enable high timer precision (similar to Google Chrome Trick)
-#endif
-
-	/* main loop - simply wait for workio thread to exit */
-	pthread_join(thr_info[work_thr_id].pth, NULL);
-
-#ifdef WIN32
-	timeEndPeriod(1); // be nice and forego high timer precision
-#endif
-
-	applog(LOG_INFO, "workio thread dead, exiting.");
-
-	return 0;
-}
diff --git a/cpuminer-config.h.in b/cpuminer-config.h.in
deleted file mode 100644
index c172559ba5..0000000000
--- a/cpuminer-config.h.in
+++ /dev/null
@@ -1,196 +0,0 @@
-/* cpuminer-config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
-   systems. This function is required for `alloca.c' support on those systems.
-   */
-#undef CRAY_STACKSEG_END
-
-/* Define to 1 if using `alloca.c'. */
-#undef C_ALLOCA
-
-/* Define to 1 if you have `alloca', as a function or macro. */
-#undef HAVE_ALLOCA
-
-/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
-   */
-#undef HAVE_ALLOCA_H
-
-/* Define to 1 if you have the declaration of `be32dec', and to 0 if you
-   don't. */
-#undef HAVE_DECL_BE32DEC
-
-/* Define to 1 if you have the declaration of `be32enc', and to 0 if you
-   don't. */
-#undef HAVE_DECL_BE32ENC
-
-/* Define to 1 if you have the declaration of `le32dec', and to 0 if you
-   don't. */
-#undef HAVE_DECL_LE32DEC
-
-/* Define to 1 if you have the declaration of `le32enc', and to 0 if you
-   don't. */
-#undef HAVE_DECL_LE32ENC
-
-/* Define to 1 if you have the `getopt_long' function. */
-#undef HAVE_GETOPT_LONG
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#undef HAVE_INTTYPES_H
-
-/* Define to 1 if you have the `crypto' library (-lcrypto). */
-#undef HAVE_LIBCRYPTO
-
-/* Define to 1 if you have a functional curl library. */
-#undef HAVE_LIBCURL
-
-/* Define to 1 if you have the `ssl' library (-lssl). */
-#undef HAVE_LIBSSL
-
-/* Define to 1 if you have the <memory.h> header file. */
-#undef HAVE_MEMORY_H
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#undef HAVE_STDINT_H
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#undef HAVE_STDLIB_H
-
-/* Define to 1 if you have the <strings.h> header file. */
-#undef HAVE_STRINGS_H
-
-/* Define to 1 if you have the <string.h> header file. */
-#undef HAVE_STRING_H
-
-/* Define to 1 if you have the <syslog.h> header file. */
-#undef HAVE_SYSLOG_H
-
-/* Define to 1 if you have the <sys/endian.h> header file. */
-#undef HAVE_SYS_ENDIAN_H
-
-/* Define to 1 if you have the <sys/param.h> header file. */
-#undef HAVE_SYS_PARAM_H
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#undef HAVE_SYS_STAT_H
-
-/* Define to 1 if you have the <sys/sysctl.h> header file. */
-#undef HAVE_SYS_SYSCTL_H
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#undef HAVE_SYS_TYPES_H
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#undef HAVE_UNISTD_H
-
-/* Defined if libcurl supports AsynchDNS */
-#undef LIBCURL_FEATURE_ASYNCHDNS
-
-/* Defined if libcurl supports IDN */
-#undef LIBCURL_FEATURE_IDN
-
-/* Defined if libcurl supports IPv6 */
-#undef LIBCURL_FEATURE_IPV6
-
-/* Defined if libcurl supports KRB4 */
-#undef LIBCURL_FEATURE_KRB4
-
-/* Defined if libcurl supports libz */
-#undef LIBCURL_FEATURE_LIBZ
-
-/* Defined if libcurl supports NTLM */
-#undef LIBCURL_FEATURE_NTLM
-
-/* Defined if libcurl supports SSL */
-#undef LIBCURL_FEATURE_SSL
-
-/* Defined if libcurl supports SSPI */
-#undef LIBCURL_FEATURE_SSPI
-
-/* Defined if libcurl supports DICT */
-#undef LIBCURL_PROTOCOL_DICT
-
-/* Defined if libcurl supports FILE */
-#undef LIBCURL_PROTOCOL_FILE
-
-/* Defined if libcurl supports FTP */
-#undef LIBCURL_PROTOCOL_FTP
-
-/* Defined if libcurl supports FTPS */
-#undef LIBCURL_PROTOCOL_FTPS
-
-/* Defined if libcurl supports HTTP */
-#undef LIBCURL_PROTOCOL_HTTP
-
-/* Defined if libcurl supports HTTPS */
-#undef LIBCURL_PROTOCOL_HTTPS
-
-/* Defined if libcurl supports IMAP */
-#undef LIBCURL_PROTOCOL_IMAP
-
-/* Defined if libcurl supports LDAP */
-#undef LIBCURL_PROTOCOL_LDAP
-
-/* Defined if libcurl supports POP3 */
-#undef LIBCURL_PROTOCOL_POP3
-
-/* Defined if libcurl supports RTSP */
-#undef LIBCURL_PROTOCOL_RTSP
-
-/* Defined if libcurl supports SMTP */
-#undef LIBCURL_PROTOCOL_SMTP
-
-/* Defined if libcurl supports TELNET */
-#undef LIBCURL_PROTOCOL_TELNET
-
-/* Defined if libcurl supports TFTP */
-#undef LIBCURL_PROTOCOL_TFTP
-
-/* Name of package */
-#undef PACKAGE
-
-/* Define to the address where bug reports for this package should be sent. */
-#undef PACKAGE_BUGREPORT
-
-/* Define to the full name of this package. */
-#undef PACKAGE_NAME
-
-/* Define to the full name and version of this package. */
-#undef PACKAGE_STRING
-
-/* Define to the one symbol short name of this package. */
-#undef PACKAGE_TARNAME
-
-/* Define to the home page for this package. */
-#undef PACKAGE_URL
-
-/* Define to the version of this package. */
-#undef PACKAGE_VERSION
-
-/* If using the C implementation of alloca, define if you know the
-   direction of stack growth for your system; otherwise it will be
-   automatically deduced at runtime.
-	STACK_DIRECTION > 0 => grows toward higher addresses
-	STACK_DIRECTION < 0 => grows toward lower addresses
-	STACK_DIRECTION = 0 => direction of growth unknown */
-#undef STACK_DIRECTION
-
-/* Define to 1 if you have the ANSI C header files. */
-#undef STDC_HEADERS
-
-/* Define to 1 if AVX assembly is available. */
-#undef USE_AVX
-
-/* Define to 1 if AVX2 assembly is available. */
-#undef USE_AVX2
-
-/* Define to 1 if XOP assembly is available. */
-#undef USE_XOP
-
-/* Version number of package */
-#undef VERSION
-
-/* Define curl_free() as free() if our version of curl lacks curl_free. */
-#undef curl_free
-
-/* Define to `unsigned int' if <sys/types.h> does not define. */
-#undef size_t
diff --git a/crc32.c b/crc32.c
index f036bcbd3b..ad65c57120 100644
--- a/crc32.c
+++ b/crc32.c
@@ -40,8 +40,13 @@
  * CRC32 code derived from work by Gary S. Brown.
  */
 
-#include <stdlib.h>
+#ifdef __cplusplus
+#include <cstdint>
+#include <cstdlib>
+#else
 #include <stdint.h>
+#include <stdlib.h>
+#endif
 
 static uint32_t crc32_tab[] = {
 	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
diff --git a/cuPrintf.cu b/cuPrintf.cu
deleted file mode 100644
index f06653f2db..0000000000
--- a/cuPrintf.cu
+++ /dev/null
@@ -1,879 +0,0 @@
-/*
-	Copyright 2009 NVIDIA Corporation.  All rights reserved.
-
-	NOTICE TO LICENSEE:   
-
-	This source code and/or documentation ("Licensed Deliverables") are subject 
-	to NVIDIA intellectual property rights under U.S. and international Copyright 
-	laws.  
-
-	These Licensed Deliverables contained herein is PROPRIETARY and CONFIDENTIAL 
-	to NVIDIA and is being provided under the terms and conditions of a form of 
-	NVIDIA software license agreement by and between NVIDIA and Licensee ("License 
-	Agreement") or electronically accepted by Licensee.  Notwithstanding any terms 
-	or conditions to the contrary in the License Agreement, reproduction or 
-	disclosure of the Licensed Deliverables to any third party without the express 
-	written consent of NVIDIA is prohibited.     
-
-	NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, 
-	NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THESE LICENSED 
-	DELIVERABLES FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED 
-	WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE 
-	LICENSED DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 
-	NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   NOTWITHSTANDING ANY 
-	TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, IN NO EVENT SHALL 
-	NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, 
-	OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,	WHETHER 
-	IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,  ARISING OUT OF 
-	OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THESE LICENSED DELIVERABLES.  
-
-	U.S. Government End Users. These Licensed Deliverables are a "commercial item" 
-	as that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of 
-	"commercial computer  software"  and "commercial computer software documentation" 
-	as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the 
-	U.S. Government only as a commercial end item.  Consistent with 48 C.F.R.12.212 
-	and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all U.S. Government 
-	End Users acquire the Licensed Deliverables with only those rights set forth 
-	herein. 
-
-	Any use of the Licensed Deliverables in individual and commercial software must 
-	include, in the user documentation and internal comments to the code, the above 
-	Disclaimer and U.S. Government End Users Notice.
- */
-
-/*
- *	cuPrintf.cu
- *
- *	This is a printf command callable from within a kernel. It is set
- *	up so that output is sent to a memory buffer, which is emptied from
- *	the host side - but only after a cudaThreadSynchronize() on the host.
- *
- *	Currently, there is a limitation of around 200 characters of output
- *	and no more than 10 arguments to a single cuPrintf() call. Issue
- *	multiple calls if longer format strings are required.
- *
- *	It requires minimal setup, and is *NOT* optimised for performance.
- *	For example, writes are not coalesced - this is because there is an
- *	assumption that people will not want to printf from every single one
- *	of thousands of threads, but only from individual threads at a time.
- *
- *	Using this is simple - it requires one host-side call to initialise
- *	everything, and then kernels can call cuPrintf at will. Sample code
- *	is the easiest way to demonstrate:
- *
-	#include "cuPrintf.cu"
- 	
-	__global__ void testKernel(int val)
-	{
-		cuPrintf("Value is: %d\n", val);
-	}
-
-	int main()
-	{
-		cudaPrintfInit();
-		testKernel<<< 2, 3 >>>(10);
-		cudaPrintfDisplay(stdout, true);
-		cudaPrintfEnd();
-        return 0;
-	}
- *
- *	See the header file, "cuPrintf.cuh" for more info, especially
- *	arguments to cudaPrintfInit() and cudaPrintfDisplay();
- */
-
-#ifndef CUPRINTF_CU
-#define CUPRINTF_CU
-
-#include "cuPrintf.cuh"
-#if __CUDA_ARCH__ > 100      // Atomics only used with > sm_10 architecture
-#include <sm_11_atomic_functions.h>
-#endif
-
-// This is the smallest amount of memory, per-thread, which is allowed.
-// It is also the largest amount of space a single printf() can take up
-const static int CUPRINTF_MAX_LEN = 256;
-
-// This structure is used internally to track block/thread output restrictions.
-typedef struct __align__(8) {
-	int threadid;				// CUPRINTF_UNRESTRICTED for unrestricted
-	int blockid;				// CUPRINTF_UNRESTRICTED for unrestricted
-} cuPrintfRestriction;
-
-// The main storage is in a global print buffer, which has a known
-// start/end/length. These are atomically updated so it works as a
-// circular buffer.
-// Since the only control primitive that can be used is atomicAdd(),
-// we cannot wrap the pointer as such. The actual address must be
-// calculated from printfBufferPtr by mod-ing with printfBufferLength.
-// For sm_10 architecture, we must subdivide the buffer per-thread
-// since we do not even have an atomic primitive.
-__constant__ static char *globalPrintfBuffer = NULL;         // Start of circular buffer (set up by host)
-__constant__ static int printfBufferLength = 0;              // Size of circular buffer (set up by host)
-__device__ static cuPrintfRestriction restrictRules;         // Output restrictions
-__device__ volatile static char *printfBufferPtr = NULL;     // Current atomically-incremented non-wrapped offset
-
-// This is the header preceeding all printf entries.
-// NOTE: It *must* be size-aligned to the maximum entity size (size_t)
-typedef struct __align__(8) {
-    unsigned short magic;                   // Magic number says we're valid
-    unsigned short fmtoffset;               // Offset of fmt string into buffer
-    unsigned short blockid;                 // Block ID of author
-    unsigned short threadid;                // Thread ID of author
-} cuPrintfHeader;
-
-// Special header for sm_10 architecture
-#define CUPRINTF_SM10_MAGIC   0xC810        // Not a valid ascii character
-typedef struct __align__(16) {
-    unsigned short magic;                   // sm_10 specific magic number
-    unsigned short unused;
-    unsigned int thread_index;              // thread ID for this buffer
-    unsigned int thread_buf_len;            // per-thread buffer length
-    unsigned int offset;                    // most recent printf's offset
-} cuPrintfHeaderSM10;
-
-
-// Because we can't write an element which is not aligned to its bit-size,
-// we have to align all sizes and variables on maximum-size boundaries.
-// That means sizeof(double) in this case, but we'll use (long long) for
-// better arch<1.3 support
-#define CUPRINTF_ALIGN_SIZE      sizeof(long long)
-
-// All our headers are prefixed with a magic number so we know they're ready
-#define CUPRINTF_SM11_MAGIC  (unsigned short)0xC811        // Not a valid ascii character
-
-
-//
-//  getNextPrintfBufPtr
-//
-//  Grabs a block of space in the general circular buffer, using an
-//  atomic function to ensure that it's ours. We handle wrapping
-//  around the circular buffer and return a pointer to a place which
-//  can be written to.
-//
-//  Important notes:
-//      1. We always grab CUPRINTF_MAX_LEN bytes
-//      2. Because of 1, we never worry about wrapping around the end
-//      3. Because of 1, printfBufferLength *must* be a factor of CUPRINTF_MAX_LEN
-//
-//  This returns a pointer to the place where we own.
-//
-__device__ static char *getNextPrintfBufPtr()
-{
-    // Initialisation check
-    if(!printfBufferPtr)
-        return NULL;
-
-	// Thread/block restriction check
-	if((restrictRules.blockid != CUPRINTF_UNRESTRICTED) && (restrictRules.blockid != (blockIdx.x + gridDim.x*blockIdx.y)))
-		return NULL;
-	if((restrictRules.threadid != CUPRINTF_UNRESTRICTED) && (restrictRules.threadid != (threadIdx.x + blockDim.x*threadIdx.y + blockDim.x*blockDim.y*threadIdx.z)))
-		return NULL;
-
-	// Conditional section, dependent on architecture
-#if __CUDA_ARCH__ == 100
-    // For sm_10 architectures, we have no atomic add - this means we must split the
-    // entire available buffer into per-thread blocks. Inefficient, but what can you do.
-    int thread_count = (gridDim.x * gridDim.y) * (blockDim.x * blockDim.y * blockDim.z);
-    int thread_index = threadIdx.x + blockDim.x*threadIdx.y + blockDim.x*blockDim.y*threadIdx.z +
-                       (blockIdx.x + gridDim.x*blockIdx.y) * (blockDim.x * blockDim.y * blockDim.z);
-    
-    // Find our own block of data and go to it. Make sure the per-thread length
-	// is a precise multiple of CUPRINTF_MAX_LEN, otherwise we risk size and
-	// alignment issues! We must round down, of course.
-    unsigned int thread_buf_len = printfBufferLength / thread_count;
-	thread_buf_len &= ~(CUPRINTF_MAX_LEN-1);
-
-	// We *must* have a thread buffer length able to fit at least two printfs (one header, one real)
-	if(thread_buf_len < (CUPRINTF_MAX_LEN * 2))
-		return NULL;
-
-	// Now address our section of the buffer. The first item is a header.
-    char *myPrintfBuffer = globalPrintfBuffer + (thread_buf_len * thread_index);
-    cuPrintfHeaderSM10 hdr = *(cuPrintfHeaderSM10 *)(void *)myPrintfBuffer;
-    if(hdr.magic != CUPRINTF_SM10_MAGIC)
-    {
-        // If our header is not set up, initialise it
-        hdr.magic = CUPRINTF_SM10_MAGIC;
-        hdr.thread_index = thread_index;
-        hdr.thread_buf_len = thread_buf_len;
-        hdr.offset = 0;         // Note we start at 0! We pre-increment below.
-        *(cuPrintfHeaderSM10 *)(void *)myPrintfBuffer = hdr;       // Write back the header
-
-        // For initial setup purposes, we might need to init thread0's header too
-        // (so that cudaPrintfDisplay() below will work). This is only run once.
-        cuPrintfHeaderSM10 *tophdr = (cuPrintfHeaderSM10 *)(void *)globalPrintfBuffer;
-        tophdr->thread_buf_len = thread_buf_len;
-    }
-
-    // Adjust the offset by the right amount, and wrap it if need be
-    unsigned int offset = hdr.offset + CUPRINTF_MAX_LEN;
-    if(offset >= hdr.thread_buf_len)
-        offset = CUPRINTF_MAX_LEN;
-
-    // Write back the new offset for next time and return a pointer to it
-    ((cuPrintfHeaderSM10 *)(void *)myPrintfBuffer)->offset = offset;
-    return myPrintfBuffer + offset;
-#else
-    // Much easier with an atomic operation!
-    size_t offset = atomicAdd((unsigned int *)&printfBufferPtr, CUPRINTF_MAX_LEN) - (size_t)globalPrintfBuffer;
-    offset %= printfBufferLength;
-    return globalPrintfBuffer + offset;
-#endif
-}
-
-
-//
-//  writePrintfHeader
-//
-//  Inserts the header for containing our UID, fmt position and
-//  block/thread number. We generate it dynamically to avoid
-//	issues arising from requiring pre-initialisation.
-//
-__device__ static void writePrintfHeader(char *ptr, char *fmtptr)
-{
-    if(ptr)
-    {
-        cuPrintfHeader header;
-        header.magic = CUPRINTF_SM11_MAGIC;
-        header.fmtoffset = (unsigned short)(fmtptr - ptr);
-        header.blockid = blockIdx.x + gridDim.x*blockIdx.y;
-        header.threadid = threadIdx.x + blockDim.x*threadIdx.y + blockDim.x*blockDim.y*threadIdx.z;
-        *(cuPrintfHeader *)(void *)ptr = header;
-    }
-}
-
-
-//
-//  cuPrintfStrncpy
-//
-//  This special strncpy outputs an aligned length value, followed by the
-//  string. It then zero-pads the rest of the string until a 64-aligned
-//  boundary. The length *includes* the padding. A pointer to the byte
-//  just after the \0 is returned.
-//
-//  This function could overflow CUPRINTF_MAX_LEN characters in our buffer.
-//  To avoid it, we must count as we output and truncate where necessary.
-//
-__device__ static char *cuPrintfStrncpy(char *dest, const char *src, int n, char *end)
-{
-    // Initialisation and overflow check
-    if(!dest || !src || (dest >= end))
-        return NULL;
-
-    // Prepare to write the length specifier. We're guaranteed to have
-    // at least "CUPRINTF_ALIGN_SIZE" bytes left because we only write out in
-    // chunks that size, and CUPRINTF_MAX_LEN is aligned with CUPRINTF_ALIGN_SIZE.
-    int *lenptr = (int *)(void *)dest;
-    int len = 0;
-    dest += CUPRINTF_ALIGN_SIZE;
-
-    // Now copy the string
-    while(n--)
-    {
-        if(dest >= end)     // Overflow check
-            break;
-
-        len++;
-        *dest++ = *src;
-        if(*src++ == '\0')
-            break;
-    }
-
-    // Now write out the padding bytes, and we have our length.
-    while((dest < end) && (((long)dest & (CUPRINTF_ALIGN_SIZE-1)) != 0))
-    {
-        len++;
-        *dest++ = 0;
-    }
-    *lenptr = len;
-    return (dest < end) ? dest : NULL;        // Overflow means return NULL
-}
-
-
-//
-//  copyArg
-//
-//  This copies a length specifier and then the argument out to the
-//  data buffer. Templates let the compiler figure all this out at
-//  compile-time, making life much simpler from the programming
-//  point of view. I'm assuimg all (const char *) is a string, and
-//  everything else is the variable it points at. I'd love to see
-//  a better way of doing it, but aside from parsing the format
-//  string I can't think of one.
-//
-//  The length of the data type is inserted at the beginning (so that
-//  the display can distinguish between float and double), and the
-//  pointer to the end of the entry is returned.
-//
-__device__ static char *copyArg(char *ptr, const char *arg, char *end)
-{
-    // Initialisation check
-    if(!ptr || !arg)
-        return NULL;
-
-    // strncpy does all our work. We just terminate.
-    if((ptr = cuPrintfStrncpy(ptr, arg, CUPRINTF_MAX_LEN, end)) != NULL)
-        *ptr = 0;
-
-    return ptr;
-}
-
-template <typename T>
-__device__ static char *copyArg(char *ptr, T &arg, char *end)
-{
-    // Initisalisation and overflow check. Alignment rules mean that
-    // we're at least CUPRINTF_ALIGN_SIZE away from "end", so we only need
-    // to check that one offset.
-    if(!ptr || ((ptr+CUPRINTF_ALIGN_SIZE) >= end))
-        return NULL;
-
-    // Write the length and argument
-    *(int *)(void *)ptr = sizeof(arg);
-    ptr += CUPRINTF_ALIGN_SIZE;
-    *(T *)(void *)ptr = arg;
-    ptr += CUPRINTF_ALIGN_SIZE;
-    *ptr = 0;
-
-    return ptr;
-}
-
-
-//
-//  cuPrintf
-//
-//  Templated printf functions to handle multiple arguments.
-//  Note we return the total amount of data copied, not the number
-//  of characters output. But then again, who ever looks at the
-//  return from printf() anyway?
-//
-//  The format is to grab a block of circular buffer space, the
-//  start of which will hold a header and a pointer to the format
-//  string. We then write in all the arguments, and finally the
-//  format string itself. This is to make it easy to prevent
-//  overflow of our buffer (we support up to 10 arguments, each of
-//  which can be 12 bytes in length - that means that only the
-//  format string (or a %s) can actually overflow; so the overflow
-//  check need only be in the strcpy function.
-//
-//  The header is written at the very last because that's what
-//  makes it look like we're done.
-//
-//  Errors, which are basically lack-of-initialisation, are ignored
-//  in the called functions because NULL pointers are passed around
-//
-
-// All printf variants basically do the same thing, setting up the
-// buffer, writing all arguments, then finalising the header. For
-// clarity, we'll pack the code into some big macros.
-#define CUPRINTF_PREAMBLE \
-    char *start, *end, *bufptr, *fmtstart; \
-    if((start = getNextPrintfBufPtr()) == NULL) return 0; \
-    end = start + CUPRINTF_MAX_LEN; \
-    bufptr = start + sizeof(cuPrintfHeader);
-
-// Posting an argument is easy
-#define CUPRINTF_ARG(argname) \
-	bufptr = copyArg(bufptr, argname, end);
-
-// After args are done, record start-of-fmt and write the fmt and header
-#define CUPRINTF_POSTAMBLE \
-    fmtstart = bufptr; \
-    end = cuPrintfStrncpy(bufptr, fmt, CUPRINTF_MAX_LEN, end); \
-    writePrintfHeader(start, end ? fmtstart : NULL); \
-    return end ? (int)(end - start) : 0;
-
-__device__ int cuPrintf(const char *fmt)
-{
-	CUPRINTF_PREAMBLE;
-
-	CUPRINTF_POSTAMBLE;
-}
-template <typename T1> __device__ int cuPrintf(const char *fmt, T1 arg1)
-{
-	CUPRINTF_PREAMBLE;
-	    
-	CUPRINTF_ARG(arg1);
-
-	CUPRINTF_POSTAMBLE;
-}
-template <typename T1, typename T2> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2)
-{
-	CUPRINTF_PREAMBLE;
-	    
-	CUPRINTF_ARG(arg1);
-	CUPRINTF_ARG(arg2);
-
-	CUPRINTF_POSTAMBLE;
-}
-template <typename T1, typename T2, typename T3> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3)
-{
-	CUPRINTF_PREAMBLE;
-	    
-	CUPRINTF_ARG(arg1);
-	CUPRINTF_ARG(arg2);
-	CUPRINTF_ARG(arg3);
-
-	CUPRINTF_POSTAMBLE;
-}
-template <typename T1, typename T2, typename T3, typename T4> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4)
-{
-	CUPRINTF_PREAMBLE;
-	    
-	CUPRINTF_ARG(arg1);
-	CUPRINTF_ARG(arg2);
-	CUPRINTF_ARG(arg3);
-	CUPRINTF_ARG(arg4);
-
-	CUPRINTF_POSTAMBLE;
-}
-template <typename T1, typename T2, typename T3, typename T4, typename T5> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5)
-{
-	CUPRINTF_PREAMBLE;
-	    
-	CUPRINTF_ARG(arg1);
-	CUPRINTF_ARG(arg2);
-	CUPRINTF_ARG(arg3);
-	CUPRINTF_ARG(arg4);
-	CUPRINTF_ARG(arg5);
-
-	CUPRINTF_POSTAMBLE;
-}
-template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6)
-{
-	CUPRINTF_PREAMBLE;
-	    
-	CUPRINTF_ARG(arg1);
-	CUPRINTF_ARG(arg2);
-	CUPRINTF_ARG(arg3);
-	CUPRINTF_ARG(arg4);
-	CUPRINTF_ARG(arg5);
-	CUPRINTF_ARG(arg6);
-	CUPRINTF_POSTAMBLE;
-}
-template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7)
-{
-	CUPRINTF_PREAMBLE;
-	    
-	CUPRINTF_ARG(arg1);
-	CUPRINTF_ARG(arg2);
-	CUPRINTF_ARG(arg3);
-	CUPRINTF_ARG(arg4);
-	CUPRINTF_ARG(arg5);
-	CUPRINTF_ARG(arg6);
-	CUPRINTF_ARG(arg7);
-
-	CUPRINTF_POSTAMBLE;
-}
-template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8)
-{
-	CUPRINTF_PREAMBLE;
-
-	CUPRINTF_ARG(arg1);
-	CUPRINTF_ARG(arg2);
-	CUPRINTF_ARG(arg3);
-	CUPRINTF_ARG(arg4);
-	CUPRINTF_ARG(arg5);
-	CUPRINTF_ARG(arg6);
-	CUPRINTF_ARG(arg7);
-	CUPRINTF_ARG(arg8);
-
-	CUPRINTF_POSTAMBLE;
-}
-template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9)
-{
-	CUPRINTF_PREAMBLE;
-	    
-	CUPRINTF_ARG(arg1);
-	CUPRINTF_ARG(arg2);
-	CUPRINTF_ARG(arg3);
-	CUPRINTF_ARG(arg4);
-	CUPRINTF_ARG(arg5);
-	CUPRINTF_ARG(arg6);
-	CUPRINTF_ARG(arg7);
-	CUPRINTF_ARG(arg8);
-	CUPRINTF_ARG(arg9);
-
-	CUPRINTF_POSTAMBLE;
-}
-template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9, typename T10> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9, T10 arg10)
-{
-	CUPRINTF_PREAMBLE;
-	    
-	CUPRINTF_ARG(arg1);
-	CUPRINTF_ARG(arg2);
-	CUPRINTF_ARG(arg3);
-	CUPRINTF_ARG(arg4);
-	CUPRINTF_ARG(arg5);
-	CUPRINTF_ARG(arg6);
-	CUPRINTF_ARG(arg7);
-	CUPRINTF_ARG(arg8);
-	CUPRINTF_ARG(arg9);
-	CUPRINTF_ARG(arg10);
-
-	CUPRINTF_POSTAMBLE;
-}
-#undef CUPRINTF_PREAMBLE
-#undef CUPRINTF_ARG
-#undef CUPRINTF_POSTAMBLE
-
-
-//
-//	cuPrintfRestrict
-//
-//	Called to restrict output to a given thread/block.
-//	We store the info in "restrictRules", which is set up at
-//	init time by the host. It's not the cleanest way to do this
-//	because it means restrictions will last between
-//	invocations, but given the output-pointer continuity,
-//	I feel this is reasonable.
-//
-__device__ void cuPrintfRestrict(int threadid, int blockid)
-{
-    int thread_count = blockDim.x * blockDim.y * blockDim.z;
-	if(((threadid < thread_count) && (threadid >= 0)) || (threadid == CUPRINTF_UNRESTRICTED))
-		restrictRules.threadid = threadid;
-
-	int block_count = gridDim.x * gridDim.y;
-	if(((blockid < block_count) && (blockid >= 0)) || (blockid == CUPRINTF_UNRESTRICTED))
-		restrictRules.blockid = blockid;
-}
-
-
-///////////////////////////////////////////////////////////////////////////////
-// HOST SIDE
-
-#include <stdio.h>
-static FILE *printf_fp;
-
-static char *printfbuf_start=NULL;
-static char *printfbuf_device=NULL;
-static int printfbuf_len=0;
-
-
-//
-//  outputPrintfData
-//
-//  Our own internal function, which takes a pointer to a data buffer
-//  and passes it through libc's printf for output.
-//
-//  We receive the formate string and a pointer to where the data is
-//  held. We then run through and print it out.
-//
-//  Returns 0 on failure, 1 on success
-//
-static int outputPrintfData(char *fmt, char *data)
-{
-    // Format string is prefixed by a length that we don't need
-    fmt += CUPRINTF_ALIGN_SIZE;
-
-    // Now run through it, printing everything we can. We must
-    // run to every % character, extract only that, and use printf
-    // to format it.
-    char *p = strchr(fmt, '%');
-    while(p != NULL)
-    {
-        // Print up to the % character
-        *p = '\0';
-        fputs(fmt, printf_fp);
-        *p = '%';           // Put back the %
-
-        // Now handle the format specifier
-        char *format = p++;         // Points to the '%'
-        p += strcspn(p, "%cdiouxXeEfgGaAnps");
-        if(*p == '\0')              // If no format specifier, print the whole thing
-        {
-            fmt = format;
-            break;
-        }
-
-        // Cut out the format bit and use printf to print it. It's prefixed
-        // by its length.
-        int arglen = *(int *)data;
-        if(arglen > CUPRINTF_MAX_LEN)
-        {
-            fputs("Corrupt printf buffer data - aborting\n", printf_fp);
-            return 0;
-        }
-
-        data += CUPRINTF_ALIGN_SIZE;
-        
-        char specifier = *p++;
-        char c = *p;        // Store for later
-        *p = '\0';
-        switch(specifier)
-        {
-            // These all take integer arguments
-            case 'c':
-            case 'd':
-            case 'i':
-            case 'o':
-            case 'u':
-            case 'x':
-            case 'X':
-            case 'p':
-                fprintf(printf_fp, format, *((int *)data));
-                break;
-
-            // These all take double arguments
-            case 'e':
-            case 'E':
-            case 'f':
-            case 'g':
-            case 'G':
-            case 'a':
-            case 'A':
-                if(arglen == 4)     // Float vs. Double thing
-                    fprintf(printf_fp, format, *((float *)data));
-                else
-                    fprintf(printf_fp, format, *((double *)data));
-                break;
-
-            // Strings are handled in a special way
-            case 's':
-                fprintf(printf_fp, format, (char *)data);
-                break;
-
-            // % is special
-            case '%':
-                fprintf(printf_fp, "%%");
-                break;
-
-            // Everything else is just printed out as-is
-            default:
-                fprintf(printf_fp, format);
-                break;
-        }
-        data += CUPRINTF_ALIGN_SIZE;         // Move on to next argument
-        *p = c;                     // Restore what we removed
-        fmt = p;                    // Adjust fmt string to be past the specifier
-        p = strchr(fmt, '%');       // and get the next specifier
-    }
-
-    // Print out the last of the string
-    fputs(fmt, printf_fp);
-    return 1;
-}
-
-
-//
-//  doPrintfDisplay
-//
-//  This runs through the blocks of CUPRINTF_MAX_LEN-sized data, calling the
-//  print function above to display them. We've got this separate from
-//  cudaPrintfDisplay() below so we can handle the SM_10 architecture
-//  partitioning.
-//
-static int doPrintfDisplay(int headings, int clear, char *bufstart, char *bufend, char *bufptr, char *endptr)
-{
-    // Grab, piece-by-piece, each output element until we catch
-    // up with the circular buffer end pointer
-    int printf_count=0;
-    char printfbuf_local[CUPRINTF_MAX_LEN+1];
-    printfbuf_local[CUPRINTF_MAX_LEN] = '\0';
-
-    while(bufptr != endptr)
-    {
-        // Wrap ourselves at the end-of-buffer
-        if(bufptr == bufend)
-            bufptr = bufstart;
-
-        // Adjust our start pointer to within the circular buffer and copy a block.
-        cudaMemcpy(printfbuf_local, bufptr, CUPRINTF_MAX_LEN, cudaMemcpyDeviceToHost);
-
-        // If the magic number isn't valid, then this write hasn't gone through
-        // yet and we'll wait until it does (or we're past the end for non-async printfs).
-        cuPrintfHeader *hdr = (cuPrintfHeader *)printfbuf_local;
-        if((hdr->magic != CUPRINTF_SM11_MAGIC) || (hdr->fmtoffset >= CUPRINTF_MAX_LEN))
-        {
-            //fprintf(printf_fp, "Bad magic number in printf header\n");
-            break;
-        }
-
-        // Extract all the info and get this printf done
-        if(headings)
-            fprintf(printf_fp, "[%d, %d]: ", hdr->blockid, hdr->threadid);
-        if(hdr->fmtoffset == 0)
-            fprintf(printf_fp, "printf buffer overflow\n");
-        else if(!outputPrintfData(printfbuf_local+hdr->fmtoffset, printfbuf_local+sizeof(cuPrintfHeader)))
-            break;
-        printf_count++;
-
-        // Clear if asked
-        if(clear)
-            cudaMemset(bufptr, 0, CUPRINTF_MAX_LEN);
-
-        // Now advance our start location, because we're done, and keep copying
-        bufptr += CUPRINTF_MAX_LEN;
-    }
-
-    return printf_count;
-}
-
-
-//
-//  cudaPrintfInit
-//
-//  Takes a buffer length to allocate, creates the memory on the device and
-//  returns a pointer to it for when a kernel is called. It's up to the caller
-//  to free it.
-//
-extern "C" cudaError_t cudaPrintfInit(size_t bufferLen)
-{
-    // Fix up bufferlen to be a multiple of CUPRINTF_MAX_LEN
-    bufferLen = (bufferLen < CUPRINTF_MAX_LEN) ? CUPRINTF_MAX_LEN : bufferLen;
-    if((bufferLen % CUPRINTF_MAX_LEN) > 0)
-        bufferLen += (CUPRINTF_MAX_LEN - (bufferLen % CUPRINTF_MAX_LEN));
-    printfbuf_len = (int)bufferLen;
-
-    // Allocate a print buffer on the device and zero it
-    if(cudaMalloc((void **)&printfbuf_device, printfbuf_len) != cudaSuccess)
-		return cudaErrorInitializationError;
-    cudaMemset(printfbuf_device, 0, printfbuf_len);
-    printfbuf_start = printfbuf_device;         // Where we start reading from
-
-	// No restrictions to begin with
-	cuPrintfRestriction restrict;
-	restrict.threadid = restrict.blockid = CUPRINTF_UNRESTRICTED;
-	cudaMemcpyToSymbol(restrictRules, &restrict, sizeof(restrict));
-
-    // Initialise the buffer and the respective lengths/pointers.
-    cudaMemcpyToSymbol(globalPrintfBuffer, &printfbuf_device, sizeof(char *));
-    cudaMemcpyToSymbol(printfBufferPtr, &printfbuf_device, sizeof(char *));
-    cudaMemcpyToSymbol(printfBufferLength, &printfbuf_len, sizeof(printfbuf_len));
-
-    return cudaSuccess;
-}
-
-
-//
-//  cudaPrintfEnd
-//
-//  Frees up the memory which we allocated
-//
-extern "C" void cudaPrintfEnd()
-{
-    if(!printfbuf_start || !printfbuf_device)
-        return;
-
-    cudaFree(printfbuf_device);
-    printfbuf_start = printfbuf_device = NULL;
-}
-
-
-//
-//  cudaPrintfDisplay
-//
-//  Each call to this function dumps the entire current contents
-//	of the printf buffer to the pre-specified FILE pointer. The
-//	circular "start" pointer is advanced so that subsequent calls
-//	dumps only new stuff.
-//
-//  In the case of async memory access (via streams), call this
-//  repeatedly to keep trying to empty the buffer. If it's a sync
-//  access, then the whole buffer should empty in one go.
-//
-//	Arguments:
-//		outputFP     - File descriptor to output to (NULL => stdout)
-//		showThreadID - If true, prints [block,thread] before each line
-//
-extern "C" cudaError_t cudaPrintfDisplay(void *outputFP, bool showThreadID)
-{
-	printf_fp = (FILE *)((outputFP == NULL) ? stdout : outputFP);
-
-    // For now, we force "synchronous" mode which means we're not concurrent
-	// with kernel execution. This also means we don't need clearOnPrint.
-	// If you're patching it for async operation, here's where you want it.
-    bool sync_printfs = true;
-	bool clearOnPrint = false;
-
-    // Initialisation check
-    if(!printfbuf_start || !printfbuf_device || !printf_fp)
-        return cudaErrorMissingConfiguration;
-
-    // To determine which architecture we're using, we read the
-    // first short from the buffer - it'll be the magic number
-    // relating to the version.
-    unsigned short magic;
-    cudaMemcpy(&magic, printfbuf_device, sizeof(unsigned short), cudaMemcpyDeviceToHost);
-
-    // For SM_10 architecture, we've split our buffer into one-per-thread.
-    // That means we must do each thread block separately. It'll require
-    // extra reading. We also, for now, don't support async printfs because
-    // that requires tracking one start pointer per thread.
-    if(magic == CUPRINTF_SM10_MAGIC)
-    {
-        sync_printfs = true;
-	    clearOnPrint = false;
-        int blocklen = 0;
-        char *blockptr = printfbuf_device;
-        while(blockptr < (printfbuf_device + printfbuf_len))
-        {
-            cuPrintfHeaderSM10 hdr;
-            cudaMemcpy(&hdr, blockptr, sizeof(hdr), cudaMemcpyDeviceToHost);
-
-            // We get our block-size-step from the very first header
-            if(hdr.thread_buf_len != 0)
-                blocklen = hdr.thread_buf_len;
-
-            // No magic number means no printfs from this thread
-            if(hdr.magic != CUPRINTF_SM10_MAGIC)
-            {
-                if(blocklen == 0)
-                {
-                    fprintf(printf_fp, "No printf headers found at all!\n");
-                    break;                              // No valid headers!
-                }
-                blockptr += blocklen;
-                continue;
-            }
-
-            // "offset" is non-zero then we can print the block contents
-            if(hdr.offset > 0)
-            {
-                // For synchronous printfs, we must print from endptr->bufend, then from start->end
-                if(sync_printfs)
-                    doPrintfDisplay(showThreadID, clearOnPrint, blockptr+CUPRINTF_MAX_LEN, blockptr+hdr.thread_buf_len, blockptr+hdr.offset+CUPRINTF_MAX_LEN, blockptr+hdr.thread_buf_len);
-                doPrintfDisplay(showThreadID, clearOnPrint, blockptr+CUPRINTF_MAX_LEN, blockptr+hdr.thread_buf_len, blockptr+CUPRINTF_MAX_LEN, blockptr+hdr.offset+CUPRINTF_MAX_LEN);
-            }
-
-            // Move on to the next block and loop again
-            blockptr += hdr.thread_buf_len;
-        }
-    }
-    // For SM_11 and up, everything is a single buffer and it's simple
-    else if(magic == CUPRINTF_SM11_MAGIC)
-    {
-	    // Grab the current "end of circular buffer" pointer.
-        char *printfbuf_end = NULL;
-        cudaMemcpyFromSymbol(&printfbuf_end, printfBufferPtr, sizeof(char *));
-
-        // Adjust our starting and ending pointers to within the block
-        char *bufptr = ((printfbuf_start - printfbuf_device) % printfbuf_len) + printfbuf_device;
-        char *endptr = ((printfbuf_end - printfbuf_device) % printfbuf_len) + printfbuf_device;
-
-        // For synchronous (i.e. after-kernel-exit) printf display, we have to handle circular
-        // buffer wrap carefully because we could miss those past "end".
-        if(sync_printfs)
-            doPrintfDisplay(showThreadID, clearOnPrint, printfbuf_device, printfbuf_device+printfbuf_len, endptr, printfbuf_device+printfbuf_len);
-        doPrintfDisplay(showThreadID, clearOnPrint, printfbuf_device, printfbuf_device+printfbuf_len, bufptr, endptr);
-
-        printfbuf_start = printfbuf_end;
-    }
-    else
-        ;//printf("Bad magic number in cuPrintf buffer header\n");
-
-    // If we were synchronous, then we must ensure that the memory is cleared on exit
-    // otherwise another kernel launch with a different grid size could conflict.
-    if(sync_printfs)
-        cudaMemset(printfbuf_device, 0, printfbuf_len);
-
-    return cudaSuccess;
-}
-
-// Cleanup
-#undef CUPRINTF_MAX_LEN
-#undef CUPRINTF_ALIGN_SIZE
-#undef CUPRINTF_SM10_MAGIC
-#undef CUPRINTF_SM11_MAGIC
-
-#endif
diff --git a/cuPrintf.cuh b/cuPrintf.cuh
deleted file mode 100644
index cf3fe48688..0000000000
--- a/cuPrintf.cuh
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
-	Copyright 2009 NVIDIA Corporation.  All rights reserved.
-
-	NOTICE TO LICENSEE:   
-
-	This source code and/or documentation ("Licensed Deliverables") are subject 
-	to NVIDIA intellectual property rights under U.S. and international Copyright 
-	laws.  
-
-	These Licensed Deliverables contained herein is PROPRIETARY and CONFIDENTIAL 
-	to NVIDIA and is being provided under the terms and conditions of a form of 
-	NVIDIA software license agreement by and between NVIDIA and Licensee ("License 
-	Agreement") or electronically accepted by Licensee.  Notwithstanding any terms 
-	or conditions to the contrary in the License Agreement, reproduction or 
-	disclosure of the Licensed Deliverables to any third party without the express 
-	written consent of NVIDIA is prohibited.     
-
-	NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, 
-	NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THESE LICENSED 
-	DELIVERABLES FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED 
-	WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE 
-	LICENSED DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 
-	NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   NOTWITHSTANDING ANY 
-	TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, IN NO EVENT SHALL 
-	NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, 
-	OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,	WHETHER 
-	IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,  ARISING OUT OF 
-	OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THESE LICENSED DELIVERABLES.  
-
-	U.S. Government End Users. These Licensed Deliverables are a "commercial item" 
-	as that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of 
-	"commercial computer  software"  and "commercial computer software documentation" 
-	as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the 
-	U.S. Government only as a commercial end item.  Consistent with 48 C.F.R.12.212 
-	and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all U.S. Government 
-	End Users acquire the Licensed Deliverables with only those rights set forth 
-	herein. 
-
-	Any use of the Licensed Deliverables in individual and commercial software must 
-	include, in the user documentation and internal comments to the code, the above 
-	Disclaimer and U.S. Government End Users Notice.
- */
-
-#ifndef CUPRINTF_H
-#define CUPRINTF_H
-
-/*
- *	This is the header file supporting cuPrintf.cu and defining both
- *	the host and device-side interfaces. See that file for some more
- *	explanation and sample use code. See also below for details of the
- *	host-side interfaces.
- *
- *  Quick sample code:
- *
-	#include "cuPrintf.cu"
- 	
-	__global__ void testKernel(int val)
-	{
-		cuPrintf("Value is: %d\n", val);
-	}
-
-	int main()
-	{
-		cudaPrintfInit();
-		testKernel<<< 2, 3 >>>(10);
-		cudaPrintfDisplay(stdout, true);
-		cudaPrintfEnd();
-        return 0;
-	}
- */
-
-///////////////////////////////////////////////////////////////////////////////
-// DEVICE SIDE
-// External function definitions for device-side code
-
-// Abuse of templates to simulate varargs
-__device__ int cuPrintf(const char *fmt);
-template <typename T1> __device__ int cuPrintf(const char *fmt, T1 arg1);
-template <typename T1, typename T2> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2);
-template <typename T1, typename T2, typename T3> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3);
-template <typename T1, typename T2, typename T3, typename T4> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4);
-template <typename T1, typename T2, typename T3, typename T4, typename T5> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5);
-template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6);
-template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7);
-template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8);
-template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9);
-template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9, typename T10> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9, T10 arg10);
-
-
-//
-//	cuPrintfRestrict
-//
-//	Called to restrict output to a given thread/block. Pass
-//	the constant CUPRINTF_UNRESTRICTED to unrestrict output
-//	for thread/block IDs. Note you can therefore allow
-//	"all printfs from block 3" or "printfs from thread 2
-//	on all blocks", or "printfs only from block 1, thread 5".
-//
-//	Arguments:
-//		threadid - Thread ID to allow printfs from
-//		blockid - Block ID to allow printfs from
-//
-//	NOTE: Restrictions last between invocations of
-//	kernels unless cudaPrintfInit() is called again.
-//
-#define CUPRINTF_UNRESTRICTED	-1
-__device__ void cuPrintfRestrict(int threadid, int blockid);
-
-
-
-///////////////////////////////////////////////////////////////////////////////
-// HOST SIDE
-// External function definitions for host-side code
-
-//
-//	cudaPrintfInit
-//
-//	Call this once to initialise the printf system. If the output
-//	file or buffer size needs to be changed, call cudaPrintfEnd()
-//	before re-calling cudaPrintfInit().
-//
-//	The default size for the buffer is 1 megabyte. For CUDA
-//	architecture 1.1 and above, the buffer is filled linearly and
-//	is completely used;	however for architecture 1.0, the buffer
-//	is divided into as many segments are there are threads, even
-//	if some threads do not call cuPrintf().
-//
-//	Arguments:
-//		bufferLen - Length, in bytes, of total space to reserve
-//		            (in device global memory) for output.
-//
-//	Returns:
-//		cudaSuccess if all is well.
-//
-extern "C" cudaError_t cudaPrintfInit(size_t bufferLen=1048576);   // 1-meg - that's enough for 4096 printfs by all threads put together
-
-//
-//	cudaPrintfEnd
-//
-//	Cleans up all memories allocated by cudaPrintfInit().
-//	Call this at exit, or before calling cudaPrintfInit() again.
-//
-extern "C" void cudaPrintfEnd();
-
-//
-//	cudaPrintfDisplay
-//
-//	Dumps the contents of the output buffer to the specified
-//	file pointer. If the output pointer is not specified,
-//	the default "stdout" is used.
-//
-//	Arguments:
-//		outputFP     - A file pointer to an output stream.
-//		showThreadID - If "true", output strings are prefixed
-//		               by "[blockid, threadid] " at output.
-//
-//	Returns:
-//		cudaSuccess if all is well.
-//
-extern "C" cudaError_t cudaPrintfDisplay(void *outputFP=NULL, bool showThreadID=false);
-
-#endif  // CUPRINTF_H
diff --git a/cuda.cpp b/cuda.cpp
index 95c8221bdf..02f253c297 100644
--- a/cuda.cpp
+++ b/cuda.cpp
@@ -1,8 +1,8 @@
-﻿#include <stdio.h>
+﻿#include <cstdio>
 #include <memory.h>
-#include <string.h>
+#include <cstring>
 #include <map>
-
+using namespace std;
 #ifndef _WIN32
 #include <unistd.h>
 #endif
@@ -16,10 +16,15 @@
 #include <ctype.h>
 #endif
 
+#include "nvml.h"
 #include "miner.h"
 
 #include "cuda_runtime.h"
 
+cudaDeviceProp device_props[MAX_GPUS];
+cudaStream_t gpustream[MAX_GPUS] = { 0 };
+extern uint16_t opt_api_listen;
+
 // CUDA Devices on the System
 int cuda_num_devices()
 {
@@ -31,10 +36,9 @@ int cuda_num_devices()
 		exit(1);
 	}
 
-	int maj = version / 1000, min = version % 100; // same as in deviceQuery sample
-	if (maj < 5 || (maj == 5 && min < 5))
+	if (version < CUDART_VERSION)
 	{
-		applog(LOG_ERR, "Driver does not support CUDA %d.%d API! Update your nVidia driver!", 5, 5);
+		applog(LOG_ERR, "Driver does not support CUDA %d.%d API! Update your nVidia driver!", CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10);
 		exit(1);
 	}
 
@@ -42,37 +46,89 @@ int cuda_num_devices()
 	err = cudaGetDeviceCount(&GPU_N);
 	if (err != cudaSuccess)
 	{
-		applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
+		if(err!=cudaErrorNoDevice)
+			applog(LOG_ERR, "No CUDA device found!");
+		else
+			applog(LOG_ERR, "Unable to query number of CUDA devices!");
 		exit(1);
 	}
 	return GPU_N;
 }
 
+int cuda_version()
+{
+	return (int)CUDART_VERSION;
+}
+
 void cuda_devicenames()
 {
 	cudaError_t err;
 	int GPU_N;
 	err = cudaGetDeviceCount(&GPU_N);
-	if (err != cudaSuccess)
+	if(err != cudaSuccess)
 	{
 		applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
 		exit(1);
 	}
 
-	for (int i=0; i < GPU_N; i++)
+	if(opt_n_threads)
+		GPU_N = min(MAX_GPUS, opt_n_threads);
+	for(int i = 0; i < GPU_N; i++)
 	{
+		char vendorname[32] = {0};
+		int dev_id = device_map[i];
 		cudaDeviceProp props;
-		cudaGetDeviceProperties(&props, device_map[i]);
+		cudaGetDeviceProperties(&props, dev_id);
+
+		device_sm[dev_id] = (props.major * 100 + props.minor * 10);
 
-		device_name[i] = strdup(props.name);
-		device_sm[i] = (props.major * 100 + props.minor * 10);
+		if(device_name[dev_id])
+		{
+			free(device_name[dev_id]);
+			device_name[dev_id] = NULL;
+		}
+#ifdef USE_WRAPNVML
+		if(gpu_vendor((uint8_t)props.pciBusID, vendorname) > 0 && strlen(vendorname))
+		{
+			device_name[dev_id] = (char*)calloc(1, strlen(vendorname) + strlen(props.name) + 2);
+			if(device_name[dev_id] == NULL)
+			{
+				applog(LOG_ERR, "Out of memory!");
+				proper_exit(1);
+			}
+			if(!strncmp(props.name, "GeForce ", 8))
+				sprintf(device_name[dev_id], "%s %s", vendorname, &props.name[8]);
+			else
+				sprintf(device_name[dev_id], "%s %s", vendorname, props.name);
+		}
+		else
+#endif
+			device_name[dev_id] = strdup(props.name);
+	}
+}
+
+
+void cuda_print_devices()
+{
+	int ngpus = cuda_num_devices();
+	for (int n=0; n < ngpus; n++) {
+		int m = device_map[n];
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, m);
+		if (!opt_n_threads || n < opt_n_threads)
+			fprintf(stderr, "GPU #%d: SM %d.%d %s\n", m, props.major, props.minor, props.name);
 	}
 }
 
 // Can't be called directly in cpu-miner.c
 void cuda_devicereset()
 {
-	cudaDeviceReset();
+	for (int i = 0; i < active_gpus; i++)
+	{
+		cudaSetDevice(device_map[i]);
+		cudaDeviceSynchronize();
+		cudaDeviceReset();
+	}
 }
 
 static bool substringsearch(const char *haystack, const char *needle, int &match)
@@ -113,7 +169,7 @@ int cuda_finddevice(char *name)
 uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount)
 {
 	uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount;
-	api_set_throughput(thr_id, throughput);
+	if(opt_api_listen!=0) api_set_throughput(thr_id, throughput);
 	return throughput;
 }
 
diff --git a/cuda_bitcoin.cu b/cuda_bitcoin.cu
index 80c6dc6965..1b6d4e0b46 100644
--- a/cuda_bitcoin.cu
+++ b/cuda_bitcoin.cu
@@ -1,7 +1,11 @@
 // Original version written by Schleicher (KlausT @github)
 // Redistribution and use in source and binary forms, with or without modification, are permitted
 
+#ifdef __cplusplus
+#include <cstdint>
+#else
 #include <stdint.h>
+#endif
 #include "miner.h"
 #include "cuda_helper.h"
 
@@ -9,553 +13,548 @@ void bitcoin_cpu_init(int thr_id);
 void bitcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *const ms, uint32_t merkle, uint32_t time, uint32_t compacttarget, uint32_t *const h_nounce);
 void bitcoin_midstate(const uint32_t *data, uint32_t *midstate);
 
+
 __constant__ uint32_t pTarget[8];
 static uint32_t *d_result[MAX_GPUS];
 
 #define TPB 512
-#define NONCES_PER_THREAD 2048
+#define NONCES_PER_THREAD 32
 
-#if __CUDA_ARCH__ < 320
-#define rrot(x, n) ((x >> n) | (x << (32 - n)))
-#else
-#define rrot(x, n) __funnelshift_r((x), (x), (n))
-#endif
-
-__global__ __launch_bounds__(TPB, 1)
+__global__ __launch_bounds__(TPB, 2)
 void bitcoin_gpu_hash(const uint32_t threads, const uint32_t startNounce, uint32_t *const result, const uint32_t t1c, const uint32_t t2c, const uint32_t w16, const uint32_t w16rot, const uint32_t w17, const uint32_t w17rot, const uint32_t b2, const uint32_t c2, const uint32_t d2, const uint32_t f2, const uint32_t g2, const uint32_t h2, const uint32_t ms0, const uint32_t ms1, const uint32_t ms2, const uint32_t ms3, const uint32_t ms4, const uint32_t ms5, const uint32_t ms6, const uint32_t ms7, const uint32_t compacttarget)
 {
-	uint32_t threadindex = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t threadindex = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (threadindex < threads)
 	{
 		uint32_t t1, a, b, c, d, e, f, g, h;
 		uint32_t w[64];
 		const uint32_t numberofthreads = blockDim.x*gridDim.x;
 		const uint32_t maxnonce = startNounce + threadindex + numberofthreads*NONCES_PER_THREAD - 1;
-		const uint32_t threadindex = blockIdx.x*blockDim.x + threadIdx.x;
-		
-		for (uint32_t nonce = startNounce + threadindex; nonce <= maxnonce; nonce += numberofthreads)
+
+		#pragma unroll 
+		for (uint32_t nonce = startNounce + threadindex; nonce-1 < maxnonce; nonce += numberofthreads)
 		{
-			w[18] = (rrot(nonce, 7) ^ rrot(nonce, 18) ^ (nonce >> 3)) + w16rot;
+			w[18] = (ROTR32(nonce, 7) ^ ROTR32(nonce, 18) ^ (nonce >> 3)) + w16rot;
 			w[19] = nonce + w17rot;
-			w[20] = 0x80000000U + (rrot(w[18], 17) ^ rrot(w[18], 19) ^ (w[18] >> 10));
-			w[21] = (rrot(w[19], 17) ^ rrot(w[19], 19) ^ (w[19] >> 10));
-			w[22] = 0x280U + (rrot(w[20], 17) ^ rrot(w[20], 19) ^ (w[20] >> 10));
-			w[23] = w16 + (rrot(w[21], 17) ^ rrot(w[21], 19) ^ (w[21] >> 10));
-			w[24] = w17 + (rrot(w[22], 17) ^ rrot(w[22], 19) ^ (w[22] >> 10));
-			w[25] = w[18] + (rrot(w[23], 17) ^ rrot(w[23], 19) ^ (w[23] >> 10));
-			w[26] = w[19] + (rrot(w[24], 17) ^ rrot(w[24], 19) ^ (w[24] >> 10));
-			w[27] = w[20] + (rrot(w[25], 17) ^ rrot(w[25], 19) ^ (w[25] >> 10));
-			w[28] = w[21] + (rrot(w[26], 17) ^ rrot(w[26], 19) ^ (w[26] >> 10));
-			w[29] = w[22] + (rrot(w[27], 17) ^ rrot(w[27], 19) ^ (w[27] >> 10));
-			w[30] = w[23] + 0xa00055U + (rrot(w[28], 17) ^ rrot(w[28], 19) ^ (w[28] >> 10));
-			w[31] = 0x280U + w[24] + (rrot(w16, 7) ^ rrot(w16, 18) ^ (w16 >> 3)) + (rrot(w[29], 17) ^ rrot(w[29], 19) ^ (w[29] >> 10));
-			w[32] = w16 + w[25] + (rrot(w17, 7) ^ rrot(w17, 18) ^ (w17 >> 3)) + (rrot(w[30], 17) ^ rrot(w[30], 19) ^ (w[30] >> 10));
-			w[33] = w17 + w[26] + (rrot(w[18], 7) ^ rrot(w[18], 18) ^ (w[18] >> 3)) + (rrot(w[31], 17) ^ rrot(w[31], 19) ^ (w[31] >> 10));
+			w[20] = 0x80000000U + (ROTR32(w[18], 17) ^ ROTR32(w[18], 19) ^ (w[18] >> 10));
+			w[21] = (ROTR32(w[19], 17) ^ ROTR32(w[19], 19) ^ (w[19] >> 10));
+			w[22] = 0x280U + (ROTR32(w[20], 17) ^ ROTR32(w[20], 19) ^ (w[20] >> 10));
+			w[23] = w16 + (ROTR32(w[21], 17) ^ ROTR32(w[21], 19) ^ (w[21] >> 10));
+			w[24] = w17 + (ROTR32(w[22], 17) ^ ROTR32(w[22], 19) ^ (w[22] >> 10));
+			w[25] = w[18] + (ROTR32(w[23], 17) ^ ROTR32(w[23], 19) ^ (w[23] >> 10));
+			w[26] = w[19] + (ROTR32(w[24], 17) ^ ROTR32(w[24], 19) ^ (w[24] >> 10));
+			w[27] = w[20] + (ROTR32(w[25], 17) ^ ROTR32(w[25], 19) ^ (w[25] >> 10));
+			w[28] = w[21] + (ROTR32(w[26], 17) ^ ROTR32(w[26], 19) ^ (w[26] >> 10));
+			w[29] = w[22] + (ROTR32(w[27], 17) ^ ROTR32(w[27], 19) ^ (w[27] >> 10));
+			w[30] = w[23] + 0xa00055U + (ROTR32(w[28], 17) ^ ROTR32(w[28], 19) ^ (w[28] >> 10));
+			w[31] = 0x280U + w[24] + (ROTR32(w16, 7) ^ ROTR32(w16, 18) ^ (w16 >> 3)) + (ROTR32(w[29], 17) ^ ROTR32(w[29], 19) ^ (w[29] >> 10));
+			w[32] = w16 + w[25] + (ROTR32(w17, 7) ^ ROTR32(w17, 18) ^ (w17 >> 3)) + (ROTR32(w[30], 17) ^ ROTR32(w[30], 19) ^ (w[30] >> 10));
+			w[33] = w17 + w[26] + (ROTR32(w[18], 7) ^ ROTR32(w[18], 18) ^ (w[18] >> 3)) + (ROTR32(w[31], 17) ^ ROTR32(w[31], 19) ^ (w[31] >> 10));
 #pragma unroll
 			for (int i = 34; i < 62; i++)
-				w[i] = w[i-16] + w[i-7] + (rrot(w[i-15], 7) ^ rrot(w[i-15], 18) ^ (w[i-15] >> 3)) + (rrot(w[i-2], 17) ^ rrot(w[i-2], 19) ^ (w[i-2] >> 10));
+				w[i] = w[i-16] + w[i-7] + (ROTR32(w[i-15], 7) ^ ROTR32(w[i-15], 18) ^ (w[i-15] >> 3)) + (ROTR32(w[i-2], 17) ^ ROTR32(w[i-2], 19) ^ (w[i-2] >> 10));
 
 			t1 = t1c + (uint32_t)nonce;
 			a = ms0 + t1;
 			e = t1 + t2c;
 			//
-			t1 = d2 + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c2 ^ (a & (b2 ^ c2))) + 0xb956c25bU;
+			t1 = d2 + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c2 ^ (a & (b2 ^ c2))) + 0xb956c25bU;
 			h = h2 + t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g2 & f2) | (e & (g2 | f2)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g2 & f2) | (e & (g2 | f2)));
 			//
-			t1 = c2 + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b2 ^ (h & (a ^ b2))) + 0x59f111f1U;
+			t1 = c2 + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b2 ^ (h & (a ^ b2))) + 0x59f111f1U;
 			g = g2 + t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f2 & e) | (d & (f2 | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f2 & e) | (d & (f2 | e)));
 			//
-			t1 = b2 + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x923f82a4U;
+			t1 = b2 + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x923f82a4U;
 			f = f2 + t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0xab1c5ed5U;
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0xab1c5ed5U;
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0xd807aa98U;
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0xd807aa98U;
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x12835b01U;
+			t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x12835b01U;
 			c += t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a)));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a)));
 			//
-			t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x243185beU;
+			t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x243185beU;
 			b += t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h)));
 			//
-			t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x550c7dc3U;
+			t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x550c7dc3U;
 			a += t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x72be5d74U;
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x72be5d74U;
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x80deb1feU;
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x80deb1feU;
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x9bdc06a7U;
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x9bdc06a7U;
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0xc19bf3f4U;
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0xc19bf3f4U;
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0xe49b69c1U + w16;
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0xe49b69c1U + w16;
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0xefbe4786U + w17;
+			t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0xefbe4786U + w17;
 			c += t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a)));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a)));
 			//
-			t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x0fc19dc6U + w[18];
+			t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x0fc19dc6U + w[18];
 			b += t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h)));
 			//
-			t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x240ca1ccU + w[19];
+			t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x240ca1ccU + w[19];
 			a += t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x2de92c6fU + w[20];
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x2de92c6fU + w[20];
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x4a7484aaU + w[21];
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x4a7484aaU + w[21];
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x5cb0a9dcU + w[22];
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x5cb0a9dcU + w[22];
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x76f988daU + w[23];
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x76f988daU + w[23];
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x983e5152U + w[24];
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x983e5152U + w[24];
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0xa831c66dU + w[25];
+			t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0xa831c66dU + w[25];
 			c += t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a)));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a)));
 			//
-			t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0xb00327c8U + w[26];
+			t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0xb00327c8U + w[26];
 			b += t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h)));
 			//
-			t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0xbf597fc7U + w[27];
+			t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0xbf597fc7U + w[27];
 			a += t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0xc6e00bf3U + w[28];
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0xc6e00bf3U + w[28];
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0xd5a79147U + w[29];
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0xd5a79147U + w[29];
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x06ca6351U + w[30];
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x06ca6351U + w[30];
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x14292967U + w[31];
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x14292967U + w[31];
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x27b70a85U + w[32];
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x27b70a85U + w[32];
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x2e1b2138U + w[33];
+			t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x2e1b2138U + w[33];
 			c += t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a)));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a)));
 			//
-			t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x4d2c6dfcU + w[34];
+			t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x4d2c6dfcU + w[34];
 			b += t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h)));
 			//
-			t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x53380d13U + w[35];
+			t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x53380d13U + w[35];
 			a += t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x650a7354U + w[36];
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x650a7354U + w[36];
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x766a0abbU + w[37];
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x766a0abbU + w[37];
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x81c2c92eU + w[38];
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x81c2c92eU + w[38];
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x92722c85U + w[39];
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x92722c85U + w[39];
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0xa2bfe8a1U + w[40];
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0xa2bfe8a1U + w[40];
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0xa81a664bU + w[41];
+			t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0xa81a664bU + w[41];
 			c += t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a)));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a)));
 			//
-			t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0xc24b8b70U + w[42];
+			t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0xc24b8b70U + w[42];
 			b += t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h)));
 			//
-			t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0xc76c51a3U + w[43];
+			t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0xc76c51a3U + w[43];
 			a += t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0xd192e819U + w[44];
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0xd192e819U + w[44];
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0xd6990624U + w[45];
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0xd6990624U + w[45];
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0xf40e3585U + w[46];
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0xf40e3585U + w[46];
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x106aa070U + w[47];
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x106aa070U + w[47];
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x19a4c116U + w[48];
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x19a4c116U + w[48];
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x1e376c08U + w[49];
+			t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x1e376c08U + w[49];
 			c += t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a)));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a)));
 			//
-			t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x2748774cU + w[50];
+			t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x2748774cU + w[50];
 			b += t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h)));
 			//
-			t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x34b0bcb5U + w[51];
+			t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x34b0bcb5U + w[51];
 			a += t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x391c0cb3U + w[52];
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x391c0cb3U + w[52];
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x4ed8aa4aU + w[53];
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x4ed8aa4aU + w[53];
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x5b9cca4fU + w[54];
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x5b9cca4fU + w[54];
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x682e6ff3U + w[55];
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x682e6ff3U + w[55];
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x748f82eeU + w[56];
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x748f82eeU + w[56];
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x78a5636fU + w[57];
+			t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x78a5636fU + w[57];
 			c += t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a)));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a)));
 			//
-			t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x84c87814U + w[58];
+			t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x84c87814U + w[58];
 			b += t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h)));
 			//
-			t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x8cc70208U + w[59];
+			t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x8cc70208U + w[59];
 			a += t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x90befffaU + w[60];
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x90befffaU + w[60];
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0xa4506cebU + w[61];
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0xa4506cebU + w[61];
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0xbef9a3f7U + w[46] + w[55] + (rrot(w[47], 7) ^ rrot(w[47], 18) ^ (w[47] >> 3)) + (rrot(w[60], 17) ^ rrot(w[60], 19) ^ (w[60] >> 10));
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0xbef9a3f7U + w[46] + w[55] + (ROTR32(w[47], 7) ^ ROTR32(w[47], 18) ^ (w[47] >> 3)) + (ROTR32(w[60], 17) ^ ROTR32(w[60], 19) ^ (w[60] >> 10));
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0xc67178f2U + w[47] + w[56] + (rrot(w[48], 7) ^ rrot(w[48], 18) ^ (w[48] >> 3)) + (rrot(w[61], 17) ^ rrot(w[61], 19) ^ (w[61] >> 10));
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0xc67178f2U + w[47] + w[56] + (ROTR32(w[48], 7) ^ ROTR32(w[48], 18) ^ (w[48] >> 3)) + (ROTR32(w[61], 17) ^ ROTR32(w[61], 19) ^ (w[61] >> 10));
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
 			w[0] = a + ms0; w[1] = b + ms1; w[2] = c + ms2; w[3] = d + ms3;
 			w[4] = e + ms4; w[5] = f + ms5; w[6] = g + ms6; w[7] = h + ms7;
 			// hash the hash ***************************************************************
-			w[16] = w[0] + (rrot(w[1], 7) ^ rrot(w[1], 18) ^ (w[1] >> 3));
-			w[17] = w[1] + (rrot(w[2], 7) ^ rrot(w[2], 18) ^ (w[2] >> 3)) + (rrot(0x100, 17) ^ rrot(0x100, 19) ^ (0x100 >> 10));
-			w[18] = w[2] + (rrot(w[3], 7) ^ rrot(w[3], 18) ^ (w[3] >> 3)) + (rrot(w[16], 17) ^ rrot(w[16], 19) ^ (w[16] >> 10));
-			w[19] = w[3] + (rrot(w[4], 7) ^ rrot(w[4], 18) ^ (w[4] >> 3)) + (rrot(w[17], 17) ^ rrot(w[17], 19) ^ (w[17] >> 10));
-			w[20] = w[4] + (rrot(w[5], 7) ^ rrot(w[5], 18) ^ (w[5] >> 3)) + (rrot(w[18], 17) ^ rrot(w[18], 19) ^ (w[18] >> 10));
-			w[21] = w[5] + (rrot(w[6], 7) ^ rrot(w[6], 18) ^ (w[6] >> 3)) + (rrot(w[19], 17) ^ rrot(w[19], 19) ^ (w[19] >> 10));
-			w[22] = w[6] + 0x100 + (rrot(w[7], 7) ^ rrot(w[7], 18) ^ (w[7] >> 3)) + (rrot(w[20], 17) ^ rrot(w[20], 19) ^ (w[20] >> 10));
-			w[23] = w[7] + w[16] + 0x11002000U + (rrot(w[21], 17) ^ rrot(w[21], 19) ^ (w[21] >> 10));
-			w[24] = 0x80000000U + w[17] + (rrot(w[22], 17) ^ rrot(w[22], 19) ^ (w[22] >> 10));
-			w[25] = w[18] + (rrot(w[23], 17) ^ rrot(w[23], 19) ^ (w[23] >> 10));
-			w[26] = w[19] + (rrot(w[24], 17) ^ rrot(w[24], 19) ^ (w[24] >> 10));
-			w[27] = w[20] + (rrot(w[25], 17) ^ rrot(w[25], 19) ^ (w[25] >> 10));
-			w[28] = w[21] + (rrot(w[26], 17) ^ rrot(w[26], 19) ^ (w[26] >> 10));
-			w[29] = w[22] + (rrot(w[27], 17) ^ rrot(w[27], 19) ^ (w[27] >> 10));
-			w[30] = w[23] + (rrot(0x100, 7) ^ rrot(0x100, 18) ^ (0x100 >> 3)) + (rrot(w[28], 17) ^ rrot(w[28], 19) ^ (w[28] >> 10));
-			w[31] = 0x100 + w[24] + (rrot(w[16], 7) ^ rrot(w[16], 18) ^ (w[16] >> 3)) + (rrot(w[29], 17) ^ rrot(w[29], 19) ^ (w[29] >> 10));
+			w[16] = w[0] + (ROTR32(w[1], 7) ^ ROTR32(w[1], 18) ^ (w[1] >> 3));
+			w[17] = w[1] + (ROTR32(w[2], 7) ^ ROTR32(w[2], 18) ^ (w[2] >> 3)) + (ROTR32(0x100, 17) ^ ROTR32(0x100, 19) ^ (0x100 >> 10));
+			w[18] = w[2] + (ROTR32(w[3], 7) ^ ROTR32(w[3], 18) ^ (w[3] >> 3)) + (ROTR32(w[16], 17) ^ ROTR32(w[16], 19) ^ (w[16] >> 10));
+			w[19] = w[3] + (ROTR32(w[4], 7) ^ ROTR32(w[4], 18) ^ (w[4] >> 3)) + (ROTR32(w[17], 17) ^ ROTR32(w[17], 19) ^ (w[17] >> 10));
+			w[20] = w[4] + (ROTR32(w[5], 7) ^ ROTR32(w[5], 18) ^ (w[5] >> 3)) + (ROTR32(w[18], 17) ^ ROTR32(w[18], 19) ^ (w[18] >> 10));
+			w[21] = w[5] + (ROTR32(w[6], 7) ^ ROTR32(w[6], 18) ^ (w[6] >> 3)) + (ROTR32(w[19], 17) ^ ROTR32(w[19], 19) ^ (w[19] >> 10));
+			w[22] = w[6] + 0x100 + (ROTR32(w[7], 7) ^ ROTR32(w[7], 18) ^ (w[7] >> 3)) + (ROTR32(w[20], 17) ^ ROTR32(w[20], 19) ^ (w[20] >> 10));
+			w[23] = w[7] + w[16] + 0x11002000U + (ROTR32(w[21], 17) ^ ROTR32(w[21], 19) ^ (w[21] >> 10));
+			w[24] = 0x80000000U + w[17] + (ROTR32(w[22], 17) ^ ROTR32(w[22], 19) ^ (w[22] >> 10));
+			w[25] = w[18] + (ROTR32(w[23], 17) ^ ROTR32(w[23], 19) ^ (w[23] >> 10));
+			w[26] = w[19] + (ROTR32(w[24], 17) ^ ROTR32(w[24], 19) ^ (w[24] >> 10));
+			w[27] = w[20] + (ROTR32(w[25], 17) ^ ROTR32(w[25], 19) ^ (w[25] >> 10));
+			w[28] = w[21] + (ROTR32(w[26], 17) ^ ROTR32(w[26], 19) ^ (w[26] >> 10));
+			w[29] = w[22] + (ROTR32(w[27], 17) ^ ROTR32(w[27], 19) ^ (w[27] >> 10));
+			w[30] = w[23] + (ROTR32(0x100, 7) ^ ROTR32(0x100, 18) ^ (0x100 >> 3)) + (ROTR32(w[28], 17) ^ ROTR32(w[28], 19) ^ (w[28] >> 10));
+			w[31] = 0x100 + w[24] + (ROTR32(w[16], 7) ^ ROTR32(w[16], 18) ^ (w[16] >> 3)) + (ROTR32(w[29], 17) ^ ROTR32(w[29], 19) ^ (w[29] >> 10));
 #pragma unroll
 			for (int i = 32; i < 59; i++)
-				w[i] = w[i - 16] + w[i - 7] + (rrot(w[i - 15], 7) ^ rrot(w[i - 15], 18) ^ (w[i - 15] >> 3)) + (rrot(w[i - 2], 17) ^ rrot(w[i - 2], 19) ^ (w[i - 2] >> 10));
+				w[i] = w[i - 16] + w[i - 7] + (ROTR32(w[i - 15], 7) ^ ROTR32(w[i - 15], 18) ^ (w[i - 15] >> 3)) + (ROTR32(w[i - 2], 17) ^ ROTR32(w[i - 2], 19) ^ (w[i - 2] >> 10));
 
 			d = 0x98c7e2a2U + w[0];
 			h = 0xfc08884dU + w[0];
 			//
-			t1 = (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (0x9b05688cU ^ (d & 0xca0b3af3)) + 0x90bb1e3cU + w[1];
+			t1 = (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (0x9b05688cU ^ (d & 0xca0b3af3)) + 0x90bb1e3cU + w[1];
 			c = 0x3c6ef372U + t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + (0x2A01A605 | (h & 0xfb6feee7));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + (0x2A01A605 | (h & 0xfb6feee7));
 			//
-			t1 = (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (0x510e527fU ^ (c & (d ^ 0x510e527fU))) + 0x50C6645BU + w[2];
+			t1 = (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (0x510e527fU ^ (c & (d ^ 0x510e527fU))) + 0x50C6645BU + w[2];
 			b = 0xbb67ae85U + t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((0x6a09e667U & h) | (g & (0x6a09e667U | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((0x6a09e667U & h) | (g & (0x6a09e667U | h)));
 			//
-			t1 = (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x3AC42E24U + w[3];
+			t1 = (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x3AC42E24U + w[3];
 			a = 0x6a09e667U + t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x3956c25bU + w[4];
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x3956c25bU + w[4];
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x59f111f1U + w[5];
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x59f111f1U + w[5];
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x923f82a4U + w[6];
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x923f82a4U + w[6];
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0xab1c5ed5U + w[7];
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0xab1c5ed5U + w[7];
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x5807aa98U;
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x5807aa98U;
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x12835b01U;
+			t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x12835b01U;
 			c += t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a)));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a)));
 			//
-			t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x243185beU;
+			t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x243185beU;
 			b += t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h)));
 			//
-			t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x550c7dc3U;
+			t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x550c7dc3U;
 			a += t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x72be5d74U;
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x72be5d74U;
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x80deb1feU;
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x80deb1feU;
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x9bdc06a7U;
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x9bdc06a7U;
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0xc19bf274U;
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0xc19bf274U;
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0xe49b69c1U + w[16];
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0xe49b69c1U + w[16];
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0xefbe4786U + w[17];
+			t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0xefbe4786U + w[17];
 			c += t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a)));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a)));
 			//
-			t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x0fc19dc6U + w[18];
+			t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x0fc19dc6U + w[18];
 			b += t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h)));
 			//
-			t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x240ca1ccU + w[19];
+			t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x240ca1ccU + w[19];
 			a += t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x2de92c6fU + w[20];
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x2de92c6fU + w[20];
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x4a7484aaU + w[21];
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x4a7484aaU + w[21];
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x5cb0a9dcU + w[22];
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x5cb0a9dcU + w[22];
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x76f988daU + w[23];
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x76f988daU + w[23];
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x983e5152U + w[24];
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x983e5152U + w[24];
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0xa831c66dU + w[25];
+			t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0xa831c66dU + w[25];
 			c += t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a)));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a)));
 			//
-			t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0xb00327c8U + w[26];
+			t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0xb00327c8U + w[26];
 			b += t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h)));
 			//
-			t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0xbf597fc7U + w[27];
+			t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0xbf597fc7U + w[27];
 			a += t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0xc6e00bf3U + w[28];
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0xc6e00bf3U + w[28];
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0xd5a79147U + w[29];
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0xd5a79147U + w[29];
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x06ca6351U + w[30];
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x06ca6351U + w[30];
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x14292967U + w[31];
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x14292967U + w[31];
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x27b70a85U + w[32];
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x27b70a85U + w[32];
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x2e1b2138U + w[33];
+			t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x2e1b2138U + w[33];
 			c += t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a)));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a)));
 			//
-			t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x4d2c6dfcU + w[34];
+			t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x4d2c6dfcU + w[34];
 			b += t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h)));
 			//
-			t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x53380d13U + w[35];
+			t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x53380d13U + w[35];
 			a += t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x650a7354U + w[36];
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x650a7354U + w[36];
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x766a0abbU + w[37];
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x766a0abbU + w[37];
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x81c2c92eU + w[38];
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x81c2c92eU + w[38];
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x92722c85U + w[39];
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x92722c85U + w[39];
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0xa2bfe8a1U + w[40];
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0xa2bfe8a1U + w[40];
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0xa81a664bU + w[41];
+			t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0xa81a664bU + w[41];
 			c += t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a)));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a)));
 			//
-			t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0xc24b8b70U + w[42];
+			t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0xc24b8b70U + w[42];
 			b += t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h)));
 			//
-			t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0xc76c51a3U + w[43];
+			t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0xc76c51a3U + w[43];
 			a += t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0xd192e819U + w[44];
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0xd192e819U + w[44];
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0xd6990624U + w[45];
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0xd6990624U + w[45];
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0xf40e3585U + w[46];
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0xf40e3585U + w[46];
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x106aa070U + w[47];
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x106aa070U + w[47];
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x19a4c116U + w[48];
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x19a4c116U + w[48];
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x1e376c08U + w[49];
+			t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x1e376c08U + w[49];
 			c += t1;
-			g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a)));
+			g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a)));
 			//
-			t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x2748774cU + w[50];
+			t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x2748774cU + w[50];
 			b += t1;
-			f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h)));
+			f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h)));
 			//
-			t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x34b0bcb5U + w[51];
+			t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x34b0bcb5U + w[51];
 			a += t1;
-			e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g)));
+			e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g)));
 			//
-			t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x391c0cb3U + w[52];
+			t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x391c0cb3U + w[52];
 			h += t1;
-			d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f)));
+			d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f)));
 			//
-			t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x4ed8aa4aU + w[53];
+			t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x4ed8aa4aU + w[53];
 			g += t1;
-			c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e)));
+			c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e)));
 			//
-			t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x5b9cca4fU + w[54];
+			t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x5b9cca4fU + w[54];
 			f += t1;
-			b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d)));
+			b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d)));
 			//
-			t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x682e6ff3U + w[55];
+			t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x682e6ff3U + w[55];
 			e += t1;
-			a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c)));
+			a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c)));
 			//
-			t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x748f82eeU + w[56];
+			t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x748f82eeU + w[56];
 			d += t1;
-			h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b)));
+			h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b)));
 			//
-			c += g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x78a5636fU + w[57];
+			c += g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x78a5636fU + w[57];
 			//
-			b += f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x84c87814U + w[58];
+			b += f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x84c87814U + w[58];
 			//
-			a += e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x8cc70208U + w[43] + w[52] + (rrot(w[44], 7) ^ rrot(w[44], 18) ^ (w[44] >> 3)) + (rrot(w[57], 17) ^ rrot(w[57], 19) ^ (w[57] >> 10));
+			a += e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x8cc70208U + w[43] + w[52] + (ROTR32(w[44], 7) ^ ROTR32(w[44], 18) ^ (w[44] >> 3)) + (ROTR32(w[57], 17) ^ ROTR32(w[57], 19) ^ (w[57] >> 10));
 			//
-			h += d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x90befffaU + w[44] + w[53] + (rrot(w[45], 7) ^ rrot(w[45], 18) ^ (w[45] >> 3)) + (rrot(w[58], 17) ^ rrot(w[58], 19) ^ (w[58] >> 10));
+			h += d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x90befffaU + w[44] + w[53] + (ROTR32(w[45], 7) ^ ROTR32(w[45], 18) ^ (w[45] >> 3)) + (ROTR32(w[58], 17) ^ ROTR32(w[58], 19) ^ (w[58] >> 10));
 			//
 			if (h == 0xa41f32e7)
 			{
@@ -595,8 +594,8 @@ void bitcoin_midstate(const uint32_t *data, uint32_t *midstate)
 	}
 	for (i = 16; i <= 63; i++)
 	{
-		s0 = rrot(w[i - 15], 7) ^ rrot(w[i - 15], 18) ^ (w[i - 15] >> 3);
-		s1 = rrot(w[i - 2], 17) ^ rrot(w[i - 2], 19) ^ (w[i - 2] >> 10);
+		s0 = ROTR32(w[i - 15], 7) ^ ROTR32(w[i - 15], 18) ^ (w[i - 15] >> 3);
+		s1 = ROTR32(w[i - 2], 17) ^ ROTR32(w[i - 2], 19) ^ (w[i - 2] >> 10);
 		w[i] = w[i - 16] + s0 + w[i - 7] + s1;
 	}
 	a = hc[0];
@@ -609,10 +608,10 @@ void bitcoin_midstate(const uint32_t *data, uint32_t *midstate)
 	h = hc[7];
 	for (i = 0; i <= 63; i++)
 	{
-		s0 = rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22);
+		s0 = ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22);
 		maj = (a & b) ^ (a & c) ^ (b & c);
 		t2 = s0 + maj;
-		s1 = rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25);
+		s1 = ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25);
 		ch = (e & f) ^ ((~e) & g);
 		t1 = h + s1 + ch + k[i] + w[i];
 		h = g;
@@ -639,31 +638,31 @@ void bitcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, const
 {
 	uint32_t b2, c2, d2, f2, g2, h2, t1, w16, w17, t1c, t2c, w16rot, w17rot;
 
-	cudaMemset(d_result[thr_id], 0xff, 2 * sizeof(uint32_t));
+	cudaMemsetAsync(d_result[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]);
 
-	t1 = ms[7] + (rrot(ms[4], 6) ^ rrot(ms[4], 11) ^ rrot(ms[4], 25)) + (ms[6] ^ (ms[4] & (ms[5] ^ ms[6]))) + 0x428a2f98U + merkle;
+	t1 = ms[7] + (ROTR32(ms[4], 6) ^ ROTR32(ms[4], 11) ^ ROTR32(ms[4], 25)) + (ms[6] ^ (ms[4] & (ms[5] ^ ms[6]))) + 0x428a2f98U + merkle;
 	d2 = ms[3] + t1;
-	h2 = t1 + (rrot(ms[0], 2) ^ rrot(ms[0], 13) ^ rrot(ms[0], 22)) + ((ms[2] & ms[1]) | (ms[0] & (ms[2] | ms[1])));
+	h2 = t1 + (ROTR32(ms[0], 2) ^ ROTR32(ms[0], 13) ^ ROTR32(ms[0], 22)) + ((ms[2] & ms[1]) | (ms[0] & (ms[2] | ms[1])));
 	//
-	t1 = ms[6] + (rrot(d2, 6) ^ rrot(d2, 11) ^ rrot(d2, 25)) + (ms[5] ^ (d2 & (ms[4] ^ ms[5]))) + 0x71374491U + time;
+	t1 = ms[6] + (ROTR32(d2, 6) ^ ROTR32(d2, 11) ^ ROTR32(d2, 25)) + (ms[5] ^ (d2 & (ms[4] ^ ms[5]))) + 0x71374491U + time;
 	c2 = ms[2] + t1;
-	g2 = t1 + (rrot(h2, 2) ^ rrot(h2, 13) ^ rrot(h2, 22)) + ((ms[1] & ms[0]) | (h2 & (ms[1] | ms[0])));
+	g2 = t1 + (ROTR32(h2, 2) ^ ROTR32(h2, 13) ^ ROTR32(h2, 22)) + ((ms[1] & ms[0]) | (h2 & (ms[1] | ms[0])));
 	//
-	t1 = ms[5] + (rrot(c2, 6) ^ rrot(c2, 11) ^ rrot(c2, 25)) + (ms[4] ^ (c2 & (d2 ^ ms[4]))) + 0xb5c0fbcfU + compacttarget;
+	t1 = ms[5] + (ROTR32(c2, 6) ^ ROTR32(c2, 11) ^ ROTR32(c2, 25)) + (ms[4] ^ (c2 & (d2 ^ ms[4]))) + 0xb5c0fbcfU + compacttarget;
 	b2 = ms[1] + t1;
-	f2 = t1 + (rrot(g2, 2) ^ rrot(g2, 13) ^ rrot(g2, 22)) + ((ms[0] & h2) | (g2 & (ms[0] | h2)));
+	f2 = t1 + (ROTR32(g2, 2) ^ ROTR32(g2, 13) ^ ROTR32(g2, 22)) + ((ms[0] & h2) | (g2 & (ms[0] | h2)));
 
-	w16 = merkle + (rrot(time, 7) ^ rrot(time, 18) ^ (time >> 3));
-	w16rot = (rrot(w16, 17) ^ rrot(w16, 19) ^ (w16 >> 10)) + compacttarget;
-	w17 = time + (rrot(compacttarget, 7) ^ rrot(compacttarget, 18) ^ (compacttarget >> 3)) + 0x01100000U;
-	w17rot = (rrot(w17, 17) ^ rrot(w17, 19) ^ (w17 >> 10)) + 0x11002000U;
-	t2c = (rrot(f2, 2) ^ rrot(f2, 13) ^ rrot(f2, 22)) + ((h2 & g2) | (f2 & (h2 | g2)));
-	t1c = ms[4] + (rrot(b2, 6) ^ rrot(b2, 11) ^ rrot(b2, 25)) + (d2 ^ (b2 & (c2 ^ d2))) + 0xe9b5dba5U;
+	w16 = merkle + (ROTR32(time, 7) ^ ROTR32(time, 18) ^ (time >> 3));
+	w16rot = (ROTR32(w16, 17) ^ ROTR32(w16, 19) ^ (w16 >> 10)) + compacttarget;
+	w17 = time + (ROTR32(compacttarget, 7) ^ ROTR32(compacttarget, 18) ^ (compacttarget >> 3)) + 0x01100000U;
+	w17rot = (ROTR32(w17, 17) ^ ROTR32(w17, 19) ^ (w17 >> 10)) + 0x11002000U;
+	t2c = (ROTR32(f2, 2) ^ ROTR32(f2, 13) ^ ROTR32(f2, 22)) + ((h2 & g2) | (f2 & (h2 | g2)));
+	t1c = ms[4] + (ROTR32(b2, 6) ^ ROTR32(b2, 11) ^ ROTR32(b2, 25)) + (d2 ^ (b2 & (c2 ^ d2))) + 0xe9b5dba5U;
 
 	dim3 grid((threads + TPB*NONCES_PER_THREAD - 1) / TPB / NONCES_PER_THREAD);
 	dim3 block(TPB);
-	bitcoin_gpu_hash << <grid, block >> > (threads, startNounce, d_result[thr_id], t1c, t2c, w16, w16rot, w17, w17rot, b2, c2, d2, f2, g2, h2, ms[0], ms[1], ms[2], ms[3], ms[4], ms[5], ms[6], ms[7], compacttarget);
-	CUDA_SAFE_CALL(cudaMemcpy(h_nounce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+	bitcoin_gpu_hash << <grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, d_result[thr_id], t1c, t2c, w16, w16rot, w17, w17rot, b2, c2, d2, f2, g2, h2, ms[0], ms[1], ms[2], ms[3], ms[4], ms[5], ms[6], ms[7], compacttarget);
+	CUDA_SAFE_CALL(cudaMemcpyAsync(h_nounce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); cudaStreamSynchronize(gpustream[thr_id]);
 }
 
 __host__
diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu
index 070c307b43..1fc51aa84b 100644
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@@ -5,7 +5,6 @@
 #include <memory.h>
 
 #include "miner.h"
-
 #include "cuda_helper.h"
 
 __constant__ uint32_t pTarget[8]; // 32 bytes
@@ -17,15 +16,16 @@ static uint32_t* d_resNonces[MAX_GPUS];
 __host__
 void cuda_check_cpu_init(int thr_id, uint32_t threads)
 {
-    CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 8*sizeof(uint32_t)));
-	CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 8 * sizeof(uint32_t)));
+    CUDA_SAFE_CALL(cudaMallocHost(&h_resNonces[thr_id], 8*sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 8 * sizeof(uint32_t)));
 }
 
 // Target Difficulty
+
 __host__
-void cuda_check_cpu_setTarget(const void *ptarget)
+void cuda_check_cpu_setTarget(const void *ptarget, int thr_id)
 {
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
 }
 
 /* --------------------------------------------------------------------------------------------- */
@@ -73,7 +73,7 @@ static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint3
 __global__ __launch_bounds__(512, 4)
 void cuda_checkhash_64(uint32_t threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		// shl 4 = *16 x 4 (uint32) = 64 bytes
@@ -90,16 +90,17 @@ void cuda_checkhash_64(uint32_t threads, uint32_t startNounce, uint32_t *hash, u
 __host__
 uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash)
 {
-	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaMemsetAsync(d_resNonces[thr_id], 0xff, sizeof(uint32_t), gpustream[thr_id]));
 
 	const uint32_t threadsperblock = 512;
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
+	cuda_checkhash_64 <<<grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
 
-	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpyAsync(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
+	cudaStreamSynchronize(gpustream[thr_id]);
 
 	return h_resNonces[thr_id][0];
 }
@@ -109,7 +110,7 @@ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uin
 __global__ __launch_bounds__(512, 4)
 void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
 	uint32_t *inpHash = &hash[thread << 4];
 
@@ -130,10 +131,11 @@ uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounc
 	dim3 block(threadsperblock);
 
 	// first element stores the count of found nonces
-	cudaMemset(d_resNonces[thr_id], 0, sizeof(uint32_t));
+	cudaMemsetAsync(d_resNonces[thr_id], 0, sizeof(uint32_t), gpustream[thr_id]);
 
-	cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
-	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cuda_checkhash_64_suppl <<<grid, block, 0, gpustream[thr_id]>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
+	cudaMemcpyAsync(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
+	cudaStreamSynchronize(gpustream[thr_id]);
 
 	rescnt = h_resNonces[thr_id][0];
 	if (rescnt > 1)
@@ -156,12 +158,12 @@ uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounc
 __global__
 void cuda_check_hash_branch_64(uint32_t threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t nounce = g_nonceVector[thread];
+		const uint32_t nounce = g_nonceVector[thread];
 		uint32_t hashPosition = (nounce - startNounce) << 4;
-		uint32_t *inpHash = &g_hash[hashPosition];
+		const uint32_t *const inpHash = &g_hash[hashPosition];
 
 		if (hashbelowtarget(inpHash, pTarget))
 		{
@@ -174,12 +176,12 @@ void cuda_check_hash_branch_64(uint32_t threads, uint32_t startNounce, uint32_t
 __global__
 void cuda_check_quarkcoin_64(uint32_t threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t nounce = g_nonceVector[thread];
+		const uint32_t nounce = g_nonceVector[thread];
 		uint32_t hashPosition = (nounce - startNounce) << 4;
-		uint32_t *inpHash = &g_hash[hashPosition];
+		const uint32_t *const inpHash = &g_hash[hashPosition];
 
 		if (inpHash[7] <= pTarget[7])
 		{
@@ -191,35 +193,54 @@ void cuda_check_quarkcoin_64(uint32_t threads, uint32_t startNounce, uint32_t *g
 }
 
 __host__
-uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
+uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash)
 {
 	uint32_t result = 0xffffffff;
-	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
+	cudaMemsetAsync(d_resNonces[thr_id], 0xff, sizeof(uint32_t), gpustream[thr_id]);
 
 	const uint32_t threadsperblock = 256;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]);
+	cuda_check_hash_branch_64 <<<grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]);
 
-	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpyAsync(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
+	cudaStreamSynchronize(gpustream[thr_id]);
 
 	result = *h_resNonces[thr_id];
 
 	return result;
 }
 __host__
-void cuda_check_quarkcoin(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order, uint32_t *resNonces)
+void cuda_check_quarkcoin(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, uint32_t *resNonces)
 {
-	cudaMemset(d_resNonces[thr_id], 0xff, 2*sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaMemsetAsync(d_resNonces[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]));
 
 	const uint32_t threadsperblock = 256;
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	cuda_check_quarkcoin_64 << <grid, block >> > (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]);
+	cuda_check_quarkcoin_64 << <grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]);
 
-	cudaMemcpy(resNonces, d_resNonces[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost);
-}
\ No newline at end of file
+	cudaMemcpyAsync(resNonces, d_resNonces[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
+	cudaStreamSynchronize(gpustream[thr_id]);
+}
+
+int cuda_arch[MAX_GPUS];
+__global__ void get_cuda_arch_gpu(int *d_version)
+{
+#ifdef __CUDA_ARCH__
+	*d_version = __CUDA_ARCH__;
+#endif
+}
+
+__host__ void get_cuda_arch(int *version)
+{
+	int *d_version;
+	cudaMalloc(&d_version, sizeof(int));
+	get_cuda_arch_gpu << < 1, 1 >> > (d_version);
+	cudaMemcpy(version, d_version, sizeof(int), cudaMemcpyDeviceToHost);
+	cudaFree(d_version);
+}
diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu
index ce8ea18a5b..150ee8c18e 100644
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@@ -6,11 +6,11 @@
 #include "cuda_helper.h"
 #include <host_defines.h>
 
+
 // globaler Speicher für alle HeftyHashes aller Threads
-__constant__ uint32_t pTarget[8]; // Single GPU
-extern uint32_t *d_resultNonce[MAX_GPUS];
+static uint32_t *d_resultNonce[MAX_GPUS];
 
-__constant__ uint32_t groestlcoin_gpu_msg[32];
+__constant__ uint32_t groestlcoin_gpu_msg[20];
 
 // 64 Register Variante für Compute 3.0
 #include "groestl_functions_quad.cu"
@@ -18,72 +18,54 @@ __constant__ uint32_t groestlcoin_gpu_msg[32];
 
 #define SWAB32(x) cuda_swab32(x)
 
-__global__ __launch_bounds__(256, 4)
-void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
+__global__ __launch_bounds__(512, 2)
+void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, int thr_id, uint32_t target)
 {
     // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
-    if (thread < threads)
+    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
+//    if (thread < threads)
     {
         // GROESTL
-        uint32_t paddedInput[8];
-#pragma unroll 8
-        for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+(threadIdx.x & 3)];
-
-        uint32_t nounce = startNounce + thread;
-        if ((threadIdx.x & 3) == 3)
-            paddedInput[4] = SWAB32(nounce);  // 4*4+3 = 19
+		uint32_t paddedInput[8] = { 0 };
+        const uint32_t nounce = startNounce + thread;
+		paddedInput[0] = groestlcoin_gpu_msg[(threadIdx.x & 3)];
+		paddedInput[1] = groestlcoin_gpu_msg[4 + (threadIdx.x & 3)];
+		paddedInput[2] = groestlcoin_gpu_msg[8 + (threadIdx.x & 3)];
+		paddedInput[3] = groestlcoin_gpu_msg[12 + (threadIdx.x & 3)];
+		paddedInput[4] = groestlcoin_gpu_msg[16 + (threadIdx.x & 3)];
+		if ((threadIdx.x & 3) == 3) paddedInput[4] = SWAB32(nounce);
+		if ((threadIdx.x & 3) == 0) paddedInput[5] = 0x80;
+		if ((threadIdx.x & 3)==3) paddedInput[7] = 0x01000000;
 
         uint32_t msgBitsliced[8];
-        to_bitslice_quad(paddedInput, msgBitsliced);
+        myr_to_bitslice_quad(paddedInput, msgBitsliced);
 
         uint32_t state[8];
-        for (int round=0; round<2; round++)
-        {
-            groestl512_progressMessage_quad(state, msgBitsliced);
-
-            if (round < 1)
-            {
-                // Verkettung zweier Runden inclusive Padding.
-                msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x & 3)==3)*0x2000);
-                msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
-                msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
-                msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
-                msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
-                msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
-                msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
-				msgBitsliced[7] = __byte_perm(state[7], 0x00800100, 0x4341 + ((threadIdx.x & 3) == 0) * 0x0010);
-            }
-        }
 
-        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-        uint32_t out_state[16];
-        from_bitslice_quad(state, out_state);
+		groestl512_progressMessage_quad(state, msgBitsliced);
+
+		msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x & 3)==3)*0x2000);
+		msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
+		msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
+		msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
+		msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
+		msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
+		msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
+		msgBitsliced[7] = __byte_perm(state[7], 0x00800100, 0x4341 + ((threadIdx.x & 3) == 0) * 0x0010);
+		
+		groestl512_progressMessage_quad(state, msgBitsliced);
+
+		uint32_t out_state[16];
+        from_bitslice_quad_final(state, out_state);
         
 		if ((threadIdx.x & 3) == 0)
         {
-            int i, position = -1;
-            bool rc = true;
-
-    #pragma unroll 8
-            for (i = 7; i >= 0; i--) {
-                if (out_state[i] > pTarget[i]) {
-                    if(position < i) {
-                        position = i;
-                        rc = false;
-                    }
-                 }
-                 if (out_state[i] < pTarget[i]) {
-                    if(position < i) {
-                        position = i;
-                        rc = true;
-                    }
-                 }
-            }
-
-            if(rc == true)
-                if(resNounce[0] > nounce)
-                    resNounce[0] = nounce;
+			if (out_state[7] <= target) 
+			{
+				uint32_t tmp = atomicExch(resNounce, nounce);
+				if (tmp != 0xffffffff)
+					resNounce[1] = tmp;
+			}
         }
     }
 }
@@ -91,53 +73,31 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t
 // Setup-Funktionen
 __host__ void groestlcoin_cpu_init(int thr_id, uint32_t threads)
 {
-    CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-
-    // Speicher für Gewinner-Nonce belegen
-    cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
+    cudaMalloc(&d_resultNonce[thr_id], 2 * sizeof(uint32_t)); 
 }
 
-__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
+__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data )
 {
-    // Nachricht expandieren und setzen
-    uint32_t msgBlock[32];
-
-    memset(msgBlock, 0, sizeof(uint32_t) * 32);
+    uint32_t msgBlock[20];
     memcpy(&msgBlock[0], data, 80);
+	cudaMemcpyToSymbolAsync(groestlcoin_gpu_msg, msgBlock, 80, 0, cudaMemcpyHostToDevice, gpustream[thr_id]);
 
-    // Erweitere die Nachricht auf den Nachrichtenblock (padding)
-    // Unsere Nachricht hat 80 Byte
-    msgBlock[20] = 0x80;
-    msgBlock[31] = 0x01000000;
-
-    // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
-    // auf der GPU ausgeführt)
-
-    // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
-    cudaMemcpyToSymbol( groestlcoin_gpu_msg,
-                        msgBlock,
-                        128);
-
-    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-    cudaMemcpyToSymbol( pTarget,
-                        pTargetIn,
-                        sizeof(uint32_t) * 8 );
+	cudaMemsetAsync(d_resultNonce[thr_id], 0xFF, 2 * sizeof(uint32_t), gpustream[thr_id]);
 }
 
-__host__ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
+__host__ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *nounce, uint32_t target)
 {
-    uint32_t threadsperblock = 256;
+    uint32_t threadsperblock = 512;
 
     // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
     // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
     int factor = 4;
 
-        // berechne wie viele Thread Blocks wir brauchen
+     // berechne wie viele Thread Blocks wir brauchen
     dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
     dim3 block(threadsperblock);
 
-    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-    groestlcoin_gpu_hash_quad<<<grid, block>>>(threads, startNounce, d_resultNonce[thr_id]);
+    groestlcoin_gpu_hash_quad<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_resultNonce[thr_id], thr_id, target);
 
-    cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+    CUDA_SAFE_CALL(cudaMemcpyAsync(nounce, d_resultNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); cudaStreamSynchronize(gpustream[thr_id]);
 }
diff --git a/cuda_groestlcoin.h b/cuda_groestlcoin.h
index 7b95b59a07..3dc121537f 100644
--- a/cuda_groestlcoin.h
+++ b/cuda_groestlcoin.h
@@ -2,7 +2,7 @@
 #define _CUDA_GROESTLCOIN_H
 
 void groestlcoin_cpu_init(int thr_id, uint32_t threads);
-void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
-void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
+void groestlcoin_cpu_setBlock(int thr_id, void *data);
+void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *nounce, uint32_t target);
 
 #endif
\ No newline at end of file
diff --git a/cuda_helper.h b/cuda_helper.h
index 8eb46f3ba2..497b6670e8 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -3,32 +3,45 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#ifdef __cplusplus
+#include <cstdint>
+#include <cstdio>
+using namespace std;
+#else
+#include <stdint.h>
+#endif
 
 #ifdef __INTELLISENSE__
+#define NOASM
 /* reduce vstudio warnings (__byteperm, blockIdx...) */
 #include <device_functions.h>
 #include <device_launch_parameters.h>
 #define __launch_bounds__(max_tpb, min_blocks)
+#define __CUDA_ARCH__ 610
+
 uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
 uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
 uint32_t atomicExch(uint32_t *x, uint32_t y);
 uint32_t atomicAdd(uint32_t *x, uint32_t y);
 void __syncthreads(void);
 void __threadfence(void);
+#define __ldg(x) (*(x))
 #endif
 
-#include <stdint.h>
-
 #ifndef MAX_GPUS
-#define MAX_GPUS 16
+#define MAX_GPUS 8
 #endif
 
-extern "C" int device_map[MAX_GPUS];
-extern "C"  long device_sm[MAX_GPUS];
+extern int device_map[MAX_GPUS];
+extern long device_sm[MAX_GPUS];
+extern cudaStream_t gpustream[MAX_GPUS];
+extern bool stop_mining;
+extern volatile bool mining_has_stopped[MAX_GPUS];
 
 // common functions
 extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
-extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern void cuda_check_cpu_setTarget(const void *ptarget, int thr_id);
+extern void cuda_check_cpu_setTarget_mod(const void *ptarget, const void *ptarget2);
 extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
 extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint32_t foundnonce);
 extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
@@ -61,25 +74,63 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 #define SPH_T64(x) (x)
 // #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
 #endif
+
+#if defined _MSC_VER && !defined __CUDA_ARCH__
+#define ROTL32c(x, n) _rotl(x, n)
+#define ROTR32c(x, n) _rotr(x, n)
+#else
+#define ROTL32c(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
+#define ROTR32c(x, n) ((x) >> (n)) | ((x) << (32 - (n)))
+#endif
+
+#ifndef __CUDA_ARCH__
+#define ROTR32(x, n) ROTR32c(x, n)
+#define ROTL32(x, n) ROTL32c(x, n)
+#else
 #if __CUDA_ARCH__ < 320
 // Kepler (Compute 3.0)
-#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
+__device__ __forceinline__ uint32_t ROTR32(const uint32_t x, const uint32_t n)
+{
+	return (x >> n) | (x << (32 - n));
+}
+__device__ __forceinline__ uint32_t ROTL32(const uint32_t x, const uint32_t n)
+{
+	return (x << n) | (x >> (32 - n));
+}
 #else
-// Kepler (Compute 3.5, 5.0)
-#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
+__device__ __forceinline__ uint32_t ROTR32(const uint32_t x, const uint32_t n)
+{
+	return __funnelshift_r(x, x, n);
+}
+__device__ __forceinline__ uint32_t ROTL32(const uint32_t x, const uint32_t n)
+{
+	return __funnelshift_l(x, x, n);
+}
+#endif
+#endif
+
+// #define NOASM here if you don't want asm
+#ifndef __CUDA_ARCH__
+#define NOASM
 #endif
 
+#define MAKE_ULONGLONG(lo, hi) MAKE_UINT64(lo, hi)
 
-__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
+__device__ __forceinline__ uint64_t MAKE_UINT64(uint32_t LO, uint32_t HI)
 {
+#ifndef NOASM
 	uint64_t result;
 	asm("mov.b64 %0,{%1,%2}; \n\t"
 		: "=l"(result) : "r"(LO), "r"(HI));
 	return result;
+#else
+	return LO + ((uint64_t)HI << 32);
+#endif
 }
 
 __device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t x, const uint32_t y)
 {
+#ifndef NOASM
 	uint64_t result;
 	asm(
 		"{\n\t"
@@ -89,10 +140,13 @@ __device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t x, const uint3
 		"}" : "=l"(result) : "l"(x), "r"(y)
 		);
 	return result;
-
+#else
+	return (x & 0xffffffff) + ((uint64_t)y << 32);
+#endif
 }
 __device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t x, const uint32_t y)
 {
+#ifndef NOASM
 	uint64_t result;
 	asm(
 		"{\n\t"
@@ -102,25 +156,36 @@ __device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t x, const uint3
 		"}" : "=l"(result) : "l"(x), "r"(y)
 		);
 	return result;
+#else
+	return (x & 0xffffffff00000000) + y;
+#endif
 }
 
-// Endian Drehung f�r 32 Bit Typen
+// endian change for 32bit
 #ifdef __CUDA_ARCH__
-__device__ __forceinline__ uint32_t cuda_swab32(const uint32_t x)
-{
-	/* device */
-	return __byte_perm(x, x, 0x0123);
-}
+	__device__ __forceinline__ uint32_t cuda_swab32(const uint32_t x)
+	{
+		/* device */
+		return __byte_perm(x, x, 0x0123);
+	}
 #else
 	/* host */
-	#define cuda_swab32(x) \
-	((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
-		(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+	#ifdef __GNUC__
+		#if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+			#define cuda_swab32(x) __builtin_bswap32(x)
+		#endif
+	#else
+		#ifdef _MSC_VER
+			#define cuda_swab32(x) _byteswap_ulong(x)
+		#else
+			#define cuda_swab32(x) ( ((x) << 24) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | ((x) >> 24))
+		#endif
+	#endif
 #endif
 
-
 static __device__ uint32_t _HIWORD(const uint64_t x)
 {
+#ifndef NOASM
 	uint32_t result;
 	asm(
 		"{\n\t"
@@ -129,10 +194,14 @@ static __device__ uint32_t _HIWORD(const uint64_t x)
 		"}" : "=r"(result) : "l"(x)
 		);
 	return result;
+#else
+	return x >> 32;
+#endif
 }
 
 static __device__ uint32_t _LOWORD(const uint64_t x)
 {
+#ifndef NOASM
 	uint32_t result;
 	asm(
 		"{\n\t"
@@ -141,11 +210,13 @@ static __device__ uint32_t _LOWORD(const uint64_t x)
 		"}" : "=r"(result) : "l"(x)
 		);
 	return result;
+#else
+	return x & 0xffffffff;
+#endif
 }
 
-// Input:       77665544 33221100
-// Output:      00112233 44556677
-#ifdef __CUDA_ARCH__
+// endian change for 64bit
+#if (defined __CUDA_ARCH__ && !defined NOASM)
 __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
 {
 	uint64_t result;
@@ -160,15 +231,25 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
 }
 #else
 	/* host */
-	#define cuda_swab64(x) \
-		((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
-			(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
-			(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
-			(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
-			(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
-			(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
-			(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
-			(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
+	#ifdef __GNUC__
+		#if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+			#define cuda_swab64(x) __builtin_bswap64(x)
+		#endif
+	#else
+		#ifdef _MSC_VER
+			#define cuda_swab64(x) _byteswap_uint64(x)
+		#else
+			#define cuda_swab64(x) \
+				((uint64_t)((((uint64_t)(x)) >> 56) | \
+				(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
+				(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
+				(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
+				(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
+				(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
+				(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
+				(((uint64_t)(x)) << 56)))
+		#endif
+	#endif
 #endif
 
 /*********************************************************************/
@@ -201,7 +282,7 @@ do {                                                                  \
 } while (0)
 
 /*********************************************************************/
-#ifdef _WIN64
+#if (defined _WIN64 || defined NOASM)
 #define USE_XOR_ASM_OPTS 0
 #else
 #define USE_XOR_ASM_OPTS 1
@@ -217,7 +298,7 @@ uint64_t xor1(const uint64_t a, const uint64_t b)
 	return result;
 }
 #else
-#define xor1(a,b) (a ^ b)
+#define xor1(a,b) ((a) ^ (b))
 #endif
 
 #if USE_XOR_ASM_OPTS
@@ -233,7 +314,7 @@ uint64_t xor3(const uint64_t a, const uint64_t b, const uint64_t c)
 	return result;
 }
 #else
-#define xor3(a,b,c) (a ^ b ^ c)
+#define xor3(a,b,c) ((a) ^ (b) ^ (c))
 #endif
 
 #if USE_XOR_ASM_OPTS
@@ -252,7 +333,7 @@ uint64_t xor8(const uint64_t a, const uint64_t b, const uint64_t c, const uint64
 	return result;
 }
 #else
-#define xor8(a,b,c,d,e,f,g,h) ((a^b)^(c^d)^(e^f)^(g^h))
+#define xor8(a,b,c,d,e,f,g,h) ((a)^(b)^(c)^(d)^(e)^(f)^(g)^(h))
 #endif
 
 // device asm for x17
@@ -260,6 +341,7 @@ __device__ __forceinline__
 uint64_t xandx(const uint64_t a, const uint64_t b, const uint64_t c)
 {
 	uint64_t result;
+#ifndef NOASM
 	asm("{\n\t"
 		".reg .u64 n;\n\t"
 		"xor.b64 %0, %2, %3;\n\t"
@@ -267,6 +349,9 @@ uint64_t xandx(const uint64_t a, const uint64_t b, const uint64_t c)
 		"xor.b64 %0, n, %3;"
 	"}\n"
 	: "=l"(result) : "l"(a), "l"(b), "l"(c));
+#else
+	result = ((((b) ^ (c)) & (a)) ^ (c));
+#endif
 	return result;
 }
 
@@ -275,6 +360,7 @@ __device__ __forceinline__
 uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
 {
 	uint64_t result;
+#ifndef NOASM
 	asm("{\n\t"
 		".reg .u64 m,n;\n\t"
 		"and.b64 m,  %1, %2;\n\t"
@@ -283,6 +369,9 @@ uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
 		" or.b64 %0, %0, m ;\n\t"
 	"}\n"
 	: "=l"(result) : "l"(a), "l"(b), "l"(c));
+#else
+	result = (((a) & (b)) | (((a) | (b)) & (c)));
+#endif
 	return result;
 }
 
@@ -291,8 +380,12 @@ __device__ __forceinline__
 uint64_t shr_t64(uint64_t x, uint32_t n)
 {
 	uint64_t result;
+#ifndef NOASM
 	asm("shr.b64 %0,%1,%2;\n\t"
 	: "=l"(result) : "l"(x), "r"(n));
+#else
+	result = x >> n;
+#endif
 	return result;
 }
 
@@ -301,8 +394,12 @@ __device__ __forceinline__
 uint64_t shl_t64(uint64_t x, uint32_t n)
 {
 	uint64_t result;
+#ifndef NOASM
 	asm("shl.b64 %0,%1,%2;\n\t"
 	: "=l"(result) : "l"(x), "r"(n));
+#else
+	result = x << n;
+#endif
 	return result;
 }
 
@@ -310,6 +407,10 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
 #define USE_ROT_ASM_OPT 1
 #endif
 
+#ifdef NOASM
+#undef USE_ROT_ASM_OPT
+#endif
+
 // 64-bit ROTATE RIGHT
 #if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
 /* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
@@ -343,7 +444,48 @@ uint64_t ROTR64(const uint64_t x, const int offset)
 }
 #else
 /* host */
-#define ROTR64(x, n)  (((x) >> (n)) | ((x) << (64 - (n))))
+#if defined _MSC_VER && !defined __CUDA_ARCH__
+	#define ROTR64(x, n) _rotr64(x, n)
+#else
+#ifndef __CUDA_ARCH__
+	#define ROTR64(x, n)  (((x) >> (n)) | ((x) << (64 - (n))))
+#else
+#if __CUDA_ARCH__ >= 520
+__device__ __forceinline__
+uint64_t ROTR64(const uint64_t value, const int offset)
+{
+	uint2 result;
+	if(offset < 32)
+	{
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	}
+	else
+	{
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#else
+__device__ __forceinline__
+uint64_t ROTR64(const uint64_t x, const int offset)
+{
+	uint64_t result;
+	asm("{\n\t"
+			".reg .b64 lhs;\n\t"
+			".reg .u32 roff;\n\t"
+			"shr.b64 lhs, %1, %2;\n\t"
+			"sub.u32 roff, 64, %2;\n\t"
+			"shl.b64 %0, %1, roff;\n\t"
+			"add.u64 %0, %0, lhs;\n\t"
+			"}\n"
+			: "=l"(result) : "l"(x), "r"(offset));
+	return result;
+}
+#endif
+#endif
+#endif
 #endif
 
 // 64-bit ROTATE LEFT
@@ -397,13 +539,17 @@ uint64_t ROTL64(const uint64_t x, const int offset)
 }
 #else
 /* host */
+#if defined _MSC_VER && !defined __CUDA_ARCH__
+#define ROTL64(x, n) _rotl64(x, n)
+#else
 #define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
 #endif
+#endif
 
 __device__ __forceinline__
 uint64_t SWAPDWORDS(uint64_t value)
 {
-#if __CUDA_ARCH__ >= 320
+#if __CUDA_ARCH__ >= 320 && !defined NOASM
 	uint2 temp;
 	asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
 	asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
@@ -416,26 +562,39 @@ uint64_t SWAPDWORDS(uint64_t value)
 /* lyra2 - int2 operators */
 
 __device__ __forceinline__
-void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) {
+void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x)
+{
+#ifndef NOASM
 	asm("mov.b64 {%0,%1},%2; \n\t"
 		: "=r"(lo), "=r"(hi) : "l"(x));
+#else
+	lo = x & 0xffffffff;
+	hi = x >> 32;
+#endif
 }
 
 __device__ __forceinline__ uint64_t devectorize(uint2 x)
 {
+#ifndef NOASM
 	uint64_t result;
 	asm("mov.b64 %0,{%1,%2}; \n\t"
 		: "=l"(result) : "r"(x.x), "r"(x.y));
 	return result;
+#else
+	return x.x + ((uint64_t)x.y << 32);
+#endif
 }
 
-
-__device__ __forceinline__ uint2 vectorize(uint64_t x)
+__device__ __forceinline__ uint2 vectorize(const uint64_t x)
 {
+#ifndef NOASM
 	uint2 result;
 	asm("mov.b64 {%0,%1},%2; \n\t"
 		: "=r"(result.x), "=r"(result.y) : "l"(x));
 	return result;
+#else
+	return make_uint2(x & 0xffffffff, x >> 32);
+#endif
 }
 
 static __device__ __forceinline__ uint2 vectorizelow(uint32_t v) {
@@ -444,6 +603,19 @@ static __device__ __forceinline__ uint2 vectorizelow(uint32_t v) {
 	result.y = 0;
 	return result;
 }
+static __device__ __forceinline__ uint2 vectorizehigh(uint32_t v) {
+	uint2 result;
+	result.x = 0;
+	result.y = v;
+	return result;
+}
+static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v)
+{
+	uint2 result;
+	result.y = u.x ^ v.x;
+	result.x = u.y ^ v.y;
+	return result;
+}
 
 static __device__ __forceinline__ uint2 operator^ (uint2 a, uint32_t b) { return make_uint2(a.x^ b, a.y); }
 static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); }
@@ -451,8 +623,10 @@ static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return ma
 static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); }
 static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); }
 static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
+
 static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b)
 {
+#ifndef NOASM
 	uint2 result;
 	asm("{\n\t"
 		"add.cc.u32 %0,%2,%4; \n\t"
@@ -460,11 +634,24 @@ static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b)
 	"}\n\t"
 		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
 	return result;
+#else
+	return make_uint2(a.x + b.x, a.y + b.y);
+#endif
 }
 
-
+static __device__ __forceinline__ uint2 operator+ (uint2 a, uint32_t b)
+{
+	uint2 result;
+	asm("{\n\t"
+		"add.cc.u32 %0,%2,%4; \n\t"
+		"addc.u32 %1,%3,%5;   \n\t"
+		"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0));
+	return result;
+}
 static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b)
 {
+#ifndef NOASM
 	uint2 result;
 	asm("{\n\t"
 		"sub.cc.u32 %0,%2,%4; \n\t"
@@ -472,9 +659,27 @@ static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b)
 		"}\n\t"
 		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
 	return result;
+#else
+return make_uint2(a.x - b.x, a.y - b.y);
+#endif
+}
+
+
+static __device__ __forceinline__ uint4 operator+ (uint4 a, uint4 b)
+{
+	return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
 }
+static __device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __device__ __forceinline__ uint4 operator& (uint4 a, uint4 b) { return make_uint4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); }
+static __device__ __forceinline__ uint4 operator| (uint4 a, uint4 b) { return make_uint4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w); }
+static __device__ __forceinline__ uint4 operator~ (uint4 a) { return make_uint4(~a.x, ~a.y, ~a.z, ~a.w); }
+static __device__ __forceinline__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; }
 
 static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; }
+static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b){return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);}
+static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b){return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);}
+
+static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) { a = a ^ b; }
 
 /**
  * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b))
@@ -482,6 +687,7 @@ static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a +
  */
 static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b)
 {
+#ifndef NOASM
 	uint2 result;
 	asm("{\n\t"
 		"mul.lo.u32        %0,%2,%4;  \n\t"
@@ -491,10 +697,13 @@ static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b)
 	"}\n\t"
 		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
 	return result;
+#else
+	return vectorize(devectorize(a)*devectorize(b));
+#endif
 }
 
 // uint2 method
-#if  __CUDA_ARCH__ >= 350
+#if  __CUDA_ARCH__ >= 320 && !defined NOASM
 __device__ __inline__ uint2 ROR2(const uint2 a, const int offset) 
 {
 	uint2 result;
@@ -519,14 +728,83 @@ __device__ __inline__ uint2 ROR2(const uint2 v, const int n)
 	}
 	else 
 	{
-		result.y = ((v.x >> (n - 32)) | (v.y << (32 - n)));
-		result.x = ((v.y >> (n - 32)) | (v.x << (32 - n)));
+		result.y = ((v.x >> (n - 32)) | (v.y << (64 - n)));
+		result.x = ((v.y >> (n - 32)) | (v.x << (64 - n)));
 	}
 	return result;
 }
 #endif
 
-#if  __CUDA_ARCH__ >= 350
+__device__ __inline__ uint32_t ROL8(const uint32_t x)
+{
+	return __byte_perm(x, x, 0x2103);
+}
+__device__ __inline__ uint32_t ROL16(const uint32_t x)
+{
+	return __byte_perm(x, x, 0x1032);
+}
+__device__ __inline__ uint32_t ROL24(const uint32_t x)
+{
+	return __byte_perm(x, x, 0x0321);
+}
+
+__device__ __inline__ uint2 ROR8(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x0765);
+	result.y = __byte_perm(a.y, a.x, 0x4321);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROR16(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x1076);
+	result.y = __byte_perm(a.y, a.x, 0x5432);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROR24(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x2107);
+	result.y = __byte_perm(a.y, a.x, 0x6543);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROL8(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x6543);
+	result.y = __byte_perm(a.y, a.x, 0x2107);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROL16(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x5432);
+	result.y = __byte_perm(a.y, a.x, 0x1076);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROL24(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x4321);
+	result.y = __byte_perm(a.y, a.x, 0x0765);
+
+	return result;
+}
+
+#if  __CUDA_ARCH__ >= 320 && !defined NOASM
+
+
 __inline__ __device__ uint2 ROL2(const uint2 a, const int offset) {
 	uint2 result;
 	if (offset >= 32) {
@@ -560,7 +838,7 @@ __inline__ __device__ uint2 ROL2(const uint2 v, const int n)
 __device__ __forceinline__
 uint64_t ROTR16(uint64_t x)
 {
-#if __CUDA_ARCH__ > 500
+#if __CUDA_ARCH__ > 500 && !defined NOASM
 	short4 temp;
 	asm("mov.b64 { %0,  %1, %2, %3 }, %4; ": "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) : "l"(x));
 	asm("mov.b64 %0, {%1, %2, %3 , %4}; ":  "=l"(x) : "h"(temp.y), "h"(temp.z), "h"(temp.w), "h"(temp.x));
@@ -569,10 +847,11 @@ uint64_t ROTR16(uint64_t x)
 	return ROTR64(x, 16);
 #endif
 }
+
 __device__ __forceinline__
 uint64_t ROTL16(uint64_t x)
 {
-#if __CUDA_ARCH__ > 500
+#if __CUDA_ARCH__ > 500 && !defined NOASM
 	short4 temp;
 	asm("mov.b64 { %0,  %1, %2, %3 }, %4; ": "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) : "l"(x));
 	asm("mov.b64 %0, {%1, %2, %3 , %4}; ":  "=l"(x) : "h"(temp.w), "h"(temp.x), "h"(temp.y), "h"(temp.z));
@@ -587,6 +866,7 @@ uint2 SWAPINT2(uint2 x)
 {
 	return(make_uint2(x.y, x.x));
 }
+
 __device__ __forceinline__ bool cuda_hashisbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target)
 {
 	if (hash[7] > target[7])
@@ -628,10 +908,10 @@ uint2 SWAPDWORDS2(uint2 value)
 	return make_uint2(value.y, value.x);
 }
 
-static __forceinline__ __device__ uint2 SHL2(uint2 a, int offset)
+static __forceinline__ __device__ uint2 SHL2(const uint2 a, int offset)
 {
-#if __CUDA_ARCH__ > 300
 	uint2 result;
+#if __CUDA_ARCH__ > 300 && !defined NOASM
 	if (offset<32) 
 	{
 		asm("{\n\t"
@@ -647,25 +927,25 @@ static __forceinline__ __device__ uint2 SHL2(uint2 a, int offset)
 			"}\n\t"
 			: "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
 	}
-	return result;
 #else
 	if (offset<=32) 
 	{
-		a.y = (a.y << offset) | (a.x >> (32 - offset));
-		a.x = (a.x << offset);
+		result.y = (a.y << offset) | (a.x >> (32 - offset));
+		result.x = (a.x << offset);
 	}
 	else
 	{
-		a.y = (a.x << (offset-32));
-		a.x = 0;
+		result.y = (a.x << (offset - 32));
+		result.x = 0;
 	}
-	return a;
 #endif
+	return result;
 }
-static __forceinline__ __device__ uint2 SHR2(uint2 a, int offset)
+
+static __forceinline__ __device__ uint2 SHR2(const uint2 a, int offset)
 {
-	#if __CUDA_ARCH__ > 300
 	uint2 result;
+#if __CUDA_ARCH__ >= 320 && !defined NOASM
 	if (offset<32) {
 		asm("{\n\t"
 			"shf.r.clamp.b32 %0,%2,%3,%4; \n\t"
@@ -680,24 +960,24 @@ static __forceinline__ __device__ uint2 SHR2(uint2 a, int offset)
 			"}\n\t"
 			: "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
 	}
-	return result;
 	#else
 	if (offset<=32) 
 	{
-		a.x = (a.x >> offset) | (a.y << (32 - offset));
-		a.y = (a.y >> offset);
+		result.x = (a.x >> offset) | (a.y << (32 - offset));
+		result.y = (a.y >> offset);
 	}
 	else
 	{
-		a.x = (a.y >> (offset - 32));
-		a.y = 0;
+		result.x = (a.y >> (offset - 32));
+		result.y = 0;
 	}
-	return a;
-	#endif
+#endif
+	return result;
 }
 
-static __device__ __forceinline__ uint64_t devectorizeswap(uint2 v) { return MAKE_ULONGLONG(cuda_swab32(v.y), cuda_swab32(v.x)); }
-static __device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
+static __device__ __forceinline__ uint64_t devectorizeswap(uint2 v) { return MAKE_UINT64(cuda_swab32(v.y), cuda_swab32(v.x)); }
+static __device__ __forceinline__ uint2 vectorizeswap(uint64_t v)
+{
 	uint2 result;
 	LOHI(result.y, result.x, v);
 	result.x = cuda_swab32(result.x);
@@ -705,12 +985,23 @@ static __device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
 	return result;
 }
 
+static __device__ __forceinline__ uint2 cuda_swap(uint2 v)
+{
+	uint32_t t = cuda_swab32(v.x);
+	v.x = cuda_swab32(v.y);
+	v.y = t;
+	return v;
+}
 
 __device__ __forceinline__ uint32_t devectorize16(ushort2 x)
 {
 	uint32_t result;
+#ifndef NOASM
 	asm("mov.b32 %0,{%1,%2}; \n\t"
 		: "=r"(result) : "h"(x.x) , "h"(x.y));
+#else
+	result = x.x + (x.y << 16);
+#endif
 	return result;
 }
 
@@ -718,11 +1009,164 @@ __device__ __forceinline__ uint32_t devectorize16(ushort2 x)
 __device__ __forceinline__ ushort2 vectorize16(uint32_t x)
 {
 	ushort2 result;
+#ifndef NOASM
 	asm("mov.b32 {%0,%1},%2; \n\t"
 		: "=h"(result.x), "=h"(result.y) : "r"(x));
+#else
+	result.x = x & 0xffff;
+	result.y = x >> 16;
+#endif
+	return result;
+}
+
+extern int cuda_arch[MAX_GPUS];
+extern void get_cuda_arch(int *);
+
+/*
+static __device__ __forceinline__ uint4 mul4(uint4 a)
+{
+	uint4 result;
+	asm("{\n\t"
+		 "mul.lo.u32        %0,%4,%5;  \n\t"
+		 "mul.hi.u32        %1,%4,%5;  \n\t"
+		 "mul.lo.u32        %2,%6,%7;  \n\t"
+		 "mul.hi.u32        %3,%6,%7;  \n\t"
+		 "}\n\t"
+		 : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w));
+	return result;
+}
+static __device__ __forceinline__ uint4 add4(uint4 a, uint4 b)
+ {
+	uint4 result;
+	asm("{\n\t"
+		 "add.cc.u32           %0,%4,%8;  \n\t"
+		 "addc.u32             %1,%5,%9;  \n\t"
+		 "add.cc.u32           %2,%6,%10;  \n\t"
+		 "addc.u32             %3,%7,%11;  \n\t"
+		 "}\n\t"
+		 : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w));
+	return result;
+	}
+
+static __device__ __forceinline__ uint4 madd4(uint4 a, uint4 b)
+ {
+	uint4 result;
+	asm("{\n\t"
+		 "mad.lo.cc.u32        %0,%4,%5,%8;  \n\t"
+		 "madc.hi.u32          %1,%4,%5,%9;  \n\t"
+		 "mad.lo.cc.u32        %2,%6,%7,%10;  \n\t"
+		 "madc.hi.u32          %3,%6,%7,%11;  \n\t"
+		 "}\n\t"
+		 : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w));
+	return result;
+	}
+
+static __device__ __forceinline__ ulonglong2 madd4long(ulonglong2 a, ulonglong2 b)
+ {
+	ulonglong2 result;
+	asm("{\n\t"
+		 ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t"
+		 "mov.b64 {a0,a1}, %2;\n\t"
+		 "mov.b64 {a2,a3}, %3;\n\t"
+		 "mov.b64 {b0,b1}, %4;\n\t"
+		 "mov.b64 {b2,b3}, %5;\n\t"
+		 "mad.lo.cc.u32        b0,a0,a1,b0;  \n\t"
+		 "madc.hi.u32          b1,a0,a1,b1;  \n\t"
+		 "mad.lo.cc.u32        b2,a2,a3,b2;  \n\t"
+		 "madc.hi.u32          b3,a2,a3,b3;  \n\t"
+		 "mov.b64 %0, {b0,b1};\n\t"
+		 "mov.b64 %1, {b2,b3};\n\t"
+		 "}\n\t"
+		 : "=l"(result.x), "=l"(result.y) : "l"(a.x), "l"(a.y), "l"(b.x), "l"(b.y));
+	return result;
+	}
+*/
+static __device__ __forceinline__ void madd4long2(ulonglong2 &a, ulonglong2 b)
+ {
+#ifndef NOASM	
+		asm("{\n\t"
+		 ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t"
+		 "mov.b64 {a0,a1}, %0;\n\t"
+		 "mov.b64 {a2,a3}, %1;\n\t"
+		 "mov.b64 {b0,b1}, %2;\n\t"
+		 "mov.b64 {b2,b3}, %3;\n\t"
+		 "mad.lo.cc.u32        b0,a0,a1,b0;  \n\t"
+		 "madc.hi.u32          b1,a0,a1,b1;  \n\t"
+		 "mad.lo.cc.u32        b2,a2,a3,b2;  \n\t"
+		 "madc.hi.u32          b3,a2,a3,b3;  \n\t"
+		 "mov.b64 %0, {b0,b1};\n\t"
+		 "mov.b64 %1, {b2,b3};\n\t"
+		 "}\n\t"
+		 : "+l"(a.x), "+l"(a.y) : "l"(b.x), "l"(b.y));
+#else // ?? no idea what madd4long is supposed to do
+	 a.x = a.x + b.x;
+	 if(a.x < b.x)
+		 a.y = a.y + b.y + 1;
+	 else
+		 a.y = a.y + b.y;
+#endif	
+}
+
+__device__ __forceinline__
+uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) {
+	uint32_t result;
+#ifndef NOASM
+	asm("{ .reg .u32 t1;\n\t"
+		"xor.b32 t1, %2, %3;\n\t"
+		"xor.b32 %0, %1, t1;\n\t"
+		"}"
+		: "=r"(result) : "r"(a), "r"(b), "r"(c));
+#else
+	result = a ^ b ^ c;
+#endif
 	return result;
 }
 
+__device__ __forceinline__
+uint32_t shr_t32(uint32_t x, uint32_t n) {
+	uint32_t result;
+#ifndef NOASM
+	asm("shr.b32 %0,%1,%2;"	: "=r"(result) : "r"(x), "r"(n));
+#else
+	result = x >> n;
+#endif
+	return result;
+}
+
+__device__ __forceinline__
+uint32_t shl_t32(uint32_t x, uint32_t n) {
+	uint32_t result;
+#ifndef NOASM
+	asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
+#else
+	result = x << n;
+#endif
+	return result;
+}
+
+// device asm 32 for pluck
+__device__ __forceinline__
+uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) {
+	uint32_t result;
+#ifndef NOASM
+	asm("{ .reg .u32 m,n,o;\n\t"
+		"and.b32 m,  %1, %2;\n\t"
+		" or.b32 n,  %1, %2;\n\t"
+		"and.b32 o,   n, %3;\n\t"
+		" or.b32 %0,  m, o ;\n\t"
+		"}\n\t"
+		: "=r"(result) : "r"(a), "r"(b), "r"(c));
+#else
+	result = ((a | b) & c) | (a & b);
+#endif
+	return result;
+}
+
+#if __CUDA_ARCH__ < 350
+#ifndef __ldg
+#define __ldg(x) (*(x))
+#endif
+#endif
 
 #endif // #ifndef CUDA_HELPER_H
 
diff --git a/cuda_helper.h.orig b/cuda_helper.h.orig
new file mode 100644
index 0000000000..142bfa9da6
--- /dev/null
+++ b/cuda_helper.h.orig
@@ -0,0 +1,1115 @@
+#ifndef CUDA_HELPER_H
+#define CUDA_HELPER_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#ifdef __cplusplus
+#include <cstdint>
+#include <cstdio>
+using namespace std;
+#else
+#include <stdint.h>
+#endif
+
+#ifdef __INTELLISENSE__
+#define NOASM
+/* reduce vstudio warnings (__byteperm, blockIdx...) */
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#define __launch_bounds__(max_tpb, min_blocks)
+
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
+uint32_t atomicExch(uint32_t *x, uint32_t y);
+uint32_t atomicAdd(uint32_t *x, uint32_t y);
+void __syncthreads(void);
+void __threadfence(void);
+#define __ldg(x) (*(x))
+#endif
+
+#ifndef MAX_GPUS
+#define MAX_GPUS 8
+#endif
+
+extern int device_map[MAX_GPUS];
+extern long device_sm[MAX_GPUS];
+extern cudaStream_t gpustream[MAX_GPUS];
+extern bool stop_mining;
+extern volatile bool mining_has_stopped[MAX_GPUS];
+
+// common functions
+extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget, int thr_id);
+extern void cuda_check_cpu_setTarget_mod(const void *ptarget, const void *ptarget2);
+extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
+extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint32_t foundnonce);
+extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
+
+#ifndef __CUDA_ARCH__
+// define blockDim and threadIdx for host
+extern const dim3 blockDim;
+extern const uint3 threadIdx;
+#endif
+
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
+
+#ifndef SPH_C32
+#define SPH_C32(x) ((x ## U))
+// #define SPH_C32(x) ((uint32_t)(x ## U))
+#endif
+
+#ifndef SPH_C64
+#define SPH_C64(x) ((x ## ULL))
+// #define SPH_C64(x) ((uint64_t)(x ## ULL))
+#endif
+
+#ifndef SPH_T32
+#define SPH_T32(x) (x)
+// #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
+#endif
+
+#ifndef SPH_T64
+#define SPH_T64(x) (x)
+// #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+#endif
+
+#if defined _MSC_VER && !defined __CUDA_ARCH__
+#define ROTL32c(x, n) _rotl(x, n)
+#define ROTR32c(x, n) _rotr(x, n)
+#else
+#define ROTL32c(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
+#define ROTR32c(x, n) ((x) >> (n)) | ((x) << (32 - (n)))
+#endif
+
+#ifndef __CUDA_ARCH__
+#define ROTR32(x, n) ROTR32c(x, n)
+#define ROTL32(x, n) ROTL32c(x, n)
+#else
+#if __CUDA_ARCH__ < 320
+// Kepler (Compute 3.0)
+__device__ __forceinline__ uint32_t ROTR32(const uint32_t x, const uint32_t n)
+{
+	return (x >> n) | (x << (32 - n));
+}
+__device__ __forceinline__ uint32_t ROTL32(const uint32_t x, const uint32_t n)
+{
+	return (x << n) | (x >> (32 - n));
+}
+#else
+__device__ __forceinline__ uint32_t ROTR32(const uint32_t x, const uint32_t n)
+{
+	return __funnelshift_r(x, x, n);
+}
+__device__ __forceinline__ uint32_t ROTL32(const uint32_t x, const uint32_t n)
+{
+	return __funnelshift_l(x, x, n);
+}
+#endif
+#endif
+
+// #define NOASM here if you don't want asm
+#ifndef __CUDA_ARCH__
+#define NOASM
+#endif
+
+#define MAKE_ULONGLONG(lo, hi) MAKE_UINT64(lo, hi)
+
+__device__ __forceinline__ uint64_t MAKE_UINT64(uint32_t LO, uint32_t HI)
+{
+#ifndef NOASM
+	uint64_t result;
+	asm("mov.b64 %0,{%1,%2}; \n\t"
+		: "=l"(result) : "r"(LO), "r"(HI));
+	return result;
+#else
+	return LO + (uint64_t)HI << 32;
+#endif
+}
+
+__device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t x, const uint32_t y)
+{
+#ifndef NOASM
+	uint64_t result;
+	asm(
+		"{\n\t"
+		".reg .u32 t,t2; \n\t"
+		"mov.b64 {t2,t},%1; \n\t"
+		"mov.b64 %0,{t2,%2}; \n\t"
+		"}" : "=l"(result) : "l"(x), "r"(y)
+		);
+	return result;
+#else
+	return (x & 0xffffffff) + ((uint64_t)y << 32);
+#endif
+}
+__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t x, const uint32_t y)
+{
+#ifndef NOASM
+	uint64_t result;
+	asm(
+		"{\n\t"
+		".reg .u32 t,t2; \n\t"
+		"mov.b64 {t2,t},%1; \n\t"
+		"mov.b64 %0,{%2,t}; \n\t"
+		"}" : "=l"(result) : "l"(x), "r"(y)
+		);
+	return result;
+#else
+	return (x & 0xffffffff00000000) + y;
+#endif
+}
+
+// endian change for 32bit
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ uint32_t cuda_swab32(const uint32_t x)
+{
+	/* device */
+	return __byte_perm(x, x, 0x0123);
+}
+#else
+	/* host */
+	#define cuda_swab32(x) \
+	( ((x) << 24) | (((x) << 8) & 0x00ff0000u) | \
+		(((x) >> 8) & 0x0000ff00u) | ((x) >> 24))
+#endif
+
+
+static __device__ uint32_t _HIWORD(const uint64_t x)
+{
+#ifndef NOASM
+	uint32_t result;
+	asm(
+		"{\n\t"
+		".reg .u32 xl; \n\t"
+		"mov.b64 {xl,%0},%1; \n\t"
+		"}" : "=r"(result) : "l"(x)
+		);
+	return result;
+#else
+	return x >> 32;
+#endif
+}
+
+static __device__ uint32_t _LOWORD(const uint64_t x)
+{
+#ifndef NOASM
+	uint32_t result;
+	asm(
+		"{\n\t"
+		".reg .u32 xh; \n\t"
+		"mov.b64 {%0,xh},%1; \n\t"
+		"}" : "=r"(result) : "l"(x)
+		);
+	return result;
+#else
+	return x & 0xffffffff;
+#endif
+}
+
+// endian change for 64bit
+#if (defined __CUDA_ARCH__ && !defined NOASM)
+__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
+{
+	uint64_t result;
+	uint2 t;
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(t.x), "=r"(t.y) : "l"(x));
+	t.x=__byte_perm(t.x, 0, 0x0123);
+	t.y=__byte_perm(t.y, 0, 0x0123);
+	asm("mov.b64 %0,{%1,%2}; \n\t"
+		: "=l"(result) : "r"(t.y), "r"(t.x));
+	return result;
+}
+#else
+	/* host */
+	#define cuda_swab64(x) \
+		((uint64_t)((((uint64_t)(x)) >> 56) | \
+			(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
+			(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
+			(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
+			(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
+			(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
+			(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
+			(((uint64_t)(x)) << 56)))
+#endif
+
+/*********************************************************************/
+// Macros to catch CUDA errors in CUDA runtime calls
+
+#define CUDA_SAFE_CALL(call)                                          \
+do {                                                                  \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
+		         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
+		exit(EXIT_FAILURE);                                           \
+	}                                                                 \
+} while (0)
+
+#define CUDA_CALL_OR_RET(call) do {                                   \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
+		return;                                                       \
+	}                                                                 \
+} while (0)
+
+#define CUDA_CALL_OR_RET_X(call, ret) do {                            \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
+		return ret;                                                   \
+	}                                                                 \
+} while (0)
+
+/*********************************************************************/
+#if (defined _WIN64 || defined NOASM)
+#define USE_XOR_ASM_OPTS 0
+#else
+#define USE_XOR_ASM_OPTS 1
+#endif
+
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor1(const uint64_t a, const uint64_t b)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b));
+	return result;
+}
+#else
+#define xor1(a,b) ((a) ^ (b))
+#endif
+
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor3(const uint64_t a, const uint64_t b, const uint64_t c)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %2, %3;\n\t"
+	    "xor.b64 %0, %0, %1;\n\t"
+		/* output : input registers */
+		: "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+}
+#else
+#define xor3(a,b,c) ((a) ^ (b) ^ (c))
+#endif
+
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor8(const uint64_t a, const uint64_t b, const uint64_t c, const uint64_t d, const uint64_t e, const uint64_t f, const uint64_t g, const  uint64_t h)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
+	return result;
+}
+#else
+#define xor8(a,b,c,d,e,f,g,h) ((a)^(b)^(c)^(d)^(e)^(f)^(g)^(h))
+#endif
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t xandx(const uint64_t a, const uint64_t b, const uint64_t c)
+{
+	uint64_t result;
+#ifndef NOASM
+	asm("{\n\t"
+		".reg .u64 n;\n\t"
+		"xor.b64 %0, %2, %3;\n\t"
+		"and.b64 n, %0, %1;\n\t"
+		"xor.b64 %0, n, %3;"
+	"}\n"
+	: "=l"(result) : "l"(a), "l"(b), "l"(c));
+#else
+	result = ((((b) ^ (c)) & (a)) ^ (c));
+#endif
+	return result;
+}
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
+{
+	uint64_t result;
+#ifndef NOASM
+	asm("{\n\t"
+		".reg .u64 m,n;\n\t"
+		"and.b64 m,  %1, %2;\n\t"
+		" or.b64 n,  %1, %2;\n\t"
+		"and.b64 %0, n,  %3;\n\t"
+		" or.b64 %0, %0, m ;\n\t"
+	"}\n"
+	: "=l"(result) : "l"(a), "l"(b), "l"(c));
+#else
+	result = (((a) & (b)) | (((a) | (b)) & (c)));
+#endif
+	return result;
+}
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t shr_t64(uint64_t x, uint32_t n)
+{
+	uint64_t result;
+#ifndef NOASM
+	asm("shr.b64 %0,%1,%2;\n\t"
+	: "=l"(result) : "l"(x), "r"(n));
+#else
+	result = x >> n;
+#endif
+	return result;
+}
+
+// device asm for ?
+__device__ __forceinline__
+uint64_t shl_t64(uint64_t x, uint32_t n)
+{
+	uint64_t result;
+#ifndef NOASM
+	asm("shl.b64 %0,%1,%2;\n\t"
+	: "=l"(result) : "l"(x), "r"(n));
+#else
+	result = x << n;
+#endif
+	return result;
+}
+
+#ifndef USE_ROT_ASM_OPT
+#define USE_ROT_ASM_OPT 1
+#endif
+
+#ifdef NOASM
+#undef USE_ROT_ASM_OPT
+#endif
+
+// 64-bit ROTATE RIGHT
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
+/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
+__device__ __forceinline__
+uint64_t ROTR64(const uint64_t value, const int offset) {
+	uint2 result;
+	if(offset < 32) {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} else {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
+__device__ __forceinline__
+uint64_t ROTR64(const uint64_t x, const int offset)
+{
+	uint64_t result;
+	asm("{\n\t"
+		".reg .b64 lhs;\n\t"
+		".reg .u32 roff;\n\t"
+		"shr.b64 lhs, %1, %2;\n\t"
+		"sub.u32 roff, 64, %2;\n\t"
+		"shl.b64 %0, %1, roff;\n\t"
+		"add.u64 %0, %0, lhs;\n\t"
+	"}\n"
+	: "=l"(result) : "l"(x), "r"(offset));
+	return result;
+}
+#else
+/* host */
+#if defined _MSC_VER && !defined __CUDA_ARCH__
+#define ROTR64(x, n) _rotr64(x, n)
+#else
+#define ROTR64(x, n)  (((x) >> (n)) | ((x) << (64 - (n))))
+#endif
+#endif
+
+// 64-bit ROTATE LEFT
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
+__device__ __forceinline__
+uint64_t ROTL64(const uint64_t value, const int offset) {
+	uint2 result;
+	if(offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return  __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
+__device__ __forceinline__
+uint64_t ROTL64(const uint64_t x, const int offset)
+{
+	uint64_t result;
+	asm("{\n\t"
+		".reg .b64 lhs;\n\t"
+		".reg .u32 roff;\n\t"
+		"shl.b64 lhs, %1, %2;\n\t"
+		"sub.u32 roff, 64, %2;\n\t"
+		"shr.b64 %0, %1, roff;\n\t"
+		"add.u64 %0, lhs, %0;\n\t"
+	"}\n"
+	: "=l"(result) : "l"(x), "r"(offset));
+	return result;
+}
+#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3
+__device__
+uint64_t ROTL64(const uint64_t x, const int offset)
+{
+	uint64_t res;
+	asm("{\n\t"
+		".reg .u32 tl,th,vl,vh;\n\t"
+		".reg .pred p;\n\t"
+		"mov.b64 {tl,th}, %1;\n\t"
+		"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
+		"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
+		"setp.lt.u32 p, %2, 32;\n\t"
+		"@!p mov.b64 %0, {vl,vh};\n\t"
+		"@p  mov.b64 %0, {vh,vl};\n\t"
+	"}"
+		: "=l"(res) : "l"(x) , "r"(offset)
+	);
+	return res;
+}
+#else
+/* host */
+#if defined _MSC_VER && !defined __CUDA_ARCH__
+#define ROTL64(x, n) _rotr64(x, n)
+#else
+#define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
+#endif
+#endif
+
+__device__ __forceinline__
+uint64_t SWAPDWORDS(uint64_t value)
+{
+#if __CUDA_ARCH__ >= 320 && !defined NOASM
+	uint2 temp;
+	asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
+	asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
+	return value;
+#else
+	return ROTL64(value, 32);
+#endif
+}
+
+/* lyra2 - int2 operators */
+
+__device__ __forceinline__
+void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x)
+{
+#ifndef NOASM
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(lo), "=r"(hi) : "l"(x));
+#else
+	lo = x & 0xffffffff;
+	hi = x >> 32;
+#endif
+}
+
+__device__ __forceinline__ uint64_t devectorize(uint2 x)
+{
+#ifndef NOASM
+	uint64_t result;
+	asm("mov.b64 %0,{%1,%2}; \n\t"
+		: "=l"(result) : "r"(x.x), "r"(x.y));
+	return result;
+#else
+	return x.x + ((uint64_t)x.y << 32);
+#endif
+}
+
+<<<<<<< HEAD
+__device__ __forceinline__ uint2 vectorize(uint64_t x)
+=======
+
+__device__ __forceinline__ uint2 vectorize(const uint64_t x)
+>>>>>>> 6506ecf... fixed build
+{
+#ifndef NOASM
+	uint2 result;
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(result.x), "=r"(result.y) : "l"(x));
+	return result;
+#else
+	return make_uint2(x & 0xffffffff, x >> 32);
+#endif
+}
+
+static __device__ __forceinline__ uint2 vectorizelow(uint32_t v) {
+	uint2 result;
+	result.x = v;
+	result.y = 0;
+	return result;
+}
+static __device__ __forceinline__ uint2 vectorizehigh(uint32_t v) {
+	uint2 result;
+	result.x = 0;
+	result.y = v;
+	return result;
+}
+
+static __device__ __forceinline__ uint2 operator^ (uint2 a, uint32_t b) { return make_uint2(a.x^ b, a.y); }
+static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); }
+static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); }
+static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); }
+static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); }
+static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
+
+static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b)
+{
+#ifndef NOASM
+	uint2 result;
+	asm("{\n\t"
+		"add.cc.u32 %0,%2,%4; \n\t"
+		"addc.u32 %1,%3,%5;   \n\t"
+	"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+#else
+	return make_uint2(a.x + b.x, a.y + b.y);
+#endif
+}
+
+<<<<<<< HEAD
+=======
+static __device__ __forceinline__ uint2 operator+ (uint2 a, uint32_t b)
+{
+	uint2 result;
+	asm("{\n\t"
+		"add.cc.u32 %0,%2,%4; \n\t"
+		"addc.u32 %1,%3,%5;   \n\t"
+		"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0));
+	return result;
+}
+
+
+static __device__ __forceinline__ uint2 operator- (uint2 a, uint32_t b)
+{
+	uint2 result;
+	asm("{\n\t"
+		"sub.cc.u32 %0,%2,%4; \n\t"
+		"subc.u32 %1,%3,%5;   \n\t"
+		"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0));
+	return result;
+}
+
+
+>>>>>>> 6506ecf... fixed build
+static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b)
+{
+#ifndef NOASM
+	uint2 result;
+	asm("{\n\t"
+		"sub.cc.u32 %0,%2,%4; \n\t"
+		"subc.u32 %1,%3,%5;   \n\t"
+		"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+#else
+return make_uint2(a.x - b.x, a.y - b.y);
+#endif
+}
+
+
+
+static __device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __device__ __forceinline__ uint4 operator& (uint4 a, uint4 b) { return make_uint4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); }
+static __device__ __forceinline__ uint4 operator| (uint4 a, uint4 b) { return make_uint4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w); }
+static __device__ __forceinline__ uint4 operator~ (uint4 a) { return make_uint4(~a.x, ~a.y, ~a.z, ~a.w); }
+static __device__ __forceinline__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; }
+static __device__ __forceinline__ uint4 operator^ (uint4 a, uint2 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.x, a.w ^ b.y); }
+
+
+static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; }
+
+/**
+ * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b))
+ * (what does uint64 "*" operator)
+ */
+static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b)
+{
+#ifndef NOASM
+	uint2 result;
+	asm("{\n\t"
+		"mul.lo.u32        %0,%2,%4;  \n\t"
+		"mul.hi.u32        %1,%2,%4;  \n\t"
+		"mad.lo.cc.u32    %1,%3,%4,%1; \n\t"
+		"madc.lo.u32      %1,%3,%5,%1; \n\t"
+	"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+#else
+	return vectorize(devectorize(a)*devectorize(b));
+#endif
+}
+
+// uint2 method
+#if  __CUDA_ARCH__ >= 320 && !defined NOASM
+__device__ __inline__ uint2 ROR2(const uint2 a, const int offset) 
+{
+	uint2 result;
+	if (offset < 32) {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	else {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+	return result;
+}
+#else
+__device__ __inline__ uint2 ROR2(const uint2 v, const int n) 
+{
+	uint2 result;
+	if (n <= 32) 
+	{
+		result.y = ((v.y >> (n)) | (v.x << (32 - n)));
+		result.x = ((v.x >> (n)) | (v.y << (32 - n)));
+	}
+	else 
+	{
+		result.y = ((v.x >> (n - 32)) | (v.y << (64 - n)));
+		result.x = ((v.y >> (n - 32)) | (v.x << (64 - n)));
+	}
+	return result;
+}
+#endif
+
+__device__ __inline__ uint32_t ROL8(const uint32_t x)
+{
+	return __byte_perm(x, x, 0x2103);
+}
+__device__ __inline__ uint32_t ROL16(const uint32_t x)
+{
+	return __byte_perm(x, x, 0x1032);
+}
+__device__ __inline__ uint32_t ROL24(const uint32_t x)
+{
+	return __byte_perm(x, x, 0x0321);
+}
+
+__device__ __inline__ uint2 ROR8(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x0765);
+	result.y = __byte_perm(a.y, a.x, 0x4321);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROR16(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x1076);
+	result.y = __byte_perm(a.y, a.x, 0x5432);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROR24(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x2107);
+	result.y = __byte_perm(a.y, a.x, 0x6543);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROL8(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x6543);
+	result.y = __byte_perm(a.y, a.x, 0x2107);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROL16(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x5432);
+	result.y = __byte_perm(a.y, a.x, 0x1076);
+
+	return result;
+}
+
+__device__ __inline__ uint2 ROL24(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x4321);
+	result.y = __byte_perm(a.y, a.x, 0x0765);
+
+	return result;
+}
+
+#if  __CUDA_ARCH__ >= 320 && !defined NOASM
+
+
+__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) {
+	uint2 result;
+	if (offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+	return result;
+}
+#else
+__inline__ __device__ uint2 ROL2(const uint2 v, const int n)
+{
+		uint2 result;
+		if (n <= 32) 
+		{
+			result.y = ((v.y << (n)) | (v.x >> (32 - n)));
+			result.x = ((v.x << (n)) | (v.y >> (32 - n)));
+		}
+		else 
+		{
+			result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
+			result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
+		}
+		return result;
+}
+#endif
+
+__device__ __forceinline__
+uint64_t ROTR16(uint64_t x)
+{
+#if __CUDA_ARCH__ > 500 && !defined NOASM
+	short4 temp;
+	asm("mov.b64 { %0,  %1, %2, %3 }, %4; ": "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) : "l"(x));
+	asm("mov.b64 %0, {%1, %2, %3 , %4}; ":  "=l"(x) : "h"(temp.y), "h"(temp.z), "h"(temp.w), "h"(temp.x));
+	return x;
+#else
+	return ROTR64(x, 16);
+#endif
+}
+
+__device__ __forceinline__
+uint64_t ROTL16(uint64_t x)
+{
+#if __CUDA_ARCH__ > 500 && !defined NOASM
+	short4 temp;
+	asm("mov.b64 { %0,  %1, %2, %3 }, %4; ": "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) : "l"(x));
+	asm("mov.b64 %0, {%1, %2, %3 , %4}; ":  "=l"(x) : "h"(temp.w), "h"(temp.x), "h"(temp.y), "h"(temp.z));
+	return x;
+#else
+	return ROTL64(x, 16);
+#endif
+}
+
+__device__ __forceinline__
+uint2 SWAPINT2(uint2 x)
+{
+	return(make_uint2(x.y, x.x));
+}
+
+__device__ __forceinline__ bool cuda_hashisbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target)
+{
+	if (hash[7] > target[7])
+		return false;
+	if (hash[7] < target[7])
+		return true;
+	if (hash[6] > target[6])
+		return false;
+	if (hash[6] < target[6])
+		return true;
+	if (hash[5] > target[5])
+		return false;
+	if (hash[5] < target[5])
+		return true;
+	if (hash[4] > target[4])
+		return false;
+	if (hash[4] < target[4])
+		return true;
+	if (hash[3] > target[3])
+		return false;
+	if (hash[3] < target[3])
+		return true;
+	if (hash[2] > target[2])
+		return false;
+	if (hash[2] < target[2])
+		return true;
+	if (hash[1] > target[1])
+		return false;
+	if (hash[1] < target[1])
+		return true;
+	if (hash[0] > target[0])
+		return false;
+	return true;
+}
+
+__device__ __forceinline__
+uint2 SWAPDWORDS2(uint2 value)
+{
+	return make_uint2(value.y, value.x);
+}
+
+static __forceinline__ __device__ uint2 SHL2(const uint2 a, int offset)
+{
+	uint2 result;
+#if __CUDA_ARCH__ > 300 && !defined NOASM
+	if (offset<32) 
+	{
+		asm("{\n\t"
+			"shf.l.clamp.b32 %1,%2,%3,%4; \n\t"
+			"shl.b32 %0,%2,%4; \n\t"
+			"}\n\t"
+			: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+	else {
+		asm("{\n\t"
+			"shf.l.clamp.b32 %1,%2,%3,%4; \n\t"
+			"shl.b32 %0,%2,%4; \n\t"
+			"}\n\t"
+			: "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+#else
+	if (offset<=32) 
+	{
+		result.y = (a.y << offset) | (a.x >> (32 - offset));
+		result.x = (a.x << offset);
+	}
+	else
+	{
+		result.y = (a.x << (offset - 32));
+		result.x = 0;
+	}
+#endif
+	return result;
+}
+
+static __forceinline__ __device__ uint2 SHR2(const uint2 a, int offset)
+{
+	uint2 result;
+#if __CUDA_ARCH__ >= 320 && !defined NOASM
+	if (offset<32) {
+		asm("{\n\t"
+			"shf.r.clamp.b32 %0,%2,%3,%4; \n\t"
+			"shr.b32 %1,%3,%4; \n\t"
+			"}\n\t"
+			: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+	else {
+		asm("{\n\t"
+			"shf.l.clamp.b32 %0,%2,%3,%4; \n\t"
+			"shl.b32 %1,%3,%4; \n\t"
+			"}\n\t"
+			: "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	#else
+	if (offset<=32) 
+	{
+		result.x = (a.x >> offset) | (a.y << (32 - offset));
+		result.y = (a.y >> offset);
+	}
+	else
+	{
+		result.x = (a.y >> (offset - 32));
+		result.y = 0;
+	}
+#endif
+	return result;
+}
+
+static __device__ __forceinline__ uint64_t devectorizeswap(uint2 v) { return MAKE_UINT64(cuda_swab32(v.y), cuda_swab32(v.x)); }
+static __device__ __forceinline__ uint2 vectorizeswap(uint64_t v)
+{
+	uint2 result;
+	LOHI(result.y, result.x, v);
+	result.x = cuda_swab32(result.x);
+	result.y = cuda_swab32(result.y);
+	return result;
+}
+
+
+__device__ __forceinline__ uint32_t devectorize16(ushort2 x)
+{
+	uint32_t result;
+#ifndef NOASM
+	asm("mov.b32 %0,{%1,%2}; \n\t"
+		: "=r"(result) : "h"(x.x) , "h"(x.y));
+#else
+	result = x.x + (x.y << 16);
+#endif
+	return result;
+}
+
+
+__device__ __forceinline__ ushort2 vectorize16(uint32_t x)
+{
+	ushort2 result;
+#ifndef NOASM
+	asm("mov.b32 {%0,%1},%2; \n\t"
+		: "=h"(result.x), "=h"(result.y) : "r"(x));
+#else
+	result.x = x & 0xffff;
+	result.y = x >> 16;
+#endif
+	return result;
+}
+
+extern int cuda_arch[MAX_GPUS];
+extern void get_cuda_arch(int *);
+
+/*
+static __device__ __forceinline__ uint4 mul4(uint4 a)
+{
+	uint4 result;
+	asm("{\n\t"
+		 "mul.lo.u32        %0,%4,%5;  \n\t"
+		 "mul.hi.u32        %1,%4,%5;  \n\t"
+		 "mul.lo.u32        %2,%6,%7;  \n\t"
+		 "mul.hi.u32        %3,%6,%7;  \n\t"
+		 "}\n\t"
+		 : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w));
+	return result;
+}
+static __device__ __forceinline__ uint4 add4(uint4 a, uint4 b)
+ {
+	uint4 result;
+	asm("{\n\t"
+		 "add.cc.u32           %0,%4,%8;  \n\t"
+		 "addc.u32             %1,%5,%9;  \n\t"
+		 "add.cc.u32           %2,%6,%10;  \n\t"
+		 "addc.u32             %3,%7,%11;  \n\t"
+		 "}\n\t"
+		 : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w));
+	return result;
+	}
+
+static __device__ __forceinline__ uint4 madd4(uint4 a, uint4 b)
+ {
+	uint4 result;
+	asm("{\n\t"
+		 "mad.lo.cc.u32        %0,%4,%5,%8;  \n\t"
+		 "madc.hi.u32          %1,%4,%5,%9;  \n\t"
+		 "mad.lo.cc.u32        %2,%6,%7,%10;  \n\t"
+		 "madc.hi.u32          %3,%6,%7,%11;  \n\t"
+		 "}\n\t"
+		 : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w));
+	return result;
+	}
+
+static __device__ __forceinline__ ulonglong2 madd4long(ulonglong2 a, ulonglong2 b)
+ {
+	ulonglong2 result;
+	asm("{\n\t"
+		 ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t"
+		 "mov.b64 {a0,a1}, %2;\n\t"
+		 "mov.b64 {a2,a3}, %3;\n\t"
+		 "mov.b64 {b0,b1}, %4;\n\t"
+		 "mov.b64 {b2,b3}, %5;\n\t"
+		 "mad.lo.cc.u32        b0,a0,a1,b0;  \n\t"
+		 "madc.hi.u32          b1,a0,a1,b1;  \n\t"
+		 "mad.lo.cc.u32        b2,a2,a3,b2;  \n\t"
+		 "madc.hi.u32          b3,a2,a3,b3;  \n\t"
+		 "mov.b64 %0, {b0,b1};\n\t"
+		 "mov.b64 %1, {b2,b3};\n\t"
+		 "}\n\t"
+		 : "=l"(result.x), "=l"(result.y) : "l"(a.x), "l"(a.y), "l"(b.x), "l"(b.y));
+	return result;
+	}
+*/
+static __device__ __forceinline__ void madd4long2(ulonglong2 &a, ulonglong2 b)
+ {
+#ifndef NOASM	
+		asm("{\n\t"
+		 ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t"
+		 "mov.b64 {a0,a1}, %0;\n\t"
+		 "mov.b64 {a2,a3}, %1;\n\t"
+		 "mov.b64 {b0,b1}, %2;\n\t"
+		 "mov.b64 {b2,b3}, %3;\n\t"
+		 "mad.lo.cc.u32        b0,a0,a1,b0;  \n\t"
+		 "madc.hi.u32          b1,a0,a1,b1;  \n\t"
+		 "mad.lo.cc.u32        b2,a2,a3,b2;  \n\t"
+		 "madc.hi.u32          b3,a2,a3,b3;  \n\t"
+		 "mov.b64 %0, {b0,b1};\n\t"
+		 "mov.b64 %1, {b2,b3};\n\t"
+		 "}\n\t"
+		 : "+l"(a.x), "+l"(a.y) : "l"(b.x), "l"(b.y));
+#else // ?? no idea what madd4long is supposed to do
+	 a.x = a.x + b.x;
+	 if(a.x < b.x)
+		 a.y = a.y + b.y + 1;
+	 else
+		 a.y = a.y + b.y;
+#endif	
+}
+
+__device__ __forceinline__
+uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) {
+	uint32_t result;
+#ifndef NOASM
+	asm("{ .reg .u32 t1;\n\t"
+		"xor.b32 t1, %2, %3;\n\t"
+		"xor.b32 %0, %1, t1;\n\t"
+		"}"
+		: "=r"(result) : "r"(a), "r"(b), "r"(c));
+#else
+	result = a ^ b ^ c;
+#endif
+	return result;
+}
+
+__device__ __forceinline__
+uint32_t shr_t32(uint32_t x, uint32_t n) {
+	uint32_t result;
+#ifndef NOASM
+	asm("shr.b32 %0,%1,%2;"	: "=r"(result) : "r"(x), "r"(n));
+#else
+	result = x >> n;
+#endif
+	return result;
+}
+
+__device__ __forceinline__
+uint32_t shl_t32(uint32_t x, uint32_t n) {
+	uint32_t result;
+#ifndef NOASM
+	asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
+#else
+	result = x << n;
+#endif
+	return result;
+}
+
+// device asm 32 for pluck
+__device__ __forceinline__
+uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) {
+	uint32_t result;
+#ifndef NOASM
+	asm("{ .reg .u32 m,n,o;\n\t"
+		"and.b32 m,  %1, %2;\n\t"
+		" or.b32 n,  %1, %2;\n\t"
+		"and.b32 o,   n, %3;\n\t"
+		" or.b32 %0,  m, o ;\n\t"
+		"}\n\t"
+		: "=r"(result) : "r"(a), "r"(b), "r"(c));
+#else
+	result = ((a | b) & c) | (a & b);
+#endif
+	return result;
+}
+
+#endif // #ifndef CUDA_HELPER_H
+
+
diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu
index d404afe7b4..6316309fbb 100644
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@@ -2,15 +2,16 @@
 
 #include <stdio.h>
 #include <memory.h>
-
+#include "miner.h"
 #include "cuda_helper.h"
 
+
 // globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
-uint32_t *d_outputHashes[MAX_GPUS];
+static uint32_t *d_outputHashes[MAX_GPUS];
 static uint32_t *d_resultNonce[MAX_GPUS];
 
-__constant__ uint32_t myriadgroestl_gpu_msg[32];
+__constant__ uint32_t myriadgroestl_gpu_msg[20];
 
 // muss expandiert werden
 __constant__ uint32_t myr_sha256_gpu_constantTable[64] = {
@@ -23,7 +24,17 @@ __constant__ uint32_t myr_sha256_gpu_constantTable[64] = {
 	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 };
-// __constant__ uint32_t myr_sha256_gpu_constantTable2[64];
+__constant__ uint32_t myr_sha256_gpu_constantTable2[64] = {
+	0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
+	0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254, 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
+	0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7, 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
+	0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd, 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
+	0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537, 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
+	0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7, 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
+	0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c, 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
+};
+
 __constant__ uint32_t myr_sha256_gpu_hashTable[8] = {
 	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
 
@@ -41,15 +52,6 @@ __constant__ uint32_t myr_sha256_gpu_w2Table[64] = {
 #include "groestl_functions_quad.cu"
 #include "bitslice_transformations_quad.cu"
 
-#define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
-
-#if __CUDA_ARCH__ < 350 
-    // Kepler (Compute 3.0)
-    #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
-#else
-    // Kepler (Compute 3.5)
-    #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
-#endif
 #define R(x, n)            ((x) >> (n))
 #define Ch(x, y, z)        ((x & (y ^ z)) ^ z)
 #define Maj(x, y, z)    ((x & (y | z)) | (y & z))
@@ -181,49 +183,56 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
     for(int k=0;k<8;k++)
         regs[k] = hash[k];
 
-// to do: precalculate constants
-	uint32_t myr_sha256_gpu_constantTable2[64];
-#pragma unroll 64
-	for (int i = 0; i < 64; i++)
-		myr_sha256_gpu_constantTable2[i] = myr_sha256_gpu_constantTable[i] + myr_sha256_gpu_w2Table[i];
-
 // Progress W1
-#pragma unroll 64
-    for(int j=0;j<61;j++)
+#pragma unroll 
+    for(int j=0;j<57;j++)
     {
 		uint32_t T1, T2;
 		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable2[j];
 		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
 
-#pragma unroll 7
+#pragma unroll
 		for (int k = 6; k >= 0; k--) regs[k + 1] = regs[k];
 		regs[0] = T1 + T2;
 		regs[4] += T1;
 	}
 
-    //// FERTIG
+	regs[3] += regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable2[57];
+	regs[2] += regs[6] + S1(regs[3]) + Ch(regs[3], regs[4], regs[5]) + myr_sha256_gpu_constantTable2[58];
+	regs[1] += regs[5] + S1(regs[2]) + Ch(regs[2], regs[3], regs[4]) + myr_sha256_gpu_constantTable2[59];
+	regs[0] += regs[4] + S1(regs[1]) + Ch(regs[1], regs[2], regs[3]) + myr_sha256_gpu_constantTable2[60];
 
-	message[7] = cuda_swab32(hash[7] + regs[4]);
+	message[7] = cuda_swab32(hash[7] + regs[0]);
 }
 
-__global__ void __launch_bounds__(256, 4)
+__global__ void __launch_bounds__(512, 2)
  myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer)
 {
     // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
+    const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
     if (thread < threads)
     {
         // GROESTL
-        uint32_t paddedInput[8];
-#pragma unroll 8
-		for (int k = 0; k<8; k++) paddedInput[k] = myriadgroestl_gpu_msg[4 * k + (threadIdx.x & 3)];
-
-		uint32_t nounce = startNounce + thread;
-		if ((threadIdx.x & 3) == 3)
-			paddedInput[4] = cuda_swab32(nounce);  // 4*4+3 = 19
+		uint32_t paddedInput[8];
+		paddedInput[0] = myriadgroestl_gpu_msg[4 * 0 + (threadIdx.x & 3)];
+		paddedInput[1] = myriadgroestl_gpu_msg[4 * 1 + (threadIdx.x & 3)];
+		paddedInput[2] = myriadgroestl_gpu_msg[4 * 2 + (threadIdx.x & 3)];
+		paddedInput[3] = myriadgroestl_gpu_msg[4 * 3 + (threadIdx.x & 3)];
+		paddedInput[4] = myriadgroestl_gpu_msg[4 * 4 + (threadIdx.x & 3)];
+		paddedInput[5] = 0;
+		paddedInput[6] = 0;
+		paddedInput[7] = 0;
+
+		if((threadIdx.x & 3) == 0)
+			paddedInput[5] = 0x80;
+		if((threadIdx.x & 3) == 3)
+		{
+			paddedInput[4] = cuda_swab32(startNounce + thread);
+			paddedInput[7] = 0x01000000;
+		}
 
         uint32_t msgBitsliced[8];
-        to_bitslice_quad(paddedInput, msgBitsliced);
+        myr_to_bitslice_quad(paddedInput, msgBitsliced);
 
         uint32_t state[8];
 
@@ -234,23 +243,25 @@ __global__ void __launch_bounds__(256, 4)
 
         if ((threadIdx.x & 0x03) == 0)
         {
-            uint32_t *outpHash = &hashBuffer[16 * thread];
-#pragma unroll 16
-            for(int k=0;k<16;k++) outpHash[k] = out_state[k];
-        }
+			uint4 *outpHash = (uint4*)&hashBuffer[16 * thread];
+			uint4 *phash = (uint4*)out_state;
+			uint4 *outpt = outpHash;
+			outpt[0] = phash[0];
+			outpt[1] = phash[1];
+			outpt[2] = phash[2];
+			outpt[3] = phash[3];
+		}
     }
 }
 
-__global__ void __launch_bounds__(256, 3)
- myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
+__global__ void __launch_bounds__(512, 1)
+ myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ resNounce, const uint32_t *const __restrict__ hashBuffer)
 {
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+    const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
     if (thread < threads)
     {
-        uint32_t nounce = startNounce + thread;
-
         uint32_t out_state[16];
-        uint32_t *inpHash = &hashBuffer[16 * thread];
+        const uint32_t *inpHash = &hashBuffer[16 * thread];
 #pragma unroll 16
         for (int i=0; i < 16; i++)
             out_state[i] = inpHash[i];
@@ -259,72 +270,54 @@ __global__ void __launch_bounds__(256, 3)
         
         if (out_state[7] <= pTarget[7])
 		{
-			uint32_t tmp = atomicExch(resNounce, nounce);
+			uint32_t tmp = atomicExch(resNounce, startNounce + thread);
 			if (tmp != 0xffffffff)
 				resNounce[1] = tmp;
 		 }
     }
 }
 
+static THREAD cudaStream_t stream[3];
 // Setup-Funktionen
 __host__ void myriadgroestl_cpu_init(int thr_id, uint32_t threads)
 {
-	cudaSetDevice(device_map[thr_id]);
-	cudaDeviceReset();
+	CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 	cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-
-    // Speicher für Gewinner-Nonce belegen
-    cudaMalloc(&d_resultNonce[thr_id], 4*sizeof(uint32_t)); 
+	CUDA_SAFE_CALL(cudaStreamCreate(&stream[0]));
+	CUDA_SAFE_CALL(cudaStreamCreate(&stream[1]));
+	CUDA_SAFE_CALL(cudaStreamCreate(&stream[2]));
+	cudaMalloc(&d_resultNonce[thr_id], 4 * sizeof(uint32_t));
 
     // Speicher für temporäreHashes
-    cudaMalloc(&d_outputHashes[thr_id], 16*sizeof(uint32_t)*threads); 
+	CUDA_SAFE_CALL(cudaMalloc(&d_outputHashes[thr_id], 16 * sizeof(uint32_t)*threads));
 }
 
 __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
-    // Nachricht expandieren und setzen
-    uint32_t msgBlock[32];
-
-    memset(msgBlock, 0, sizeof(uint32_t) * 32);
-    memcpy(&msgBlock[0], data, 80);
+	cudaMemcpyToSymbolAsync(myriadgroestl_gpu_msg, data, 80, 0, cudaMemcpyHostToDevice, stream[0]);
 
-    // Erweitere die Nachricht auf den Nachrichtenblock (padding)
-    // Unsere Nachricht hat 80 Byte
-    msgBlock[20] = 0x80;
-    msgBlock[31] = 0x01000000;
-
-    // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
-    // auf der GPU ausgeführt)
-
-    // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
-    cudaMemcpyToSymbol( myriadgroestl_gpu_msg,
-                        msgBlock,
-                        128);
-
-    cudaMemset(d_resultNonce[thr_id], 0xFF, 4*sizeof(uint32_t));
-    cudaMemcpyToSymbol( pTarget,
-                        pTargetIn,
-                        sizeof(uint32_t) * 8 );
+	cudaMemsetAsync(d_resultNonce[thr_id], 0xFF, 4 * sizeof(uint32_t), stream[1]);
+	cudaMemcpyToSymbolAsync(pTarget, pTargetIn, sizeof(uint32_t) * 8, 0, cudaMemcpyHostToDevice, stream[2]);
 }
 
 __host__ void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *nounce)
 {
-    uint32_t threadsperblock = 256;
-
+    const uint32_t threadsperblock = 512;
+	const uint32_t threadsperblock2 = 512;
     // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
     // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
     const int factor=4;
 
-    cudaMemset(d_resultNonce[thr_id], 0xFF, 4*sizeof(uint32_t));
     // berechne wie viele Thread Blocks wir brauchen
     dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
     dim3 block(threadsperblock);
 
+	CUDA_SAFE_CALL(cudaDeviceSynchronize());
     myriadgroestl_gpu_hash_quad<<<grid, block>>>(threads, startNounce, d_outputHashes[thr_id]);
-    dim3 grid2((threads + threadsperblock-1)/threadsperblock);
+    dim3 grid2((threads + threadsperblock2-1)/threadsperblock2);
     myriadgroestl_gpu_hash_quad2<<<grid2, block>>>(threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]);
 
 
-    cudaMemcpy(nounce, d_resultNonce[thr_id], 4*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+    CUDA_SAFE_CALL(cudaMemcpy(nounce, d_resultNonce[thr_id], 4*sizeof(uint32_t), cudaMemcpyDeviceToHost));
 }
diff --git a/cuda_nist5.cu b/cuda_nist5.cu
index 437889b648..a4270cde30 100644
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@@ -11,26 +11,27 @@ extern "C"
 
 #include "cuda_helper.h"
 
-static uint32_t *d_hash[MAX_GPUS];
-
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_init(int thr_id);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
-extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_skein512_cpu_init(int thr_id);
-extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t *h_found, uint32_t target, int order);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+extern void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t *h_found, uint32_t target);
 
 
 // Original nist5hash Funktion aus einem miner Quelltext
-extern "C" void nist5hash(void *state, const void *input)
+void nist5hash(void *state, const void *input)
 {
     sph_blake512_context ctx_blake;
     sph_groestl512_context ctx_groestl;
@@ -63,91 +64,125 @@ extern "C" void nist5hash(void *state, const void *input)
     memcpy(state, hash, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
-static uint32_t *h_found[MAX_GPUS];
-
-extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done)
+extern int scanhash_nist5(int thr_id, uint32_t *pdata,
+    uint32_t *ptarget, uint32_t max_nonce,
+    uint32_t *hashes_done)
 {
+	static THREAD uint32_t *d_hash = nullptr;
+	static THREAD uint32_t *h_found = nullptr;
+	static THREAD uint32_t oldthroughput;
+
 	const uint32_t first_nonce = pdata[19];
 
-	uint32_t throughput = device_intensity(thr_id, __func__, 1 << 20); // 256*256*16
-	throughput = min(throughput, (max_nonce - first_nonce));
+	uint32_t throughput = device_intensity(device_map[thr_id], __func__, 1 << 19); // 256*256*16
+	throughput = min(throughput, (max_nonce - first_nonce)) & 0xfffffc00;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0Fu;
+		ptarget[7] = 0x0Fu;
 
-	if (!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+		oldthroughput = throughput;
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		get_cuda_arch(&cuda_arch[thr_id]);
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughput > 0x7fffffffULL / (16 * sizeof(uint32_t)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
 
 		// Konstanten kopieren, Speicher belegen
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id);
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
-		CUDA_CALL_OR_RET_X(cudaMallocHost(&(h_found[thr_id]), 2 * sizeof(uint32_t)), 0);
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughput));
+		CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 2 * sizeof(uint32_t)));
 
-		cuda_check_cpu_init(thr_id, throughput);
-		init[thr_id] = true;
+//		cuda_check_cpu_init(thr_id, throughput);
+		mining_has_stopped[thr_id] = false;
+		init = true;
+	}
+	if(throughput > oldthroughput)
+	{
+		oldthroughput = throughput;
+		CUDA_SAFE_CALL(cudaFree(d_hash));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughput));
 	}
 
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	cuda_check_cpu_setTarget(ptarget);
+	quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata);
+//	cuda_check_cpu_setTarget(ptarget, thr_id);
 
 	do {
-		int order = 0;
 
 		// Hash with CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_skein512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], h_found[thr_id], ptarget[7], order++);
-
-		if (h_found[thr_id][0] != 0xffffffff)
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_skein512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash, h_found, ptarget[7]);
+
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(h_found[0] != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], h_found[thr_id][0]);
+			uint32_t vhash64[8]={0};
+			if(opt_verify){ be32enc(&endiandata[19], h_found[0]);
 			nist5hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
 			{
 				int res = 1;
-				// check if there was some other ones...
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (h_found[thr_id][1] != 0xffffffff)
+				if (h_found[1] != 0xffffffff)
 				{
-					pdata[21] = h_found[thr_id][1];
-					res++;
-					if (opt_benchmark)
-						applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, h_found[thr_id][1]);
+					if(opt_verify){ be32enc(&endiandata[19], h_found[1]);
+					nist5hash(vhash64, endiandata);
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+
+						pdata[21] = h_found[1];
+						res++;
+						if (opt_benchmark)
+							applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]);
+					}
+					else
+					{
+						if (vhash64[7] != Htarg)
+						{
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]);
+						}
+					}
+
 				}
-				pdata[19] = h_found[thr_id][0];
+				pdata[19] = h_found[0];
 				if (opt_benchmark)
-					applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, h_found[thr_id][0]);
+					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]);
 				return res;
 			}
 			else
 			{
 				if (vhash64[7] != Htarg)
 				{
-					applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_found[thr_id][0]);
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]);
 				}
 			}
 		}
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/cuda_vector.h b/cuda_vector.h
new file mode 100644
index 0000000000..0644c902d6
--- /dev/null
+++ b/cuda_vector.h
@@ -0,0 +1,1385 @@
+#ifndef CUDA_VECTOR_H
+#define CUDA_VECTOR_H
+
+
+///////////////////////////////////////////////////////////////////////////////////
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif
+
+#include "cuda_helper.h"
+
+//typedef __device_builtin__ struct ulong16 ulong16;
+
+
+typedef struct __align__(32) uint8
+{
+	unsigned int s0, s1, s2, s3, s4, s5, s6, s7;
+} uint8;
+
+typedef struct __align__(64) uint2_8
+{
+	uint2 s0, s1, s2, s3, s4, s5, s6, s7;
+} uint2_8;
+
+
+typedef struct __align__(64) ulonglong2to8
+{
+ulonglong2 l0,l1,l2,l3;
+} ulonglong2to8;
+
+typedef struct __align__(128) ulonglong8to16
+{
+	ulonglong2to8 lo, hi;
+} ulonglong8to16;
+
+typedef struct __align__(256) ulonglong16to32
+{
+	ulonglong8to16 lo, hi;
+} ulonglong16to32;
+
+typedef struct __align__(512) ulonglong32to64
+{
+	ulonglong16to32 lo, hi;
+} ulonglong32to64;
+
+
+
+typedef struct __align__(128) ulonglonglong
+{
+	ulonglong2 s0,s1,s2,s3,s4,s5,s6,s7;
+} ulonglonglong;
+
+
+
+
+typedef struct __align__(64) uint16
+{
+	union {
+		struct {unsigned int  s0, s1, s2, s3, s4, s5, s6, s7;};
+		uint8 lo;
+	};
+	union {
+		struct {unsigned int s8, s9, sa, sb, sc, sd, se, sf;};
+		uint8 hi;
+	};
+} uint16;
+
+typedef struct __align__(128) uint2_16
+{
+	union {
+		struct { uint2  s0, s1, s2, s3, s4, s5, s6, s7; };
+		uint2_8 lo;
+	};
+	union {
+		struct { uint2 s8, s9, sa, sb, sc, sd, se, sf; };
+		uint2_8 hi;
+	};
+} uint2_16;
+
+
+
+
+typedef struct __align__(128) uint32
+{
+
+		uint16 lo,hi;
+} uint32;
+
+
+
+struct __align__(128) ulong8
+{
+	ulonglong4 s0, s1, s2, s3;
+};
+typedef __device_builtin__ struct ulong8 ulong8;
+
+/*
+typedef struct  __align__(256) ulonglong16
+{
+	ulonglong2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf;
+} ulonglong16;
+*/
+typedef struct  __align__(256) ulonglong16
+{
+	ulonglong4 s0, s1, s2, s3, s4, s5, s6, s7;
+} ulonglong16;
+
+
+
+//typedef struct  __align__(32) uint48
+//{
+//	uint4 s0, s1;
+//
+//} uint48;
+
+typedef struct   __align__(16) uint28
+{
+	uint2 x, y, z, w;
+
+} uint28;
+
+/*
+typedef struct  __builtin_align__(32) uint48
+{
+	union {
+		uint4 s0;
+		struct { uint2  x, y;};
+	};
+	union {
+		uint4 s1;
+		struct { uint2 z, w; };
+
+	};
+} uint48;
+*/
+
+typedef struct  __builtin_align__(32) uint48
+{	
+		uint4 s0,s1;
+} uint48;
+
+typedef struct  __align__(64) uint816
+{
+	uint48 s0, s1;
+
+} uint816;
+
+typedef struct  __align__(128) uint1632
+{
+	uint816 s0, s1;
+
+} uint1632;
+
+typedef struct  __align__(256) uintx64
+{
+	uint1632 s0, s1;
+
+} uintx64;
+
+typedef struct  __builtin_align__(256) uintx64bis
+{
+	uint28 s0, s1, s2, s3, s4, s5, s6, s7;
+
+} uintx64bis;
+
+
+
+typedef struct  __align__(512) uintx128
+{
+	uintx64 s0, s1;
+
+} uintx128;
+
+typedef struct  __align__(1024) uintx256
+{
+	uintx128 s0, s1;
+
+} uintx256;
+
+
+
+typedef struct __align__(256) uint4x16
+{
+	uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+} uint4x16;
+
+static __inline__ __device__ ulonglong2to8 make_ulonglong2to8(ulonglong2 s0, ulonglong2 s1, ulonglong2 s2, ulonglong2 s3)
+{
+ulonglong2to8 t; t.l0=s0; t.l1=s1; t.l2=s2; t.l3=s3;
+return t;
+}
+
+static __inline__ __device__ ulonglong8to16 make_ulonglong8to16(const ulonglong2to8 &s0, const ulonglong2to8 &s1)
+{
+	ulonglong8to16 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+static __inline__ __device__ ulonglong16to32 make_ulonglong16to32(const ulonglong8to16 &s0, const ulonglong8to16 &s1)
+{
+	ulonglong16to32 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+static __inline__ __device__ ulonglong32to64 make_ulonglong32to64(const ulonglong16to32 &s0, const ulonglong16to32 &s1)
+{
+	ulonglong32to64 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+
+static __inline__ __host__ __device__ ulonglonglong make_ulonglonglong(
+	const ulonglong2 &s0, const ulonglong2 &s1, const ulonglong2 &s2, const ulonglong2 &s3,
+	const ulonglong2 &s4, const ulonglong2 &s5)
+{
+	ulonglonglong t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5;
+	return t;
+}
+
+
+static __inline__ __device__ uint48 make_uint48(uint4 s0, uint4 s1)
+{
+	uint48 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+/*
+static __inline__ __device__ uint48 make_uint48(uint2 s0, uint2 s1, uint2 s2, uint2 s3)
+{
+	uint48 t; t.x = s0; t.y = s1; t.z = s2; t.w = s3;
+	return t;
+}
+
+static __inline__ __device__ uint48 make_uint48(const uint28 &s0)
+{
+	uint48 t; t.x = s0.x; t.y = s0.y; t.z = s0.z; t.w = s0.w;
+	return t;
+}
+*/
+static __inline__ __device__ uint28 make_uint28(uint2 s0, uint2 s1, uint2 s2, uint2 s3)
+{
+	uint28 t; t.x = s0; t.y = s1; t.z = s2; t.w = s3;
+	return t;
+}
+
+
+static __inline__ __device__ uint816 make_uint816(const uint48 &s0, const uint48 &s1)
+{
+	uint816 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+
+
+
+
+static __inline__ __device__ uint1632 make_uint1632(const uint816 &s0, const uint816 &s1)
+{
+	uint1632 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uintx64 make_uintx64(const uint1632 &s0, const uint1632 &s1)
+{
+	uintx64 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uintx64bis make_uintx64bis(
+	const uint28 &s0, const uint28 &s1, const uint28 &s2, const uint28 &s3,
+	const uint28 &s4, const uint28 &s5, const uint28 &s6, const uint28 &s7
+)
+{
+	uintx64bis t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+
+static __inline__ __device__ uintx128 make_uintx128(const uintx64 &s0, const uintx64 &s1)
+{
+	uintx128 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uintx256 make_uintx256(const uintx128 &s0, const uintx128 &s1)
+{
+	uintx256 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+
+static __inline__ __device__ uintx256 make_uintx64(const uintx128 &s0, const uintx128 &s1)
+{
+	uintx256 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+
+static __inline__ __host__ __device__ uint4x16 make_uint4x16(
+	uint4 s0, uint4 s1, uint4 s2, uint4 s3, uint4 s4, uint4 s5, uint4 s6, uint4 s7,
+	uint4 s8, uint4 s9, uint4 sa, uint4 sb, uint4 sc, uint4 sd, uint4 se, uint4 sf)
+{
+	uint4x16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.s10 = sa; t.s11 = sb; t.s12 = sc; t.s13 = sd; t.s14 = se; t.s15 = sf;
+	return t;
+}
+
+
+static __inline__  __device__ uint2_16 make_uint2_16(
+	uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7,
+	uint2 s8, uint2 s9, uint2 sa, uint2 sb, uint2 sc, uint2 sd, uint2 se, uint2 sf)
+{
+	uint2_16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+	return t;
+}
+
+
+static __inline__ __host__ __device__ uint16 make_uint16(
+	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7,
+	unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf)
+{
+	uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+	return t;
+}
+
+static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b)
+{
+	uint16 t; t.lo=a; t.hi=b; return t;
+}
+
+static __inline__ __host__ __device__ uint32 make_uint32(const uint16 &a, const uint16 &b)
+{
+	uint32 t; t.lo = a; t.hi = b; return t;
+}
+
+
+static __inline__ __host__ __device__ uint8 make_uint8(
+	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7)
+{
+	uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+static __inline__ __host__ __device__ uint2_8 make_uint2_8(
+	uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7)
+{
+	uint2_8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+
+static __inline__ __host__ __device__ ulonglong16 make_ulonglong16(const ulonglong4 &s0, const ulonglong4 &s1,
+	const ulonglong4 &s2, const ulonglong4 &s3, const ulonglong4 &s4, const ulonglong4 &s5, const ulonglong4 &s6, const ulonglong4 &s7)
+{
+	ulonglong16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+
+
+
+static __inline__ __host__ __device__ ulong8 make_ulong8(
+	ulonglong4 s0, ulonglong4 s1, ulonglong4 s2, ulonglong4 s3)
+{
+	ulong8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3;// t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); }
+static __forceinline__ __device__ ulonglong2 operator+ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x + b.x, a.y + b.y); }
+
+static __forceinline__ __device__ ulong8 operator^ (const ulong8 &a, const ulong8 &b) {
+	return make_ulong8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3);
+} //, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
+
+static __forceinline__ __device__ ulong8 operator+ (const ulong8 &a, const ulong8 &b) {
+	return make_ulong8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3);
+} //, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
+
+
+static __forceinline__ __device__  __host__ uint8 operator^ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
+
+static __forceinline__ __device__  __host__ uint8 operator+ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
+
+static __forceinline__ __device__   uint2_8 operator^ (const uint2_8 &a, const uint2_8 &b) { return make_uint2_8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
+
+static __forceinline__ __device__   uint2_8 operator+ (const uint2_8 &a, const uint2_8 &b) { return make_uint2_8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
+
+
+////////////// mess++ //////
+
+static __forceinline__ __device__  uint48 operator^ (const uint48 &a, const uint48 &b) {
+	return make_uint48(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__  uint28 operator^ (const uint28 &a, const uint28 &b) {
+	return make_uint28(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+
+static __forceinline__ __device__  uint28 operator+ (const uint28 &a, const uint28 &b) {
+	return make_uint28(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+static __forceinline__ __device__  uint48 operator+ (const uint48 &a, const uint48 &b) {
+	return make_uint48(a.s0 + b.s0, a.s1 + b.s1);
+}
+/*
+static __forceinline__ __device__  uint48 operator+ (const uint48 &a, const uint48 &b) {
+	return make_uint48(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+*/
+
+static __forceinline__ __device__  uint816 operator^ (const uint816 &a, const uint816 &b) {
+	return make_uint816(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__  uint816 operator+ (const uint816 &a, const uint816 &b) {
+	return make_uint816(a.s0 + b.s0, a.s1 + b.s1);
+}
+
+
+static __forceinline__ __device__ uint1632 operator^ (const uint1632 &a, const uint1632 &b) {
+	return make_uint1632(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+
+static __forceinline__ __device__  uintx64 operator^ (const uintx64 &a, const uintx64 &b) {
+	return make_uintx64(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__  uintx128 operator^ (const uintx128 &a, const uintx128 &b) {
+	return make_uintx128(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__  uintx256 operator^ (const uintx256 &a, const uintx256 &b) {
+	return make_uintx256(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+/////////////////////////
+
+static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) {
+	return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+		a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf);
+}
+
+static __forceinline__ __device__  __host__ uint16 operator+ (const uint16 &a, const uint16 &b) {
+	return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+		a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf);
+}
+
+static __forceinline__ __device__  uint2_16 operator^ (const uint2_16 &a, const uint2_16 &b) {
+	return make_uint2_16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+		a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf);
+}
+
+static __forceinline__ __device__  uint2_16 operator+ (const uint2_16 &a, const uint2_16 &b) {
+	return make_uint2_16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+		a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf);
+}
+
+static __forceinline__ __device__  uintx64bis operator^ (const uintx64bis &a, const uintx64bis &b) {
+	return make_uintx64bis(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7);
+}
+
+static __forceinline__ __device__  uintx64bis operator+ (const uintx64bis &a, const uintx64bis &b) {
+	return make_uintx64bis(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7);
+}
+
+
+static __forceinline__ __device__  uint32 operator^ (const uint32 &a, const uint32 &b) {
+	return make_uint32(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__  uint32 operator+ (const uint32 &a, const uint32 &b) {
+	return make_uint32(a.lo + b.lo, a.hi + b.hi);
+}
+
+
+static __forceinline__ __device__ ulonglong16 operator^ (const ulonglong16 &a, const ulonglong16 &b) {
+	return make_ulonglong16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7);
+}
+
+static __forceinline__ __device__ ulonglong16 operator+ (const ulonglong16 &a, const ulonglong16 &b) {
+	return make_ulonglong16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7);
+}
+
+static __forceinline__ __device__ void operator^= (ulong8 &a, const ulong8 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (uintx64 &a, const uintx64 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator^= (uintx64bis &a, const uintx64bis &b) { a = a ^ b; }
+
+
+static __forceinline__ __device__ void operator^= (uintx128 &a, const uintx128 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (uintx256 &a, const uintx256 &b) { a = a ^ b; }
+
+
+static __forceinline__ __device__ void operator^= (uint816 &a, const uint816 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (uint816 &a, const uint816 &b) { a = a + b; }
+
+
+static __forceinline__ __device__ void operator^= (uint48 &a, const uint48 &b) { a = a ^ b; }
+
+//static __forceinline__ __device__ void operator+= (uint48 &a, const uint48 &b) { a = a + b; }
+
+
+static __forceinline__ __device__ void operator^= (uint28 &a, const uint28 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (uint28 &a, const uint28 &b) { a = a + b; }
+
+static __forceinline__ __device__ void operator^= (uint2_8 &a, const uint2_8 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (uint2_8 &a, const uint2_8 &b) { a = a + b; }
+
+
+
+static __forceinline__ __device__ void operator^= (uint32 &a, const uint32 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (uint32 &a, const uint32 &b) { a = a + b; }
+
+
+//static __forceinline__ __device__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; }
+static __forceinline__ __device__  __host__ void operator^= (uint8 &a, const uint8 &b) { a = a ^ b; }
+static __forceinline__ __device__  __host__ void operator^= (uint16 &a, const uint16 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator^= (ulonglong16 &a, const ulonglong16 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator+= (ulonglong4 &a, const ulonglong4 &b) { a = a + b; }
+
+static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator+= (ulonglong2 &a, const ulonglong2 &b) { a = a + b; }
+
+static __forceinline__ __device__
+ulonglong2to8 operator^ (const ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+	return make_ulonglong2to8(a.l0 ^ b.l0, a.l1 ^ b.l1, a.l2 ^ b.l2, a.l3 ^ b.l3);
+}
+static __forceinline__ __device__
+ulonglong2to8 operator+ (const ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+	return make_ulonglong2to8(a.l0 + b.l0, a.l1 + b.l1, a.l2 + b.l2, a.l3 + b.l3);
+}
+
+
+static __forceinline__ __device__
+ulonglong8to16 operator^ (const ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	return make_ulonglong8to16(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong8to16 operator+ (const ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	return make_ulonglong8to16(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong16to32 operator^ (const ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	return make_ulonglong16to32(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong16to32 operator+ (const ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	return make_ulonglong16to32(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong32to64 operator^ (const ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	return make_ulonglong32to64(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong32to64 operator+ (const ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	return make_ulonglong32to64(a.lo + b.lo, a.hi + b.hi);
+}
+
+
+static __forceinline__ __device__ ulonglonglong operator^ (const ulonglonglong &a, const ulonglonglong &b) {
+	return make_ulonglonglong(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5);
+}
+
+static __forceinline__ __device__ ulonglonglong operator+ (const ulonglonglong &a, const ulonglonglong &b) {
+	return make_ulonglonglong(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5);
+}
+
+
+static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulonglong2to8 &b) { a = a ^ b; }
+
+
+static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; }
+static __forceinline__ __device__  __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
+static __forceinline__ __device__  __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
+static __forceinline__ __device__   void operator+= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
+static __forceinline__ __device__   void operator^= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
+
+static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a ^ b; }
+
+
+static __forceinline__ __device__ void operator+= (ulonglonglong &a, const ulonglonglong &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglonglong &a, const ulonglonglong &b) { a = a ^ b; }
+
+
+#define rotate ROTL32
+#define rotateR ROTR32
+
+#if __CUDA_ARCH__ >= 320
+
+static __forceinline__ __device__ uint4 rotate4(uint4 vec4, uint32_t shift)
+{
+	uint4 ret;
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift));
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift));
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift));
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift));
+	return ret;
+}
+
+static __forceinline__ __device__ uint4 rotate4R(uint4 vec4, uint32_t shift)
+{
+	uint4 ret;
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift));
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift));
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift));
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift));
+	return ret;
+}
+
+static __device__ __inline__ uint8 __ldg8(const uint8_t *ptr)
+{
+	uint8 test;
+	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr));
+	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr));
+	return (test);
+}
+
+
+static __device__ __inline__ uint32_t __ldgtoint(const uint8_t *ptr)
+{
+	uint32_t test;
+	asm volatile ("ld.global.nc.u32 {%0},[%1];" : "=r"(test) : __LDG_PTR(ptr));
+	return (test);
+}
+
+static __device__ __inline__ uint32_t __ldgtoint64(const uint8_t *ptr)
+{
+	uint64_t test;
+	asm volatile ("ld.global.nc.u64 {%0},[%1];" : "=l"(test) : __LDG_PTR(ptr));
+	return (test);
+}
+
+
+static __device__ __inline__ uint32_t __ldgtoint_unaligned(const uint8_t *ptr)
+{
+	uint32_t test;
+	asm volatile ("{\n\t"
+		".reg .u8 a,b,c,d; \n\t"
+	"ld.global.nc.u8 a,[%1]; \n\t"
+	"ld.global.nc.u8 b,[%1+1]; \n\t"
+	"ld.global.nc.u8 c,[%1+2]; \n\t"
+	"ld.global.nc.u8 d,[%1+3]; \n\t"
+	"mov.b32 %0,{a,b,c,d}; }\n\t"
+		: "=r"(test) : __LDG_PTR(ptr));
+	return (test);
+}
+
+static __device__ __inline__ uint64_t __ldgtoint64_unaligned(const uint8_t *ptr)
+{
+	uint64_t test;
+	asm volatile ("{\n\t"
+		".reg .u8 a,b,c,d,e,f,g,h; \n\t"
+		".reg .u32 i,j; \n\t"
+		"ld.global.nc.u8 a,[%1]; \n\t"
+		"ld.global.nc.u8 b,[%1+1]; \n\t"
+		"ld.global.nc.u8 c,[%1+2]; \n\t"
+		"ld.global.nc.u8 d,[%1+3]; \n\t"
+		"ld.global.nc.u8 e,[%1+4]; \n\t"
+		"ld.global.nc.u8 f,[%1+5]; \n\t"
+		"ld.global.nc.u8 g,[%1+6]; \n\t"
+		"ld.global.nc.u8 h,[%1+7]; \n\t"
+		 "mov.b32 i,{a,b,c,d}; \n\t"
+         "mov.b32 j,{e,f,g,h}; \n\t"
+		 "mov.b64 %0,{i,j}; }\n\t"
+		: "=l"(test) : __LDG_PTR(ptr));
+	return (test);
+}
+
+
+static __device__ __inline__ uint64_t __ldgtoint64_trunc(const uint8_t *ptr)
+{
+	uint32_t zero = 0;
+	uint64_t test;
+	asm volatile ("{\n\t"
+		".reg .u8 a,b,c,d; \n\t"
+		".reg .u32 i; \n\t"
+		"ld.global.nc.u8 a,[%1]; \n\t"
+		"ld.global.nc.u8 b,[%1+1]; \n\t"
+		"ld.global.nc.u8 c,[%1+2]; \n\t"
+		"ld.global.nc.u8 d,[%1+3]; \n\t"
+		"mov.b32 i,{a,b,c,d}; \n\t"
+		"mov.b64 %0,{i,%1}; }\n\t"
+		: "=l"(test) : __LDG_PTR(ptr), "r"(zero));
+	return (test);
+}
+
+
+
+static __device__ __inline__ uint32_t __ldgtoint_unaligned2(const uint8_t *ptr)
+{
+	uint32_t test;
+	asm("{\n\t"
+		".reg .u8 e,b,c,d; \n\t"
+		"ld.global.nc.u8 e,[%1]; \n\t"
+		"ld.global.nc.u8 b,[%1+1]; \n\t"
+		"ld.global.nc.u8 c,[%1+2]; \n\t"
+		"ld.global.nc.u8 d,[%1+3]; \n\t"
+		"mov.b32 %0,{e,b,c,d}; }\n\t"
+		: "=r"(test) : __LDG_PTR(ptr));
+	return (test);
+}
+
+#endif
+
+static __forceinline__ __device__ void shift256R2(uint32_t * ret, const uint8 &vec4, const uint32_t shift)
+{
+	uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+	ret[8] = cuda_swab32(truc);
+	truc3 = cuda_swab32(vec4.s6);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+	ret[7] = cuda_swab32(truc);
+	truc2 = cuda_swab32(vec4.s5);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+	ret[6] = cuda_swab32(truc);
+	truc3 = cuda_swab32(vec4.s4);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+	ret[5] = cuda_swab32(truc);
+	truc2 = cuda_swab32(vec4.s3);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+	ret[4] = cuda_swab32(truc);
+	truc3 = cuda_swab32(vec4.s2);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+	ret[3] = cuda_swab32(truc);
+	truc2 = cuda_swab32(vec4.s1);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+	ret[2] = cuda_swab32(truc);
+	truc3 = cuda_swab32(vec4.s0);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+	ret[1] = cuda_swab32(truc);
+	asm("shr.b32        %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift));
+	ret[0] = cuda_swab32(truc);
+
+}
+
+#define shift256R3(ret,vec4, shift) \
+{ \
+ \
+uint32_t truc=0,truc2=cuda_swab32(vec4.s7),truc3=0; \
+	asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+		ret[8] = cuda_swab32(truc); \
+truc2=cuda_swab32(vec4.s6);truc3=cuda_swab32(vec4.s7); \
+	asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+		ret[7] = cuda_swab32(truc); \
+truc2=cuda_swab32(vec4.s5);truc3=cuda_swab32(vec4.s6); \
+	asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+		ret[6] = cuda_swab32(truc); \
+truc2 = cuda_swab32(vec4.s4); truc3 = cuda_swab32(vec4.s5); \
+	asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+		ret[5] = cuda_swab32(truc); \
+truc2 = cuda_swab32(vec4.s3); truc3 = cuda_swab32(vec4.s4); \
+	asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+		ret[4] = cuda_swab32(truc); \
+truc2 = cuda_swab32(vec4.s2); truc3 = cuda_swab32(vec4.s3); \
+	asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+		ret[3] = cuda_swab32(truc); \
+truc2 = cuda_swab32(vec4.s1); truc3 = cuda_swab32(vec4.s2); \
+	asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+		ret[2] = cuda_swab32(truc); \
+truc2 = cuda_swab32(vec4.s0); truc3 = cuda_swab32(vec4.s1); \
+	asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+		ret[1] = cuda_swab32(truc); \
+truc3 = cuda_swab32(vec4.s0); \
+	asm volatile ("shr.b32        %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift)); \
+		ret[0] = cuda_swab32(truc); \
+ \
+ \
+}
+
+#if __CUDA_ARCH__ >= 320 && !defined NOASM
+static __device__ __inline__ uint32 __ldg32b(const uint32 *ptr)
+{
+	uint32 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"     : "=r"(ret.lo.s0), "=r"(ret.lo.s1), "=r"(ret.lo.s2), "=r"(ret.lo.s3) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];"  : "=r"(ret.lo.s4), "=r"(ret.lo.s5), "=r"(ret.lo.s6), "=r"(ret.lo.s7) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret.lo.s8), "=r"(ret.lo.s9), "=r"(ret.lo.sa), "=r"(ret.lo.sb) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];"  : "=r"(ret.lo.sc), "=r"(ret.lo.sd), "=r"(ret.lo.se), "=r"(ret.lo.sf) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret.hi.s0), "=r"(ret.hi.s1), "=r"(ret.hi.s2), "=r"(ret.hi.s3) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];"  : "=r"(ret.hi.s4), "=r"(ret.hi.s5), "=r"(ret.hi.s6), "=r"(ret.hi.s7) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];"  : "=r"(ret.hi.s8), "=r"(ret.hi.s9), "=r"(ret.hi.sa), "=r"(ret.hi.sb) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.hi.sc), "=r"(ret.hi.sd), "=r"(ret.hi.se), "=r"(ret.hi.sf) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ uint16 __ldg16b(const uint16 *ptr)
+{
+	uint16 ret; 
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"     : "=r"(ret.s0), "=r"(ret.s1), "=r"(ret.s2), "=r"(ret.s3) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];"  : "=r"(ret.s4), "=r"(ret.s5), "=r"(ret.s6), "=r"(ret.s7) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret.s8), "=r"(ret.s9), "=r"(ret.sa), "=r"(ret.sb) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];"  : "=r"(ret.sc), "=r"(ret.sd), "=r"(ret.se), "=r"(ret.sf) : __LDG_PTR(ptr));
+	return ret;
+}
+
+
+static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr)
+{
+	uintx64 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];"  : "=r"(ret.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];"  : "=r"(ret.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];"  : "=r"(ret.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];"  : "=r"(ret.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];"  : "=r"(ret.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];"  : "=r"(ret.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];"  : "=r"(ret.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];"  : "=r"(ret.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];"  : "=r"(ret.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];"  : "=r"(ret.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];"  : "=r"(ret.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];"  : "=r"(ret.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];"  : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ uintx64 __ldg32c(const uintx64 *ptr)
+{
+	uintx64 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];"  : "=r"(ret.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];"  : "=r"(ret.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];"  : "=r"(ret.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];"  : "=r"(ret.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];"  : "=r"(ret.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];"  : "=r"(ret.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];"  : "=r"(ret.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];"  : "=r"(ret.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];"  : "=r"(ret.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];"  : "=r"(ret.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];"  : "=r"(ret.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];"  : "=r"(ret.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];"  : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+
+	return ret;
+}
+
+static __device__ __inline__ uintx128 __ldg128(const uintx128 *ptr)
+{
+	uintx128 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"     : "=r"(ret.s0.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];"  : "=r"(ret.s0.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret.s0.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];"  : "=r"(ret.s0.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret.s0.s0.s1.s0.s0.x), "=r"(ret.s0.s0.s1.s0.s0.y), "=r"(ret.s0.s0.s1.s0.s0.z), "=r"(ret.s0.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];"  : "=r"(ret.s0.s0.s1.s0.s1.x), "=r"(ret.s0.s0.s1.s0.s1.y), "=r"(ret.s0.s0.s1.s0.s1.z), "=r"(ret.s0.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];"  : "=r"(ret.s0.s0.s1.s1.s0.x), "=r"(ret.s0.s0.s1.s1.s0.y), "=r"(ret.s0.s0.s1.s1.s0.z), "=r"(ret.s0.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s0.s1.s1.s1.x), "=r"(ret.s0.s0.s1.s1.s1.y), "=r"(ret.s0.s0.s1.s1.s1.z), "=r"(ret.s0.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s0.s1.s0.s0.s0.x), "=r"(ret.s0.s1.s0.s0.s0.y), "=r"(ret.s0.s1.s0.s0.s0.z), "=r"(ret.s0.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s0.s1.s0.s0.s1.x), "=r"(ret.s0.s1.s0.s0.s1.y), "=r"(ret.s0.s1.s0.s0.s1.z), "=r"(ret.s0.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s0.s1.s0.s1.s0.x), "=r"(ret.s0.s1.s0.s1.s0.y), "=r"(ret.s0.s1.s0.s1.s0.z), "=r"(ret.s0.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s0.s1.s0.s1.s1.x), "=r"(ret.s0.s1.s0.s1.s1.y), "=r"(ret.s0.s1.s0.s1.s1.z), "=r"(ret.s0.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s0.s1.s1.s0.s0.x), "=r"(ret.s0.s1.s1.s0.s0.y), "=r"(ret.s0.s1.s1.s0.s0.z), "=r"(ret.s0.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s0.s1.s1.s0.s1.x), "=r"(ret.s0.s1.s1.s0.s1.y), "=r"(ret.s0.s1.s1.s0.s1.z), "=r"(ret.s0.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s0.s1.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s0.s1.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+256];" : "=r"(ret.s1.s0.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+272];" : "=r"(ret.s1.s0.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+288];" : "=r"(ret.s1.s0.s0.s1.s0.x), "=r"(ret.s1.s0.s0.s1.s0.y), "=r"(ret.s1.s0.s0.s1.s0.z), "=r"(ret.s1.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+304];" : "=r"(ret.s1.s0.s0.s1.s1.x), "=r"(ret.s1.s0.s0.s1.s1.y), "=r"(ret.s1.s0.s0.s1.s1.z), "=r"(ret.s1.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+320];" : "=r"(ret.s1.s0.s1.s0.s0.x), "=r"(ret.s1.s0.s1.s0.s0.y), "=r"(ret.s1.s0.s1.s0.s0.z), "=r"(ret.s1.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+336];" : "=r"(ret.s1.s0.s1.s0.s1.x), "=r"(ret.s1.s0.s1.s0.s1.y), "=r"(ret.s1.s0.s1.s0.s1.z), "=r"(ret.s1.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+352];" : "=r"(ret.s1.s0.s1.s1.s0.x), "=r"(ret.s1.s0.s1.s1.s0.y), "=r"(ret.s1.s0.s1.s1.s0.z), "=r"(ret.s1.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+368];" : "=r"(ret.s1.s0.s1.s1.s1.x), "=r"(ret.s1.s0.s1.s1.s1.y), "=r"(ret.s1.s0.s1.s1.s1.z), "=r"(ret.s1.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+384];" : "=r"(ret.s1.s1.s0.s0.s0.x), "=r"(ret.s1.s1.s0.s0.s0.y), "=r"(ret.s1.s1.s0.s0.s0.z), "=r"(ret.s1.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+400];" : "=r"(ret.s1.s1.s0.s0.s1.x), "=r"(ret.s1.s1.s0.s0.s1.y), "=r"(ret.s1.s1.s0.s0.s1.z), "=r"(ret.s1.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+416];" : "=r"(ret.s1.s1.s0.s1.s0.x), "=r"(ret.s1.s1.s0.s1.s0.y), "=r"(ret.s1.s1.s0.s1.s0.z), "=r"(ret.s1.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+432];" : "=r"(ret.s1.s1.s0.s1.s1.x), "=r"(ret.s1.s1.s0.s1.s1.y), "=r"(ret.s1.s1.s0.s1.s1.z), "=r"(ret.s1.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+448];" : "=r"(ret.s1.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+464];" : "=r"(ret.s1.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+480];" : "=r"(ret.s1.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+496];" : "=r"(ret.s1.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+
+	return ret;
+}
+
+static __device__ __inline__ ulonglong2 __ldg2(const ulonglong2 *ptr)
+{
+	ulonglong2 ret;
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr));
+return ret;
+}
+
+static __device__ __inline__ ulonglong4 __ldg4(const ulonglong4 *ptr)
+{
+	ulonglong4 ret;
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret.z), "=l"(ret.w) : __LDG_PTR(ptr));
+	return ret;
+}
+static __device__ __inline__ void ldg4(const ulonglong4 *ptr,ulonglong4 *ret)
+{
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"     : "=l"(ret[0].x), "=l"(ret[0].y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret[0].z), "=l"(ret[0].w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];"  : "=l"(ret[1].x), "=l"(ret[1].y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];"  : "=l"(ret[1].z), "=l"(ret[1].w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];"  : "=l"(ret[2].x), "=l"(ret[2].y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];"  : "=l"(ret[2].z), "=l"(ret[2].w) : __LDG_PTR(ptr));
+}
+static __device__ __inline__ void ldg4xor(const ulonglong4 *ptr, ulonglong4 *ret, ulonglong4 *state)
+{
+
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"     : "=l"(ret[0].x), "=l"(ret[0].y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret[0].z), "=l"(ret[0].w) : __LDG_PTR(ptr));
+	state[0] ^= ret[0];
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];"  : "=l"(ret[1].x), "=l"(ret[1].y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];"  : "=l"(ret[1].z), "=l"(ret[1].w) : __LDG_PTR(ptr));
+	state[1] ^= ret[1];
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];"  : "=l"(ret[2].x), "=l"(ret[2].y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];"  : "=l"(ret[2].z), "=l"(ret[2].w) : __LDG_PTR(ptr));
+	state[2] ^= ret[2];
+}
+
+
+static __device__ __inline__ uint28 __ldg4(const uint28 *ptr)
+{
+	uint28 ret;
+asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr));
+asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr));
+
+	return ret;
+}
+
+static __device__ __inline__ uint48 __ldg4(const uint48 *ptr)
+{
+	uint48 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.s0.x), "=r"(ret.s0.y), "=r"(ret.s0.z), "=r"(ret.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s1.x), "=r"(ret.s1.y), "=r"(ret.s1.z), "=r"(ret.s1.w) : __LDG_PTR(ptr));
+	return ret;
+}
+
+
+static __device__ __inline__ void ldg4(const uint28 *ptr, uint28 *ret)
+{
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr));
+}
+static __device__ __inline__ void ldg4xor(const uint28 *ptr, uint28 *ret,uint28* state)
+{
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr));
+	state[0].x ^= ret[0].x;	state[0].y ^= ret[0].y;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr));
+	state[0].z ^= ret[0].z;	state[0].w ^= ret[0].w;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr));
+	state[1].x ^= ret[1].x;	state[1].y ^= ret[1].y;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr));
+	state[1].z ^= ret[1].z;	state[1].w ^= ret[1].w;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr));
+	state[2].x ^= ret[2].x;	state[2].y ^= ret[2].y;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr));
+	state[2].z ^= ret[2].z;	state[2].w ^= ret[2].w;
+
+
+}
+
+
+static __device__ __inline__ ulonglong2to8 __ldg2to8(const ulonglong2to8 *ptr)
+{
+	ulonglong2to8 ret;
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"     : "=l"(ret.l0.x), "=l"(ret.l0.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret.l1.x), "=l"(ret.l1.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];"  : "=l"(ret.l2.x), "=l"(ret.l2.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];"  : "=l"(ret.l3.x), "=l"(ret.l3.y) : __LDG_PTR(ptr));
+	return ret;
+}
+static __device__ __inline__ ulonglong8to16 __ldg8to16(const ulonglong8to16 *ptr)
+{
+	ulonglong8to16 ret;
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"     : "=l"(ret.lo.l0.x), "=l"(ret.lo.l0.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret.lo.l1.x), "=l"(ret.lo.l1.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];"  : "=l"(ret.lo.l2.x), "=l"(ret.lo.l2.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];"  : "=l"(ret.lo.l3.x), "=l"(ret.lo.l3.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];"  : "=l"(ret.hi.l0.x), "=l"(ret.hi.l0.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];"  : "=l"(ret.hi.l1.x), "=l"(ret.hi.l1.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];"  : "=l"(ret.hi.l2.x), "=l"(ret.hi.l2.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.hi.l3.x), "=l"(ret.hi.l3.y) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ ulonglonglong __ldgxtralong(const ulonglonglong *ptr)
+{
+	ulonglonglong ret;
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"     : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];"  : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];"  : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];"  : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];"  : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr));
+	return ret;
+}
+static __device__ __inline__ uint8 ldg8bis(const uint8 *ptr)
+{
+	uint8 test;
+	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr));
+	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr));
+	return (test);
+}
+
+
+static __device__ __inline__ ulonglong16 __ldg32(const ulonglong4 *ptr)
+{
+	ulonglong16 ret;
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"     : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret.s0.z), "=l"(ret.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];"  : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];"  : "=l"(ret.s1.z), "=l"(ret.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];"  : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];"  : "=l"(ret.s2.z), "=l"(ret.s2.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];"  : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];"  : "=l"(ret.s3.z), "=l"(ret.s3.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+128];"  : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+144];"  : "=l"(ret.s4.z), "=l"(ret.s4.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+160];"  : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+176];"  : "=l"(ret.s5.z), "=l"(ret.s5.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+192];"  : "=l"(ret.s6.x), "=l"(ret.s6.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+208];"  : "=l"(ret.s6.z), "=l"(ret.s6.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+224];"  : "=l"(ret.s7.x), "=l"(ret.s7.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+240];"  : "=l"(ret.s7.z), "=l"(ret.s7.w) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ uintx64bis __ldg32(const uint28 *ptr)
+{
+	uintx64bis ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"     : "=r"(ret.s0.x.x), "=r"(ret.s0.x.y), "=r"(ret.s0.y.x), "=r"(ret.s0.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];"  : "=r"(ret.s0.z.x), "=r"(ret.s0.z.y), "=r"(ret.s0.w.x), "=r"(ret.s0.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret.s1.x.x), "=r"(ret.s1.x.y), "=r"(ret.s1.y.x), "=r"(ret.s1.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];"  : "=r"(ret.s1.z.x), "=r"(ret.s1.z.y), "=r"(ret.s1.w.x), "=r"(ret.s1.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret.s2.x.x), "=r"(ret.s2.x.y), "=r"(ret.s2.y.x), "=r"(ret.s2.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];"  : "=r"(ret.s2.z.x), "=r"(ret.s2.z.y), "=r"(ret.s2.w.x), "=r"(ret.s2.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];"  : "=r"(ret.s3.x.x), "=r"(ret.s3.x.y), "=r"(ret.s3.y.x), "=r"(ret.s3.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s3.z.x), "=r"(ret.s3.z.y), "=r"(ret.s3.w.x), "=r"(ret.s3.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s4.x.x), "=r"(ret.s4.x.y), "=r"(ret.s4.y.x), "=r"(ret.s4.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s4.z.x), "=r"(ret.s4.z.y), "=r"(ret.s4.w.x), "=r"(ret.s4.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s5.x.x), "=r"(ret.s5.x.y), "=r"(ret.s5.y.x), "=r"(ret.s5.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s5.z.x), "=r"(ret.s5.z.y), "=r"(ret.s5.w.x), "=r"(ret.s5.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s6.x.x), "=r"(ret.s6.x.y), "=r"(ret.s6.y.x), "=r"(ret.s6.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s6.z.x), "=r"(ret.s6.z.y), "=r"(ret.s6.w.x), "=r"(ret.s6.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s7.x.x), "=r"(ret.s7.x.y), "=r"(ret.s7.y.x), "=r"(ret.s7.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s7.z.x), "=r"(ret.s7.z.y), "=r"(ret.s7.w.x), "=r"(ret.s7.w.y) : __LDG_PTR(ptr));
+	return ret;
+}
+#else //not implemented yet
+static __device__ __inline__ uint32 __ldg32b(const uint32 *ptr)
+{
+	return *ptr;
+}
+
+static __device__ __inline__ uint16 __ldg16b(const uint16 *ptr)
+{
+	return *ptr;
+}
+
+
+static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr)
+{
+	return *((uintx64*)ptr);
+}
+
+static __device__ __inline__ uintx64 __ldg32c(const uintx64 *ptr)
+{
+	return *ptr;
+}
+
+static __device__ __inline__ uintx128 __ldg128(const uintx128 *ptr)
+{
+	return *ptr;
+}
+
+static __device__ __inline__ ulonglong2 __ldg2(const ulonglong2 *ptr)
+{
+	return *ptr;
+}
+
+static __device__ __inline__ ulonglong4 __ldg4(const ulonglong4 *ptr)
+{
+	return *ptr;
+}
+static __device__ __inline__ void ldg4(const ulonglong4 *ptr, ulonglong4 *ret)
+{
+	*ret = *ptr;
+}
+static __device__ __inline__ void ldg4xor(const ulonglong4 *ptr, ulonglong4 *ret, ulonglong4 *state)
+{
+	ret[0] = ptr[0];
+	ret[1] = ptr[1];
+	ret[2] = ptr[2];
+	state[0] ^= ret[0];
+	state[1] ^= ret[1];
+	state[2] ^= ret[2];
+}
+
+
+static __device__ __inline__ uint28 __ldg4(const uint28 *ptr)
+{
+	uint28 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr));
+
+	return ret;
+}
+
+static __device__ __inline__ uint48 __ldg4(const uint48 *ptr)
+{
+	uint48 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.s0.x), "=r"(ret.s0.y), "=r"(ret.s0.z), "=r"(ret.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s1.x), "=r"(ret.s1.y), "=r"(ret.s1.z), "=r"(ret.s1.w) : __LDG_PTR(ptr));
+	return ret;
+}
+
+
+static __device__ __inline__ void ldg4(const uint28 *ptr, uint28 *ret)
+{
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr));
+}
+static __device__ __inline__ void ldg4xor(const uint28 *ptr, uint28 *ret, uint28* state)
+{
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr));
+	state[0].x ^= ret[0].x;	state[0].y ^= ret[0].y;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr));
+	state[0].z ^= ret[0].z;	state[0].w ^= ret[0].w;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr));
+	state[1].x ^= ret[1].x;	state[1].y ^= ret[1].y;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr));
+	state[1].z ^= ret[1].z;	state[1].w ^= ret[1].w;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr));
+	state[2].x ^= ret[2].x;	state[2].y ^= ret[2].y;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr));
+	state[2].z ^= ret[2].z;	state[2].w ^= ret[2].w;
+
+
+}
+
+
+static __device__ __inline__ ulonglong2to8 __ldg2to8(const ulonglong2to8 *ptr)
+{
+	ulonglong2to8 ret;
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"     : "=l"(ret.l0.x), "=l"(ret.l0.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret.l1.x), "=l"(ret.l1.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];"  : "=l"(ret.l2.x), "=l"(ret.l2.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];"  : "=l"(ret.l3.x), "=l"(ret.l3.y) : __LDG_PTR(ptr));
+	return ret;
+}
+static __device__ __inline__ ulonglong8to16 __ldg8to16(const ulonglong8to16 *ptr)
+{
+	ulonglong8to16 ret;
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"     : "=l"(ret.lo.l0.x), "=l"(ret.lo.l0.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret.lo.l1.x), "=l"(ret.lo.l1.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];"  : "=l"(ret.lo.l2.x), "=l"(ret.lo.l2.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];"  : "=l"(ret.lo.l3.x), "=l"(ret.lo.l3.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];"  : "=l"(ret.hi.l0.x), "=l"(ret.hi.l0.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];"  : "=l"(ret.hi.l1.x), "=l"(ret.hi.l1.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];"  : "=l"(ret.hi.l2.x), "=l"(ret.hi.l2.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.hi.l3.x), "=l"(ret.hi.l3.y) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ ulonglonglong __ldgxtralong(const ulonglonglong *ptr)
+{
+	ulonglonglong ret;
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"     : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];"  : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];"  : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];"  : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];"  : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr));
+	return ret;
+}
+static __device__ __inline__ uint8 ldg8bis(const uint8 *ptr)
+{
+	uint8 test;
+	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr));
+	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr));
+	return (test);
+}
+
+
+static __device__ __inline__ ulonglong16 __ldg32(const ulonglong4 *ptr)
+{
+	ulonglong16 ret;
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"     : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret.s0.z), "=l"(ret.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];"  : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];"  : "=l"(ret.s1.z), "=l"(ret.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];"  : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];"  : "=l"(ret.s2.z), "=l"(ret.s2.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];"  : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];"  : "=l"(ret.s3.z), "=l"(ret.s3.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+128];"  : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+144];"  : "=l"(ret.s4.z), "=l"(ret.s4.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+160];"  : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+176];"  : "=l"(ret.s5.z), "=l"(ret.s5.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+192];"  : "=l"(ret.s6.x), "=l"(ret.s6.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+208];"  : "=l"(ret.s6.z), "=l"(ret.s6.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+224];"  : "=l"(ret.s7.x), "=l"(ret.s7.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+240];"  : "=l"(ret.s7.z), "=l"(ret.s7.w) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ uintx64bis __ldg32(const uint28 *ptr)
+{
+	uintx64bis ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"     : "=r"(ret.s0.x.x), "=r"(ret.s0.x.y), "=r"(ret.s0.y.x), "=r"(ret.s0.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];"  : "=r"(ret.s0.z.x), "=r"(ret.s0.z.y), "=r"(ret.s0.w.x), "=r"(ret.s0.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret.s1.x.x), "=r"(ret.s1.x.y), "=r"(ret.s1.y.x), "=r"(ret.s1.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];"  : "=r"(ret.s1.z.x), "=r"(ret.s1.z.y), "=r"(ret.s1.w.x), "=r"(ret.s1.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret.s2.x.x), "=r"(ret.s2.x.y), "=r"(ret.s2.y.x), "=r"(ret.s2.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];"  : "=r"(ret.s2.z.x), "=r"(ret.s2.z.y), "=r"(ret.s2.w.x), "=r"(ret.s2.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];"  : "=r"(ret.s3.x.x), "=r"(ret.s3.x.y), "=r"(ret.s3.y.x), "=r"(ret.s3.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s3.z.x), "=r"(ret.s3.z.y), "=r"(ret.s3.w.x), "=r"(ret.s3.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s4.x.x), "=r"(ret.s4.x.y), "=r"(ret.s4.y.x), "=r"(ret.s4.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s4.z.x), "=r"(ret.s4.z.y), "=r"(ret.s4.w.x), "=r"(ret.s4.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s5.x.x), "=r"(ret.s5.x.y), "=r"(ret.s5.y.x), "=r"(ret.s5.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s5.z.x), "=r"(ret.s5.z.y), "=r"(ret.s5.w.x), "=r"(ret.s5.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s6.x.x), "=r"(ret.s6.x.y), "=r"(ret.s6.y.x), "=r"(ret.s6.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s6.z.x), "=r"(ret.s6.z.y), "=r"(ret.s6.w.x), "=r"(ret.s6.w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s7.x.x), "=r"(ret.s7.x.y), "=r"(ret.s7.y.x), "=r"(ret.s7.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s7.z.x), "=r"(ret.s7.z.y), "=r"(ret.s7.w.x), "=r"(ret.s7.w.y) : __LDG_PTR(ptr));
+	return ret;
+}
+
+#endif
+
+static __forceinline__ __device__ uint8 swapvec(const uint8 &buf)
+{
+	uint8 vec;
+	vec.s0 = cuda_swab32(buf.s0);
+	vec.s1 = cuda_swab32(buf.s1);
+	vec.s2 = cuda_swab32(buf.s2);
+	vec.s3 = cuda_swab32(buf.s3);
+	vec.s4 = cuda_swab32(buf.s4);
+	vec.s5 = cuda_swab32(buf.s5);
+	vec.s6 = cuda_swab32(buf.s6);
+	vec.s7 = cuda_swab32(buf.s7);
+	return vec;
+}
+
+
+static __forceinline__ __device__ uint8 swapvec(const uint8 *buf)
+{
+	uint8 vec;
+	vec.s0 = cuda_swab32(buf[0].s0);
+	vec.s1 = cuda_swab32(buf[0].s1);
+	vec.s2 = cuda_swab32(buf[0].s2);
+	vec.s3 = cuda_swab32(buf[0].s3);
+	vec.s4 = cuda_swab32(buf[0].s4);
+	vec.s5 = cuda_swab32(buf[0].s5);
+	vec.s6 = cuda_swab32(buf[0].s6);
+	vec.s7 = cuda_swab32(buf[0].s7);
+	return vec;
+}
+
+static __forceinline__ __device__ uint16 swapvec(const uint16 *buf)
+{
+	uint16 vec;
+	vec.s0 = cuda_swab32(buf[0].s0);
+	vec.s1 = cuda_swab32(buf[0].s1);
+	vec.s2 = cuda_swab32(buf[0].s2);
+	vec.s3 = cuda_swab32(buf[0].s3);
+	vec.s4 = cuda_swab32(buf[0].s4);
+	vec.s5 = cuda_swab32(buf[0].s5);
+	vec.s6 = cuda_swab32(buf[0].s6);
+	vec.s7 = cuda_swab32(buf[0].s7);
+	vec.s8 = cuda_swab32(buf[0].s8);
+	vec.s9 = cuda_swab32(buf[0].s9);
+	vec.sa = cuda_swab32(buf[0].sa);
+	vec.sb = cuda_swab32(buf[0].sb);
+	vec.sc = cuda_swab32(buf[0].sc);
+	vec.sd = cuda_swab32(buf[0].sd);
+	vec.se = cuda_swab32(buf[0].se);
+	vec.sf = cuda_swab32(buf[0].sf);
+	return vec;
+}
+
+static __forceinline__ __device__ uint16 swapvec(const uint16 &buf)
+{
+	uint16 vec;
+	vec.s0 = cuda_swab32(buf.s0);
+	vec.s1 = cuda_swab32(buf.s1);
+	vec.s2 = cuda_swab32(buf.s2);
+	vec.s3 = cuda_swab32(buf.s3);
+	vec.s4 = cuda_swab32(buf.s4);
+	vec.s5 = cuda_swab32(buf.s5);
+	vec.s6 = cuda_swab32(buf.s6);
+	vec.s7 = cuda_swab32(buf.s7);
+	vec.s8 = cuda_swab32(buf.s8);
+	vec.s9 = cuda_swab32(buf.s9);
+	vec.sa = cuda_swab32(buf.sa);
+	vec.sb = cuda_swab32(buf.sb);
+	vec.sc = cuda_swab32(buf.sc);
+	vec.sd = cuda_swab32(buf.sd);
+	vec.se = cuda_swab32(buf.se);
+	vec.sf = cuda_swab32(buf.sf);
+	return vec;
+}
+
+static __device__ __forceinline__ uint28 shuffle4(const uint28 &var, int lane)
+{
+uint28 res;
+res.x.x = __shfl(var.x.x, lane);
+res.x.y = __shfl(var.x.y, lane);
+res.y.x = __shfl(var.y.x, lane);
+res.y.y = __shfl(var.y.y, lane);
+res.z.x = __shfl(var.z.x, lane);
+res.z.y = __shfl(var.z.y, lane);
+res.w.x = __shfl(var.w.x, lane);
+res.w.y = __shfl(var.w.y, lane);
+return res;
+}
+
+
+static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane)
+{
+	ulonglong4 res;
+    uint2 temp;
+	temp = vectorize(var.x);
+	temp.x = __shfl(temp.x, lane);
+	temp.y = __shfl(temp.y, lane);
+	res.x = devectorize(temp);
+	temp = vectorize(var.y);
+	temp.x = __shfl(temp.x, lane);
+	temp.y = __shfl(temp.y, lane);
+	res.y = devectorize(temp);
+	temp = vectorize(var.z);
+	temp.x = __shfl(temp.x, lane);
+	temp.y = __shfl(temp.y, lane);
+	res.z = devectorize(temp);
+	temp = vectorize(var.w);
+	temp.x = __shfl(temp.x, lane);
+	temp.y = __shfl(temp.y, lane);
+	res.w = devectorize(temp);
+	return res;
+}
+
+
+#endif // #ifndef CUDA_VECTOR_H
diff --git a/cuda_x11_aes_noasm.cu b/cuda_x11_aes_noasm.cu
new file mode 100644
index 0000000000..1f595c6192
--- /dev/null
+++ b/cuda_x11_aes_noasm.cu
@@ -0,0 +1,347 @@
+#include "cuda_helper.h"
+
+/* AES Helper for inline-usage from SPH */
+#define AESx(x) SPH_C32(x)
+
+__constant__ __align__(64) uint32_t d_AES0[256] = {
+	AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
+	AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
+	AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
+	AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
+	AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
+	AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
+	AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
+	AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
+	AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
+	AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
+	AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
+	AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
+	AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
+	AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
+	AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
+	AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
+	AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
+	AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
+	AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
+	AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
+	AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
+	AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
+	AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
+	AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
+	AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
+	AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
+	AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
+	AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
+	AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
+	AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
+	AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
+	AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
+	AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
+	AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
+	AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
+	AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
+	AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
+	AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
+	AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
+	AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
+	AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
+	AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
+	AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
+	AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
+	AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
+	AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
+	AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
+	AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
+	AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
+	AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
+	AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
+	AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
+	AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
+	AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
+	AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
+	AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
+	AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
+	AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
+	AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
+	AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
+	AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
+	AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
+	AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
+	AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
+};
+
+__constant__ __align__(64) uint32_t d_AES1[256] = {
+	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
+	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
+	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
+	AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
+	AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
+	AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
+	AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
+	AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
+	AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
+	AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
+	AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
+	AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
+	AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
+	AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
+	AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
+	AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
+	AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
+	AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
+	AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
+	AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
+	AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
+	AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
+	AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
+	AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
+	AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
+	AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
+	AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
+	AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
+	AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
+	AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
+	AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
+	AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
+	AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
+	AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
+	AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
+	AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
+	AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
+	AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
+	AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
+	AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
+	AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
+	AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
+	AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
+	AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
+	AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
+	AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
+	AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
+	AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
+	AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
+	AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
+	AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
+	AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
+	AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
+	AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
+	AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
+	AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
+	AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
+	AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
+	AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
+	AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
+	AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
+	AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
+	AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
+	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
+};
+
+__constant__ __align__(64) uint32_t d_AES2[256] = {
+	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
+	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
+	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
+	AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
+	AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
+	AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
+	AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
+	AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
+	AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
+	AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
+	AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
+	AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
+	AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
+	AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
+	AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
+	AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
+	AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
+	AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
+	AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
+	AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
+	AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
+	AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
+	AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
+	AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
+	AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
+	AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
+	AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
+	AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
+	AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
+	AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
+	AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
+	AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
+	AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
+	AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
+	AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
+	AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
+	AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
+	AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
+	AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
+	AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
+	AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
+	AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
+	AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
+	AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
+	AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
+	AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
+	AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
+	AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
+	AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
+	AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
+	AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
+	AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
+	AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
+	AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
+	AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
+	AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
+	AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
+	AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
+	AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
+	AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
+	AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
+	AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
+	AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
+	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
+};
+
+__constant__ __align__(64) uint32_t d_AES3[256] = {
+	AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
+	AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
+	AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
+	AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
+	AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
+	AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
+	AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
+	AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
+	AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
+	AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
+	AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
+	AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
+	AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
+	AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
+	AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
+	AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
+	AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
+	AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
+	AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
+	AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
+	AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
+	AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
+	AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
+	AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
+	AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
+	AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
+	AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
+	AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
+	AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
+	AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
+	AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
+	AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
+	AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
+	AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
+	AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
+	AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
+	AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
+	AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
+	AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
+	AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
+	AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
+	AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
+	AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
+	AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
+	AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
+	AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
+	AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
+	AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
+	AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
+	AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
+	AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
+	AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
+	AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
+	AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
+	AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
+	AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
+	AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
+	AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
+	AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
+	AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
+	AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
+	AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
+	AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
+	AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
+};
+
+
+__device__ __forceinline__
+void aes_gpu_init(uint32_t *const sharedMemory)
+{
+	/* each thread startup will fill a uint32 */
+	if(threadIdx.x < 256)
+	{
+		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
+		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
+		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
+	}
+}
+
+#define xor4_32(a,b,c,d) (a) ^ (b) ^ (c) ^ (d)
+
+// with k0
+__device__
+static void aes_round(const uint32_t *const __restrict__ sharedMemory, const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+
+	y0 = xor4_32(
+		sharedMemory[__byte_perm(x0, 0, 0x4440)],
+		sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]) ^ k0;
+
+	y1 = xor4_32(
+		sharedMemory[__byte_perm(x1, 0, 0x4440)],
+		sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
+
+	y2 = xor4_32(
+		sharedMemory[__byte_perm(x2, 0, 0x4440)],
+		sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]);
+
+	y3 = xor4_32(
+		sharedMemory[__byte_perm(x3, 0, 0x4440)],
+		sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]);
+}
+
+//without k0
+__device__
+static void aes_round(const uint32_t *const __restrict__ sharedMemory, const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+	y0 = xor4_32(
+		sharedMemory[__byte_perm(x0, 0, 0x4440)],
+		sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]);
+
+	y1 = xor4_32(
+		sharedMemory[__byte_perm(x1, 0, 0x4440)],
+		sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
+
+	y2 = xor4_32(
+		sharedMemory[__byte_perm(x2, 0, 0x4440)],
+		sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]);
+
+	y3 = xor4_32(
+		sharedMemory[__byte_perm(x3, 0, 0x4440)],
+		sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]);
+}
diff --git a/fuguecoin.cpp b/fuguecoin.cpp
index 4db4d54fa9..1d3234ace7 100644
--- a/fuguecoin.cpp
+++ b/fuguecoin.cpp
@@ -1,12 +1,17 @@
 #include <string.h>
+#ifdef __cplusplus
+#include <cstdint>
+#else
 #include <stdint.h>
+#endif
 
-#include "uint256.h"
 #include "sph/sph_fugue.h"
 
 #include "miner.h"
-
 #include "cuda_fugue256.h"
+#include <cuda_runtime.h>
+extern bool stop_mining;
+extern volatile bool mining_has_stopped[MAX_GPUS];
 
 extern "C" void my_fugue256_init(void *cc);
 extern "C" void my_fugue256(void *cc, const void *data, size_t len);
@@ -16,28 +21,35 @@ extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n,
 // vorbereitete Kontexte nach den ersten 80 Bytes
 // sph_fugue256_context  ctx_fugue_const[MAX_GPUS];
 
-#define SWAP32(x) \
-    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
-      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
-
-static bool init[MAX_GPUS] = { 0 };
+#define SWAP32(x) swab32(x)
 
-extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
+extern int scanhash_fugue256(int thr_id, uint32_t *pdata, uint32_t *ptarget,
+	uint32_t max_nonce, uint32_t *hashes_done)
 {
-	uint32_t start_nonce = pdata[19]++;
+	uint32_t start_nonce = pdata[19];
 	unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
-	uint32_t throughput = device_intensity(thr_id, __func__, 1 << intensity); // 256*256*8
-	throughput = min(throughput, max_nonce - start_nonce);
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1 << intensity); // 256*256*8
+	uint32_t throughput = min(throughputmax, max_nonce - start_nonce) & 0xfffffc00;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0xf;
+		ptarget[7] = 0xf;
 
 	// init
-	if(!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
 	{
-		fugue256_cpu_init(thr_id, throughput);
-		init[thr_id] = true;
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (8 * sizeof(uint32_t)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			proper_exit(2);
+		}
+#endif
+		fugue256_cpu_init(thr_id, throughputmax);
+		mining_has_stopped[thr_id] = false;
+		init = true;
 	}
 
 	// Endian Drehung ist notwendig
@@ -53,6 +65,7 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt
 		uint32_t foundNounce = 0xFFFFFFFF;
 		fugue256_cpu_hash(thr_id, throughput, pdata[19], NULL, &foundNounce);
 
+		if(stop_mining) {mining_has_stopped[thr_id] = true; pthread_exit(nullptr);}
 		if(foundNounce < 0xffffffff)
 		{
 			uint32_t hash[8];
@@ -66,18 +79,24 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt
 
 			if (hash[7] <= Htarg && fulltest(hash, ptarget))
 			{
+				*hashes_done = pdata[19] - start_nonce + throughput;
 				pdata[19] = foundNounce;
-				*hashes_done = foundNounce - start_nonce + 1;
 				return 1;
 			} else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
+				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundNounce);
 			}
 		}
 
 		pdata[19] += throughput;
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess)
+		{
+			applog(LOG_ERR, "GPU #%d: %s", device_map[thr_id], cudaGetErrorString(err));
+			exit(EXIT_FAILURE);
+		}
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - start_nonce + 1;
+	*hashes_done = pdata[19] - start_nonce;
 	return 0;
 }
 
diff --git a/groestl_functions_quad.cu b/groestl_functions_quad.cu
index 3753866124..8fa044ab2a 100644
--- a/groestl_functions_quad.cu
+++ b/groestl_functions_quad.cu
@@ -1,18 +1,5 @@
 #include "cuda_helper.h"
 
-__device__ __forceinline__ void G256_Mul2(uint32_t *const regs)
-{
-    uint32_t tmp = regs[7];
-    regs[7] = regs[6];
-    regs[6] = regs[5];
-    regs[5] = regs[4];
-    regs[4] = regs[3] ^ tmp;
-    regs[3] = regs[2] ^ tmp;
-    regs[2] = regs[1];
-    regs[1] = regs[0] ^ tmp;
-    regs[0] = tmp;
-}
-
 __device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, const int round)
 {
 	x0 = ~x0;
@@ -24,7 +11,7 @@ __device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32
 	x6 = ~x6;
 	x7 = ~x7;
 
-	uint32_t andmask1 = ((-((threadIdx.x & 0x03) == 3)) & 0xffff0000);
+	const uint32_t andmask1 = ((-((threadIdx.x & 0x03) == 3)) & 0xffff0000);
 
 	x0 ^= ((-(round & 0x01)) & andmask1);
 	x1 ^= ((-(round & 0x02)) & andmask1);
@@ -38,7 +25,7 @@ __device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32
 
 __device__ __forceinline__ void G256_AddRoundConstantP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, const int round)
 {
-	uint32_t andmask1 = ((threadIdx.x & 0x03) - 1) >> 16;
+	const uint32_t andmask1 = ((threadIdx.x & 0x03) - 1) >> 16;
 
 	x4 ^= (0xAAAA & andmask1);
 	x5 ^= (0xCCCC & andmask1);
@@ -240,15 +227,6 @@ __device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6
     x7 = __byte_perm(t0, t1, 0x5410);
 }
 
-#if __CUDA_ARCH__ < 300
-/**
- * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
- * If srcLane is outside the range 0..width-1, the thread’s own value of var is returned.
- */
-#undef __shfl
-#define __shfl(var, srcLane, width) (uint32_t)(var)
-#endif
-
 __device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
 {
 #define SHIFT64_16(hi, lo)    __byte_perm(lo, hi, 0x5432)
@@ -262,19 +240,34 @@ __device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
 #define SINGLE_EVEN(i, bc)        ( A(i, (bc)) )
     uint32_t b[8];
 
-#pragma unroll 8
-    for(int i=0;i<8;i++)
-        b[i] = DOUBLE_ODD(i, 1) ^ DOUBLE_EVEN(i, 3);
-
-    G256_Mul2(b);
-#pragma unroll 8
-    for(int i=0;i<8;i++)
-        b[i] = b[i] ^ DOUBLE_ODD(i, 3) ^ DOUBLE_ODD(i, 4) ^ SINGLE_ODD(i, 6);
-
-    G256_Mul2(b);
-#pragma unroll 8
-    for(int i=0;i<8;i++)
-        r[i] = b[i] ^ DOUBLE_EVEN(i, 2) ^ DOUBLE_EVEN(i, 3) ^ SINGLE_EVEN(i, 5);
+	b[0] = (S(0, (1)) ^ A(0, (1) + 1)) ^ DOUBLE_EVEN(0, 3);
+	b[1] = (S(1, (1)) ^ A(1, (1) + 1)) ^ DOUBLE_EVEN(1, 3);
+	b[2] = (S(2, (1)) ^ A(2, (1) + 1)) ^ DOUBLE_EVEN(2, 3);
+	b[3] = (S(3, (1)) ^ A(3, (1) + 1)) ^ DOUBLE_EVEN(3, 3);
+	b[4] = (S(4, (1)) ^ A(4, (1) + 1)) ^ DOUBLE_EVEN(4, 3);
+	b[5] = (S(5, (1)) ^ A(5, (1) + 1)) ^ DOUBLE_EVEN(5, 3);
+	b[6] = (S(6, (1)) ^ A(6, (1) + 1)) ^ DOUBLE_EVEN(6, 3);
+	b[7] = (S(7, (1)) ^ A(7, (1) + 1)) ^ DOUBLE_EVEN(7, 3);
+
+	uint32_t tmp = b[7];
+	b[7] = b[6] ^ (S(7, (3)) ^ A(7, (3) + 1)) ^ DOUBLE_ODD(7, 4) ^ SINGLE_ODD(7, 6);
+	b[6] = b[5] ^ (S(6, (3)) ^ A(6, (3) + 1)) ^ DOUBLE_ODD(6, 4) ^ SINGLE_ODD(6, 6);
+	b[5] = b[4] ^ (S(5, (3)) ^ A(5, (3) + 1)) ^ DOUBLE_ODD(5, 4) ^ SINGLE_ODD(5, 6);
+	b[4] = b[3] ^ (S(4, (3)) ^ A(4, (3) + 1)) ^ DOUBLE_ODD(4, 4) ^ SINGLE_ODD(4, 6) ^ tmp;
+	b[3] = b[2] ^ (S(3, (3)) ^ A(3, (3) + 1)) ^ DOUBLE_ODD(3, 4) ^ SINGLE_ODD(3, 6) ^ tmp;
+	b[2] = b[1] ^ (S(2, (3)) ^ A(2, (3) + 1)) ^ DOUBLE_ODD(2, 4) ^ SINGLE_ODD(2, 6);
+	b[1] = b[0] ^ (S(1, (3)) ^ A(1, (3) + 1)) ^ DOUBLE_ODD(1, 4) ^ SINGLE_ODD(1, 6) ^ tmp;
+	b[0] = tmp ^ (S(0, (3)) ^ A(0, (3) + 1)) ^ DOUBLE_ODD(0, 4) ^ SINGLE_ODD(0, 6);
+
+	tmp = b[7];
+	r[7] = b[6] ^ DOUBLE_EVEN(7, 2) ^ DOUBLE_EVEN(7, 3) ^ SINGLE_EVEN(7, 5);
+	r[6] = b[5] ^ DOUBLE_EVEN(6, 2) ^ DOUBLE_EVEN(6, 3) ^ SINGLE_EVEN(6, 5);
+	r[5] = b[4] ^ DOUBLE_EVEN(5, 2) ^ DOUBLE_EVEN(5, 3) ^ SINGLE_EVEN(5, 5);
+	r[4] = b[3] ^ DOUBLE_EVEN(4, 2) ^ DOUBLE_EVEN(4, 3) ^ SINGLE_EVEN(4, 5) ^ tmp;
+	r[3] = b[2] ^ DOUBLE_EVEN(3, 2) ^ DOUBLE_EVEN(3, 3) ^ SINGLE_EVEN(3, 5) ^ tmp;
+	r[2] = b[1] ^ DOUBLE_EVEN(2, 2) ^ DOUBLE_EVEN(2, 3) ^ SINGLE_EVEN(2, 5);
+	r[1] = b[0] ^ DOUBLE_EVEN(1, 2) ^ DOUBLE_EVEN(1, 3) ^ SINGLE_EVEN(1, 5)^tmp;
+	r[0] = tmp ^ DOUBLE_EVEN(0, 2) ^ DOUBLE_EVEN(0, 3) ^ SINGLE_EVEN(0, 5);
 
 #undef S
 #undef A
@@ -285,19 +278,174 @@ __device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
 
 __device__ __forceinline__ void groestl512_perm_P_quad(uint32_t *const r)
 {
+#if __CUDA_ARCH__ > 500
+	const uint32_t andmask1 = ((threadIdx.x & 0x03) - 1) >> 16;
+
+	r[4] ^= (0xAAAA & andmask1);
+	r[5] ^= (0xCCCC & andmask1);
+	r[6] ^= (0xF0F0 & andmask1);
+	r[7] ^= (0xFF00 & andmask1);
+	sbox_quad(r);
+	G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+
+	r[4] ^= (0xAAAA & andmask1);
+	r[5] ^= (0xCCCC & andmask1);
+	r[6] ^= (0xF0F0 & andmask1);
+	r[7] ^= (0xFF00 & andmask1);
+	r[0] ^= andmask1;
+	sbox_quad(r);
+	G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+
+	r[4] ^= (0xAAAA & andmask1);
+	r[5] ^= (0xCCCC & andmask1);
+	r[6] ^= (0xF0F0 & andmask1);
+	r[7] ^= (0xFF00 & andmask1);
+	r[1] ^= andmask1;
+	sbox_quad(r);
+	G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+
+	for (int round = 3; round<14; round++)
+	{
+		G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
+		sbox_quad(r);
+		G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+		G256_MixFunction_quad(r);
+	}
+
+#else
+	for (int round = 0; round<14; round++)
+	{
+		G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
+		sbox_quad(r);
+		G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+		G256_MixFunction_quad(r);
+	}
+#endif
 
-    for(int round=0;round<14;round++)
-    {
-        G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
-        sbox_quad(r);
-        G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
-        G256_MixFunction_quad(r);
-    }
+/*
+
+
+
+r[4] ^= (0xAAAA & andmask1);
+r[5] ^= (0xCCCC & andmask1);
+r[6] ^= (0xF0F0 & andmask1);
+r[7] ^= (0xFF00 & andmask1);
+r[0] ^= andmask1;
+r[1] ^= andmask1;
+sbox_quad(r);
+G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+G256_MixFunction_quad(r);
+
+r[4] ^= (0xAAAA & andmask1);
+r[5] ^= (0xCCCC & andmask1);
+r[6] ^= (0xF0F0 & andmask1);
+r[7] ^= (0xFF00 & andmask1);
+r[2] ^= andmask1;
+sbox_quad(r);	
+G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+
+
+	r[4] ^= (0xAAAA & andmask1);
+	r[5] ^= (0xCCCC & andmask1);
+	r[6] ^= (0xF0F0 & andmask1);
+	r[7] ^= (0xFF00 & andmask1);
+	r[2] ^= andmask1;
+	sbox_quad(r);
+	G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+	r[4] ^= (0xAAAA & andmask1);
+	r[5] ^= (0xCCCC & andmask1);
+	r[6] ^= (0xF0F0 & andmask1);
+	r[7] ^= (0xFF00 & andmask1);
+	r[0] ^= andmask1;
+	r[2] ^= andmask1;
+	sbox_quad(r);
+	G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+	r[4] ^= (0xAAAA & andmask1);
+	r[5] ^= (0xCCCC & andmask1);
+	r[6] ^= (0xF0F0 & andmask1);
+	r[7] ^= (0xFF00 & andmask1);
+	r[1] ^= andmask1;
+	r[2] ^= andmask1;
+	sbox_quad(r);
+	G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+	r[4] ^= (0xAAAA & andmask1);
+	r[5] ^= (0xCCCC & andmask1);
+	r[6] ^= (0xF0F0 & andmask1);
+	r[7] ^= (0xFF00 & andmask1);
+	r[0] ^= andmask1;
+	r[1] ^= andmask1;
+	r[2] ^= andmask1;
+	sbox_quad(r);
+	G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+	r[4] ^= (0xAAAA & andmask1);
+	r[5] ^= (0xCCCC & andmask1);
+	r[6] ^= (0xF0F0 & andmask1);
+	r[7] ^= (0xFF00 & andmask1);
+	r[3] ^= andmask1;
+	sbox_quad(r);
+	G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+	r[4] ^= (0xAAAA & andmask1);
+	r[5] ^= (0xCCCC & andmask1);
+	r[6] ^= (0xF0F0 & andmask1);
+	r[7] ^= (0xFF00 & andmask1);
+	r[0] ^= andmask1;
+	r[3] ^= andmask1;
+	sbox_quad(r);
+	G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+	r[4] ^= (0xAAAA & andmask1);
+	r[5] ^= (0xCCCC & andmask1);
+	r[6] ^= (0xF0F0 & andmask1);
+	r[7] ^= (0xFF00 & andmask1);
+	r[1] ^= andmask1;
+	r[3] ^= andmask1;
+	sbox_quad(r);
+	G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+	r[4] ^= (0xAAAA & andmask1);
+	r[5] ^= (0xCCCC & andmask1);
+	r[6] ^= (0xF0F0 & andmask1);
+	r[7] ^= (0xFF00 & andmask1);
+	r[0] ^= andmask1;
+	r[1] ^= andmask1;
+	r[3] ^= andmask1;
+	sbox_quad(r);
+	G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+	r[4] ^= (0xAAAA & andmask1);
+	r[5] ^= (0xCCCC & andmask1);
+	r[6] ^= (0xF0F0 & andmask1);
+	r[7] ^= (0xFF00 & andmask1);
+	r[2] ^= andmask1;
+	r[3] ^= andmask1;
+	sbox_quad(r);
+	G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+	r[4] ^= (0xAAAA & andmask1);
+	r[5] ^= (0xCCCC & andmask1);
+	r[6] ^= (0xF0F0 & andmask1);
+	r[7] ^= (0xFF00 & andmask1);
+	r[0] ^= andmask1;
+	r[2] ^= andmask1;
+	r[3] ^= andmask1;
+	sbox_quad(r);
+	G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+	G256_MixFunction_quad(r);
+	*/
 }
 
 __device__ __forceinline__ void groestl512_perm_Q_quad(uint32_t *const r)
 {    
-    for(int round=0;round<14;round++)
+	for (int round = 0; round<14; round++)
     {
         G256_AddRoundConstantQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
         sbox_quad(r);
@@ -308,18 +456,46 @@ __device__ __forceinline__ void groestl512_perm_Q_quad(uint32_t *const r)
 
 __device__ __forceinline__ void groestl512_progressMessage_quad(uint32_t *const __restrict__ state, uint32_t *const __restrict__ message)
 {
-#pragma unroll 8
-    for(int u=0;u<8;u++) state[u] = message[u];
+	state[0] = message[0];
+	state[1] = message[1];
+	state[2] = message[2];
+	state[3] = message[3];
+	state[4] = message[4];
+	state[5] = message[5];
+	state[6] = message[6];
+	state[7] = message[7];
 
     if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
     groestl512_perm_P_quad(state);
     if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
     groestl512_perm_Q_quad(message);
-#pragma unroll 8
-    for(int u=0;u<8;u++) state[u] ^= message[u];
-#pragma unroll 8
-    for(int u=0;u<8;u++) message[u] = state[u];
-    groestl512_perm_P_quad(message);
-#pragma unroll 8
-    for(int u=0;u<8;u++) state[u] ^= message[u];
+
+	state[0] ^= message[0];
+	state[1] ^= message[1];
+	state[2] ^= message[2];
+	state[3] ^= message[3];
+	state[4] ^= message[4];
+	state[5] ^= message[5];
+	state[6] ^= message[6];
+	state[7] ^= message[7];
+
+	message[0] = state[0];
+	message[1] = state[1];
+	message[2] = state[2];
+	message[3] = state[3];
+	message[4] = state[4];
+	message[5] = state[5];
+	message[6] = state[6];
+	message[7] = state[7];
+
+	groestl512_perm_P_quad(message);
+		
+	state[0] ^= message[0];
+	state[1] ^= message[1];
+	state[2] ^= message[2];
+	state[3] ^= message[3];
+	state[4] ^= message[4];
+	state[5] ^= message[5];
+	state[6] ^= message[6];
+	state[7] ^= message[7];
 }
diff --git a/groestlcoin.cpp b/groestlcoin.cpp
index f732be93c2..9897a66888 100644
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@@ -1,19 +1,35 @@
+
 #include <string.h>
+#ifdef __cplusplus
+#include <cstdint>
+#else
 #include <stdint.h>
+#endif
 #include <openssl/sha.h>
 
-#include "uint256.h"
 #include "sph/sph_groestl.h"
 #include "cuda_groestlcoin.h"
 
 #include "miner.h"
-
-#define SWAP32(x) \
-    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
-      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+#include <cuda.h>
+#include <cuda_runtime.h>
+extern bool stop_mining;
+extern volatile bool mining_has_stopped[MAX_GPUS];
+
+#define CUDA_SAFE_CALL(call)                                          \
+do {                                                                  \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
+		         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
+		exit(EXIT_FAILURE);                                           \
+		}                                                                 \
+} while (0)
+
+#define SWAP32(x) swab32(x)
 
 // CPU-groestl
-extern "C" void groestlhash(void *state, const void *input)
+void groestlhash(void *state, const void *input)
 {
     sph_groestl512_context ctx_groestl;
 
@@ -31,25 +47,34 @@ extern "C" void groestlhash(void *state, const void *input)
     memcpy(state, hashB, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
+extern cudaStream_t gpustream[MAX_GPUS];
 
-extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-    uint32_t max_nonce, unsigned long *hashes_done)
+extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata, uint32_t *ptarget,
+    uint32_t max_nonce, uint32_t *hashes_done)
 {
-    uint32_t start_nonce = pdata[19]++;
-    uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
-    throughput = min(throughput, max_nonce - start_nonce);
+	static THREAD uint32_t *foundNounce = nullptr;
 
-    uint32_t *outputHash = (uint32_t*)malloc(throughput * 16 * sizeof(uint32_t));
+    uint32_t start_nonce = pdata[19];
+	unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 24 : 23;
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << intensity);
+	uint32_t throughput = min(throughputmax, max_nonce - start_nonce) & 0xfffffc00;
 
     if (opt_benchmark)
-        ((uint32_t*)ptarget)[7] = 0x000000ff;
+        ptarget[7] = 0x0000000f;
 
     // init
-    if(!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
     {
-        groestlcoin_cpu_init(thr_id, throughput);
-        init[thr_id] = true;
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+
+		groestlcoin_cpu_init(thr_id, throughputmax);
+		CUDA_SAFE_CALL(cudaMallocHost(&foundNounce, 2 * 4));
+		mining_has_stopped[thr_id] = false;
+		init = true;
     }
 
     // Endian Drehung ist notwendig
@@ -58,38 +83,69 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t
         be32enc(&endiandata[kk], pdata[kk]);
 
     // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird sp�ter ersetzt)
-    groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
-
-    do {
-        // GPU
-        uint32_t foundNounce = 0xFFFFFFFF;
-        const uint32_t Htarg = ptarget[7];
-
-        groestlcoin_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce);
-
-        if(foundNounce < 0xffffffff)
-        {
-            uint32_t tmpHash[8];
-            endiandata[19] = SWAP32(foundNounce);
-            groestlhash(tmpHash, endiandata);
-
-            if (tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) {
-                pdata[19] = foundNounce;
-                *hashes_done = foundNounce - start_nonce + 1;
-                free(outputHash);
-                return true;
-            } else {
-                applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
-            }
-
-            foundNounce = 0xffffffff;
-        }
+    groestlcoin_cpu_setBlock(thr_id, endiandata);
+
+	do
+	{
+		// GPU
+		const uint32_t Htarg = ptarget[7];
+
+		groestlcoin_cpu_hash(thr_id, throughput, pdata[19], foundNounce, ptarget[7]);
+
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(foundNounce[0] < 0xffffffff)
+		{
+			uint32_t tmpHash[8];
+			endiandata[19] = SWAP32(foundNounce[0]);
+			groestlhash(tmpHash, endiandata);
+
+			if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget))
+			{
+				int res = 1;
+				if(opt_benchmark)
+					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], foundNounce[0]);
+				*hashes_done = pdata[19] - start_nonce + throughput;
+				if(foundNounce[1] != 0xffffffff)
+				{
+					endiandata[19] = SWAP32(foundNounce[1]);
+					groestlhash(tmpHash, endiandata);
+					if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget))
+					{
+						pdata[21] = foundNounce[1];
+						res++;
+						if(opt_benchmark)
+							applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], foundNounce[1]);
+					}
+					else
+					{
+						if(tmpHash[7] != Htarg)
+						{
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[1]);
+						}
+					}
+				}
+				pdata[19] = foundNounce[0];
+				return res;
+			}
+			else
+			{
+				if(tmpHash[7] != Htarg)
+				{
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[0]);
+				}
+			}
+		}
 
 		pdata[19] += throughput;
-	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
-
-    *hashes_done = pdata[19] - start_nonce + 1;
-    free(outputHash);
+		cudaError_t err = cudaGetLastError();
+		if(err != cudaSuccess)
+		{
+			applog(LOG_ERR, "GPU #%d: %s", device_map[thr_id], cudaGetErrorString(err));
+			exit(EXIT_FAILURE);
+		}
+	} while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
+
+    *hashes_done = pdata[19] - start_nonce;
     return 0;
 }
 
diff --git a/heavy/cuda_blake512.cu b/heavy/cuda_blake512.cu
deleted file mode 100644
index 75e7c13b1f..0000000000
--- a/heavy/cuda_blake512.cu
+++ /dev/null
@@ -1,253 +0,0 @@
-#include <stdio.h>
-#include <memory.h>
-
-#include "cuda_helper.h"
-
-// globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *heavy_heftyHashes[MAX_GPUS];
-extern uint32_t *heavy_nonceVector[MAX_GPUS];
-
-// globaler Speicher für unsere Ergebnisse
-uint32_t *d_hash5output[MAX_GPUS];
-
-// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
-__constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding)
-
-
-// ---------------------------- BEGIN CUDA blake512 functions ------------------------------------
-
-__constant__ uint8_t c_sigma[16][16];
-
-const uint8_t host_sigma[16][16] =
-{
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-  {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-  {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-  { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-  {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
-};
-
-/* in cuda_helper */
-#define SWAP32(x) cuda_swab32(x)
-#define SWAP64(x) cuda_swab64(x)
-
-__constant__ uint64_t c_SecondRound[15];
-
-const uint64_t host_SecondRound[15] =
-{
-  0,0,0,0,0,0,0,0,0,0,0,0,0,SWAP64(1),0
-};
-
-__constant__ uint64_t c_u512[16];
-
-const uint64_t host_u512[16] =
-{
-  0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
-  0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
-  0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
-  0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
-  0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
-  0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
-  0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
-  0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
-};
-
-
-#define G(a,b,c,d,e)          \
-    v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
-    v[d] = SWAPDWORDS( v[d] ^ v[a]);        \
-    v[c] += v[d];           \
-    v[b] = ROTR64( v[b] ^ v[c],25);        \
-    v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b];  \
-    v[d] = ROTR64( v[d] ^ v[a],16);        \
-    v[c] += v[d];           \
-    v[b] = ROTR64( v[b] ^ v[c],11);
-
-template <int BLOCKSIZE> __device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt, const uint8_t ((*sigma)[16]), const uint64_t *u512 )
-{
-    uint64_t v[16], m[16], i;
-
-#pragma unroll 16
-    for( i = 0; i < 16; ++i )  m[i] = cuda_swab64(block[i]);
-
-#pragma unroll 8
-    for( i = 0; i < 8; ++i )  v[i] = h[i];
-
-    v[ 8] = u512[0];
-    v[ 9] = u512[1];
-    v[10] = u512[2];
-    v[11] = u512[3];
-    v[12] = u512[4];
-    v[13] = u512[5];
-    v[14] = u512[6];
-    v[15] = u512[7];
-
-    /* don't xor t when the block is only padding */
-    if ( !nullt ) {
-        v[12] ^= 8*(BLOCKSIZE+32);
-        v[13] ^= 8*(BLOCKSIZE+32);
-    }
-
-//#pragma unroll 16
-    for( i = 0; i < 16; ++i )
-    {
-        /* column step */
-        G( 0, 4, 8, 12, 0 );
-        G( 1, 5, 9, 13, 2 );
-        G( 2, 6, 10, 14, 4 );
-        G( 3, 7, 11, 15, 6 );
-        /* diagonal step */
-        G( 0, 5, 10, 15, 8 );
-        G( 1, 6, 11, 12, 10 );
-        G( 2, 7, 8, 13, 12 );
-        G( 3, 4, 9, 14, 14 );
-    }
-
-#pragma unroll 16
-    for( i = 0; i < 16; ++i )  h[i & 7] ^= v[i];
-}
-
-template <int BLOCKSIZE> __global__ void blake512_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		// bestimme den aktuellen Zähler
-		//uint32_t nounce = startNounce + thread;
-		uint32_t nounce = nonceVector[thread];
-
-		// Index-Position des Hashes in den Hash Puffern bestimmen (Hefty1 und outputHash)
-		uint32_t hashPosition = nounce - startNounce;
-
-		// State vorbereiten
-		uint64_t h[8];
-		h[0] = 0x6a09e667f3bcc908ULL;
-		h[1] = 0xbb67ae8584caa73bULL;
-		h[2] = 0x3c6ef372fe94f82bULL;
-		h[3] = 0xa54ff53a5f1d36f1ULL;
-		h[4] = 0x510e527fade682d1ULL;
-		h[5] = 0x9b05688c2b3e6c1fULL;
-		h[6] = 0x1f83d9abfb41bd6bULL;
-		h[7] = 0x5be0cd19137e2179ULL;
-
-		// 128 Byte für die Message
-		uint64_t buf[16];
-
-		// Message für die erste Runde in Register holen
-#pragma unroll 16
-		for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i];
-
-		// die Nounce durch die thread-spezifische ersetzen
-		buf[9] = REPLACE_HIWORD(buf[9], nounce);
-
-		uint32_t *hefty = heftyHashes + 8 * hashPosition;
-		if (BLOCKSIZE == 84) {
-			// den thread-spezifischen Hefty1 hash einsetzen
-			// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
-			buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
-			buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
-			buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
-			buf[12] = REPLACE_LOWORD(buf[12], hefty[3]);
-			buf[12] = REPLACE_HIWORD(buf[12], hefty[4]);
-			buf[13] = REPLACE_LOWORD(buf[13], hefty[5]);
-			buf[13] = REPLACE_HIWORD(buf[13], hefty[6]);
-			buf[14] = REPLACE_LOWORD(buf[14], hefty[7]);
-		}
-		else if (BLOCKSIZE == 80) {
-			buf[10] = MAKE_ULONGLONG(hefty[0], hefty[1]);
-			buf[11] = MAKE_ULONGLONG(hefty[2], hefty[3]);
-			buf[12] = MAKE_ULONGLONG(hefty[4], hefty[5]);
-			buf[13] = MAKE_ULONGLONG(hefty[6], hefty[7]);
-		}
-
-		// erste Runde
-		blake512_compress<BLOCKSIZE>( h, buf, 0, c_sigma, c_u512 );
-
-
-		// zweite Runde
-#pragma unroll 15
-		for (int i=0; i < 15; ++i) buf[i] = c_SecondRound[i];
-		buf[15] = SWAP64(8*(BLOCKSIZE+32)); // Blocksize in Bits einsetzen
-		blake512_compress<BLOCKSIZE>( h, buf, 1, c_sigma, c_u512 );
-
-		// Hash rauslassen
-		uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition;
-#pragma unroll 8
-		for (int i=0; i < 8; ++i) outHash[i] = cuda_swab64( h[i] );
-	}
-}
-
-
-// ---------------------------- END CUDA blake512 functions ------------------------------------
-
-// Setup-Funktionen
-__host__ void blake512_cpu_init(int thr_id, uint32_t threads)
-{
-	// Kopiere die Hash-Tabellen in den GPU-Speicher
-	cudaMemcpyToSymbol( c_sigma,
-						host_sigma,
-						sizeof(host_sigma),
-						0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( c_u512,
-						host_u512,
-						sizeof(host_u512),
-						0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( c_SecondRound,
-						host_SecondRound,
-						sizeof(host_SecondRound),
-						0, cudaMemcpyHostToDevice);
-
-	// Speicher für alle Ergebnisse belegen
-	CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads));
-}
-
-static int BLOCKSIZE = 84;
-
-__host__ void blake512_cpu_setBlock(void *pdata, int len)
-	// data muss 84-Byte haben!
-	// heftyHash hat 32-Byte
-{
-	unsigned char PaddedMessage[128];
-	if (len == 84) {
-		// Message mit Padding für erste Runde bereitstellen
-		memcpy(PaddedMessage, pdata, 84);
-		memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
-		memset(PaddedMessage+116, 0, 12);
-		PaddedMessage[116] = 0x80;
-	} else if (len == 80) {
-		memcpy(PaddedMessage, pdata, 80);
-		memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
-		memset(PaddedMessage+112, 0, 16);
-		PaddedMessage[112] = 0x80;
-	}
-	// die Message (116 Bytes) ohne Padding zur Berechnung auf der GPU
-	cudaMemcpyToSymbol( c_PaddedMessage, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
-	BLOCKSIZE = len;
-}
-
-__host__ void blake512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce)
-{
-	const uint32_t threadsperblock = 256;
-
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
-
-	if (BLOCKSIZE == 80)
-		blake512_gpu_hash<80><<<grid, block>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
-	else if (BLOCKSIZE == 84)
-		blake512_gpu_hash<84><<<grid, block>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
-}
diff --git a/heavy/cuda_blake512.h b/heavy/cuda_blake512.h
deleted file mode 100644
index 7e24973348..0000000000
--- a/heavy/cuda_blake512.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _CUDA_BLAKE512_H
-#define _CUDA_BLAKE512_H
-
-void blake512_cpu_init(int thr_id, int threads);
-void blake512_cpu_setBlock(void *pdata, int len);
-void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
-#endif
diff --git a/heavy/cuda_combine.cu b/heavy/cuda_combine.cu
deleted file mode 100644
index 3365cf18c7..0000000000
--- a/heavy/cuda_combine.cu
+++ /dev/null
@@ -1,145 +0,0 @@
-#include <stdio.h>
-
-#include "cuda_helper.h"
-
-// globaler Speicher für unsere Ergebnisse
-static uint32_t *d_hashoutput[MAX_GPUS];
-extern uint32_t *d_hash2output[MAX_GPUS];
-extern uint32_t *d_hash3output[MAX_GPUS];
-extern uint32_t *d_hash4output[MAX_GPUS];
-extern uint32_t *d_hash5output[MAX_GPUS];
-
-extern uint32_t *heavy_nonceVector[MAX_GPUS];
-
-/* Combines top 64-bits from each hash into a single hash */
-__device__
-static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
-{
-	uint32_t lout[8]; // Combining in Registern machen
-
-#pragma unroll 8
-	for (int i=0; i < 8; ++i)
-		lout[i] = 0;
-
-	// das Makro setzt jeweils 4 Bits aus vier verschiedenen Hashes zu einem Nibble zusammen
-#define MIX(bits, mask, i) \
-	lout[(255 - (bits+3))/32] <<= 4; \
-	if ((hash1[i] & mask) != 0) lout[(255 - (bits+0))/32] |= 8; \
-	if ((hash2[i] & mask) != 0) lout[(255 - (bits+1))/32] |= 4; \
-	if ((hash3[i] & mask) != 0) lout[(255 - (bits+2))/32] |= 2; \
-	if ((hash4[i] & mask) != 0) lout[(255 - (bits+3))/32] |= 1; \
-
-	/* Transpose first 64 bits of each hash into out */
-	MIX(  0, 0x80000000, 7);
-	MIX(  4, 0x40000000, 7);
-	MIX(  8, 0x20000000, 7);
-	MIX( 12, 0x10000000, 7);
-	MIX( 16, 0x08000000, 7);
-	MIX( 20, 0x04000000, 7);
-	MIX( 24, 0x02000000, 7);
-	MIX( 28, 0x01000000, 7);
-	MIX( 32, 0x00800000, 7);
-	MIX( 36, 0x00400000, 7);
-	MIX( 40, 0x00200000, 7);
-	MIX( 44, 0x00100000, 7);
-	MIX( 48, 0x00080000, 7);
-	MIX( 52, 0x00040000, 7);
-	MIX( 56, 0x00020000, 7);
-	MIX( 60, 0x00010000, 7);
-	MIX( 64, 0x00008000, 7);
-	MIX( 68, 0x00004000, 7);
-	MIX( 72, 0x00002000, 7);
-	MIX( 76, 0x00001000, 7);
-	MIX( 80, 0x00000800, 7);
-	MIX( 84, 0x00000400, 7);
-	MIX( 88, 0x00000200, 7);
-	MIX( 92, 0x00000100, 7);
-	MIX( 96, 0x00000080, 7);
-	MIX(100, 0x00000040, 7);
-	MIX(104, 0x00000020, 7);
-	MIX(108, 0x00000010, 7);
-	MIX(112, 0x00000008, 7);
-	MIX(116, 0x00000004, 7);
-	MIX(120, 0x00000002, 7);
-	MIX(124, 0x00000001, 7);
-
-	MIX(128, 0x80000000, 6);
-	MIX(132, 0x40000000, 6);
-	MIX(136, 0x20000000, 6);
-	MIX(140, 0x10000000, 6);
-	MIX(144, 0x08000000, 6);
-	MIX(148, 0x04000000, 6);
-	MIX(152, 0x02000000, 6);
-	MIX(156, 0x01000000, 6);
-	MIX(160, 0x00800000, 6);
-	MIX(164, 0x00400000, 6);
-	MIX(168, 0x00200000, 6);
-	MIX(172, 0x00100000, 6);
-	MIX(176, 0x00080000, 6);
-	MIX(180, 0x00040000, 6);
-	MIX(184, 0x00020000, 6);
-	MIX(188, 0x00010000, 6);
-	MIX(192, 0x00008000, 6);
-	MIX(196, 0x00004000, 6);
-	MIX(200, 0x00002000, 6);
-	MIX(204, 0x00001000, 6);
-	MIX(208, 0x00000800, 6);
-	MIX(212, 0x00000400, 6);
-	MIX(216, 0x00000200, 6);
-	MIX(220, 0x00000100, 6);
-	MIX(224, 0x00000080, 6);
-	MIX(228, 0x00000040, 6);
-	MIX(232, 0x00000020, 6);
-	MIX(236, 0x00000010, 6);
-	MIX(240, 0x00000008, 6);
-	MIX(244, 0x00000004, 6);
-	MIX(248, 0x00000002, 6);
-	MIX(252, 0x00000001, 6);
-
-#pragma unroll 8
-	for (int i=0; i < 8; ++i)
-		out[i] = lout[i];
-}
-
-__global__
-void combine_gpu_hash(uint32_t threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint32_t nounce = nonceVector[thread];
-		uint32_t hashPosition = nounce - startNounce;
-		// Die Aufgabe der combine-funktion besteht aus zwei Teilen.
-		// 1) Komprimiere die hashes zu einem kleinen Array
-		// 2) Errechne dort den combines-value
-
-		// Die Kompression wird dadurch verwirklicht, dass im out-array weiterhin mit "thread" indiziert
-		// wird. Die anderen Werte werden mit der nonce indiziert
-
-		combine_hashes(&out[8 * thread], &hash2[8 * hashPosition], &hash3[16 * hashPosition], &hash4[16 * hashPosition], &hash5[16 * hashPosition]);
-	}
-}
-
-__host__
-void combine_cpu_init(int thr_id, uint32_t threads)
-{
-	// Speicher für alle Ergebnisse belegen
-	CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads));
-}
-
-__host__
-void combine_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *hash)
-{
-	// diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten
-
-	const uint32_t threadsperblock = 128;
-
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
-
-	combine_gpu_hash <<<grid, block>>> (threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], heavy_nonceVector[thr_id]);
-
-	// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
-	CUDA_SAFE_CALL(cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost));
-}
diff --git a/heavy/cuda_combine.h b/heavy/cuda_combine.h
deleted file mode 100644
index 5bb5832d19..0000000000
--- a/heavy/cuda_combine.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _CUDA_COMBINE_H
-#define _CUDA_COMBINE_H
-
-void combine_cpu_init(int thr_id, int threads);
-void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash);
-
-#endif
diff --git a/heavy/cuda_groestl512.cu b/heavy/cuda_groestl512.cu
deleted file mode 100644
index eac60fdaad..0000000000
--- a/heavy/cuda_groestl512.cu
+++ /dev/null
@@ -1,816 +0,0 @@
-#include <stdio.h>
-#include <memory.h>
-
-#define SPH_C32(x) ((uint32_t)(x ## U))
-#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
-
-#include "cuda_helper.h"
-
-// globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *heavy_heftyHashes[MAX_GPUS];
-extern uint32_t *heavy_nonceVector[MAX_GPUS];
-
-// globaler Speicher für unsere Ergebnisse
-uint32_t *d_hash4output[MAX_GPUS];
-
-__constant__ uint32_t groestl_gpu_state[32];
-__constant__ uint32_t groestl_gpu_msg[32];
-
-#define PC32up(j, r)   ((uint32_t)((j) + (r)))
-#define PC32dn(j, r)   0
-#define QC32up(j, r)   0xFFFFFFFF
-#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ (~((uint32_t)(j) << 24)))
-
-#define B32_0(x)    ((x) & 0xFF)
-#define B32_1(x)    (((x) >> 8) & 0xFF)
-#define B32_2(x)    (((x) >> 16) & 0xFF)
-#define B32_3(x)    ((x) >> 24)
-
-#define C32e(x)     ((SPH_C32(x) >> 24) \
-                    | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
-                    | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
-                    | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
-
-#define T0up(x) tex1Dfetch(t0up, x)
-#define T0dn(x) tex1Dfetch(t0dn, x)
-#define T1up(x) tex1Dfetch(t1up, x)
-#define T1dn(x) tex1Dfetch(t1dn, x)
-#define T2up(x) tex1Dfetch(t2up, x)
-#define T2dn(x) tex1Dfetch(t2dn, x)
-#define T3up(x) tex1Dfetch(t3up, x)
-#define T3dn(x) tex1Dfetch(t3dn, x)
-
-texture<unsigned int, 1, cudaReadModeElementType> t0up;
-texture<unsigned int, 1, cudaReadModeElementType> t0dn;
-texture<unsigned int, 1, cudaReadModeElementType> t1up;
-texture<unsigned int, 1, cudaReadModeElementType> t1dn;
-texture<unsigned int, 1, cudaReadModeElementType> t2up;
-texture<unsigned int, 1, cudaReadModeElementType> t2dn;
-texture<unsigned int, 1, cudaReadModeElementType> t3up;
-texture<unsigned int, 1, cudaReadModeElementType> t3dn;
-
-uint32_t T0up_cpu[] = {
-	C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d),
-	C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54),
-	C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d),
-	C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a),
-	C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287),
-	C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b),
-	C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea),
-	C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b),
-	C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a),
-	C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f),
-	C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808),
-	C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f),
-	C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e),
-	C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5),
-	C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d),
-	C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f),
-	C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e),
-	C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb),
-	C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce),
-	C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297),
-	C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c),
-	C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced),
-	C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b),
-	C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a),
-	C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16),
-	C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794),
-	C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881),
-	C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3),
-	C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a),
-	C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04),
-	C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563),
-	C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d),
-	C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f),
-	C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39),
-	C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947),
-	C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495),
-	C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f),
-	C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83),
-	C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c),
-	C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76),
-	C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e),
-	C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4),
-	C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6),
-	C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b),
-	C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7),
-	C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0),
-	C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25),
-	C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818),
-	C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672),
-	C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351),
-	C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321),
-	C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485),
-	C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa),
-	C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612),
-	C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0),
-	C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9),
-	C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533),
-	C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7),
-	C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020),
-	C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a),
-	C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917),
-	C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8),
-	C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311),
-	C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a)
-};
-
-uint32_t T0dn_cpu[] = {
-	C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6),
-	C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491),
-	C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56),
-	C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec),
-	C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa),
-	C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb),
-	C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45),
-	C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b),
-	C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c),
-	C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83),
-	C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9),
-	C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a),
-	C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d),
-	C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f),
-	C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf),
-	C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea),
-	C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34),
-	C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b),
-	C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d),
-	C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713),
-	C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1),
-	C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6),
-	C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72),
-	C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85),
-	C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed),
-	C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411),
-	C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe),
-	C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b),
-	C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05),
-	C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1),
-	C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342),
-	C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf),
-	C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3),
-	C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e),
-	C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a),
-	C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6),
-	C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3),
-	C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b),
-	C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28),
-	C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad),
-	C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14),
-	C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8),
-	C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4),
-	C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2),
-	C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da),
-	C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049),
-	C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf),
-	C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810),
-	C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c),
-	C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197),
-	C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e),
-	C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f),
-	C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc),
-	C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c),
-	C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069),
-	C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927),
-	C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322),
-	C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733),
-	C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9),
-	C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5),
-	C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a),
-	C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0),
-	C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e),
-	C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c)
-};
-
-uint32_t T1up_cpu[] = {
-	C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c),
-	C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc),
-	C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187),
-	C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5),
-	C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892),
-	C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d),
-	C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025),
-	C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed),
-	C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be),
-	C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1),
-	C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118),
-	C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41),
-	C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2),
-	C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4),
-	C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847),
-	C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba),
-	C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672),
-	C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16),
-	C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449),
-	C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2),
-	C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574),
-	C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c),
-	C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd),
-	C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde),
-	C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a),
-	C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7),
-	C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698),
-	C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e),
-	C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085),
-	C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c),
-	C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5),
-	C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7),
-	C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271),
-	C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b),
-	C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9),
-	C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4),
-	C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281),
-	C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e),
-	C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44),
-	C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a),
-	C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622),
-	C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37),
-	C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1),
-	C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486),
-	C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2),
-	C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b),
-	C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f),
-	C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828),
-	C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96),
-	C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3),
-	C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63),
-	C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94),
-	C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5),
-	C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36),
-	C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b),
-	C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0),
-	C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755),
-	C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2),
-	C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960),
-	C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e),
-	C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339),
-	C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3),
-	C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33),
-	C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e)
-};
-
-uint32_t T1dn_cpu[] = {
-	C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d),
-	C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954),
-	C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d),
-	C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a),
-	C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87),
-	C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b),
-	C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea),
-	C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b),
-	C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a),
-	C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f),
-	C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908),
-	C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f),
-	C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e),
-	C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5),
-	C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d),
-	C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f),
-	C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e),
-	C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb),
-	C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face),
-	C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697),
-	C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c),
-	C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed),
-	C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b),
-	C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a),
-	C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116),
-	C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294),
-	C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781),
-	C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3),
-	C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a),
-	C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904),
-	C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463),
-	C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d),
-	C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f),
-	C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39),
-	C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447),
-	C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795),
-	C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f),
-	C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683),
-	C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c),
-	C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176),
-	C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e),
-	C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4),
-	C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6),
-	C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b),
-	C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7),
-	C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0),
-	C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525),
-	C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018),
-	C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872),
-	C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551),
-	C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21),
-	C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85),
-	C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa),
-	C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812),
-	C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0),
-	C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9),
-	C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433),
-	C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7),
-	C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920),
-	C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a),
-	C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417),
-	C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8),
-	C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11),
-	C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a)
-};
-
-uint32_t T2up_cpu[] = {
-	C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a),
-	C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d),
-	C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1),
-	C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59),
-	C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68),
-	C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6),
-	C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560),
-	C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76),
-	C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2),
-	C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352),
-	C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1),
-	C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b),
-	C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f),
-	C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb),
-	C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98),
-	C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50),
-	C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446),
-	C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d),
-	C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34),
-	C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1),
-	C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5),
-	C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a),
-	C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af),
-	C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b),
-	C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7),
-	C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6),
-	C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66),
-	C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75),
-	C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580),
-	C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd),
-	C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7),
-	C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08),
-	C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2),
-	C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65),
-	C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3),
-	C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642),
-	C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322),
-	C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95),
-	C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c),
-	C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37),
-	C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436),
-	C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f),
-	C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435),
-	C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274),
-	C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18),
-	C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972),
-	C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0),
-	C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038),
-	C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca),
-	C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764),
-	C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d),
-	C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b),
-	C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29),
-	C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a),
-	C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902),
-	C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7),
-	C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277),
-	C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1),
-	C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9),
-	C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b),
-	C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23),
-	C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003),
-	C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d),
-	C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62)
-};
-
-uint32_t T2dn_cpu[] = {
-	C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7),
-	C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39),
-	C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac),
-	C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3),
-	C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef),
-	C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded),
-	C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a),
-	C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d),
-	C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98),
-	C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d),
-	C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9),
-	C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154),
-	C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221),
-	C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e),
-	C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5),
-	C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf),
-	C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268),
-	C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6),
-	C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa),
-	C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226),
-	C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499),
-	C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77),
-	C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4),
-	C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11),
-	C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1),
-	C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722),
-	C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7),
-	C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96),
-	C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a),
-	C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9),
-	C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584),
-	C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765),
-	C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d),
-	C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c),
-	C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4),
-	C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7),
-	C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d),
-	C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16),
-	C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450),
-	C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41),
-	C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228),
-	C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b),
-	C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193),
-	C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff),
-	C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af),
-	C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92),
-	C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85),
-	C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820),
-	C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8),
-	C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335),
-	C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c),
-	C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e),
-	C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583),
-	C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638),
-	C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2),
-	C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e),
-	C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544),
-	C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266),
-	C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089),
-	C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51),
-	C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934),
-	C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb),
-	C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c),
-	C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58)
-};
-
-uint32_t T3up_cpu[] = {
-	C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6),
-	C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191),
-	C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656),
-	C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec),
-	C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa),
-	C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb),
-	C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545),
-	C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b),
-	C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c),
-	C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383),
-	C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9),
-	C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a),
-	C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d),
-	C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f),
-	C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf),
-	C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea),
-	C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434),
-	C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b),
-	C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d),
-	C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313),
-	C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1),
-	C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6),
-	C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272),
-	C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585),
-	C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded),
-	C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111),
-	C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe),
-	C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b),
-	C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505),
-	C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1),
-	C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242),
-	C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf),
-	C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3),
-	C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e),
-	C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a),
-	C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6),
-	C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3),
-	C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b),
-	C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828),
-	C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad),
-	C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414),
-	C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8),
-	C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4),
-	C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2),
-	C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada),
-	C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949),
-	C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf),
-	C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010),
-	C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c),
-	C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797),
-	C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e),
-	C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f),
-	C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc),
-	C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c),
-	C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969),
-	C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727),
-	C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222),
-	C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333),
-	C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9),
-	C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5),
-	C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a),
-	C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0),
-	C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e),
-	C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c)
-};
-
-uint32_t T3dn_cpu[] = {
-	C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c),
-	C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc),
-	C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87),
-	C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5),
-	C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792),
-	C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d),
-	C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25),
-	C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed),
-	C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe),
-	C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1),
-	C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818),
-	C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41),
-	C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2),
-	C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4),
-	C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47),
-	C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba),
-	C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72),
-	C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16),
-	C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49),
-	C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2),
-	C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74),
-	C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c),
-	C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd),
-	C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade),
-	C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a),
-	C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7),
-	C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198),
-	C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e),
-	C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85),
-	C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c),
-	C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5),
-	C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7),
-	C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71),
-	C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b),
-	C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9),
-	C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4),
-	C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81),
-	C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e),
-	C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44),
-	C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a),
-	C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22),
-	C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437),
-	C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1),
-	C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86),
-	C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2),
-	C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b),
-	C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f),
-	C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828),
-	C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296),
-	C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3),
-	C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163),
-	C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594),
-	C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5),
-	C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236),
-	C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b),
-	C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0),
-	C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355),
-	C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2),
-	C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060),
-	C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e),
-	C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739),
-	C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3),
-	C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133),
-	C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e)
-};
-
-__device__ void groestl512_perm_P(uint32_t *a)
-{
-	uint32_t t[32];
-
-//#pragma unroll 14
-	for(int r=0;r<14;r++)
-	{
-#pragma unroll 16
-		for(int k=0;k<16;k++)
-		{
-			a[(k*2)+0] ^= PC32up(k * 0x10, r);
-			//a[(k<<1)+1] ^= PC32dn(k * 0x10, r);
-		}
-
-		// RBTT
-#pragma unroll 16
-		for(int k=0;k<32;k+=2)
-		{
-			t[k + 0] =	T0up( B32_0(a[k & 0x1f]) ) ^
-						T1up( B32_1(a[(k + 2) & 0x1f]) ) ^
-						T2up( B32_2(a[(k + 4) & 0x1f]) ) ^
-						T3up( B32_3(a[(k + 6) & 0x1f]) ) ^
-						T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^
-						T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^
-						T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^
-						T3dn( B32_3(a[(k + 23) & 0x1f]) );
-
-			t[k + 1] =	T0dn( B32_0(a[k & 0x1f]) ) ^
-						T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^
-						T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^
-						T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^
-						T0up( B32_0(a[(k + 9) & 0x1f]) ) ^
-						T1up( B32_1(a[(k + 11) & 0x1f]) ) ^
-						T2up( B32_2(a[(k + 13) & 0x1f]) ) ^
-						T3up( B32_3(a[(k + 23) & 0x1f]) );
-		}
-#pragma unroll 32
-		for(int k=0;k<32;k++)
-			a[k] = t[k];
-	}
-}
-
-__device__ void groestl512_perm_Q(uint32_t *a)
-{
-//#pragma unroll 14
-	for(int r=0;r<14;r++)
-	{
-		uint32_t t[32];
-
-#pragma unroll 16
-		for(int k=0;k<16;k++)
-		{
-			a[(k*2)+0] ^= QC32up(k * 0x10, r);
-			a[(k*2)+1] ^= QC32dn(k * 0x10, r);
-		}
-
-		// RBTT
-#pragma unroll 16
-		for(int k=0;k<32;k+=2)
-		{
-			t[k + 0] =	T0up( B32_0(a[(k + 2) & 0x1f]) ) ^
-						T1up( B32_1(a[(k + 6) & 0x1f]) ) ^
-						T2up( B32_2(a[(k + 10) & 0x1f]) ) ^
-						T3up( B32_3(a[(k + 22) & 0x1f]) ) ^
-						T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^
-						T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^
-						T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^
-						T3dn( B32_3(a[(k + 13) & 0x1f]) );
-
-			t[k + 1] =	T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^
-						T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^
-						T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^
-						T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^
-						T0up( B32_0(a[(k + 1) & 0x1f]) ) ^
-						T1up( B32_1(a[(k + 5) & 0x1f]) ) ^
-						T2up( B32_2(a[(k + 9) & 0x1f]) ) ^
-						T3up( B32_3(a[(k + 13) & 0x1f]) );
-		}
-#pragma unroll 32
-		for(int k=0;k<32;k++)
-			a[k] = t[k];
-	}
-}
-
-template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint32_t message[32];
-		uint32_t state[32];
-
-		// lese message ein & verknüpfe diese mit dem hash1 von hefty1
-		// lese den state ein
-
-#pragma unroll 32
-		for(int k=0;k<32;k++)
-		{
-			state[k] = groestl_gpu_state[k];
-			message[k] = groestl_gpu_msg[k];
-		}
-
-		uint32_t nounce = nonceVector[thread];
-		// nounce setzen
-		//message[19] = startNounce + thread;
-		message[19] = nounce;
-
-		uint32_t hashPosition = nounce - startNounce;
-
-		// den richtigen Hefty1 Hash holen
-//			memcpy(&message[21], &heftyHashes[8 * hashPosition], sizeof(uint32_t) * 8);
-		uint32_t *heftyHash = &heftyHashes[8 * hashPosition];
-#pragma unroll 8
-		for (int k=0; k<8; ++k)
-			message[BLOCKSIZE/4+k] = heftyHash[k];
-
-		uint32_t g[32];
-#pragma unroll 32
-		for(int u=0;u<32;u++)
-			g[u] = message[u] ^ state[u];
-
-		// Perm
-		groestl512_perm_P(g);
-		groestl512_perm_Q(message);
-
-#pragma unroll 32
-		for(int u=0;u<32;u++)
-		{
-			state[u] ^= g[u] ^ message[u];
-			g[u] = state[u];
-		}
-
-		groestl512_perm_P(g);
-
-#pragma unroll 32
-		for(int u=0;u<32;u++)
-			state[u] ^= g[u];
-
-		// kopiere Ergebnis
-#pragma unroll 16
-		for(int k=0;k<16;k++)
-			((uint32_t*)outputHash)[16*hashPosition+k] = state[k + 16];
-	}
-}
-
-#define texDef(texname, texmem, texsource, texsize) \
-	unsigned int *texmem; \
-	cudaMalloc(&texmem, texsize); \
-	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
-	texname.normalized = 0; \
-	texname.filterMode = cudaFilterModePoint; \
-	texname.addressMode[0] = cudaAddressModeClamp; \
-	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
-
-// Setup-Funktionen
-__host__ void groestl512_cpu_init(int thr_id, uint32_t threads)
-{
-	// Texturen mit obigem Makro initialisieren
-	texDef(t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
-	texDef(t0dn, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
-	texDef(t1up, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
-	texDef(t1dn, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
-	texDef(t2up, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
-	texDef(t2dn, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
-	texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
-	texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
-
-	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads);
-}
-
-static int BLOCKSIZE = 84;
-
-__host__ void groestl512_cpu_setBlock(void *data, int len)
-	// data muss 80/84-Byte haben!
-	// heftyHash hat 32-Byte
-{
-	// Nachricht expandieren und setzen
-	uint32_t msgBlock[32];
-
-	memset(msgBlock, 0, sizeof(uint32_t) * 32);
-	memcpy(&msgBlock[0], data, len);
-
-	// Erweitere die Nachricht auf den Nachrichtenblock (padding)
-	// Unsere Nachricht hat 112/116 Byte
-	if (len == 84) {
-		msgBlock[29] = 0x80;
-		msgBlock[31] = 0x01000000;
-	} else if (len == 80) {
-		msgBlock[28] = 0x80;
-		msgBlock[31] = 0x01000000;
-	}
-	// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
-	// auf der GPU ausgeführt)
-
-	// setze register
-	uint32_t groestl_state_init[32];
-	memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
-	groestl_state_init[31] = 0x20000;
-
-	// state speichern
-	cudaMemcpyToSymbol(groestl_gpu_state, groestl_state_init, 128);
-
-	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
-	cudaMemcpyToSymbol(groestl_gpu_msg, msgBlock, 128);
-	BLOCKSIZE = len;
-}
-
-__host__ void groestl512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy)
-{
-	// Hefty1 Hashes kopieren (eigentlich nur zum debuggen)
-	if (copy)
-		CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
-}
-
-__host__ void groestl512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce)
-{
-	const uint32_t threadsperblock = 128;
-
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
-
-	if (BLOCKSIZE == 84)
-		groestl512_gpu_hash<84><<<grid, block>>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
-	else if (BLOCKSIZE == 80)
-		groestl512_gpu_hash<80><<<grid, block>>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
-}
diff --git a/heavy/cuda_groestl512.h b/heavy/cuda_groestl512.h
deleted file mode 100644
index 0cdc13b809..0000000000
--- a/heavy/cuda_groestl512.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _CUDA_GROESTL512_H
-#define _CUDA_GROESTL512_H
-
-void groestl512_cpu_init(int thr_id, int threads);
-void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
-void groestl512_cpu_setBlock(void *data, int len);
-void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
-
-#endif
\ No newline at end of file
diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu
deleted file mode 100644
index 7bc4411a29..0000000000
--- a/heavy/cuda_hefty1.cu
+++ /dev/null
@@ -1,410 +0,0 @@
-#include <stdio.h>
-#include <memory.h>
-
-#include "miner.h"
-
-#include "cuda_helper.h"
-
-#define USE_SHARED 1
-
-// globaler Speicher für alle HeftyHashes aller Threads
-uint32_t *heavy_heftyHashes[MAX_GPUS];
-
-/* Hash-Tabellen */
-__constant__ uint32_t hefty_gpu_constantTable[64];
-#if USE_SHARED
-#define heftyLookUp(x) (*((uint32_t*)heftytab + (x)))
-#else
-#define heftyLookUp(x) hefty_gpu_constantTable[x]
-#endif
-
-// muss expandiert werden
-__constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message
-__constant__ uint32_t hefty_gpu_register[8];
-__constant__ uint32_t hefty_gpu_sponge[4];
-
-uint32_t hefty_cpu_hashTable[] = {
-    0x6a09e667UL,
-    0xbb67ae85UL,
-    0x3c6ef372UL,
-    0xa54ff53aUL,
-    0x510e527fUL,
-    0x9b05688cUL,
-    0x1f83d9abUL,
-    0x5be0cd19UL };
-
-uint32_t hefty_cpu_constantTable[] = {
-    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
-    0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
-    0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
-    0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
-    0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
-    0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
-    0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
-    0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
-    0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
-    0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
-    0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
-    0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
-    0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
-    0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
-    0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
-    0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
-};
-
-#if 0
-#define S(x, n)          (((x) >> (n)) | ((x) << (32 - (n))))
-#else
-__host__ __device__
-static uint32_t S(uint32_t x, int n)
-{
-    return (((x) >> (n)) | ((x) << (32 - (n))));
-}
-#endif
-
-#define R(x, n)          ((x) >> (n))
-#define Ch(x, y, z)      ((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)     ((x & (y | z)) | (y & z))
-#define S0(x)            (S(x, 2) ^ S(x, 13) ^ S(x, 22))
-#define S1(x)            (S(x, 6) ^ S(x, 11) ^ S(x, 25))
-#define s0(x)            (S(x, 7) ^ S(x, 18) ^ R(x, 3))
-#define s1(x)            (S(x, 17) ^ S(x, 19) ^ R(x, 10))
-
-#define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
-
-// uint8_t
-#define smoosh4(x)       ( ((x)>>4) ^ ((x) & 0x0F) )
-
-__host__ __forceinline__ __device__
-uint8_t smoosh2(uint32_t x)
-{
-    uint16_t w = (x >> 16) ^ (x & 0xffff);
-    uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
-    return 24 - (((n >> 2) ^ (n & 0x03)) << 3);
-}
-// 4 auf einmal
-#define smoosh4Quad(x)   ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
-#define getByte(x,y)     ( ((x) >> (y)) & 0xFF )
-
-__host__ __forceinline__ __device__
-void Mangle(uint32_t *inp)
-{
-    uint32_t r = smoosh4Quad(inp[0]);
-    uint32_t inp0org;
-    uint32_t tmp0Mask, tmp1Mask;
-    uint32_t in1, in2, isAddition;
-    int32_t tmp;
-    uint8_t b;
-
-    inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
-
-    r += 0x01010101;
-    tmp = smoosh2(inp[1]);
-    b = getByte(r,tmp);
-    inp0org = S(inp[0], b);
-    tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0
-    tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0
-
-    in1 =    (inp[2] & ~inp0org) |
-            (tmp1Mask & ~inp[2] & inp0org) |
-            (~tmp0Mask & ~inp[2] & inp0org);
-    in2 = inp[2] += ~inp0org;
-    isAddition = ~tmp0Mask & tmp1Mask;
-    inp[2] = isAddition ? in2 : in1;
-
-    r += 0x01010101;
-    tmp = smoosh2(inp[1] ^ inp[2]);
-    b = getByte(r,tmp);
-    inp0org = S(inp[0], b);
-    tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0
-    tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0
-
-    in1 =    (inp[3] & ~inp0org) |
-            (tmp1Mask & ~inp[3] & inp0org) |
-            (~tmp0Mask & ~inp[3] & inp0org);
-    in2 = inp[3] += ~inp0org;
-    isAddition = ~tmp0Mask & tmp1Mask;
-    inp[3] = isAddition ? in2 : in1;
-
-    inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
-}
-
-__host__ __forceinline__ __device__
-void Absorb(uint32_t *inp, uint32_t x)
-{
-    inp[0] ^= x;
-    Mangle(inp);
-}
-
-__host__ __forceinline__ __device__
-uint32_t Squeeze(uint32_t *inp)
-{
-    uint32_t y = inp[0];
-    Mangle(inp);
-    return y;
-}
-
-__host__ __forceinline__ __device__
-uint32_t Br(uint32_t *sponge, uint32_t x)
-{
-    uint32_t r = Squeeze(sponge);
-    uint32_t t = ((r >> 8) & 0x1F);
-    uint32_t y = 1 << t;
-
-    uint32_t a = (((r>>1) & 0x01) << t) & y;
-    uint32_t b = ((r & 0x01) << t) & y;
-    uint32_t c = x & y;
-
-    uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c);
-    return retVal;
-}
-
-__device__ __forceinline__
-void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
-{
-    uint32_t tmpBr;
-
-    uint32_t brG = Br(sponge, regs[6]);
-    uint32_t brF = Br(sponge, regs[5]);
-    uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
-    uint32_t brE = Br(sponge, regs[4]);
-    uint32_t tmp2 = tmp1 + S1(brE);
-    uint32_t brC = Br(sponge, regs[2]);
-    uint32_t brB = Br(sponge, regs[1]);
-    uint32_t brA = Br(sponge, regs[0]);
-    uint32_t tmp3 = Maj(brA, brB, brC);
-    tmpBr = Br(sponge, regs[0]);
-    uint32_t tmp4 = tmp3 + S0(tmpBr);
-    tmpBr = Br(sponge, tmp2);
-
-    #pragma unroll 7
-    for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-    regs[0] = tmp2 + tmp4;
-    regs[4] += tmpBr;
-}
-
-__host__
-void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
-{
-    uint32_t tmpBr;
-
-    uint32_t brG = Br(sponge, regs[6]);
-    uint32_t brF = Br(sponge, regs[5]);
-    uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
-    uint32_t brE = Br(sponge, regs[4]);
-    uint32_t tmp2 = tmp1 + S1(brE);
-    uint32_t brC = Br(sponge, regs[2]);
-    uint32_t brB = Br(sponge, regs[1]);
-    uint32_t brA = Br(sponge, regs[0]);
-    uint32_t tmp3 = Maj(brA, brB, brC);
-    tmpBr = Br(sponge, regs[0]);
-    uint32_t tmp4 = tmp3 + S0(tmpBr);
-    tmpBr = Br(sponge, tmp2);
-
-    for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-    regs[0] = tmp2 + tmp4;
-    regs[4] += tmpBr;
-}
-
-__global__
-void hefty_gpu_hash(uint32_t threads, uint32_t startNounce, uint32_t *outputHash)
-{
-#if USE_SHARED
-    extern __shared__ unsigned char heftytab[];
-    if(threadIdx.x < 64)
-    {
-        *((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
-    }
-
-    __syncthreads();
-#endif
-
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        // bestimme den aktuellen Zähler
-        uint32_t nounce = startNounce + thread;
-
-        // jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
-        // reduktion von 256 byte auf 128 byte
-        uint32_t W1[16];
-        uint32_t W2[16];
-
-        // Initialisiere die register a bis h mit der Hash-Tabelle
-        uint32_t regs[8];
-        uint32_t hash[8];
-        uint32_t sponge[4];
-
-#pragma unroll 4
-        for(int k=0; k < 4; k++)
-            sponge[k] = hefty_gpu_sponge[k];
-
-        // pre
-#pragma unroll 8
-        for (int k=0; k < 8; k++)
-        {
-            regs[k] = hefty_gpu_register[k];
-            hash[k] = regs[k];
-        }
-
-        //memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
-#pragma unroll 16
-        for(int k=0;k<16;k++)
-            W1[k] = hefty_gpu_blockHeader[k];
-        W1[3] = SWAB32(nounce);
-
-        // 2. Runde
-#pragma unroll 16
-        for(int j=0;j<16;j++)
-            Absorb(sponge, W1[j] ^ heftyLookUp(j));
-
-// Progress W1 (Bytes 0...63)
-#pragma unroll 16
-        for(int j=0;j<16;j++)
-        {
-            Absorb(sponge, regs[3] ^ regs[7]);
-            hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge);
-        }
-
-// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
-
-#pragma unroll 3
-        for(int k=0;k<3;k++)
-        {
-    #pragma unroll 2
-            for(int j=0;j<2;j++)
-                W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
-    #pragma unroll 5
-            for(int j=2;j<7;j++)
-                W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
-
-    #pragma unroll 8
-            for(int j=7;j<15;j++)
-                W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
-
-            W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
-
-    #pragma unroll 16
-            for(int j=0;j<16;j++)
-            {
-                Absorb(sponge, regs[3] + regs[7]);
-                hefty_gpu_round(regs, W2[j], heftyLookUp(j + ((k+1)<<4)), sponge);
-            }
-    #pragma unroll 16
-            for(int j=0;j<16;j++)
-                W1[j] = W2[j];
-        }
-
-#pragma unroll 8
-        for(int k=0;k<8;k++)
-            hash[k] += regs[k];
-
-#pragma unroll 8
-        for(int k=0;k<8;k++)
-            ((uint32_t*)outputHash)[(thread<<3)+k] = SWAB32(hash[k]);
-    }
-}
-
-__host__
-void hefty_cpu_init(int thr_id, uint32_t threads)
-{
-    CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-
-    // Kopiere die Hash-Tabellen in den GPU-Speicher
-    cudaMemcpyToSymbol( hefty_gpu_constantTable,
-                        hefty_cpu_constantTable,
-                        sizeof(uint32_t) * 64 );
-
-    // Speicher für alle Hefty1 hashes belegen
-    CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads));
-}
-
-__host__
-void hefty_cpu_setBlock(int thr_id, uint32_t threads, void *data, int len)
-// data muss 80/84-Byte haben!
-{
-    // Nachricht expandieren und setzen
-    uint32_t msgBlock[32];
-
-    memset(msgBlock, 0, sizeof(msgBlock));
-    memcpy(&msgBlock[0], data, len);
-    if (len == 84) {
-        msgBlock[21] |= 0x80;
-        msgBlock[31] = 672; // bitlen
-    } else if (len == 80) {
-        msgBlock[20] |= 0x80;
-        msgBlock[31] = 640; // bitlen
-    }
-
-    for(int i=0;i<31;i++) // Byteorder drehen
-        msgBlock[i] = SWAB32(msgBlock[i]);
-
-    // die erste Runde wird auf der CPU durchgeführt, da diese für
-    // alle Threads gleich ist. Der Hash wird dann an die Threads
-    // übergeben
-
-    // Erstelle expandierten Block W
-    uint32_t W[64];
-    memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
-    for(int j=16;j<64;j++)
-        W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
-
-    // Initialisiere die register a bis h mit der Hash-Tabelle
-    uint32_t regs[8];
-    uint32_t hash[8];
-    uint32_t sponge[4];
-
-    // pre
-    memset(sponge, 0, sizeof(uint32_t) * 4);
-    for (int k=0; k < 8; k++)
-    {
-        regs[k] = hefty_cpu_hashTable[k];
-        hash[k] = regs[k];
-    }
-
-    // 1. Runde
-    for(int j=0;j<16;j++)
-        Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
-
-    for(int j=0;j<16;j++)
-    {
-        Absorb(sponge, regs[3] ^ regs[7]);
-        hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
-    }
-
-    for(int j=16;j<64;j++)
-    {
-        Absorb(sponge, regs[3] + regs[7]);
-        hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
-    }
-
-    for(int k=0;k<8;k++)
-        hash[k] += regs[k];
-
-    // sponge speichern
-    cudaMemcpyToSymbol(hefty_gpu_sponge, sponge, 16);
-    // hash speichern
-    cudaMemcpyToSymbol(hefty_gpu_register, hash, 32);
-    // Blockheader setzen (korrekte Nonce fehlt da drin noch)
-    CUDA_SAFE_CALL(cudaMemcpyToSymbol(hefty_gpu_blockHeader, &msgBlock[16], 64));
-}
-
-__host__
-void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce)
-{
-    uint32_t threadsperblock = 256;
-
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
-
-    // Größe des dynamischen Shared Memory Bereichs
-#if USE_SHARED
-    int shared_size = 8 * 64 * sizeof(uint32_t);
-#else
-    int shared_size = 0;
-#endif
-
-    hefty_gpu_hash <<< grid, block, shared_size >>> (threads, startNounce, heavy_heftyHashes[thr_id]);
-
-}
diff --git a/heavy/cuda_hefty1.h b/heavy/cuda_hefty1.h
deleted file mode 100644
index 17b196c836..0000000000
--- a/heavy/cuda_hefty1.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _CUDA_HEFTY1_H
-#define _CUDA_HEFTY1_H
-
-void hefty_cpu_hash(int thr_id, int threads, int startNounce);
-void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len);
-void hefty_cpu_init(int thr_id, int threads);
-
-#endif
\ No newline at end of file
diff --git a/heavy/cuda_keccak512.cu b/heavy/cuda_keccak512.cu
deleted file mode 100644
index 76872e9824..0000000000
--- a/heavy/cuda_keccak512.cu
+++ /dev/null
@@ -1,276 +0,0 @@
-#include <stdio.h>
-#include <memory.h>
-
-#include "cuda_helper.h"
-
-// globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *heavy_heftyHashes[MAX_GPUS];
-extern uint32_t *heavy_nonceVector[MAX_GPUS];
-
-// globaler Speicher für unsere Ergebnisse
-uint32_t *d_hash3output[MAX_GPUS];
-extern uint32_t *d_hash4output[MAX_GPUS];
-extern uint32_t *d_hash5output[MAX_GPUS];
-
-// der Keccak512 State nach der ersten Runde (72 Bytes)
-__constant__ uint64_t c_State[25];
-
-// die Message (72 Bytes) für die zweite Runde auf der GPU
-__constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (Nonce at offset 4) plus padding
-
-// ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------
-
-#define U32TO64_LE(p) \
-	(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
-
-#define U64TO32_LE(p, v) \
-	*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
-
-static __device__ void mycpy72(uint32_t *d, const uint32_t *s) {
-	#pragma unroll 18
-	for (int k=0; k < 18; ++k) d[k] = s[k];
-}
-
-static __device__ void mycpy32(uint32_t *d, const uint32_t *s) {
-	#pragma unroll 8
-	for (int k=0; k < 8; ++k) d[k] = s[k];
-}
-
-typedef struct keccak_hash_state_t {
-	uint64_t state[25];    // 25*2
-	uint32_t buffer[72/4]; // 72
-} keccak_hash_state;
-
-__device__ void statecopy(uint64_t *d, uint64_t *s)
-{
-	#pragma unroll 25
-	for (int i=0; i < 25; ++i)
-		d[i] = s[i];
-}
-
-
-static const uint64_t host_keccak_round_constants[24] = {
-	0x0000000000000001ull, 0x0000000000008082ull,
-	0x800000000000808aull, 0x8000000080008000ull,
-	0x000000000000808bull, 0x0000000080000001ull,
-	0x8000000080008081ull, 0x8000000000008009ull,
-	0x000000000000008aull, 0x0000000000000088ull,
-	0x0000000080008009ull, 0x000000008000000aull,
-	0x000000008000808bull, 0x800000000000008bull,
-	0x8000000000008089ull, 0x8000000000008003ull,
-	0x8000000000008002ull, 0x8000000000000080ull,
-	0x000000000000800aull, 0x800000008000000aull,
-	0x8000000080008081ull, 0x8000000000008080ull,
-	0x0000000080000001ull, 0x8000000080008008ull
-};
-
-__constant__ uint64_t c_keccak_round_constants[24];
-
-__host__ __device__ void
-keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
-	int i;
-	uint64_t t[5], u[5], v, w;
-
-	/* absorb input */
-	for (i = 0; i < 9 /* 72/8 */; i++, in += 2)
-		s[i] ^= U32TO64_LE(in);
-
-	for (i = 0; i < 24; i++) {
-		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
-
-		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-		u[0] = t[4] ^ ROTL64(t[1], 1);
-		u[1] = t[0] ^ ROTL64(t[2], 1);
-		u[2] = t[1] ^ ROTL64(t[3], 1);
-		u[3] = t[2] ^ ROTL64(t[4], 1);
-		u[4] = t[3] ^ ROTL64(t[0], 1);
-
-		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-		/* rho pi: b[..] = rotl(a[..], ..) */
-		v = s[ 1];
-		s[ 1] = ROTL64(s[ 6], 44);
-		s[ 6] = ROTL64(s[ 9], 20);
-		s[ 9] = ROTL64(s[22], 61);
-		s[22] = ROTL64(s[14], 39);
-		s[14] = ROTL64(s[20], 18);
-		s[20] = ROTL64(s[ 2], 62);
-		s[ 2] = ROTL64(s[12], 43);
-		s[12] = ROTL64(s[13], 25);
-		s[13] = ROTL64(s[19],  8);
-		s[19] = ROTL64(s[23], 56);
-		s[23] = ROTL64(s[15], 41);
-		s[15] = ROTL64(s[ 4], 27);
-		s[ 4] = ROTL64(s[24], 14);
-		s[24] = ROTL64(s[21],  2);
-		s[21] = ROTL64(s[ 8], 55);
-		s[ 8] = ROTL64(s[16], 45);
-		s[16] = ROTL64(s[ 5], 36);
-		s[ 5] = ROTL64(s[ 3], 28);
-		s[ 3] = ROTL64(s[18], 21);
-		s[18] = ROTL64(s[17], 15);
-		s[17] = ROTL64(s[11], 10);
-		s[11] = ROTL64(s[ 7],  6);
-		s[ 7] = ROTL64(s[10],  3);
-		s[10] = ROTL64(    v,  1);
-
-		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
-		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
-		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-		/* iota: a[0,0] ^= round constant */
-		s[0] ^= keccak_round_constants[i];
-	}
-}
-
-// Die Hash-Funktion
-template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		// bestimme den aktuellen Zähler
-		//uint32_t nounce = startNounce + thread;
-		uint32_t nounce = nonceVector[thread];
-
-		// Index-Position des Hashes in den Hash Puffern bestimmen (Hefty1 und outputHash)
-		uint32_t hashPosition = nounce - startNounce;
-
-		// erstmal den State der ersten Runde holen
-		uint64_t keccak_gpu_state[25];
-#pragma unroll 25
-		for (int i=0; i < 25; ++i)
-			keccak_gpu_state[i] = c_State[i];
-
-		// Message2 in den Puffer holen
-		uint32_t msgBlock[18];
-		mycpy72(msgBlock, c_PaddedMessage2);
-
-		// die individuelle Nonce einsetzen
-		msgBlock[1] = nounce;
-
-		// den individuellen Hefty1 Hash einsetzen
-		mycpy32(&msgBlock[(BLOCKSIZE-72)/sizeof(uint32_t)], &heftyHashes[8 * hashPosition]);
-
-		// den Block einmal gut durchschütteln
-		keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants);
-
-		// das Hash erzeugen
-		uint32_t hash[16];
-
-#pragma unroll 8
-		for (int i = 0; i < 64; i += 8) {
-			U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
-		}
-
-		// und ins Global Memory rausschreiben
-#pragma unroll 16
-		for(int k=0;k<16;k++)
-			((uint32_t*)outputHash)[16*hashPosition+k] = hash[k];
-	}
-}
-
-// ---------------------------- END CUDA keccak512 functions ------------------------------------
-
-__host__ 
-void keccak512_cpu_init(int thr_id, uint32_t threads)
-{
-	// Kopiere die Hash-Tabellen in den GPU-Speicher
-	cudaMemcpyToSymbol( c_keccak_round_constants,
-						host_keccak_round_constants,
-						sizeof(host_keccak_round_constants),
-						0, cudaMemcpyHostToDevice);
-
-	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads);
-}
-
-// ----------------BEGIN keccak512 CPU version from scrypt-jane code --------------------
-
-#define SCRYPT_HASH_DIGEST_SIZE 64
-#define SCRYPT_KECCAK_F 1600
-#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 1024 */
-#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 576 */
-#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8) /* 72 */
-
-// --------------- END keccak512 CPU version from scrypt-jane code --------------------
-
-static int BLOCKSIZE = 84;
-
-__host__
-void keccak512_cpu_setBlock(void *data, int len)
-	// data muss 80 oder 84-Byte haben!
-	// heftyHash hat 32-Byte
-{
-	// CH
-	// state init
-	uint64_t keccak_cpu_state[25];
-	memset(keccak_cpu_state, 0, sizeof(keccak_cpu_state));
-
-	// erste Runde
-	keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants);
-
-	// state kopieren
-	cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
-
-	// keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke
-	// zu jeweils
-	uint32_t msgBlock[18];
-	memset(msgBlock, 0, 18 * sizeof(uint32_t));
-
-	// kopiere die restlichen Daten rein (aber nur alles nach Byte 72)
-	if (len == 84)
-		memcpy(&msgBlock[0], &((uint8_t*)data)[72], 12);
-	else if (len == 80)
-		memcpy(&msgBlock[0], &((uint8_t*)data)[72], 8);
-
-	// Nachricht abschließen
-	if (len == 84)
-		msgBlock[11] = 0x01;
-	else if (len == 80)
-		msgBlock[10] = 0x01;
-	msgBlock[17] = 0x80000000;
-
-	// Message 2 ins Constant Memory kopieren (die variable Nonce und
-	// der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden)
-	cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice );
-
-	BLOCKSIZE = len;
-}
-
-__host__
-void keccak512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy)
-{
-	// Hefty1 Hashes kopieren
-	if (copy)
-		CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
-	//else cudaDeviceSynchronize();
-}
-
-__host__
-void keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce)
-{
-	const uint32_t threadsperblock = 128;
-
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
-
-	if (BLOCKSIZE==84)
-		keccak512_gpu_hash<84><<<grid, block>>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
-	else if (BLOCKSIZE==80)
-		keccak512_gpu_hash<80><<<grid, block>>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
-}
diff --git a/heavy/cuda_keccak512.h b/heavy/cuda_keccak512.h
deleted file mode 100644
index 1182447573..0000000000
--- a/heavy/cuda_keccak512.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _CUDA_KECCAK512_H
-#define _CUDA_KECCAK512_H
-
-void keccak512_cpu_init(int thr_id, int threads);
-void keccak512_cpu_setBlock(void *data, int len);
-void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
-void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
-
-#endif
diff --git a/heavy/cuda_sha256.cu b/heavy/cuda_sha256.cu
deleted file mode 100644
index a4c309ba33..0000000000
--- a/heavy/cuda_sha256.cu
+++ /dev/null
@@ -1,272 +0,0 @@
-#include <stdio.h>
-#include <memory.h>
-
-#include "cuda_helper.h"
-
-// globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *heavy_heftyHashes[MAX_GPUS];
-extern uint32_t *heavy_nonceVector[MAX_GPUS];
-
-// globaler Speicher für unsere Ergebnisse
-uint32_t *d_hash2output[MAX_GPUS];
-
-
-/* Hash-Tabellen */
-__constant__ uint32_t sha256_gpu_constantTable[64];
-
-// muss expandiert werden
-__constant__ uint32_t sha256_gpu_blockHeader[16]; // 2x512 Bit Message
-__constant__ uint32_t sha256_gpu_register[8];
-
-uint32_t sha256_cpu_hashTable[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
-uint32_t sha256_cpu_constantTable[] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
-};
-
-#define S(x, n)			(((x) >> (n)) | ((x) << (32 - (n))))
-#define R(x, n)			((x) >> (n))
-#define Ch(x, y, z)		((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define S0(x)			(S(x, 2) ^ S(x, 13) ^ S(x, 22))
-#define S1(x)			(S(x, 6) ^ S(x, 11) ^ S(x, 25))
-#define s0(x)			(S(x, 7) ^ S(x, 18) ^ R(x, 3))
-#define s1(x)			(S(x, 17) ^ S(x, 19) ^ R(x, 10))
-
-#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
-
-// Die Hash-Funktion
-template <int BLOCKSIZE> __global__ void sha256_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		// bestimme den aktuellen Zähler
-		uint32_t nounce = startNounce + thread;
-		nonceVector[thread] = nounce;
-
-		// jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
-		uint32_t W1[16];
-		uint32_t W2[16];
-
-		// Initialisiere die register a bis h mit der Hash-Tabelle
-		uint32_t regs[8];
-		uint32_t hash[8];
-
-		// pre
-#pragma unroll 8
-		for (int k=0; k < 8; k++)
-		{
-			regs[k] = sha256_gpu_register[k];
-			hash[k] = regs[k];
-		}
-
-		// 2. Runde
-		//memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke
-		//memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen
-#pragma unroll 16
-		for(int k=0;k<16;k++)
-			W1[k] = sha256_gpu_blockHeader[k];
-
-		uint32_t offset = 8 * (blockDim.x * blockIdx.x + threadIdx.x);
-#pragma unroll 8
-		for(int k=0;k<8;k++)
-			W1[((BLOCKSIZE-64)/4)+k] = heftyHashes[offset + k];
-
-#pragma unroll 8
-		for (int i=((BLOCKSIZE-64)/4); i < ((BLOCKSIZE-64)/4)+8; ++i) W1[i] = SWAB32(W1[i]); // die Hefty1 Hashes brauchen eine Drehung ;)
-		W1[3] = SWAB32(nounce);
-
-// Progress W1
-#pragma unroll 16
-		for(int j=0;j<16;j++)
-		{
-			uint32_t T1, T2;
-			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W1[j];
-			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-
-			#pragma unroll 7
-			for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-			regs[0] = T1 + T2;
-			regs[4] += T1;
-		}
-
-// Progress W2...W3
-#pragma unroll 3
-		for(int k=0;k<3;k++)
-		{
-	#pragma unroll 2
-			for(int j=0;j<2;j++)
-				W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
-	#pragma unroll 5
-			for(int j=2;j<7;j++)
-				W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
-
-	#pragma unroll 8
-			for(int j=7;j<15;j++)
-				W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
-
-			W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
-
-			// Rundenfunktion
-	#pragma unroll 16
-			for(int j=0;j<16;j++)
-			{
-				uint32_t T1, T2;
-				T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
-				T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-
-				#pragma unroll 7
-				for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
-				regs[0] = T1 + T2;
-				regs[4] += T1;
-			}
-
-	#pragma unroll 16
-			for(int j=0;j<16;j++)
-				W1[j] = W2[j];
-		}
-
-/*
-		for(int j=16;j<64;j++)
-			W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
-
-#pragma unroll 64
-		for(int j=0;j<64;j++)
-		{
-			uint32_t T1, T2;
-			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W[j];
-			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-
-			#pragma unroll 7
-			for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-			regs[0] = T1 + T2;
-			regs[4] += T1;
-		}
-*/
-#pragma unroll 8
-		for(int k=0;k<8;k++)
-			hash[k] += regs[k];
-
-#pragma unroll 8
-		for(int k=0;k<8;k++)
-			((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
-	}
-}
-
-// Setup-Funktionen
-__host__ void sha256_cpu_init(int thr_id, uint32_t threads)
-{
-	// Kopiere die Hash-Tabellen in den GPU-Speicher
-	cudaMemcpyToSymbol(	sha256_gpu_constantTable,
-						sha256_cpu_constantTable,
-						sizeof(uint32_t) * 64 );
-
-	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);
-}
-
-static int BLOCKSIZE = 84;
-
-__host__ void sha256_cpu_setBlock(void *data, int len)
-	// data muss 80/84-Byte haben!
-	// heftyHash hat 32-Byte
-{
-	// Nachricht expandieren und setzen
-	uint32_t msgBlock[32];
-
-	memset(msgBlock, 0, sizeof(uint32_t) * 32);
-	memcpy(&msgBlock[0], data, len);
-	if (len == 84) {
-		memset(&msgBlock[21], 0, 32); // vorläufig  Nullen anstatt der Hefty1 Hashes einfüllen
-		msgBlock[29] |= 0x80;
-		msgBlock[31] = 928; // bitlen
-	} else if (len == 80) {
-		memset(&msgBlock[20], 0, 32); // vorläufig  Nullen anstatt der Hefty1 Hashes einfüllen
-		msgBlock[28] |= 0x80;
-		msgBlock[31] = 896; // bitlen
-	}
-
-	for(int i=0;i<31;i++) // Byteorder drehen
-		msgBlock[i] = SWAB32(msgBlock[i]);
-
-	// die erste Runde wird auf der CPU durchgeführt, da diese für
-	// alle Threads gleich ist. Der Hash wird dann an die Threads
-	// übergeben
-	uint32_t W[64];
-
-	// Erstelle expandierten Block W
-	memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
-	for(int j=16;j<64;j++)
-		W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
-
-	// Initialisiere die register a bis h mit der Hash-Tabelle
-	uint32_t regs[8];
-	uint32_t hash[8];
-
-	// pre
-	for (int k=0; k < 8; k++)
-	{
-		regs[k] = sha256_cpu_hashTable[k];
-		hash[k] = regs[k];
-	}
-
-	// 1. Runde
-	for(int j=0;j<64;j++)
-	{
-		uint32_t T1, T2;
-		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_cpu_constantTable[j] + W[j];
-		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-
-		//#pragma unroll 7
-		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-		// sollte mal noch durch memmov ersetzt werden!
-//		memcpy(&regs[1], &regs[0], sizeof(uint32_t) * 7);
-		regs[0] = T1 + T2;
-		regs[4] += T1;
-	}
-
-	for(int k=0;k<8;k++)
-		hash[k] += regs[k];
-
-	// hash speichern
-	cudaMemcpyToSymbol(	sha256_gpu_register,
-						hash,
-						sizeof(uint32_t) * 8 );
-
-	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
-	cudaMemcpyToSymbol(	sha256_gpu_blockHeader,
-						&msgBlock[16],
-						64);
-
-	BLOCKSIZE = len;
-}
-
-__host__ void sha256_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy)
-{
-	// Hefty1 Hashes kopieren
-	if (copy)
-		CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
-	//else cudaDeviceSynchronize();
-}
-
-__host__ void sha256_cpu_hash(int thr_id, uint32_t threads, int startNounce)
-{
-	const uint32_t threadsperblock = 256;
-
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
-
-	if (BLOCKSIZE == 84)
-		sha256_gpu_hash<84><<<grid, block>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
-	else if (BLOCKSIZE == 80) {
-		sha256_gpu_hash<80><<<grid, block>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
-	}
-}
diff --git a/heavy/cuda_sha256.h b/heavy/cuda_sha256.h
deleted file mode 100644
index 03385d125a..0000000000
--- a/heavy/cuda_sha256.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _CUDA_SHA256_H
-#define _CUDA_SHA256_H
-
-void sha256_cpu_init(int thr_id, int threads);
-void sha256_cpu_setBlock(void *data, int len);
-void sha256_cpu_hash(int thr_id, int threads, int startNounce);
-void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
-#endif
diff --git a/heavy/heavy.cu b/heavy/heavy.cu
deleted file mode 100644
index 99b0357f57..0000000000
--- a/heavy/heavy.cu
+++ /dev/null
@@ -1,352 +0,0 @@
-#include <stdio.h>
-#include <openssl/sha.h>
-#include <map>
-// include thrust
-#include <thrust/remove.h>
-#include <thrust/device_vector.h>
-
-#include "miner.h"
-
-extern "C" {
-#include "sph/sph_keccak.h"
-#include "sph/sph_blake.h"
-#include "sph/sph_groestl.h"
-}
-#include "hefty1.h"
-#include "heavy/heavy.h"
-#include "cuda_helper.h"
-
-extern uint32_t *d_hash2output[MAX_GPUS];
-extern uint32_t *d_hash3output[MAX_GPUS];
-extern uint32_t *d_hash4output[MAX_GPUS];
-extern uint32_t *d_hash5output[MAX_GPUS];
-
-#define HEAVYCOIN_BLKHDR_SZ 84
-#define MNR_BLKHDR_SZ       80
-
-// nonce-array für die threads
-uint32_t *heavy_nonceVector[MAX_GPUS];
-
-extern uint32_t *heavy_heftyHashes[MAX_GPUS];
-
-/* Combines top 64-bits from each hash into a single hash */
-static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4)
-{
-    const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 };
-    int bits;
-    unsigned int i;
-    uint32_t mask;
-    unsigned int k;
-
-    /* Transpose first 64 bits of each hash into out */
-    memset(out, 0, 32);
-    bits = 0;
-    for (i = 7; i >= 6; i--) {
-        for (mask = 0x80000000; mask; mask >>= 1) {
-            for (k = 0; k < 4; k++) {
-                out[(255 - bits)/32] <<= 1;
-                if ((hash[k][i] & mask) != 0)
-                    out[(255 - bits)/32] |= 1;
-                bits++;
-            }
-        }
-    }
-}
-
-#ifdef _MSC_VER
-#include <intrin.h>
-static uint32_t __inline bitsset( uint32_t x )
-{
-    DWORD r = 0;
-    _BitScanReverse(&r, x);
-    return r;
-}
-#else
-static uint32_t bitsset( uint32_t x )
-{
-    return 31-__builtin_clz(x);
-}
-#endif
-
-// Finde das high bit in einem Multiword-Integer.
-static int findhighbit(const uint32_t *ptarget, int words)
-{
-    int i;
-    int highbit = 0;
-    for (i=words-1; i >= 0; --i)
-    {
-        if (ptarget[i] != 0) {
-            highbit = i*32 + bitsset(ptarget[i])+1;
-            break;
-        }
-    }
-    return highbit;
-}
-
-// Generiere ein Multiword-Integer das die Zahl
-// (2 << highbit) - 1 repräsentiert.
-static void genmask(uint32_t *ptarget, int words, int highbit)
-{
-    int i;
-    for (i=words-1; i >= 0; --i)
-    {
-        if ((i+1)*32 <= highbit)
-            ptarget[i] = UINT32_MAX;
-        else if (i*32 > highbit)
-            ptarget[i] = 0x00000000;
-        else
-            ptarget[i] = (1 << (highbit-i*32)) - 1;
-    }
-}
-
-struct check_nonce_for_remove
-{
-    check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) :
-        m_target(target),
-        m_hashes(hashes),
-        m_hashlen(hashlen),
-        m_startNonce(startNonce) { }
-
-    uint64_t  m_target;
-    uint32_t *m_hashes;
-    uint32_t  m_hashlen;
-    uint32_t  m_startNonce;
-
-    __device__
-    bool operator()(const uint32_t x)
-    {
-        // Position im Hash Buffer
-        uint32_t hashIndex = x - m_startNonce;
-        // Wert des Hashes (als uint64_t) auslesen.
-        // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits)
-        uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6]));
-        bool res = (hashValue & m_target) != hashValue;
-        //printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target);
-        // gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein.
-        return res;
-    }
-};
-
-static bool init[MAX_GPUS] = { 0 };
-
-__host__
-int scanhash_heavy(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done, uint32_t maxvote, int blocklen)
-{
-    const uint32_t first_nonce = pdata[19];
-    // CUDA will process thousands of threads.
-    uint32_t throughput = device_intensity(thr_id, __func__, (1U << 19) - 256);
-    throughput = min(throughput, (max_nonce - first_nonce));
-
-    int rc = 0;
-    uint32_t *hash = NULL;
-    uint32_t *cpu_nonceVector = NULL;
-    CUDA_SAFE_CALL(cudaMallocHost(&hash, throughput*8*sizeof(uint32_t)));
-    CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t)));
-
-    int nrmCalls[6];
-    memset(nrmCalls, 0, sizeof(int) * 6);
-
-    if (opt_benchmark)
-        ((uint32_t*)ptarget)[7] = 0x00ff;
-
-    // für jeden Hash ein individuelles Target erstellen basierend
-    // auf dem höchsten Bit, das in ptarget gesetzt ist.
-    int highbit = findhighbit(ptarget, 8);
-    uint32_t target2[2], target3[2], target4[2], target5[2];
-    genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256
-    genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512
-    genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512
-    genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512
-
-    if (!init[thr_id])
-    {
-        hefty_cpu_init(thr_id, throughput);
-        sha256_cpu_init(thr_id, throughput);
-        keccak512_cpu_init(thr_id, throughput);
-        groestl512_cpu_init(thr_id, throughput);
-        blake512_cpu_init(thr_id, throughput);
-        combine_cpu_init(thr_id, throughput);
-
-        CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput));
-
-        init[thr_id] = true;
-    }
-
-    if (blocklen == HEAVYCOIN_BLKHDR_SZ)
-    {
-        uint16_t *ext = (uint16_t *)&pdata[20];
-
-        if (opt_vote > maxvote) {
-            applog(LOG_WARNING, "Your block reward vote (%hu) exceeds "
-                    "the maxvote reported by the pool (%hu).",
-                    opt_vote, maxvote);
-        }
-
-        if (opt_trust_pool && opt_vote > maxvote) {
-            applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool.");
-            ext[0] = maxvote;
-        }
-        else
-            ext[0] = opt_vote;
-    }
-
-    // Setze die Blockdaten
-    hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen);
-    sha256_cpu_setBlock(pdata, blocklen);
-    keccak512_cpu_setBlock(pdata, blocklen);
-    groestl512_cpu_setBlock(pdata, blocklen);
-    blake512_cpu_setBlock(pdata, blocklen);
-
-    do {
-
-        ////// Compaction init
-        thrust::device_ptr<uint32_t> devNoncePtr(heavy_nonceVector[thr_id]);
-        thrust::device_ptr<uint32_t> devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput);
-        uint32_t actualNumberOfValuesInNonceVectorGPU = throughput;
-        uint64_t *t;
-
-        hefty_cpu_hash(thr_id, throughput, pdata[19]);
-        //cudaDeviceSynchronize();
-        sha256_cpu_hash(thr_id, throughput, pdata[19]);
-        //cudaDeviceSynchronize();
-
-
-        ////// Compaction
-        t = (uint64_t*) target2;
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
-        //cudaDeviceSynchronize();
-
-        ////// Compaction
-        t = (uint64_t*) target3;
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
-        //cudaDeviceSynchronize();
-
-        ////// Compaction
-        t = (uint64_t*) target5;
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
-        //cudaDeviceSynchronize();
-
-        ////// Compaction
-        t = (uint64_t*) target4;
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        // combine
-        combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash);
-
-        if (opt_tracegpu) {
-            applog(LOG_BLUE, "heavy GPU hash:");
-            applog_hash((uchar*)hash);
-        }
-
-        // Ergebnisse kopieren
-        if(actualNumberOfValuesInNonceVectorGPU > 0)
-        {
-            size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU;
-            CUDA_SAFE_CALL(cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost));
-
-            for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++)
-            {
-                uint32_t nonce = cpu_nonceVector[i];
-                uint32_t *foundhash = &hash[8*i];
-                if (foundhash[7] <= ptarget[7]) {
-                    if (fulltest(foundhash, ptarget)) {
-                        uint32_t verification[8];
-                        pdata[19] += nonce - pdata[19];
-                        heavycoin_hash((uchar*)verification, (uchar*)pdata, blocklen);
-                        if (memcmp(verification, foundhash, 8*sizeof(uint32_t))) {
-                            applog(LOG_ERR, "hash for nonce=$%08X does not validate on CPU!\n", nonce);
-                        } else {
-                            *hashes_done = pdata[19] - first_nonce;
-                            rc = 1;
-                            goto exit;
-                        }
-                    }
-                }
-            }
-        }
-
-emptyNonceVector:
-
-        pdata[19] += throughput;
-
-    } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-    *hashes_done = pdata[19] - first_nonce;
-
-exit:
-    cudaFreeHost(cpu_nonceVector);
-    cudaFreeHost(hash);
-    return rc;
-}
-
-__host__
-void heavycoin_hash(uchar* output, const uchar* input, int len)
-{
-    unsigned char hash1[32];
-    unsigned char hash2[32];
-    uint32_t hash3[16];
-    uint32_t hash4[16];
-    uint32_t hash5[16];
-    uint32_t *final;
-    SHA256_CTX ctx;
-    sph_keccak512_context keccakCtx;
-    sph_groestl512_context groestlCtx;
-    sph_blake512_context blakeCtx;
-
-    HEFTY1(input, len, hash1);
-
-    /* HEFTY1 is new, so take an extra security measure to eliminate
-     * the possiblity of collisions:
-     *
-     *     Hash(x) = SHA256(x + HEFTY1(x))
-     *
-     * N.B. '+' is concatenation.
-     */
-    SHA256_Init(&ctx);
-    SHA256_Update(&ctx, input, len);
-    SHA256_Update(&ctx, hash1, sizeof(hash1));
-    SHA256_Final(hash2, &ctx);
-
-    /* Additional security: Do not rely on a single cryptographic hash
-     * function.  Instead, combine the outputs of 4 of the most secure
-     * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512
-     * and BLAKE512.
-     */
-
-    sph_keccak512_init(&keccakCtx);
-    sph_keccak512(&keccakCtx, input, len);
-    sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
-    sph_keccak512_close(&keccakCtx, (void *)&hash3);
-
-    sph_groestl512_init(&groestlCtx);
-    sph_groestl512(&groestlCtx, input, len);
-    sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
-    sph_groestl512_close(&groestlCtx, (void *)&hash4);
-
-    sph_blake512_init(&blakeCtx);
-    sph_blake512(&blakeCtx, input, len);
-    sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
-    sph_blake512_close(&blakeCtx, (void *)&hash5);
-
-    final = (uint32_t *)output;
-    combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5);
-}
diff --git a/heavy/heavy.h b/heavy/heavy.h
deleted file mode 100644
index 59f39139ba..0000000000
--- a/heavy/heavy.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _CUDA_HEAVY_H
-#define _CUDA_HEAVY_H
-
-void blake512_cpu_init(int thr_id, uint32_t threads);
-void blake512_cpu_setBlock(void *pdata, int len);
-void blake512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
-
-void groestl512_cpu_init(int thr_id, uint32_t threads);
-void groestl512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
-void groestl512_cpu_setBlock(void *data, int len);
-void groestl512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
-
-void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce);
-void hefty_cpu_setBlock(int thr_id, uint32_t threads, void *data, int len);
-void hefty_cpu_init(int thr_id, uint32_t threads);
-
-void keccak512_cpu_init(int thr_id, uint32_t threads);
-void keccak512_cpu_setBlock(void *data, int len);
-void keccak512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
-void keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
-
-void sha256_cpu_init(int thr_id, uint32_t threads);
-void sha256_cpu_setBlock(void *data, int len);
-void sha256_cpu_hash(int thr_id, uint32_t threads, int startNounce);
-void sha256_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
-
-void combine_cpu_init(int thr_id, uint32_t threads);
-void combine_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *hash);
-
-#endif
diff --git a/install-sh b/install-sh
deleted file mode 100644
index a9244eb078..0000000000
--- a/install-sh
+++ /dev/null
@@ -1,527 +0,0 @@
-#!/bin/sh
-# install - install a program, script, or datafile
-
-scriptversion=2011-01-19.21; # UTC
-
-# This originates from X11R5 (mit/util/scripts/install.sh), which was
-# later released in X11R6 (xc/config/util/install.sh) with the
-# following copyright and license.
-#
-# Copyright (C) 1994 X Consortium
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to
-# deal in the Software without restriction, including without limitation the
-# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-# sell copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
-# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
-# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-#
-# Except as contained in this notice, the name of the X Consortium shall not
-# be used in advertising or otherwise to promote the sale, use or other deal-
-# ings in this Software without prior written authorization from the X Consor-
-# tium.
-#
-#
-# FSF changes to this file are in the public domain.
-#
-# Calling this script install-sh is preferred over install.sh, to prevent
-# `make' implicit rules from creating a file called install from it
-# when there is no Makefile.
-#
-# This script is compatible with the BSD install script, but was written
-# from scratch.
-
-nl='
-'
-IFS=" ""	$nl"
-
-# set DOITPROG to echo to test this script
-
-# Don't use :- since 4.3BSD and earlier shells don't like it.
-doit=${DOITPROG-}
-if test -z "$doit"; then
-  doit_exec=exec
-else
-  doit_exec=$doit
-fi
-
-# Put in absolute file names if you don't have them in your path;
-# or use environment vars.
-
-chgrpprog=${CHGRPPROG-chgrp}
-chmodprog=${CHMODPROG-chmod}
-chownprog=${CHOWNPROG-chown}
-cmpprog=${CMPPROG-cmp}
-cpprog=${CPPROG-cp}
-mkdirprog=${MKDIRPROG-mkdir}
-mvprog=${MVPROG-mv}
-rmprog=${RMPROG-rm}
-stripprog=${STRIPPROG-strip}
-
-posix_glob='?'
-initialize_posix_glob='
-  test "$posix_glob" != "?" || {
-    if (set -f) 2>/dev/null; then
-      posix_glob=
-    else
-      posix_glob=:
-    fi
-  }
-'
-
-posix_mkdir=
-
-# Desired mode of installed file.
-mode=0755
-
-chgrpcmd=
-chmodcmd=$chmodprog
-chowncmd=
-mvcmd=$mvprog
-rmcmd="$rmprog -f"
-stripcmd=
-
-src=
-dst=
-dir_arg=
-dst_arg=
-
-copy_on_change=false
-no_target_directory=
-
-usage="\
-Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
-   or: $0 [OPTION]... SRCFILES... DIRECTORY
-   or: $0 [OPTION]... -t DIRECTORY SRCFILES...
-   or: $0 [OPTION]... -d DIRECTORIES...
-
-In the 1st form, copy SRCFILE to DSTFILE.
-In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
-In the 4th, create DIRECTORIES.
-
-Options:
-     --help     display this help and exit.
-     --version  display version info and exit.
-
-  -c            (ignored)
-  -C            install only if different (preserve the last data modification time)
-  -d            create directories instead of installing files.
-  -g GROUP      $chgrpprog installed files to GROUP.
-  -m MODE       $chmodprog installed files to MODE.
-  -o USER       $chownprog installed files to USER.
-  -s            $stripprog installed files.
-  -t DIRECTORY  install into DIRECTORY.
-  -T            report an error if DSTFILE is a directory.
-
-Environment variables override the default commands:
-  CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG
-  RMPROG STRIPPROG
-"
-
-while test $# -ne 0; do
-  case $1 in
-    -c) ;;
-
-    -C) copy_on_change=true;;
-
-    -d) dir_arg=true;;
-
-    -g) chgrpcmd="$chgrpprog $2"
-	shift;;
-
-    --help) echo "$usage"; exit $?;;
-
-    -m) mode=$2
-	case $mode in
-	  *' '* | *'	'* | *'
-'*	  | *'*'* | *'?'* | *'['*)
-	    echo "$0: invalid mode: $mode" >&2
-	    exit 1;;
-	esac
-	shift;;
-
-    -o) chowncmd="$chownprog $2"
-	shift;;
-
-    -s) stripcmd=$stripprog;;
-
-    -t) dst_arg=$2
-	# Protect names problematic for `test' and other utilities.
-	case $dst_arg in
-	  -* | [=\(\)!]) dst_arg=./$dst_arg;;
-	esac
-	shift;;
-
-    -T) no_target_directory=true;;
-
-    --version) echo "$0 $scriptversion"; exit $?;;
-
-    --)	shift
-	break;;
-
-    -*)	echo "$0: invalid option: $1" >&2
-	exit 1;;
-
-    *)  break;;
-  esac
-  shift
-done
-
-if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then
-  # When -d is used, all remaining arguments are directories to create.
-  # When -t is used, the destination is already specified.
-  # Otherwise, the last argument is the destination.  Remove it from $@.
-  for arg
-  do
-    if test -n "$dst_arg"; then
-      # $@ is not empty: it contains at least $arg.
-      set fnord "$@" "$dst_arg"
-      shift # fnord
-    fi
-    shift # arg
-    dst_arg=$arg
-    # Protect names problematic for `test' and other utilities.
-    case $dst_arg in
-      -* | [=\(\)!]) dst_arg=./$dst_arg;;
-    esac
-  done
-fi
-
-if test $# -eq 0; then
-  if test -z "$dir_arg"; then
-    echo "$0: no input file specified." >&2
-    exit 1
-  fi
-  # It's OK to call `install-sh -d' without argument.
-  # This can happen when creating conditional directories.
-  exit 0
-fi
-
-if test -z "$dir_arg"; then
-  do_exit='(exit $ret); exit $ret'
-  trap "ret=129; $do_exit" 1
-  trap "ret=130; $do_exit" 2
-  trap "ret=141; $do_exit" 13
-  trap "ret=143; $do_exit" 15
-
-  # Set umask so as not to create temps with too-generous modes.
-  # However, 'strip' requires both read and write access to temps.
-  case $mode in
-    # Optimize common cases.
-    *644) cp_umask=133;;
-    *755) cp_umask=22;;
-
-    *[0-7])
-      if test -z "$stripcmd"; then
-	u_plus_rw=
-      else
-	u_plus_rw='% 200'
-      fi
-      cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
-    *)
-      if test -z "$stripcmd"; then
-	u_plus_rw=
-      else
-	u_plus_rw=,u+rw
-      fi
-      cp_umask=$mode$u_plus_rw;;
-  esac
-fi
-
-for src
-do
-  # Protect names problematic for `test' and other utilities.
-  case $src in
-    -* | [=\(\)!]) src=./$src;;
-  esac
-
-  if test -n "$dir_arg"; then
-    dst=$src
-    dstdir=$dst
-    test -d "$dstdir"
-    dstdir_status=$?
-  else
-
-    # Waiting for this to be detected by the "$cpprog $src $dsttmp" command
-    # might cause directories to be created, which would be especially bad
-    # if $src (and thus $dsttmp) contains '*'.
-    if test ! -f "$src" && test ! -d "$src"; then
-      echo "$0: $src does not exist." >&2
-      exit 1
-    fi
-
-    if test -z "$dst_arg"; then
-      echo "$0: no destination specified." >&2
-      exit 1
-    fi
-    dst=$dst_arg
-
-    # If destination is a directory, append the input filename; won't work
-    # if double slashes aren't ignored.
-    if test -d "$dst"; then
-      if test -n "$no_target_directory"; then
-	echo "$0: $dst_arg: Is a directory" >&2
-	exit 1
-      fi
-      dstdir=$dst
-      dst=$dstdir/`basename "$src"`
-      dstdir_status=0
-    else
-      # Prefer dirname, but fall back on a substitute if dirname fails.
-      dstdir=`
-	(dirname "$dst") 2>/dev/null ||
-	expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	     X"$dst" : 'X\(//\)[^/]' \| \
-	     X"$dst" : 'X\(//\)$' \| \
-	     X"$dst" : 'X\(/\)' \| . 2>/dev/null ||
-	echo X"$dst" |
-	    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-		   s//\1/
-		   q
-		 }
-		 /^X\(\/\/\)[^/].*/{
-		   s//\1/
-		   q
-		 }
-		 /^X\(\/\/\)$/{
-		   s//\1/
-		   q
-		 }
-		 /^X\(\/\).*/{
-		   s//\1/
-		   q
-		 }
-		 s/.*/./; q'
-      `
-
-      test -d "$dstdir"
-      dstdir_status=$?
-    fi
-  fi
-
-  obsolete_mkdir_used=false
-
-  if test $dstdir_status != 0; then
-    case $posix_mkdir in
-      '')
-	# Create intermediate dirs using mode 755 as modified by the umask.
-	# This is like FreeBSD 'install' as of 1997-10-28.
-	umask=`umask`
-	case $stripcmd.$umask in
-	  # Optimize common cases.
-	  *[2367][2367]) mkdir_umask=$umask;;
-	  .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
-
-	  *[0-7])
-	    mkdir_umask=`expr $umask + 22 \
-	      - $umask % 100 % 40 + $umask % 20 \
-	      - $umask % 10 % 4 + $umask % 2
-	    `;;
-	  *) mkdir_umask=$umask,go-w;;
-	esac
-
-	# With -d, create the new directory with the user-specified mode.
-	# Otherwise, rely on $mkdir_umask.
-	if test -n "$dir_arg"; then
-	  mkdir_mode=-m$mode
-	else
-	  mkdir_mode=
-	fi
-
-	posix_mkdir=false
-	case $umask in
-	  *[123567][0-7][0-7])
-	    # POSIX mkdir -p sets u+wx bits regardless of umask, which
-	    # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
-	    ;;
-	  *)
-	    tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
-	    trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
-
-	    if (umask $mkdir_umask &&
-		exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
-	    then
-	      if test -z "$dir_arg" || {
-		   # Check for POSIX incompatibilities with -m.
-		   # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
-		   # other-writeable bit of parent directory when it shouldn't.
-		   # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
-		   ls_ld_tmpdir=`ls -ld "$tmpdir"`
-		   case $ls_ld_tmpdir in
-		     d????-?r-*) different_mode=700;;
-		     d????-?--*) different_mode=755;;
-		     *) false;;
-		   esac &&
-		   $mkdirprog -m$different_mode -p -- "$tmpdir" && {
-		     ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
-		     test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
-		   }
-		 }
-	      then posix_mkdir=:
-	      fi
-	      rmdir "$tmpdir/d" "$tmpdir"
-	    else
-	      # Remove any dirs left behind by ancient mkdir implementations.
-	      rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
-	    fi
-	    trap '' 0;;
-	esac;;
-    esac
-
-    if
-      $posix_mkdir && (
-	umask $mkdir_umask &&
-	$doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
-      )
-    then :
-    else
-
-      # The umask is ridiculous, or mkdir does not conform to POSIX,
-      # or it failed possibly due to a race condition.  Create the
-      # directory the slow way, step by step, checking for races as we go.
-
-      case $dstdir in
-	/*) prefix='/';;
-	[-=\(\)!]*) prefix='./';;
-	*)  prefix='';;
-      esac
-
-      eval "$initialize_posix_glob"
-
-      oIFS=$IFS
-      IFS=/
-      $posix_glob set -f
-      set fnord $dstdir
-      shift
-      $posix_glob set +f
-      IFS=$oIFS
-
-      prefixes=
-
-      for d
-      do
-	test X"$d" = X && continue
-
-	prefix=$prefix$d
-	if test -d "$prefix"; then
-	  prefixes=
-	else
-	  if $posix_mkdir; then
-	    (umask=$mkdir_umask &&
-	     $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
-	    # Don't fail if two instances are running concurrently.
-	    test -d "$prefix" || exit 1
-	  else
-	    case $prefix in
-	      *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
-	      *) qprefix=$prefix;;
-	    esac
-	    prefixes="$prefixes '$qprefix'"
-	  fi
-	fi
-	prefix=$prefix/
-      done
-
-      if test -n "$prefixes"; then
-	# Don't fail if two instances are running concurrently.
-	(umask $mkdir_umask &&
-	 eval "\$doit_exec \$mkdirprog $prefixes") ||
-	  test -d "$dstdir" || exit 1
-	obsolete_mkdir_used=true
-      fi
-    fi
-  fi
-
-  if test -n "$dir_arg"; then
-    { test -z "$chowncmd" || $doit $chowncmd "$dst"; } &&
-    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } &&
-    { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false ||
-      test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1
-  else
-
-    # Make a couple of temp file names in the proper directory.
-    dsttmp=$dstdir/_inst.$$_
-    rmtmp=$dstdir/_rm.$$_
-
-    # Trap to clean up those temp files at exit.
-    trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
-
-    # Copy the file name to the temp name.
-    (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") &&
-
-    # and set any options; do chmod last to preserve setuid bits.
-    #
-    # If any of these fail, we abort the whole thing.  If we want to
-    # ignore errors from any of these, just make sure not to ignore
-    # errors from the above "$doit $cpprog $src $dsttmp" command.
-    #
-    { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } &&
-    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } &&
-    { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } &&
-    { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } &&
-
-    # If -C, don't bother to copy if it wouldn't change the file.
-    if $copy_on_change &&
-       old=`LC_ALL=C ls -dlL "$dst"	2>/dev/null` &&
-       new=`LC_ALL=C ls -dlL "$dsttmp"	2>/dev/null` &&
-
-       eval "$initialize_posix_glob" &&
-       $posix_glob set -f &&
-       set X $old && old=:$2:$4:$5:$6 &&
-       set X $new && new=:$2:$4:$5:$6 &&
-       $posix_glob set +f &&
-
-       test "$old" = "$new" &&
-       $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1
-    then
-      rm -f "$dsttmp"
-    else
-      # Rename the file to the real destination.
-      $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null ||
-
-      # The rename failed, perhaps because mv can't rename something else
-      # to itself, or perhaps because mv is so ancient that it does not
-      # support -f.
-      {
-	# Now remove or move aside any old file at destination location.
-	# We try this two ways since rm can't unlink itself on some
-	# systems and the destination file might be busy for other
-	# reasons.  In this case, the final cleanup might fail but the new
-	# file should still install successfully.
-	{
-	  test ! -f "$dst" ||
-	  $doit $rmcmd -f "$dst" 2>/dev/null ||
-	  { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
-	    { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
-	  } ||
-	  { echo "$0: cannot unlink or rename $dst" >&2
-	    (exit 1); exit 1
-	  }
-	} &&
-
-	# Now rename the file to the real destination.
-	$doit $mvcmd "$dsttmp" "$dst"
-      }
-    fi || exit 1
-
-    trap '' 0
-  fi
-done
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
-# time-stamp-end: "; # UTC"
-# End:
diff --git a/lyra2/Lyra2.c b/lyra2/Lyra2.c
index 452f37cab7..d99659a624 100644
--- a/lyra2/Lyra2.c
+++ b/lyra2/Lyra2.c
@@ -21,7 +21,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
-
 #include "Lyra2.h"
 #include "Sponge.h"
 
@@ -44,8 +43,176 @@
  *
  * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
  */
-int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols)
-{
+int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) {
+
+    //============================= Basic variables ============================//
+    int64_t row = 2; //index of row to be processed
+    int64_t prev = 1; //index of prev (last row ever computed/modified)
+    int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+    int64_t tau; //Time Loop iterator
+    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+    int64_t i; //auxiliary iteration counter
+    //==========================================================================/
+
+    //========== Initializing the Memory Matrix and pointers to it =============//
+    //Tries to allocate enough space for the whole memory matrix
+
+
+    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+    i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
+    uint64_t *wholeMatrix = malloc(i);
+    if (wholeMatrix == NULL) {
+      return -1;
+    }
+	memset(wholeMatrix, 0, i);
+
+    //Allocates pointers to each row of the matrix
+    uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
+    if (memMatrix == NULL) {
+      return -1;
+    }
+    //Places the pointers in the correct positions
+    uint64_t *ptrWord = wholeMatrix;
+    for (i = 0; i < nRows; i++) {
+      memMatrix[i] = ptrWord;
+      ptrWord += ROW_LEN_INT64;
+    }
+    //==========================================================================/
+
+    //============= Getting the password + salt + basil padded with 10*1 ===============//
+    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+    //but this ensures that the password copied locally will be overwritten as soon as possible
+
+    //First, we clean enough blocks for the password, salt, basil and padding
+    uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+    byte *ptrByte = (byte*) wholeMatrix;
+    memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES);
+
+    //Prepends the password
+    memcpy(ptrByte, pwd, pwdlen);
+    ptrByte += pwdlen;
+
+    //Concatenates the salt
+    memcpy(ptrByte, salt, saltlen);
+    ptrByte += saltlen;
+
+    //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+    memcpy(ptrByte, &kLen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &saltlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &timeCost, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nRows, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nCols, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+
+    //Now comes the padding
+    *ptrByte = 0x80; //first byte of padding: right after the password
+    ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+    ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+    *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+    //==========================================================================/
+
+    //======================= Initializing the Sponge State ====================//
+    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+    uint64_t *state = malloc(16 * sizeof (uint64_t));
+    if (state == NULL) {
+      return -1;
+    }
+    initState(state);
+    //==========================================================================/
+ 
+    //================================ Setup Phase =============================//
+    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+    ptrWord = wholeMatrix;
+    for (i = 0; i < nBlocksInput; i++) {
+      absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
+      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
+    }
+   //Initializes M[0] and M[1]
+    reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
+    reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
+ 
+    do {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+      reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa = (rowa + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa == 0) {
+      step = window + gap; //changes the step: approximately doubles its value
+      window *= 2; //doubles the size of the re-visitation window
+      gap = -gap; //inverts the modifier to the step
+    }
+
+    } while (row < nRows);
+    //==========================================================================/
+
+    //============================ Wandering Phase =============================//
+    row = 0; //Resets the visitation to the first row of the memory matrix
+    for (tau = 1; tau <= timeCost; tau++) {
+    	//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+    	step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+    	do {
+  	    //Selects a pseudorandom index row*
+  	    //------------------------------------------------------------------------------------------
+  	    //rowa = ((unsigned int)state[0]) & (nRows-1);	//(USE THIS IF nRows IS A POWER OF 2)
+  	    rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+  	    //------------------------------------------------------------------------------------------
+
+  	    //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+  	    reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+  	    //update prev: it now points to the last row ever computed
+  	    prev = row;
+
+  	    //updates row: goes to the next row to be computed
+  	    //------------------------------------------------------------------------------------------
+  	    //row = (row + step) & (nRows-1);	//(USE THIS IF nRows IS A POWER OF 2)
+  	    row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+  	    //------------------------------------------------------------------------------------------
+
+      } while (row != 0);
+    }
+    //==========================================================================/
+
+    //============================ Wrap-up Phase ===============================//
+    //Absorbs the last block of the memory matrix
+    absorbBlock(state, memMatrix[rowa]);
+
+    //Squeezes the key
+    squeeze(state, K, kLen);
+    //==========================================================================/
+
+    //========================= Freeing the memory =============================//
+    free(memMatrix);
+    free(wholeMatrix);
+
+    //Wiping out the sponge's internal state before freeing it
+    memset(state, 0, 16 * sizeof (uint64_t));
+    free(state);
+    //==========================================================================/
+
+    return 0;
+}
+
+int LYRA2_old(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) {
+
 	//============================= Basic variables ============================//
 	int64_t row = 2; //index of row to be processed
 	int64_t prev = 1; //index of prev (last row ever computed/modified)
@@ -59,21 +226,26 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 
 	//========== Initializing the Memory Matrix and pointers to it =============//
 	//Tries to allocate enough space for the whole memory matrix
-	i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
-	uint64_t *wholeMatrix = (uint64_t*) malloc((size_t) i);
+
+
+	const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+	const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+	i = (int64_t)((int64_t)nRows * (int64_t)ROW_LEN_BYTES);
+	uint64_t *wholeMatrix = malloc(i);
 	if (wholeMatrix == NULL) {
 		return -1;
 	}
-	memset(wholeMatrix, 0, (size_t) i);
+	memset(wholeMatrix, 0, i);
 
 	//Allocates pointers to each row of the matrix
-	uint64_t **memMatrix = malloc((size_t) nRows * sizeof(uint64_t*));
+	uint64_t **memMatrix = malloc(nRows * sizeof(uint64_t*));
 	if (memMatrix == NULL) {
 		return -1;
 	}
 	//Places the pointers in the correct positions
 	uint64_t *ptrWord = wholeMatrix;
-	for (i = 0; i < (int64_t) nRows; i++) {
+	for (i = 0; i < nRows; i++) {
 		memMatrix[i] = ptrWord;
 		ptrWord += ROW_LEN_INT64;
 	}
@@ -84,43 +256,42 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 	//but this ensures that the password copied locally will be overwritten as soon as possible
 
 	//First, we clean enough blocks for the password, salt, basil and padding
-	uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
-
-	byte *ptrByte = (byte*) wholeMatrix;
-	memset(ptrByte, 0, (size_t) nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES);
+	uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+	byte *ptrByte = (byte*)wholeMatrix;
+	memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES);
 
 	//Prepends the password
-	memcpy(ptrByte, pwd, (size_t) pwdlen);
+	memcpy(ptrByte, pwd, pwdlen);
 	ptrByte += pwdlen;
 
 	//Concatenates the salt
-	memcpy(ptrByte, salt, (size_t) saltlen);
+	memcpy(ptrByte, salt, saltlen);
 	ptrByte += saltlen;
 
 	//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-	memcpy(ptrByte, &kLen, sizeof (uint64_t));
-	ptrByte += sizeof (uint64_t);
-	memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
-	ptrByte += sizeof (uint64_t);
-	memcpy(ptrByte, &saltlen, sizeof (uint64_t));
-	ptrByte += sizeof (uint64_t);
-	memcpy(ptrByte, &timeCost, sizeof (uint64_t));
-	ptrByte += sizeof (uint64_t);
-	memcpy(ptrByte, &nRows, sizeof (uint64_t));
-	ptrByte += sizeof (uint64_t);
-	memcpy(ptrByte, &nCols, sizeof (uint64_t));
-	ptrByte += sizeof (uint64_t);
+	memcpy(ptrByte, &kLen, sizeof(uint64_t));
+	ptrByte += sizeof(uint64_t);
+	memcpy(ptrByte, &pwdlen, sizeof(uint64_t));
+	ptrByte += sizeof(uint64_t);
+	memcpy(ptrByte, &saltlen, sizeof(uint64_t));
+	ptrByte += sizeof(uint64_t);
+	memcpy(ptrByte, &timeCost, sizeof(uint64_t));
+	ptrByte += sizeof(uint64_t);
+	memcpy(ptrByte, &nRows, sizeof(uint64_t));
+	ptrByte += sizeof(uint64_t);
+	memcpy(ptrByte, &nCols, sizeof(uint64_t));
+	ptrByte += sizeof(uint64_t);
 
 	//Now comes the padding
 	*ptrByte = 0x80; //first byte of padding: right after the password
-	ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+	ptrByte = (byte*)wholeMatrix; //resets the pointer to the start of the memory matrix
 	ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
 	*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
 	//==========================================================================/
 
 	//======================= Initializing the Sponge State ====================//
 	//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-	uint64_t *state = malloc(16 * sizeof (uint64_t));
+	uint64_t *state = malloc(16 * sizeof(uint64_t));
 	if (state == NULL) {
 		return -1;
 	}
@@ -130,20 +301,18 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 	//================================ Setup Phase =============================//
 	//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
 	ptrWord = wholeMatrix;
-	for (i = 0; i < (int64_t) nBlocksInput; i++) {
+	for (i = 0; i < nBlocksInput; i++) {
 		absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
 		ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil)
 	}
-
 	//Initializes M[0] and M[1]
-	reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here
-
-	reducedDuplexRow1(state, memMatrix[0], memMatrix[1]);
+	reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
+	reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
 
 	do {
 		//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
 
-		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
 
 		//updates the value of row* (deterministically picked during Setup))
 		rowa = (rowa + step) & (window - 1);
@@ -154,35 +323,35 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 
 		//Checks if all rows in the window where visited.
 		if (rowa == 0) {
-		step = window + gap; //changes the step: approximately doubles its value
-		window *= 2; //doubles the size of the re-visitation window
-		gap = -gap; //inverts the modifier to the step
-	}
+			step = window + gap; //changes the step: approximately doubles its value
+			window *= 2; //doubles the size of the re-visitation window
+			gap = -gap; //inverts the modifier to the step
+		}
 
-	} while (row < (int64_t) nRows);
+	} while (row < nRows);
 	//==========================================================================/
 
 	//============================ Wandering Phase =============================//
 	row = 0; //Resets the visitation to the first row of the memory matrix
-	for (tau = 1; tau <= (int64_t) timeCost; tau++) {
+	for (tau = 1; tau <= timeCost; tau++) {
 		//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
-		step = ((tau & 1) == 0) ? -1 : nRows / 2 - 1;
+		step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
 		do {
 			//Selects a pseudorandom index row*
 			//------------------------------------------------------------------------------------------
-			//rowa = ((unsigned int)state[0]) & (nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
-			rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//rowa = ((unsigned int)state[0]) & (nRows-1);	//(USE THIS IF nRows IS A POWER OF 2)
+			rowa = ((uint64_t)(state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
 			//------------------------------------------------------------------------------------------
 
 			//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
+			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
 
 			//update prev: it now points to the last row ever computed
 			prev = row;
 
 			//updates row: goes to the next row to be computed
 			//------------------------------------------------------------------------------------------
-			//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+			//row = (row + step) & (nRows-1);	//(USE THIS IF nRows IS A POWER OF 2)
 			row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
 			//------------------------------------------------------------------------------------------
 
@@ -195,7 +364,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 	absorbBlock(state, memMatrix[rowa]);
 
 	//Squeezes the key
-	squeeze(state, K, (size_t) kLen);
+	squeeze(state, K, kLen);
 	//==========================================================================/
 
 	//========================= Freeing the memory =============================//
@@ -203,7 +372,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
 	free(wholeMatrix);
 
 	//Wiping out the sponge's internal state before freeing it
-	memset(state, 0, 16 * sizeof (uint64_t));
+	memset(state, 0, 16 * sizeof(uint64_t));
 	free(state);
 	//==========================================================================/
 
diff --git a/lyra2/Lyra2.h b/lyra2/Lyra2.h
index 229b2c9cc3..e595ecea95 100644
--- a/lyra2/Lyra2.h
+++ b/lyra2/Lyra2.h
@@ -18,9 +18,13 @@
  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #ifndef LYRA2_H_
-#define LYRA2_H_
-
+#define LYRA2_H_ 
+ 
+#ifdef __cplusplus
+#include <cstdint>
+#else
 #include <stdint.h>
+#endif
 
 typedef unsigned char byte;
 
@@ -37,14 +41,7 @@ typedef unsigned char byte;
         #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8)    //Block length, in bytes
 #endif
 
-#ifndef N_COLS
-        #define N_COLS 8                                //Number of columns in the memory matrix: fixed to 64 by default
-#endif
-
-#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks
-#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8)        //Number of bytes per row
-
-
 int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols);
+int LYRA2_old(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols);
 
 #endif /* LYRA2_H_ */
diff --git a/lyra2/Sponge.c b/lyra2/Sponge.c
index e0a001e0ee..104c188f7a 100644
--- a/lyra2/Sponge.c
+++ b/lyra2/Sponge.c
@@ -41,7 +41,6 @@
     //First 512 bis are zeros
     memset(state, 0, 64);
     //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
-
     state[8] = blake2b_IV[0];
     state[9] = blake2b_IV[1];
     state[10] = blake2b_IV[2];
@@ -50,7 +49,6 @@
     state[13] = blake2b_IV[5];
     state[14] = blake2b_IV[6];
     state[15] = blake2b_IV[7];
-
 }
 
 /**
@@ -80,7 +78,7 @@ __inline static void blake2bLyra(uint64_t *v) {
 __inline static void reducedBlake2bLyra(uint64_t *v) {
     ROUND_LYRA(0);
 }
-
+ 
 /**
  * Performs a squeeze operation, using Blake2b's G function as the
  * internal permutation
@@ -95,9 +93,9 @@ __inline static void reducedBlake2bLyra(uint64_t *v) {
     int i;
     //Squeezes full blocks
     for (i = 0; i < fullBlocks; i++) {
-        memcpy(ptr, state, BLOCK_LEN_BYTES);
-        blake2bLyra(state);
-        ptr += BLOCK_LEN_BYTES;
+	memcpy(ptr, state, BLOCK_LEN_BYTES);
+	blake2bLyra(state);
+	ptr += BLOCK_LEN_BYTES;
     }
 
     //Squeezes remaining bytes
@@ -111,7 +109,7 @@ __inline static void reducedBlake2bLyra(uint64_t *v) {
  * @param state The current state of the sponge
  * @param in    The block to be absorbed (BLOCK_LEN_INT64 words)
  */
-void absorbBlock(uint64_t *state, const uint64_t *in) {
+ void absorbBlock(uint64_t *state, const uint64_t *in) {
     //XORs the first BLOCK_LEN_INT64 words of "in" with the current state
     state[0] ^= in[0];
     state[1] ^= in[1];
@@ -137,9 +135,10 @@ void absorbBlock(uint64_t *state, const uint64_t *in) {
  * @param state The current state of the sponge
  * @param in    The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
  */
-void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
+ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
     //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
-    state[0] ^= in[0];
+
+	state[0] ^= in[0];
     state[1] ^= in[1];
     state[2] ^= in[2];
     state[3] ^= in[3];
@@ -148,14 +147,10 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
     state[6] ^= in[6];
     state[7] ^= in[7];
 
+
     //Applies the transformation f to the sponge's state
     blake2bLyra(state);
-/*
-    for(int i = 0; i<16; i++) {
-        printf(" final state %d %08x %08x in %08x %08x\n", i, (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32),
-        (uint32_t)(in[i] & 0xFFFFFFFFULL), (uint32_t)(in[i] >> 32));
-    }
-*/
+
 }
 
 /**
@@ -166,12 +161,11 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
  * @param state     The current state of the sponge
  * @param rowOut    Row to receive the data squeezed
  */
-void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) {
-    uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
+ void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, uint64_t nCols) {
+    uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
     int i;
     //M[row][C-1-col] = H.reduced_squeeze()
-    for (i = 0; i < N_COLS; i++) {
-
+    for (i = 0; i < nCols; i++) {
 	ptrWord[0] = state[0];
 	ptrWord[1] = state[1];
 	ptrWord[2] = state[2];
@@ -184,12 +178,7 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) {
 	ptrWord[9] = state[9];
 	ptrWord[10] = state[10];
 	ptrWord[11] = state[11];
-	/*
-for (int i = 0; i<12; i++) {
-		printf(" after reducedSqueezeRow0 %d %08x %08x in %08x %08x\n", i, (uint32_t)(ptrWord[i] & 0xFFFFFFFFULL), (uint32_t)(ptrWord[i] >> 32),
-			(uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32));
-	}
-*/
+
 	//Goes to next block (column) that will receive the squeezed data
 	ptrWord -= BLOCK_LEN_INT64;
 
@@ -207,12 +196,12 @@ for (int i = 0; i<12; i++) {
  * @param rowIn		Row to feed the sponge
  * @param rowOut	Row to receive the sponge's output
  */
- void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) {
+ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols) {
     uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
-    uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
     int i;
 
-    for (i = 0; i < N_COLS; i++) {
+    for (i = 0; i < nCols; i++) {
 
 	//Absorbing "M[prev][col]"
 	state[0]  ^= (ptrWordIn[0]);
@@ -267,12 +256,13 @@ for (int i = 0; i<12; i++) {
  * @param rowOut         Row receiving the output
  *
  */
- void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
+ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) {
     uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
     uint64_t* ptrWordInOut = rowInOut;				//In Lyra2: pointer to row*
-    uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
     int i;
-    for (i = 0; i < N_COLS; i++) {
+
+    for (i = 0; i < nCols; i++) {
 	//Absorbing "M[prev] [+] M[row*]"
 	state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
 	state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
@@ -340,13 +330,13 @@ for (int i = 0; i<12; i++) {
  * @param rowOut         Row receiving the output
  *
  */
-void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
+ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) {
     uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
     uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
     uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
     int i;
 
-    for (i = 0; i < N_COLS; i++) {
+    for (i = 0; i < nCols; i++) {
 
 	//Absorbing "M[prev] [+] M[row*]"
 	state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
@@ -744,12 +734,13 @@ inline void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInO
  Prints an array of unsigned chars
  */
 void printArray(unsigned char *array, unsigned int size, char *name) {
-	unsigned int i;
-	printf("%s: ", name);
-	for (i = 0; i < size; i++) {
-		printf("%2x|", array[i]);
-	}
-	printf("\n");
+    int i;
+    printf("%s: ", name);
+    for (i = 0; i < size; i++) {
+	printf("%2x|", array[i]);
+    }
+    printf("\n");
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////
+ 
\ No newline at end of file
diff --git a/lyra2/Sponge.h b/lyra2/Sponge.h
index 9bd8ed664e..2ce23d876d 100644
--- a/lyra2/Sponge.h
+++ b/lyra2/Sponge.h
@@ -22,7 +22,11 @@
 #ifndef SPONGE_H_
 #define SPONGE_H_
 
+#ifdef __cplusplus
+#include <cstdint>
+#else
 #include <stdint.h>
+#endif
 
 #if defined(__GNUC__)
 #define ALIGN __attribute__ ((aligned(32)))
@@ -74,20 +78,20 @@ static __inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 
 
 //---- Housekeeping
-void initState(uint64_t state[/*16*/]);
+ void initState(uint64_t state[/*16*/]);
 
 //---- Squeezes
-void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
-void reducedSqueezeRow0(uint64_t* state, uint64_t* row);
+ void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
+ void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);
 
 //---- Absorbs
-void absorbBlock(uint64_t *state, const uint64_t *in);
-void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
+ void absorbBlock(uint64_t *state, const uint64_t *in);
+ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
 
 //---- Duplexes
-void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut);
-void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);
+ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
 
 //---- Misc
 void printArray(unsigned char *array, unsigned int size, char *name);
diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu
deleted file mode 100644
index 55986bc793..0000000000
--- a/lyra2/cuda_lyra2.cu
+++ /dev/null
@@ -1,295 +0,0 @@
-#include <memory.h>
-
-#include "cuda_helper.h"
-
-#define TPB 160
-
-static __constant__ uint2 blake2b_IV[8] = {
-	{ 0xf3bcc908, 0x6a09e667 },
-	{ 0x84caa73b, 0xbb67ae85 },
-	{ 0xfe94f82b, 0x3c6ef372 },
-	{ 0x5f1d36f1, 0xa54ff53a },
-	{ 0xade682d1, 0x510e527f },
-	{ 0x2b3e6c1f, 0x9b05688c },
-	{ 0xfb41bd6b, 0x1f83d9ab },
-	{ 0x137e2179, 0x5be0cd19 }
-};
-
-
-#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \
-	for (int i = 0; i < 8; i++) { \
-		for (int j = 0; j < 12; j++) \
-			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
-		round_lyra_v35(state); \
-		for (int j = 0; j < 12; j++) \
-			Matrix[j + 12 * i][rowOut] ^= state[j]; \
-		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
-		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
-		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
-		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
-		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
-		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
-		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
-		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
-		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
-		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
-		Matrix[10+ 12 * i][rowInOut] ^= state[9]; \
-		Matrix[11+ 12 * i][rowInOut] ^= state[10]; \
-	} \
-  }
-
-#define absorbblock(in)  { \
-	state[0] ^= Matrix[0][in]; \
-	state[1] ^= Matrix[1][in]; \
-	state[2] ^= Matrix[2][in]; \
-	state[3] ^= Matrix[3][in]; \
-	state[4] ^= Matrix[4][in]; \
-	state[5] ^= Matrix[5][in]; \
-	state[6] ^= Matrix[6][in]; \
-	state[7] ^= Matrix[7][in]; \
-	state[8] ^= Matrix[8][in]; \
-	state[9] ^= Matrix[9][in]; \
-	state[10] ^= Matrix[10][in]; \
-	state[11] ^= Matrix[11][in]; \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-  }
-
-//// test version
-#define reduceDuplexRowSetup_test(rowIn, rowInOut, rowOut) { \
-	for (int i = 0; i < 8; i++) { \
-		for (int j = 0; j < 12; j++) \
-			state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \
-		round_lyra_v35(state); \
-		for (int j = 0; j < 12; j++) \
-			Matrix[j][7-i][rowOut] = Matrix[j][i][rowIn] ^ state[j]; \
-		Matrix[0][i][rowInOut] ^= state[11]; \
-		Matrix[1][i][rowInOut] ^= state[0]; \
-		Matrix[2][i][rowInOut] ^= state[1]; \
-		Matrix[3][i][rowInOut] ^= state[2]; \
-		Matrix[4][i][rowInOut] ^= state[3]; \
-		Matrix[5][i][rowInOut] ^= state[4]; \
-		Matrix[6][i][rowInOut] ^= state[5]; \
-		Matrix[7][i][rowInOut] ^= state[6]; \
-		Matrix[8][i][rowInOut] ^= state[7]; \
-		Matrix[9][i][rowInOut] ^= state[8]; \
-		Matrix[10][i][rowInOut] ^= state[9]; \
-		Matrix[11][i][rowInOut] ^= state[10]; \
-	} \
-  }
-
-#define reduceDuplexRow_test(rowIn, rowInOut, rowOut) { \
-	for (int i = 0; i < 8; i++) { \
-	for (int j = 0; j < 12; j++) \
-			state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \
-		round_lyra_v35(state); \
-		for (int j = 0; j < 12; j++) \
-			Matrix[j][i][rowOut] ^= state[j]; \
-		Matrix[0][i][rowInOut] ^= state[11]; \
-		Matrix[1][i][rowInOut] ^= state[0]; \
-		Matrix[2][i][rowInOut] ^= state[1]; \
-		Matrix[3][i][rowInOut] ^= state[2]; \
-		Matrix[4][i][rowInOut] ^= state[3]; \
-		Matrix[5][i][rowInOut] ^= state[4]; \
-		Matrix[6][i][rowInOut] ^= state[5]; \
-		Matrix[7][i][rowInOut] ^= state[6]; \
-		Matrix[8][i][rowInOut] ^= state[7]; \
-		Matrix[9][i][rowInOut] ^= state[8]; \
-		Matrix[10][i][rowInOut] ^= state[9]; \
-		Matrix[11][i][rowInOut] ^= state[10]; \
-	} \
-  }
-
-#define absorbblock_test(in) { \
-	state[0] ^= Matrix[0][0][ in]; \
-	state[1] ^= Matrix[1][0][in]; \
-	state[2] ^= Matrix[2][0][in]; \
-	state[3] ^= Matrix[3][0][in]; \
-	state[4] ^= Matrix[4][0][in]; \
-	state[5] ^= Matrix[5][0][in]; \
-	state[6] ^= Matrix[6][0][in]; \
-	state[7] ^= Matrix[7][0][in]; \
-	state[8] ^= Matrix[8][0][in]; \
-	state[9] ^= Matrix[9][0][in]; \
-	state[10] ^= Matrix[10][0][in]; \
-	state[11] ^= Matrix[11][0][in]; \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-	round_lyra_v35(state); \
-  }
-
-static __device__ __forceinline__
-void Gfunc_v35(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
-{
-	a += b; d = SWAPINT2(d ^ a);
-	c += d; b = ROR2(b ^ c, 24);
-	a += b; d = ROR2(d ^ a, 16);
-	c += d; b = ROR2(b ^ c, 63);
-}
-
-
-#define round_lyra_v35_new(state) { \
-	Gfunc_v35(state[0], state[4], state[8], state[12]); \
-	Gfunc_v35(state[1], state[5], state[9], state[13]); \
-	Gfunc_v35(state[2], state[6], state[10], state[14]); \
-	Gfunc_v35(state[3], state[7], state[11], state[15]); \
-	Gfunc_v35(state[0], state[5], state[10], state[15]); \
-	Gfunc_v35(state[1], state[6], state[11], state[12]); \
-	Gfunc_v35(state[2], state[7], state[8], state[13]); \
-	Gfunc_v35(state[3], state[4], state[9], state[14]); \
-}
-
-static __device__ __forceinline__ void round_lyra_v35(uint2 *s)
-{
-	Gfunc_v35(s[0], s[4], s[8],  s[12]);
-	Gfunc_v35(s[1], s[5], s[9],  s[13]);
-	Gfunc_v35(s[2], s[6], s[10], s[14]);
-	Gfunc_v35(s[3], s[7], s[11], s[15]);
-	Gfunc_v35(s[0], s[5], s[10], s[15]);
-	Gfunc_v35(s[1], s[6], s[11], s[12]);
-	Gfunc_v35(s[2], s[7], s[8],  s[13]);
-	Gfunc_v35(s[3], s[4], s[9],  s[14]);
-}
-
-__device__ __forceinline__ void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[16], uint2 Matrix[96][8])
-{ 
-	for (int i = 0; i < 8; i++)
-	{
-		#pragma unroll
-		for (int j = 0; j < 12; j++)
-			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut];
-		round_lyra_v35(state);
-		#pragma unroll
-		for (int j = 0; j < 12; j++)
-			Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j];
-		Matrix[0 + 12 * i][rowInOut] ^= state[11];
-		Matrix[1 + 12 * i][rowInOut] ^= state[0];
-		Matrix[2 + 12 * i][rowInOut] ^= state[1];
-		Matrix[3 + 12 * i][rowInOut] ^= state[2];
-		Matrix[4 + 12 * i][rowInOut] ^= state[3];
-		Matrix[5 + 12 * i][rowInOut] ^= state[4];
-		Matrix[6 + 12 * i][rowInOut] ^= state[5];
-		Matrix[7 + 12 * i][rowInOut] ^= state[6];
-		Matrix[8 + 12 * i][rowInOut] ^= state[7];
-		Matrix[9 + 12 * i][rowInOut] ^= state[8];
-		Matrix[10 + 12 * i][rowInOut] ^= state[9];
-		Matrix[11 + 12 * i][rowInOut] ^= state[10];
-	}
-}
-
-__global__ __launch_bounds__(TPB, 1)
-void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint2 state[16];
-		#pragma unroll
-		for (int i = 0; i<4; i++) 
-		{ 
-			LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); 
-		} //password
-		#pragma unroll
-		for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt
-		#pragma unroll
-		for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; }
-
-		// blake2blyra x2
-		//#pragma unroll 24
-		for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough
-
-		uint2 Matrix[96][8]; // not cool
-
-		// reducedSqueezeRow0
-		#pragma unroll 8
-		for (int i = 0; i < 8; i++)
-		{
-			#pragma unroll 12
-			for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][0] = state[j]; }
-			round_lyra_v35(state);
-		}
-
-		// reducedSqueezeRow1
-		#pragma unroll 8
-		for (int i = 0; i < 8; i++)
-		{
-			#pragma unroll 12
-			for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + 12 * i][0]; }
-			round_lyra_v35(state);
-			#pragma unroll 12
-			for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; }
-		}
-
-
-
-		reduceDuplexRowSetup(1, 0, 2,state, Matrix);
-		reduceDuplexRowSetup(2, 1, 3, state, Matrix);
-		reduceDuplexRowSetup(3, 0, 4, state, Matrix);
-		reduceDuplexRowSetup(4, 3, 5, state, Matrix);
-		reduceDuplexRowSetup(5, 2, 6, state, Matrix);
-		reduceDuplexRowSetup(6, 1, 7, state, Matrix);
-
-		uint32_t rowa;
-		rowa = state[0].x & 7;
-		reduceDuplexRow(7, rowa, 0);
-		rowa = state[0].x & 7;
-		reduceDuplexRow(0, rowa, 3);
-		rowa = state[0].x & 7;
-		reduceDuplexRow(3, rowa, 6);
-		rowa = state[0].x & 7;
-		reduceDuplexRow(6, rowa, 1);
-		rowa = state[0].x & 7;
-		reduceDuplexRow(1, rowa, 4);
-		rowa = state[0].x & 7;
-		reduceDuplexRow(4, rowa, 7);
-		rowa = state[0].x & 7;
-		reduceDuplexRow(7, rowa, 2);
-		rowa = state[0].x & 7;
-		reduceDuplexRow(2, rowa, 5);
-
-		absorbblock(rowa);
-
-		#pragma unroll
-		for (int i = 0; i<4; i++) {
-			outputHash[threads*i + thread] = devectorize(state[i]);
-		} //password
-
-	} //thread
-}
-
-__host__
-void lyra2_cpu_init(int thr_id, uint32_t threads)
-{
-	//not used
-}
-
-__host__
-void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
-{
-	dim3 grid((threads + TPB - 1) / TPB);
-	dim3 block(TPB);
-
-	lyra2_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
-
-	//MyStreamSynchronize(NULL, order, thr_id);
-}
-
diff --git a/lyra2/cuda_lyra2_vectors.h b/lyra2/cuda_lyra2_vectors.h
new file mode 100644
index 0000000000..716158eedb
--- /dev/null
+++ b/lyra2/cuda_lyra2_vectors.h
@@ -0,0 +1,735 @@
+/* DJM CRAP to strip (again) made for SM 3.2+ */
+
+#ifndef CUDA_LYRA_VECTOR_H
+#define CUDA_LYRA_VECTOR_H
+
+///////////////////////////////////////////////////////////////////////////////////
+#include "cuda_helper.h"
+
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif
+
+#if __CUDA_ARCH__ < 300
+#define __shfl(x, y) (x)
+#endif
+
+#if __CUDA_ARCH__ < 320 && !defined(__ldg4)
+#define __ldg4(x) (*(x))
+#endif
+
+typedef struct __align__(32) uint8{
+	unsigned int s0, s1, s2, s3, s4, s5, s6, s7;
+} uint8;
+
+typedef struct __align__(64) uint2_8{
+	uint2 s0, s1, s2, s3, s4, s5, s6, s7;
+} uint2_8;
+
+typedef struct __align__(64) ulonglong2to8{
+	ulonglong2 l0, l1, l2, l3;
+} ulonglong2to8;
+
+typedef struct __align__(128) ulonglong8to16{
+	ulonglong2to8 lo, hi;
+} ulonglong8to16;
+
+typedef struct __align__(256) ulonglong16to32{
+	ulonglong8to16 lo, hi;
+} ulonglong16to32;
+
+typedef struct __align__(512) ulonglong32to64{
+	ulonglong16to32 lo, hi;
+} ulonglong32to64;
+
+typedef struct __align__(128) ulonglonglong{
+	ulonglong2 s0, s1, s2, s3, s4, s5, s6, s7;
+} ulonglonglong;
+
+typedef struct __align__(64) uint16{
+	union
+	{
+		struct
+		{
+			unsigned int  s0, s1, s2, s3, s4, s5, s6, s7;
+		};
+		uint8 lo;
+	};
+	union
+	{
+		struct
+		{
+			unsigned int s8, s9, sa, sb, sc, sd, se, sf;
+		};
+		uint8 hi;
+	};
+} uint16;
+
+typedef struct __align__(128) uint2_16{
+	union
+	{
+		struct
+		{
+			uint2  s0, s1, s2, s3, s4, s5, s6, s7;
+		};
+		uint2_8 lo;
+	};
+	union
+	{
+		struct
+		{
+			uint2 s8, s9, sa, sb, sc, sd, se, sf;
+		};
+		uint2_8 hi;
+	};
+} uint2_16;
+
+typedef struct __align__(128) uint32{
+	uint16 lo, hi;
+} uint32;
+
+struct __align__(128) ulong8{
+	ulonglong4 s0, s1, s2, s3;
+};
+typedef __device_builtin__ struct ulong8 ulong8;
+
+typedef struct __align__(256) ulonglong16{
+	ulonglong4 s0, s1, s2, s3, s4, s5, s6, s7;
+} ulonglong16;
+
+typedef struct __align__(16) uint28{
+	uint2 x, y, z, w;
+} uint2x4;
+typedef uint2x4 uint28; /* name deprecated */
+
+typedef struct __builtin_align__(32) uint48{
+	uint4 s0, s1;
+} uint48;
+
+typedef struct __align__(256) uint4x16{
+	uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+} uint4x16;
+
+static __inline__ __device__ ulonglong2to8 make_ulonglong2to8(ulonglong2 s0, ulonglong2 s1, ulonglong2 s2, ulonglong2 s3)
+{
+	ulonglong2to8 t; t.l0 = s0; t.l1 = s1; t.l2 = s2; t.l3 = s3;
+	return t;
+}
+
+static __inline__ __device__ ulonglong8to16 make_ulonglong8to16(const ulonglong2to8 &s0, const ulonglong2to8 &s1)
+{
+	ulonglong8to16 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+static __inline__ __device__ ulonglong16to32 make_ulonglong16to32(const ulonglong8to16 &s0, const ulonglong8to16 &s1)
+{
+	ulonglong16to32 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+static __inline__ __device__ ulonglong32to64 make_ulonglong32to64(const ulonglong16to32 &s0, const ulonglong16to32 &s1)
+{
+	ulonglong32to64 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+static __inline__ __host__ __device__ ulonglonglong make_ulonglonglong(
+	const ulonglong2 &s0, const ulonglong2 &s1, const ulonglong2 &s2, const ulonglong2 &s3,
+	const ulonglong2 &s4, const ulonglong2 &s5)
+{
+	ulonglonglong t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5;
+	return t;
+}
+
+static __inline__ __device__ uint48 make_uint48(uint4 s0, uint4 s1)
+{
+	uint48 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uint28 make_uint28(uint2 s0, uint2 s1, uint2 s2, uint2 s3)
+{
+	uint28 t; t.x = s0; t.y = s1; t.z = s2; t.w = s3;
+	return t;
+}
+
+static __inline__ __host__ __device__ uint4x16 make_uint4x16(
+	uint4 s0, uint4 s1, uint4 s2, uint4 s3, uint4 s4, uint4 s5, uint4 s6, uint4 s7,
+	uint4 s8, uint4 s9, uint4 sa, uint4 sb, uint4 sc, uint4 sd, uint4 se, uint4 sf)
+{
+	uint4x16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.s10 = sa; t.s11 = sb; t.s12 = sc; t.s13 = sd; t.s14 = se; t.s15 = sf;
+	return t;
+}
+
+static __inline__  __device__ uint2_16 make_uint2_16(
+	uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7,
+	uint2 s8, uint2 s9, uint2 sa, uint2 sb, uint2 sc, uint2 sd, uint2 se, uint2 sf)
+{
+	uint2_16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+	return t;
+}
+
+static __inline__ __host__ __device__ uint16 make_uint16(
+	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7,
+	unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf)
+{
+	uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+	return t;
+}
+
+static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b)
+{
+	uint16 t; t.lo = a; t.hi = b; return t;
+}
+
+static __inline__ __host__ __device__ uint32 make_uint32(const uint16 &a, const uint16 &b)
+{
+	uint32 t; t.lo = a; t.hi = b; return t;
+}
+
+
+static __inline__ __host__ __device__ uint8 make_uint8(
+	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7)
+{
+	uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+static __inline__ __host__ __device__ uint2_8 make_uint2_8(
+	uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7)
+{
+	uint2_8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+static __inline__ __host__ __device__ ulonglong16 make_ulonglong16(const ulonglong4 &s0, const ulonglong4 &s1,
+																																	 const ulonglong4 &s2, const ulonglong4 &s3, const ulonglong4 &s4, const ulonglong4 &s5, const ulonglong4 &s6, const ulonglong4 &s7)
+{
+	ulonglong16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+static __inline__ __host__ __device__ ulong8 make_ulong8(
+	ulonglong4 s0, ulonglong4 s1, ulonglong4 s2, ulonglong4 s3)
+{
+	ulong8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3;// t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b)
+{
+	return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b)
+{
+	return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b)
+{
+	return make_ulonglong2(a.x ^ b.x, a.y ^ b.y);
+}
+static __forceinline__ __device__ ulonglong2 operator+ (ulonglong2 a, ulonglong2 b)
+{
+	return make_ulonglong2(a.x + b.x, a.y + b.y);
+}
+
+static __forceinline__ __device__ ulong8 operator^ (const ulong8 &a, const ulong8 &b)
+{
+	return make_ulong8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3);
+}
+
+static __forceinline__ __device__ ulong8 operator+ (const ulong8 &a, const ulong8 &b)
+{
+	return make_ulong8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3);
+}
+
+static __forceinline__ __device__  __host__ uint8 operator^ (const uint8 &a, const uint8 &b)
+{
+	return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7);
+}
+
+static __forceinline__ __device__  __host__ uint8 operator+ (const uint8 &a, const uint8 &b)
+{
+	return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7);
+}
+
+static __forceinline__ __device__   uint2_8 operator^ (const uint2_8 &a, const uint2_8 &b)
+{
+	return make_uint2_8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7);
+}
+
+static __forceinline__ __device__   uint2_8 operator+ (const uint2_8 &a, const uint2_8 &b)
+{
+	return make_uint2_8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7);
+}
+
+
+////////////// mess++ //////
+
+static __forceinline__ __device__  uint28 operator^ (const uint28 &a, const uint28 &b)
+{
+	return make_uint28(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+
+static __forceinline__ __device__  uint28 operator+ (const uint28 &a, const uint28 &b)
+{
+	return make_uint28(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+static __forceinline__ __device__  uint48 operator+ (const uint48 &a, const uint48 &b)
+{
+	return make_uint48(a.s0 + b.s0, a.s1 + b.s1);
+}
+
+/////////////////////////
+
+static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b)
+{
+	return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+										 a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf);
+}
+
+static __forceinline__ __device__  __host__ uint16 operator+ (const uint16 &a, const uint16 &b)
+{
+	return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+										 a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf);
+}
+
+static __forceinline__ __device__  uint2_16 operator^ (const uint2_16 &a, const uint2_16 &b)
+{
+	return make_uint2_16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+											 a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf);
+}
+
+static __forceinline__ __device__  uint2_16 operator+ (const uint2_16 &a, const uint2_16 &b)
+{
+	return make_uint2_16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+											 a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf);
+}
+
+static __forceinline__ __device__  uint32 operator^ (const uint32 &a, const uint32 &b)
+{
+	return make_uint32(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__  uint32 operator+ (const uint32 &a, const uint32 &b)
+{
+	return make_uint32(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__ ulonglong16 operator^ (const ulonglong16 &a, const ulonglong16 &b)
+{
+	return make_ulonglong16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7);
+}
+
+static __forceinline__ __device__ ulonglong16 operator+ (const ulonglong16 &a, const ulonglong16 &b)
+{
+	return make_ulonglong16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7);
+}
+
+static __forceinline__ __device__ void operator^= (ulong8 &a, const ulong8 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator^= (uint28 &a, const uint28 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator+= (uint28 &a, const uint28 &b)
+{
+	a = a + b;
+}
+
+static __forceinline__ __device__ void operator^= (uint2_8 &a, const uint2_8 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator+= (uint2_8 &a, const uint2_8 &b)
+{
+	a = a + b;
+}
+
+static __forceinline__ __device__ void operator^= (uint32 &a, const uint32 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator+= (uint32 &a, const uint32 &b)
+{
+	a = a + b;
+}
+
+static __forceinline__ __device__  __host__ void operator^= (uint8 &a, const uint8 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__  __host__ void operator^= (uint16 &a, const uint16 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator^= (ulonglong16 &a, const ulonglong16 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator+= (ulonglong4 &a, const ulonglong4 &b)
+{
+	a = a + b;
+}
+
+static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator+= (ulonglong2 &a, const ulonglong2 &b)
+{
+	a = a + b;
+}
+
+static __forceinline__ __device__
+ulonglong2to8 operator^ (const ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+	return make_ulonglong2to8(a.l0 ^ b.l0, a.l1 ^ b.l1, a.l2 ^ b.l2, a.l3 ^ b.l3);
+}
+static __forceinline__ __device__
+ulonglong2to8 operator+ (const ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+	return make_ulonglong2to8(a.l0 + b.l0, a.l1 + b.l1, a.l2 + b.l2, a.l3 + b.l3);
+}
+
+static __forceinline__ __device__
+ulonglong8to16 operator^ (const ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	return make_ulonglong8to16(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong8to16 operator+ (const ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	return make_ulonglong8to16(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong16to32 operator^ (const ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	return make_ulonglong16to32(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong16to32 operator+ (const ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	return make_ulonglong16to32(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong32to64 operator^ (const ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	return make_ulonglong32to64(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong32to64 operator+ (const ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	return make_ulonglong32to64(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__ ulonglonglong operator^ (const ulonglonglong &a, const ulonglonglong &b)
+{
+	return make_ulonglonglong(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5);
+}
+
+static __forceinline__ __device__ ulonglonglong operator+ (const ulonglonglong &a, const ulonglonglong &b)
+{
+	return make_ulonglonglong(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5);
+}
+
+static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__  __host__ void operator+= (uint8 &a, const uint8 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__  __host__ void operator+= (uint16 &a, const uint16 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__   void operator+= (uint2_16 &a, const uint2_16 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__   void operator^= (uint2_16 &a, const uint2_16 &b)
+{
+	a = a + b;
+}
+
+static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator+= (ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator^= (ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator+= (ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator^= (ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator+= (ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator^= (ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator+= (ulonglonglong &a, const ulonglonglong &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator^= (ulonglonglong &a, const ulonglonglong &b)
+{
+	a = a ^ b;
+}
+
+#if __CUDA_ARCH__ < 320
+
+#define rotate ROTL32
+#define rotateR ROTR32
+
+#else
+
+static __forceinline__ __device__ uint4 rotate4(uint4 vec4, uint32_t shift)
+{
+	uint4 ret;
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift));
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift));
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift));
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift));
+	return ret;
+}
+
+static __forceinline__ __device__ uint4 rotate4R(uint4 vec4, uint32_t shift)
+{
+	uint4 ret;
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift));
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift));
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift));
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift));
+	return ret;
+}
+
+static __forceinline__ __device__ uint32_t rotate(uint32_t vec4, uint32_t shift)
+{
+	uint32_t ret;
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+	return ret;
+}
+
+static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift)
+{
+	uint32_t ret;
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+	return ret;
+}
+
+static __device__ __inline__ ulonglong4 __ldg4(const ulonglong4 *ptr)
+{
+	ulonglong4 ret;
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret.z), "=l"(ret.w) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ void ldg4(const ulonglong4 *ptr, ulonglong4 *ret)
+{
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"     : "=l"(ret[0].x), "=l"(ret[0].y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret[0].z), "=l"(ret[0].w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];"  : "=l"(ret[1].x), "=l"(ret[1].y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];"  : "=l"(ret[1].z), "=l"(ret[1].w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];"  : "=l"(ret[2].x), "=l"(ret[2].y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];"  : "=l"(ret[2].z), "=l"(ret[2].w) : __LDG_PTR(ptr));
+}
+
+static __device__ __inline__ uint28 __ldg4(const uint28 *ptr)
+{
+	uint28 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ uint48 __ldg4(const uint48 *ptr)
+{
+	uint48 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.s0.x), "=r"(ret.s0.y), "=r"(ret.s0.z), "=r"(ret.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s1.x), "=r"(ret.s1.y), "=r"(ret.s1.z), "=r"(ret.s1.w) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ void ldg4(const uint28 *ptr, uint28 *ret)
+{
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr));
+}
+
+#endif /* __CUDA_ARCH__ < 320 */
+
+
+static __forceinline__ __device__ uint8 swapvec(const uint8 &buf)
+{
+	uint8 vec;
+	vec.s0 = cuda_swab32(buf.s0);
+	vec.s1 = cuda_swab32(buf.s1);
+	vec.s2 = cuda_swab32(buf.s2);
+	vec.s3 = cuda_swab32(buf.s3);
+	vec.s4 = cuda_swab32(buf.s4);
+	vec.s5 = cuda_swab32(buf.s5);
+	vec.s6 = cuda_swab32(buf.s6);
+	vec.s7 = cuda_swab32(buf.s7);
+	return vec;
+}
+
+static __forceinline__ __device__ uint8 swapvec(const uint8 *buf)
+{
+	uint8 vec;
+	vec.s0 = cuda_swab32(buf[0].s0);
+	vec.s1 = cuda_swab32(buf[0].s1);
+	vec.s2 = cuda_swab32(buf[0].s2);
+	vec.s3 = cuda_swab32(buf[0].s3);
+	vec.s4 = cuda_swab32(buf[0].s4);
+	vec.s5 = cuda_swab32(buf[0].s5);
+	vec.s6 = cuda_swab32(buf[0].s6);
+	vec.s7 = cuda_swab32(buf[0].s7);
+	return vec;
+}
+
+static __forceinline__ __device__ uint16 swapvec(const uint16 *buf)
+{
+	uint16 vec;
+	vec.s0 = cuda_swab32(buf[0].s0);
+	vec.s1 = cuda_swab32(buf[0].s1);
+	vec.s2 = cuda_swab32(buf[0].s2);
+	vec.s3 = cuda_swab32(buf[0].s3);
+	vec.s4 = cuda_swab32(buf[0].s4);
+	vec.s5 = cuda_swab32(buf[0].s5);
+	vec.s6 = cuda_swab32(buf[0].s6);
+	vec.s7 = cuda_swab32(buf[0].s7);
+	vec.s8 = cuda_swab32(buf[0].s8);
+	vec.s9 = cuda_swab32(buf[0].s9);
+	vec.sa = cuda_swab32(buf[0].sa);
+	vec.sb = cuda_swab32(buf[0].sb);
+	vec.sc = cuda_swab32(buf[0].sc);
+	vec.sd = cuda_swab32(buf[0].sd);
+	vec.se = cuda_swab32(buf[0].se);
+	vec.sf = cuda_swab32(buf[0].sf);
+	return vec;
+}
+
+static __forceinline__ __device__ uint16 swapvec(const uint16 &buf)
+{
+	uint16 vec;
+	vec.s0 = cuda_swab32(buf.s0);
+	vec.s1 = cuda_swab32(buf.s1);
+	vec.s2 = cuda_swab32(buf.s2);
+	vec.s3 = cuda_swab32(buf.s3);
+	vec.s4 = cuda_swab32(buf.s4);
+	vec.s5 = cuda_swab32(buf.s5);
+	vec.s6 = cuda_swab32(buf.s6);
+	vec.s7 = cuda_swab32(buf.s7);
+	vec.s8 = cuda_swab32(buf.s8);
+	vec.s9 = cuda_swab32(buf.s9);
+	vec.sa = cuda_swab32(buf.sa);
+	vec.sb = cuda_swab32(buf.sb);
+	vec.sc = cuda_swab32(buf.sc);
+	vec.sd = cuda_swab32(buf.sd);
+	vec.se = cuda_swab32(buf.se);
+	vec.sf = cuda_swab32(buf.sf);
+	return vec;
+}
+
+static __device__ __forceinline__ uint28 shuffle4(const uint28 &var, int lane)
+{
+#if __CUDA_ARCH__ >= 300
+	uint28 res;
+	res.x.x = __shfl(var.x.x, lane);
+	res.x.y = __shfl(var.x.y, lane);
+	res.y.x = __shfl(var.y.x, lane);
+	res.y.y = __shfl(var.y.y, lane);
+	res.z.x = __shfl(var.z.x, lane);
+	res.z.y = __shfl(var.z.y, lane);
+	res.w.x = __shfl(var.w.x, lane);
+	res.w.y = __shfl(var.w.y, lane);
+	return res;
+#else
+	return var;
+#endif
+}
+
+static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane)
+{
+#if __CUDA_ARCH__ >= 300
+	ulonglong4 res;
+	uint2 temp;
+	temp = vectorize(var.x);
+	temp.x = __shfl(temp.x, lane);
+	temp.y = __shfl(temp.y, lane);
+	res.x = devectorize(temp);
+	temp = vectorize(var.y);
+	temp.x = __shfl(temp.x, lane);
+	temp.y = __shfl(temp.y, lane);
+	res.y = devectorize(temp);
+	temp = vectorize(var.z);
+	temp.x = __shfl(temp.x, lane);
+	temp.y = __shfl(temp.y, lane);
+	res.z = devectorize(temp);
+	temp = vectorize(var.w);
+	temp.x = __shfl(temp.x, lane);
+	temp.y = __shfl(temp.y, lane);
+	res.w = devectorize(temp);
+	return res;
+#else
+	return var;
+#endif
+}
+
+#endif // #ifndef CUDA_LYRA_VECTOR_H
diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu
new file mode 100644
index 0000000000..05e75838d3
--- /dev/null
+++ b/lyra2/cuda_lyra2v2.cu
@@ -0,0 +1,493 @@
+/*
+* Lyra2 (v2) CUDA Implementation
+*
+* Based on tpruvot/djm34/VTC sources and incredible 2x boost by Nanashi Meiyo-Meijin (May 2016)
+*/
+
+#include <cstdio>
+#include <memory.h>
+
+#include "cuda_lyra2v2_sm3.cuh"
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 500
+#endif
+
+__device__ __forceinline__
+uint2 SWAPUINT2(uint2 value)
+{
+	return make_uint2(value.y, value.x);
+}
+
+#define TPB5x 128
+
+#if __CUDA_ARCH__ >= 500
+
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 4
+#define Ncol 4
+#define memshift 3
+
+__device__ uint2x4 *DMatrix;
+
+__device__ __forceinline__ uint2 LD4S(uint2 *shared_mem, const int index)
+{
+	return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+}
+
+__device__ __forceinline__ void ST4S(uint2 *shared_mem, const int index, const uint2 data)
+{
+	shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
+}
+
+__device__ __forceinline__ uint2 shuffle2(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__
+void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
+{
+	a += b; d = eorswap32(a, d);
+	c += d; b ^= c; b = ROR24(b);
+	a += b; d ^= a; d = ROR16(d);
+	c += d; b ^= c; b = ROR2(b, 63);
+}
+
+__device__ __forceinline__
+void round_lyra_v5(uint2x4 s[4])
+{
+	Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc_v5(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc_v5(s[0].w, s[1].w, s[2].w, s[3].w);
+
+	Gfunc_v5(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc_v5(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc_v5(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+__device__ __forceinline__
+void round_lyra_v5(uint2 s[4])
+{
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	s[1] = shuffle2(s[1], threadIdx.x + 1, 4);
+	s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
+	s[3] = shuffle2(s[3], threadIdx.x + 3, 4);
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	s[1] = shuffle2(s[1], threadIdx.x + 3, 4);
+	s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
+	s[3] = shuffle2(s[3], threadIdx.x + 1, 4);
+}
+
+__device__ __forceinline__
+void reduceDuplexRowSetup2(uint2 *shared_mem, uint2 state[4])
+{
+	uint2 state1[Ncol][3], state0[Ncol][3], state2[3];
+	int i, j;
+
+#pragma unroll
+	for(int i = 0; i < Ncol; i++)
+	{
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] = state[j];
+		round_lyra_v5(state);
+	}
+
+	//#pragma unroll 4
+	for(i = 0; i < Ncol; i++)
+	{
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			state[j] ^= state0[i][j];
+
+		round_lyra_v5(state);
+
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] = state0[i][j];
+
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] ^= state[j];
+	}
+
+	for(i = 0; i < Ncol; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift;
+
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[i][j];
+
+		round_lyra_v5(state);
+
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			state2[j] = state1[i][j];
+
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			state2[j] ^= state[j];
+
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			ST4S(shared_mem, s2 + j, state2[j]);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if(threadIdx.x == 0)
+		{
+			state0[i][0] ^= Data2;
+			state0[i][1] ^= Data0;
+			state0[i][2] ^= Data1;
+		}
+		else
+		{
+			state0[i][0] ^= Data0;
+			state0[i][1] ^= Data1;
+			state0[i][2] ^= Data2;
+		}
+
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			ST4S(shared_mem, s0 + j, state0[i][j]);
+
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			state0[i][j] = state2[j];
+
+	}
+
+	for(i = 0; i < Ncol; i++)
+	{
+		const uint32_t s1 = memshift * Ncol * 1 + i*memshift;
+		const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift;
+
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[Ncol - i - 1][j];
+
+		round_lyra_v5(state);
+
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] ^= state[j];
+
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			ST4S(shared_mem, s3 + j, state0[Ncol - i - 1][j]);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if(threadIdx.x == 0)
+		{
+			state1[i][0] ^= Data2;
+			state1[i][1] ^= Data0;
+			state1[i][2] ^= Data1;
+		}
+		else
+		{
+			state1[i][0] ^= Data0;
+			state1[i][1] ^= Data1;
+			state1[i][2] ^= Data2;
+		}
+
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			ST4S(shared_mem, s1 + j, state1[i][j]);
+	}
+	__syncthreads();
+}
+
+__device__
+void reduceDuplexRowt2(uint2 *shared_mem, const int rowIn, const int rowInOut, const int rowOut, uint2 state[4])
+{
+	uint2 state1[3], state2[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+	const uint32_t ps3 = memshift * Ncol * rowOut;
+
+	for(int i = 0; i < Ncol; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;
+		const uint32_t s3 = ps3 + i*memshift;
+
+#pragma unroll
+		for(int j = 0; j < 3; j++)
+			state1[j] = LD4S(shared_mem, s1 + j);
+
+#pragma unroll
+		for(int j = 0; j < 3; j++)
+			state2[j] = LD4S(shared_mem, s2 + j);
+
+#pragma unroll
+		for(int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra_v5(state);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if(threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for(int j = 0; j < 3; j++)
+			ST4S(shared_mem, s2 + j, state2[j]);
+		__syncthreads();
+
+#pragma unroll
+		for(int j = 0; j < 3; j++)
+			ST4S(shared_mem, s3 + j, LD4S(shared_mem, s3 + j) ^ state[j]);
+		__syncthreads();
+	}
+}
+
+__device__
+void reduceDuplexRowt2x4(uint2 *shared_mem, const int rowInOut, uint2 state[4])
+{
+	const int rowIn = 2;
+	const int rowOut = 3;
+
+	int i, j;
+	uint2 last[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+
+#pragma unroll
+	for(int j = 0; j < 3; j++)
+		last[j] = LD4S(shared_mem, ps2 + j);
+
+#pragma unroll
+	for(int j = 0; j < 3; j++)
+		state[j] ^= LD4S(shared_mem, ps1 + j) + last[j];
+
+	round_lyra_v5(state);
+
+	uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+	uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+	uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+	if(threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	}
+	else
+	{
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if(rowInOut == rowOut)
+	{
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for(i = 1; i < Ncol; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;
+
+#pragma unroll
+		for(j = 0; j < 3; j++)
+			state[j] ^= LD4S(shared_mem, s1 + j) + LD4S(shared_mem, s2 + j);
+
+		round_lyra_v5(state);
+	}
+
+#pragma unroll
+	for(int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}
+
+__global__
+__launch_bounds__(TPB5x, 1)
+void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	const uint2x4 blake2b_IV[2] = {
+		0xf3bcc908UL, 0x6a09e667UL, 0x84caa73bUL, 0xbb67ae85UL,
+		0xfe94f82bUL, 0x3c6ef372UL, 0x5f1d36f1UL, 0xa54ff53aUL,
+		0xade682d1UL, 0x510e527fUL, 0x2b3e6c1fUL, 0x9b05688cUL,
+		0xfb41bd6bUL, 0x1f83d9abUL, 0x137e2179UL, 0x5be0cd19UL
+	};
+
+	const uint2x4 Mask[2] = {
+		0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL,
+		0x00000020UL, 0x00000000UL, 0x00000001UL, 0x00000000UL,
+		0x00000004UL, 0x00000000UL, 0x00000004UL, 0x00000000UL,
+		0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL
+	};
+
+	uint2x4 state[4];
+
+	if(thread < threads)
+	{
+		state[0].x = state[1].x = __ldg(&inputHash[thread + threads * 0]);
+		state[0].y = state[1].y = __ldg(&inputHash[thread + threads * 1]);
+		state[0].z = state[1].z = __ldg(&inputHash[thread + threads * 2]);
+		state[0].w = state[1].w = __ldg(&inputHash[thread + threads * 3]);
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for(int i = 0; i<12; i++)
+			round_lyra_v5(state);
+
+		state[0] ^= Mask[0];
+		state[1] ^= Mask[1];
+
+		for(int i = 0; i<12; i++)
+			round_lyra_v5(state);
+
+		DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0];
+		DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1];
+		DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2];
+		DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(32, 1)
+void lyra2v2_gpu_hash_32_2(uint32_t threads)
+{
+	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
+	__shared__ uint2 shared_mem[1536];
+	if(thread < threads)
+	{
+		uint2 state[4];
+		state[0] = ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[1] = ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[2] = ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[3] = ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+
+		reduceDuplexRowSetup2(shared_mem, state);
+
+		uint32_t rowa;
+		int prev = 3;
+
+		for(int i = 0; i < 3; i++)
+		{
+			rowa = __shfl(state[0].x, 0, 4) & 3;
+			reduceDuplexRowt2(shared_mem, prev, rowa, i, state);
+			prev = i;
+		}
+
+		rowa = __shfl(state[0].x, 0, 4) & 3;
+		reduceDuplexRowt2x4(shared_mem, rowa, state);
+
+		((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0];
+		((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1];
+		((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2];
+		((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(TPB5x, 1)
+void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	uint2x4 state[4];
+
+	if(thread < threads)
+	{
+		state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]);
+		state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]);
+		state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]);
+		state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]);
+
+		for(int i = 0; i < 12; i++)
+			round_lyra_v5(state);
+
+		outputHash[thread + threads * 0] = state[0].x;
+		outputHash[thread + threads * 1] = state[0].y;
+		outputHash[thread + threads * 2] = state[0].z;
+		outputHash[thread + threads * 3] = state[0].w;
+	}
+}
+
+#else
+#include "cuda_helper.h"
+#if __CUDA_ARCH__ < 200
+__device__ void* DMatrix;
+#endif
+__global__ void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash)
+{}
+__global__ void lyra2v2_gpu_hash_32_2(uint32_t threads)
+{}
+__global__ void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash)
+{}
+#endif
+
+
+__host__
+void lyra2v2_cpu_init(int thr_id, uint64_t *d_matrix)
+{
+	get_cuda_arch(&cuda_arch[thr_id]);
+	// just assign the device pointer allocated in main loop
+	cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+{
+	if(cuda_arch[thr_id] >= 500)
+	{
+
+		const uint32_t tpb = TPB5x;
+
+		dim3 grid2((threads + tpb - 1) / tpb);
+		dim3 block2(tpb);
+		dim3 grid4((threads * 4 + 32 - 1) / 32);
+		dim3 block4(4, 32 / 4);
+
+		lyra2v2_gpu_hash_32_1 << < grid2, block2, 0, gpustream[thr_id] >> > (threads, (uint2*)g_hash);
+		lyra2v2_gpu_hash_32_2 << < grid4, block4, 0, gpustream[thr_id] >> > (threads);
+		lyra2v2_gpu_hash_32_3 << < grid2, block2, 0, gpustream[thr_id] >> > (threads, (uint2*)g_hash);
+
+	}
+	else
+	{
+
+		uint32_t tpb = 16;
+		if(cuda_arch[thr_id] >= 350) tpb = TPB35;
+		else if(cuda_arch[thr_id] >= 300) tpb = TPB30;
+		else if(cuda_arch[thr_id] >= 200) tpb = TPB20;
+
+		dim3 grid((threads + tpb - 1) / tpb);
+		dim3 block(tpb);
+		lyra2v2_gpu_hash_32_v3 << < grid, block, 0, gpustream[thr_id] >> > (threads, startNounce, (uint2*)g_hash);
+
+	}
+	CUDA_SAFE_CALL(cudaGetLastError());
+}
\ No newline at end of file
diff --git a/lyra2/cuda_lyra2v2_sm3.cuh b/lyra2/cuda_lyra2v2_sm3.cuh
new file mode 100644
index 0000000000..56c6ccfe38
--- /dev/null
+++ b/lyra2/cuda_lyra2v2_sm3.cuh
@@ -0,0 +1,345 @@
+/* SM 2/3/3.5 Variant for lyra2REv2 */
+#include <cstdint>
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#undef __CUDA_ARCH__
+#define __CUDA_ARCH__ 350
+#endif
+
+#define TPB20 64
+#define TPB30 64
+#define TPB35 64
+
+#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500
+
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 4
+#define Ncol 4
+
+#define vectype ulonglong4
+#define memshift 4
+
+__device__ vectype *DMatrix;
+
+static __device__ __forceinline__
+void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d)
+{
+	a += b; d ^= a; d = ROTR64(d, 32);
+	c += d; b ^= c; b = ROTR64(b, 24);
+	a += b; d ^= a; d = ROTR64(d, 16);
+	c += d; b ^= c; b = ROTR64(b, 63);
+}
+
+static __device__ __forceinline__
+void round_lyra_v35(vectype* s)
+{
+	Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w);
+
+	Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+static __device__ __forceinline__
+void reduceDuplexV3(vectype state[4], uint32_t thread)
+{
+	vectype state1[3];
+	uint32_t ps1 = (Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread);
+
+#pragma unroll 4
+	for(int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow * i *memshift;
+		uint32_t s2 = ps2 - Nrow * i *memshift;
+
+		for(int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+
+		for(int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+		round_lyra_v35(state);
+
+		for(int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+
+		for(int j = 0; j < 3; j++)
+			(DMatrix + s2)[j] = state1[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread)
+{
+	vectype state2[3], state1[3];
+
+	uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
+	uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift *  rowOut + Nrow * Ncol * memshift * thread);
+
+	for(int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow*i*memshift;
+		uint32_t s2 = ps2 + Nrow*i*memshift;
+		uint32_t s3 = ps3 - Nrow*i*memshift;
+
+		for(int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+		for(int j = 0; j < 3; j++)
+			state2[j] = __ldg4(&(DMatrix + s2)[j]);
+		for(int j = 0; j < 3; j++)
+		{
+			vectype tmp = state1[j] + state2[j];
+			state[j] ^= tmp;
+		}
+
+		round_lyra_v35(state);
+
+		for(int j = 0; j < 3; j++)
+		{
+			state1[j] ^= state[j];
+			(DMatrix + s3)[j] = state1[j];
+		}
+
+		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+		for(int j = 0; j < 11; j++)
+			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
+
+		for(int j = 0; j < 3; j++)
+			(DMatrix + s2)[j] = state2[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread)
+{
+	vectype state1[3], state2[3];
+	uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
+	uint32_t ps3 = (memshift * rowOut + Nrow * Ncol * memshift * thread);
+
+#pragma nounroll
+	for(int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow * i*memshift;
+		uint32_t s2 = ps2 + Nrow * i*memshift;
+		uint32_t s3 = ps3 + Nrow * i*memshift;
+
+		for(int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+
+		for(int j = 0; j < 3; j++)
+			state2[j] = __ldg4(&(DMatrix + s2)[j]);
+
+		for(int j = 0; j < 3; j++)
+			state1[j] += state2[j];
+
+		for(int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+
+		round_lyra_v35(state);
+
+		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+
+		for(int j = 0; j < 11; j++)
+			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
+
+		if(rowInOut != rowOut)
+		{
+
+			for(int j = 0; j < 3; j++)
+				(DMatrix + s2)[j] = state2[j];
+
+			for(int j = 0; j < 3; j++)
+				(DMatrix + s3)[j] ^= state[j];
+
+		}
+		else
+		{
+
+			for(int j = 0; j < 3; j++)
+				state2[j] ^= state[j];
+
+			for(int j = 0; j < 3; j++)
+				(DMatrix + s2)[j] = state2[j];
+		}
+	}
+}
+
+#if __CUDA_ARCH__ >= 300
+__global__ __launch_bounds__(TPB35, 1)
+void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	vectype state[4];
+	vectype blake2b_IV[2];
+	vectype padding[2];
+
+	if(threadIdx.x == 0)
+	{
+
+		((uint16*)blake2b_IV)[0] = make_uint16(
+			0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
+			0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
+			0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
+			0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
+			);
+		((uint16*)padding)[0] = make_uint16(
+			0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0,
+			0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000
+			);
+	}
+
+	if(thread < threads)
+	{
+		((uint2*)state)[0] = __ldg(&outputHash[thread]);
+		((uint2*)state)[1] = __ldg(&outputHash[thread + threads]);
+		((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]);
+		((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]);
+
+		state[1] = state[0];
+		state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0);
+		state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0);
+
+		for(int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		state[0] ^= shuffle4(((vectype*)padding)[0], 0);
+		state[1] ^= shuffle4(((vectype*)padding)[1], 0);
+
+		for(int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
+
+		//#pragma unroll 4
+		for(int i = 0; i < 4; i++)
+		{
+			uint32_t s1 = ps1 - 4 * memshift * i;
+			for(int j = 0; j < 3; j++)
+				(DMatrix + s1)[j] = (state)[j];
+
+			round_lyra_v35(state);
+		}
+
+		reduceDuplexV3(state, thread);
+		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
+		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
+
+		uint32_t rowa;
+		int prev = 3;
+		for(int i = 0; i < 4; i++)
+		{
+			rowa = ((uint2*)state)[0].x & 3;  reduceDuplexRowtV3(prev, rowa, i, state, thread);
+			prev = i;
+		}
+
+		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
+
+		for(int j = 0; j < 3; j++)
+			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+
+		for(int i = 0; i < 12; i++)
+			round_lyra_v35(state);
+
+		outputHash[thread] = ((uint2*)state)[0];
+		outputHash[thread + threads] = ((uint2*)state)[1];
+		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
+		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
+
+	} //thread
+}
+#elif __CUDA_ARCH__ >= 200
+__global__ __launch_bounds__(TPB20, 1)
+void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	vectype state[4];
+	vectype blake2b_IV[2];
+	vectype padding[2];
+
+	((uint16*)blake2b_IV)[0] = make_uint16(
+		0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
+		0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
+		0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
+		0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
+		);
+	((uint16*)padding)[0] = make_uint16(
+		0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0,
+		0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000
+		);
+
+	if(thread < threads)
+	{
+
+		((uint2*)state)[0] = outputHash[thread];
+		((uint2*)state)[1] = outputHash[thread + threads];
+		((uint2*)state)[2] = outputHash[thread + 2 * threads];
+		((uint2*)state)[3] = outputHash[thread + 3 * threads];
+
+		state[1] = state[0];
+		state[2] = ((vectype*)blake2b_IV)[0];
+		state[3] = ((vectype*)blake2b_IV)[1];
+
+		for(int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		state[0] ^= ((vectype*)padding)[0];
+		state[1] ^= ((vectype*)padding)[1];
+
+		for(int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
+
+		//#pragma unroll 4
+		for(int i = 0; i < 4; i++)
+		{
+			uint32_t s1 = ps1 - 4 * memshift * i;
+			for(int j = 0; j < 3; j++)
+				(DMatrix + s1)[j] = (state)[j];
+
+			round_lyra_v35(state);
+		}
+
+		reduceDuplexV3(state, thread);
+		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
+		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
+
+		uint32_t rowa;
+		int prev = 3;
+		for(int i = 0; i < 4; i++)
+		{
+			rowa = ((uint2*)state)[0].x & 3;  reduceDuplexRowtV3(prev, rowa, i, state, thread);
+			prev = i;
+		}
+
+		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
+
+		for(int j = 0; j < 3; j++)
+			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+
+		for(int i = 0; i < 12; i++)
+			round_lyra_v35(state);
+
+		outputHash[thread] = ((uint2*)state)[0];
+		outputHash[thread + threads] = ((uint2*)state)[1];
+		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
+		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
+
+	} //thread
+}
+#endif
+
+#else
+/* host & sm5+ */
+__global__ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{}
+#endif
\ No newline at end of file
diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu
index 083caeb734..a05b48ae19 100644
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@@ -8,22 +8,25 @@ extern "C" {
 
 #include "miner.h"
 #include "cuda_helper.h"
-
+#include <cuda_profiler_api.h>
 static _ALIGN(64) uint64_t *d_hash[MAX_GPUS];
+static THREAD uint32_t *foundNonce;
+
 
-extern void blake256_cpu_init(int thr_id, uint32_t threads);
-extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
-extern void blake256_cpu_setBlock_80(uint32_t *pdata);
-extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash);
+extern void blake256_cpu_setBlock_80(int thr_id, uint32_t *pdata);
+extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash);
 extern void keccak256_cpu_init(int thr_id, uint32_t threads);
-extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash);
 extern void skein256_cpu_init(int thr_id, uint32_t threads);
 
-extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
-extern void lyra2_cpu_init(int thr_id, uint32_t threads);
+extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash);
+extern void lyra2_cpu_hash_32_multi(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash);
 
-extern void groestl256_setTarget(const void *ptarget);
-extern void groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order, uint32_t *resultnonces);
+extern void groestl256_setTarget(int thr_id, const void *ptarget);
+extern void lyra2_cpu_init(int thr_id, uint32_t threads);
+extern void lyra2_cpu_init_multi(int thr_id, uint32_t threads, uint64_t *hash, uint64_t* hash2);
+extern void groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, uint32_t *resultnonces);
 extern void groestl256_cpu_init(int thr_id, uint32_t threads);
 
 extern "C" void lyra2_hash(void *state, const void *input)
@@ -43,8 +46,7 @@ extern "C" void lyra2_hash(void *state, const void *input)
 	sph_keccak256(&ctx_keccak, hashA, 32);
 	sph_keccak256_close(&ctx_keccak, hashB);
 
-	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
-
+	LYRA2_old(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
 	sph_skein256_init(&ctx_skein);
 	sph_skein256(&ctx_skein, hashA, 32);
 	sph_skein256_close(&ctx_skein, hashB);
@@ -56,63 +58,61 @@ extern "C" void lyra2_hash(void *state, const void *input)
 	memcpy(state, hashA, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
+static volatile bool init[MAX_GPUS] = { false };
 
-extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+extern int scanhash_lyra2(int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
-	unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 256 * 256 * 25 : 256 * 256 * 14;
-	uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity); // 18=256*256*4;
-	throughput = min(throughput, (max_nonce - first_nonce));
+	unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 256 * 256 * 4 : 256 * 256 * 4 ;
+    intensity = (device_sm[device_map[thr_id]] == 500) ? 256 * 256 * 2 : intensity;
+	uint32_t throughput = device_intensity(device_map[thr_id], __func__, intensity); // 18=256*256*4;
+	
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x000f;
+		ptarget[7] = 0x00ff;
 
-	if (!init[thr_id])
-	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+	
+	if(!init[thr_id])
+	{ 
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-
-		blake256_cpu_init(thr_id, throughput);
-		keccak256_cpu_init(thr_id,throughput);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		CUDA_SAFE_CALL(cudaProfilerStop());
+		CUDA_SAFE_CALL(cudaMallocHost(&foundNonce, 2 * 4));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 8 * sizeof(uint32_t) * throughput));
+		keccak256_cpu_init(thr_id, throughput);
 		skein256_cpu_init(thr_id, throughput);
 		groestl256_cpu_init(thr_id, throughput);
 		lyra2_cpu_init(thr_id, throughput);
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
-
-		init[thr_id] = true;
+		init[thr_id] = true; 
 	}
+	else
+		CUDA_SAFE_CALL(cudaProfilerStart());
 
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
-
-	blake256_cpu_setBlock_80(pdata);
-	groestl256_setTarget(ptarget);
+		be32enc(&endiandata[k], pdata[k]);
 
+	blake256_cpu_setBlock_80(thr_id, pdata);
+	groestl256_setTarget(thr_id, ptarget);
 	do {
-		int order = 0;
-		uint32_t foundNonce[2] = { 0, 0 };
-
-		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		//MyStreamSynchronize(NULL, 2, thr_id);
-		groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++, foundNonce);
-		if (foundNonce[0] != 0)
+		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce);
+		CUDA_SAFE_CALL(cudaGetLastError());
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(foundNonce[0] != 0)
 		{
-			CUDA_SAFE_CALL(cudaGetLastError());
 			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce[0]);
 			lyra2_hash(vhash64, endiandata);
-
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
 			{
 				int res = 1;
@@ -120,22 +120,34 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata,
 				*hashes_done = pdata[19] - first_nonce + throughput;
 				if (foundNonce[1] != 0)
 				{
-					pdata[21] = foundNonce[1];
-					res++;
-					if (opt_benchmark)  applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, foundNonce[1], vhash64[7], Htarg);
+					be32enc(&endiandata[19], foundNonce[1]);
+					lyra2_hash(vhash64, endiandata);
+
+					if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+						pdata[21] = foundNonce[1];
+						res++;
+						if (opt_benchmark)  applog(LOG_INFO, "GPU #%d: Found second nounce %08x", device_map[thr_id], foundNonce[1]);
+					}
+					else
+					{
+						if (vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest
+							applog(LOG_WARNING, "GPU #%d: result %08x does not validate on CPU!", device_map[thr_id], foundNonce[1]);
+					}
 				}
 				pdata[19] = foundNonce[0];
-				if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found nounce % 08x", thr_id, foundNonce[0], vhash64[7], Htarg);
+				if (opt_benchmark) applog(LOG_INFO, "GPU #%d: Found nounce %08x", device_map[thr_id], foundNonce[0]);
 				return res;
 			}
 			else
 			{
-				if (vhash64[7] > Htarg) // don't show message if it is equal but fails fulltest
-					applog(LOG_WARNING, "GPU #%d: result does not validate on CPU!", thr_id);
+				if (vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest
+					applog(LOG_WARNING, "GPU #%d: result %08x does not validate on CPU!", device_map[thr_id], foundNonce[0]);
 			}
 		}
 
 		pdata[19] += throughput;
+
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
 	*hashes_done = pdata[19] - first_nonce + 1;
diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
new file mode 100644
index 0000000000..076d4217da
--- /dev/null
+++ b/lyra2/lyra2REv2.cu
@@ -0,0 +1,230 @@
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_cubehash.h"
+#include "lyra2/Lyra2.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+
+extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash);
+extern void blake256_cpu_setBlock_80(int thr_id, uint32_t *pdata);
+
+extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash);
+extern void keccak256_cpu_init(int thr_id, uint32_t threads);
+
+extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash);
+extern void skein256_cpu_init(int thr_id, uint32_t threads);
+
+extern void skeinCube256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash);
+
+
+extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash);
+extern void lyra2v2_cpu_init(int thr_id, uint64_t* matrix);
+
+extern void bmw256_cpu_init(int thr_id);
+extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t target);
+
+extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash);
+
+extern "C" void lyra2v2_hash(void *state, const void *input)
+{
+	sph_blake256_context      ctx_blake;
+	sph_keccak256_context     ctx_keccak;
+	sph_skein256_context      ctx_skein;
+	sph_bmw256_context        ctx_bmw;
+	sph_cubehash256_context   ctx_cube;
+
+	uint32_t hashA[8], hashB[8];
+
+	sph_blake256_init(&ctx_blake);
+	sph_blake256(&ctx_blake, input, 80);
+	sph_blake256_close(&ctx_blake, hashA);
+
+	sph_keccak256_init(&ctx_keccak);
+	sph_keccak256(&ctx_keccak, hashA, 32);
+	sph_keccak256_close(&ctx_keccak, hashB);
+
+	sph_cubehash256_init(&ctx_cube);
+	sph_cubehash256(&ctx_cube, hashB, 32);
+	sph_cubehash256_close(&ctx_cube, hashA);
+
+
+	LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
+
+	sph_skein256_init(&ctx_skein);
+	sph_skein256(&ctx_skein, hashB, 32);
+	sph_skein256_close(&ctx_skein, hashA);
+
+	sph_cubehash256_init(&ctx_cube);
+	sph_cubehash256(&ctx_cube, hashA, 32);
+	sph_cubehash256_close(&ctx_cube, hashB);
+
+
+	sph_bmw256_init(&ctx_bmw);
+	sph_bmw256(&ctx_bmw, hashB, 32);
+	sph_bmw256_close(&ctx_bmw, hashA);
+
+	memcpy(state, hashA, 32);
+}
+
+int scanhash_lyra2v2(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done)
+{
+	static THREAD uint64_t *d_hash = nullptr;
+	static THREAD uint64_t *d_hash2 = nullptr;
+
+	const uint32_t first_nonce = pdata[19];
+	uint32_t intensity = 256 * 256 * 8;
+
+	cudaDeviceProp props;
+	cudaGetDeviceProperties(&props, device_map[thr_id]);
+
+	if(strstr(props.name, "1080"))
+	{
+		intensity = 256 * 256 * 15;
+#ifdef _WIN64
+		intensity = 256 * 256 * 22;
+#endif
+	}
+	else if(strstr(props.name, "1070"))
+	{
+		intensity = 256 * 256 * 15;
+#ifdef _WIN64
+		intensity = 256 * 256 * 22;
+#endif
+	}
+	else if(strstr(props.name, "970"))
+	{
+		intensity = 256 * 256 * 15;
+#ifdef _WIN64
+		intensity = 256 * 256 * 22;
+#endif
+	}
+	else if (strstr(props.name, "980"))
+	{
+		intensity = 256 * 256 * 15;
+#ifdef _WIN64
+		intensity = 256 * 256 * 22;
+#endif
+	}
+	else if (strstr(props.name, "750 Ti"))
+	{
+		intensity = 256 * 256 * 12;
+	}
+	else if (strstr(props.name, "750"))
+	{
+		intensity = 256 * 256 * 5;
+	}
+	else if (strstr(props.name, "960"))
+	{
+		intensity = 256 * 256 * 8;
+	}
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity);
+	uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffe00;
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x004f;
+
+	static THREAD bool init = false;
+	if (!init)
+	{ 
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (16 * 4 * 4 * sizeof(uint64_t)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash2, 16  * 4 * 4 * sizeof(uint64_t) * throughputmax));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 8 * sizeof(uint32_t) * throughputmax));
+
+		bmw256_cpu_init(thr_id);
+		lyra2v2_cpu_init(thr_id, d_hash2);
+		mining_has_stopped[thr_id] = false;
+
+		init = true; 
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+
+	blake256_cpu_setBlock_80(thr_id, pdata);
+
+	do {
+		uint32_t foundNonce[2] = { 0, 0 };
+
+		blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+//		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash);
+		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash);
+		lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash);
+		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash);
+		cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash);
+		bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash, foundNonce, ptarget[7]);
+		if(stop_mining)
+		{
+			mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);
+		}
+		if(foundNonce[0] != 0)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t vhash64[8]={0};
+			if(opt_verify)
+			{
+				be32enc(&endiandata[19], foundNonce[0]);
+				lyra2v2_hash(vhash64, endiandata);
+			}
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			{
+				int res = 1;
+				// check if there was some other ones...
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (foundNonce[1] != 0)
+				{
+					if(opt_verify)
+					{
+						be32enc(&endiandata[19], foundNonce[1]);
+						lyra2v2_hash(vhash64, endiandata);
+					}
+					if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+						pdata[21] = foundNonce[1];
+						res++;
+						if(opt_benchmark)  applog(LOG_INFO, "GPU #%d Found second nonce %08x", thr_id, foundNonce[1]);
+					}
+					else
+					{
+						if(vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest
+							applog(LOG_WARNING, "GPU #%d: result does not validate on CPU!", thr_id);
+					}
+				}
+				pdata[19] = foundNonce[0];
+				if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found nonce % 08x", thr_id, foundNonce[0]);
+				return res;
+			}
+			else
+			{
+				if (vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest
+					applog(LOG_WARNING, "GPU #%d: result does not validate on CPU!", thr_id);
+			}
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
+
+	*hashes_done = pdata[19] - first_nonce ;
+	return 0;
+}
diff --git a/miner.h b/miner.h
index f84a3b9a11..12fc2f1fab 100644
--- a/miner.h
+++ b/miner.h
@@ -1,13 +1,15 @@
 #ifndef __MINER_H__
 #define __MINER_H__
 
-#ifdef __cplusplus
-extern "C" {
+#ifndef WIN32
+#include "ccminer-config.h"
+#else
+#include "ccminer-config-win.h"
 #endif
 
-#include "cpuminer-config.h"
-
+#ifndef __cplusplus
 #include <stdbool.h>
+#endif
 #include <inttypes.h>
 #include <sys/time.h>
 #include <pthread.h>
@@ -15,6 +17,9 @@ extern "C" {
 #include <curl/curl.h>
 
 #ifdef WIN32
+#ifndef __cplusplus
+#define inline __inline
+#endif
 #define snprintf(...) _snprintf(__VA_ARGS__)
 #define strdup(x) _strdup(x)
 #define strncasecmp(x,y,z) _strnicmp(x,y,z)
@@ -54,19 +59,10 @@ void *alloca (size_t);
 
 #include "compat.h"
 
-#ifdef __INTELLISENSE__
-/* should be in stdint.h but... */
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-typedef __int32 int32_t;
-typedef unsigned __int32 uint32_t;
-typedef __int16 int16_t;
-typedef unsigned __int16 uint16_t;
-typedef __int16 int8_t;
-typedef unsigned __int16 uint8_t;
-
-typedef unsigned __int32 time_t;
-typedef char *  va_list;
+#ifdef _MSC_VER
+#define THREAD __declspec(thread)
+#else
+#define THREAD __thread
 #endif
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0
@@ -130,28 +126,31 @@ static inline bool is_windows(void) {
 
 #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
 #define WANT_BUILTIN_BSWAP
-#else
-#define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
-                   | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
-#define bswap_64(x) (((uint64_t) bswap_32((uint32_t)((x) & 0xffffffffu)) << 32) \
-                   | (uint64_t) bswap_32((uint32_t)((x) >> 32)))
 #endif
 
-static inline uint32_t swab32(uint32_t v)
+static inline uint32_t swab32(uint32_t x)
 {
 #ifdef WANT_BUILTIN_BSWAP
-	return __builtin_bswap32(v);
+	return __builtin_bswap32(x);
 #else
-	return bswap_32(v);
+#ifdef _MSC_VER
+	return _byteswap_ulong(x);
+#else
+	return ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu));
+#endif
 #endif
 }
 
-static inline uint64_t swab64(uint64_t v)
+static inline uint64_t swab64(uint64_t x)
 {
 #ifdef WANT_BUILTIN_BSWAP
-	return __builtin_bswap64(v);
+	return __builtin_bswap64(x);
 #else
-	return bswap_64(v);
+#ifdef _MSC_VER
+	return _byteswap_uint64(x);
+#else
+	return (((uint64_t)bswap_32((uint32_t)((x)& 0xffffffffu)) << 32) | (uint64_t)bswap_32((uint32_t)((x) >> 32)));
+#endif
 #endif
 }
 
@@ -177,9 +176,7 @@ static inline void swab256(void *dest_p, const void *src_p)
 #if !HAVE_DECL_BE32DEC
 static inline uint32_t be32dec(const void *pp)
 {
-	const uint8_t *p = (uint8_t const *)pp;
-	return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
-	    ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
+	return swab32(*((uint32_t*)pp));
 }
 #endif
 
@@ -195,11 +192,7 @@ static inline uint32_t le32dec(const void *pp)
 #if !HAVE_DECL_BE32ENC
 static inline void be32enc(void *pp, uint32_t x)
 {
-	uint8_t *p = (uint8_t *)pp;
-	p[3] = x & 0xff;
-	p[2] = (x >> 8) & 0xff;
-	p[1] = (x >> 16) & 0xff;
-	p[0] = (x >> 24) & 0xff;
+	*((uint32_t*)pp) = swab32(x);
 }
 #endif
 
@@ -260,134 +253,142 @@ void aligned_free(void *ptr);
 
 #define USER_AGENT PACKAGE_NAME "/" PACKAGE_VERSION
 
-void sha256_init(uint32_t *state);
-void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
-void sha256d(unsigned char *hash, const unsigned char *data, int len);
-
-#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__)
-#define HAVE_SHA256_4WAY 0
-int sha256_use_4way();
-void sha256_init_4way(uint32_t *state);
-void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
+#ifdef __cplusplus
+extern "C" {
 #endif
 
-#if defined(__x86_64__) && defined(USE_AVX2)
-#define HAVE_SHA256_8WAY 0
-int sha256_use_8way();
-void sha256_init_8way(uint32_t *state);
-void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
+	void sha256_init(uint32_t *state);
+	void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
+	void sha256d(unsigned char *hash, const unsigned char *data, int len);
+
+#ifdef __cplusplus
+}
 #endif
 
-extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);
+struct work_restart
+{
+	volatile unsigned long	restart;
+	char			padding[128 - sizeof(unsigned long)];
+};
+extern struct work_restart *work_restart;
 
-extern unsigned char *scrypt_buffer_alloc();
+bool fulltest(const uint32_t *hash, const uint32_t *target);
 
 extern int scanhash_deep(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_doom(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_fugue256(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
-extern int scanhash_heavy(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done, uint32_t maxvote, int blocklen);
+extern int scanhash_c11(int thr_id, uint32_t *pdata,
+						uint32_t *ptarget, uint32_t max_nonce,
+						uint32_t *hashes_done);
 
 extern int scanhash_keccak256(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_myriad(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_jackpot(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_quark(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
-extern int scanhash_anime(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
 
 extern int scanhash_blake256(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done, int8_t blakerounds);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done, int8_t blakerounds);
 
 extern int scanhash_fresh(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
-extern int scanhash_lyra2(int thr_id, uint32_t *pdata,
+extern int scanhash_lyra2v2(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *hashes_done);
 
 extern int scanhash_nist5(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_pentablake(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_qubit(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
+
 
-extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
-	unsigned char *scratchbuf, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_skeincoin(int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_s3(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_whc(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
+
+extern int scanhash_whirlpoolx(int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_x11(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_x13(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_x14(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_x15(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_x17(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
 extern int scanhash_bitcoin(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
+
+extern int scanhash_neoscrypt(bool stratum, int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done);
 
+extern int scanhash_sia(int thr_id, uint32_t *pdata,
+												uint32_t *ptarget, uint32_t max_nonce,
+												uint32_t *hashes_done);
 /* api related */
 void *api_thread(void *userdata);
 void api_set_throughput(int thr_id, uint32_t throughput);
 
-struct cgpu_info {
+struct cgpu_info
+{
 	uint8_t gpu_id;
 	uint8_t thr_id;
 	int accepted;
@@ -403,7 +404,7 @@ struct cgpu_info {
 	int gpu_clock;
 	int gpu_memclock;
 	size_t gpu_mem;
-	uint32_t gpu_usage;
+	uint32_t gpu_power;
 	double gpu_vddc;
 	int16_t gpu_pstate;
 	int16_t gpu_bus;
@@ -459,23 +460,24 @@ struct thr_info {
 	struct cgpu_info gpu;
 };
 
-struct work_restart {
-	volatile unsigned long	restart;
-	char			padding[128 - sizeof(unsigned long)];
-};
-
+extern int cuda_num_devices();
+extern int cuda_version();
+extern int cuda_gpu_clocks(struct cgpu_info *gpu);
+extern bool opt_verify;
 extern bool opt_benchmark;
 extern bool opt_debug;
 extern bool opt_quiet;
 extern bool opt_protocol;
 extern bool opt_tracegpu;
 extern int opt_n_threads;
+extern int num_cpus;
 extern int active_gpus;
 extern int opt_timeout;
 extern bool want_longpoll;
 extern bool have_longpoll;
 extern bool want_stratum;
 extern bool have_stratum;
+extern bool opt_stratum_stats;
 extern char *opt_cert;
 extern char *opt_proxy;
 extern long opt_proxy_type;
@@ -486,14 +488,12 @@ extern struct thr_info *thr_info;
 extern int longpoll_thr_id;
 extern int stratum_thr_id;
 extern int api_thr_id;
-extern struct work_restart *work_restart;
 extern bool opt_trust_pool;
-extern uint16_t opt_vote;
 
 extern uint64_t global_hashrate;
 extern double   global_diff;
 
-#define MAX_GPUS 16
+#define MAX_GPUS 8
 extern char* device_name[MAX_GPUS];
 extern int device_map[MAX_GPUS];
 extern long  device_sm[MAX_GPUS];
@@ -530,18 +530,16 @@ extern uint32_t gpus_intensity[MAX_GPUS];
 
 #define CL_WHT  "\x1B[01;37m" /* white */
 
-extern void applog(int prio, const char *fmt, ...);
-extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass,
-	const char *rpc_req, bool, bool, int *);
-extern void cbin2hex(char *out, const char *in, size_t len);
-extern char *bin2hex(const unsigned char *in, size_t len);
-extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len);
-extern int timeval_subtract(struct timeval *result, struct timeval *x,
-	struct timeval *y);
-extern bool fulltest(const uint32_t *hash, const uint32_t *target);
-extern void diff_to_target(uint32_t *target, double diff);
-extern void get_currentalgo(char* buf, int sz);
-extern uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount);
+void format_hashrate(double hashrate, char *output);
+void applog(int prio, const char *fmt, ...);
+json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass, const char *rpc_req, bool, bool, int *);
+void cbin2hex(char *out, const char *in, size_t len);
+char *bin2hex(const unsigned char *in, size_t len);
+bool hex2bin(unsigned char *p, const char *hexstr, size_t len);
+int timeval_subtract(struct timeval *result, struct timeval *x, struct timeval *y);
+void diff_to_target(uint32_t *target, double diff);
+void get_currentalgo(char* buf, int sz);
+uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount);
 
 struct stratum_job {
 	char *job_id;
@@ -565,7 +563,6 @@ struct stratum_ctx {
 
 	CURL *curl;
 	char *curl_url;
-	char curl_err_str[CURL_ERROR_SIZE];
 	curl_socket_t sock;
 	size_t sockbuf_size;
 	char *sockbuf;
@@ -589,9 +586,10 @@ struct stratum_ctx {
 };
 
 struct work {
-	uint32_t data[32];
+	uint32_t data[64];
+	size_t datasize;
+	uint32_t midstate[8];
 	uint32_t target[8];
-	uint32_t maxvote;
 
 	char job_id[128];
 	size_t xnonce2_len;
@@ -609,13 +607,49 @@ struct work {
 	uint32_t scanned_to;
 };
 
+enum sha_algos
+{
+	ALGO_BITCOIN,
+	ALGO_BLAKE,
+	ALGO_BLAKECOIN,
+	ALGO_C11,
+	ALGO_DEEP,
+	ALGO_DMD_GR,
+	ALGO_DOOM,
+	ALGO_FRESH,
+	ALGO_FUGUE256,		/* Fugue256 */
+	ALGO_GROESTL,
+	ALGO_KECCAK,
+	ALGO_JACKPOT,
+	ALGO_LUFFA_DOOM,
+	ALGO_LYRA2v2,
+	ALGO_MYR_GR,
+	ALGO_NIST5,
+	ALGO_PENTABLAKE,
+	ALGO_QUARK,
+	ALGO_QUBIT,
+	ALGO_SIA,
+	ALGO_SKEIN,
+	ALGO_S3,
+	ALGO_SPREADX11,
+	ALGO_WHC,
+	ALGO_WHCX,
+	ALGO_X11,
+	ALGO_X13,
+	ALGO_X14,
+	ALGO_X15,
+	ALGO_X17,
+	ALGO_VANILLA,
+	ALGO_NEO
+};
+
 bool stratum_socket_full(struct stratum_ctx *sctx, int timeout);
 bool stratum_send_line(struct stratum_ctx *sctx, char *s);
 char *stratum_recv_line(struct stratum_ctx *sctx);
 bool stratum_connect(struct stratum_ctx *sctx, const char *url);
 void stratum_disconnect(struct stratum_ctx *sctx);
 bool stratum_subscribe(struct stratum_ctx *sctx);
-bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
+bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass,bool extranonce);
 bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
 
 void hashlog_remember_submit(struct work* work, uint32_t nonce);
@@ -647,6 +681,7 @@ extern void tq_freeze(struct thread_q *tq);
 extern void tq_thaw(struct thread_q *tq);
 
 void proper_exit(int reason);
+void restart_threads(void);
 
 size_t time2str(char* buf, time_t timer);
 char* atime2str(time_t timer);
@@ -655,22 +690,21 @@ void applog_hash(unsigned char *hash);
 void applog_compare_hash(unsigned char *hash, unsigned char *hash2);
 
 void print_hash_tests(void);
-void animehash(void *state, const void *input);
+
 void blake256hash(void *output, const void *input, int8_t rounds);
 void deephash(void *state, const void *input);
 void doomhash(void *state, const void *input);
 void fresh_hash(void *state, const void *input);
 void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
-void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
 void keccak256_hash(void *state, const void *input);
 unsigned int jackpothash(void *state, const void *input);
 void groestlhash(void *state, const void *input);
-void lyra2_hash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
 void nist5hash(void *state, const void *input);
 void pentablakehash(void *output, const void *input);
 void quarkhash(void *state, const void *input);
 void qubithash(void *state, const void *input);
+void skeincoinhash(void *output, const void *input);
 void s3hash(void *output, const void *input);
 void wcoinhash(void *state, const void *input);
 void x11hash(void *output, const void *input);
@@ -679,8 +713,4 @@ void x14hash(void *output, const void *input);
 void x15hash(void *output, const void *input);
 void x17hash(void *output, const void *input);
 
-#ifdef __cplusplus
-}
-#endif
-
 #endif /* __MINER_H__ */
diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp
index 76ed46732c..39774c3a4c 100644
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@@ -1,25 +1,23 @@
 #include <string.h>
+#ifdef __cplusplus
+#include <cstdint>
+#else
 #include <stdint.h>
+#endif
 #include <openssl/sha.h>
 
-#include "uint256.h"
 #include "sph/sph_groestl.h"
 
 #include "miner.h"
 #include <cuda_runtime.h>
-
-static bool init[MAX_GPUS] = { 0 };
-static uint32_t *h_found[MAX_GPUS];
+extern bool stop_mining;
+extern volatile bool mining_has_stopped[MAX_GPUS];
 
 void myriadgroestl_cpu_init(int thr_id, uint32_t threads);
 void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
 void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *nounce);
 
-#define SWAP32(x) \
-    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
-      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
-
-extern "C" void myriadhash(void *state, const void *input)
+void myriadhash(void *state, const void *input)
 {
 	uint32_t hashA[16], hashB[16];
 	sph_groestl512_context ctx_groestl;
@@ -36,25 +34,38 @@ extern "C" void myriadhash(void *state, const void *input)
 	memcpy(state, hashB, 32);
 }
 
-extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
+extern int scanhash_myriad(int thr_id, uint32_t *pdata, uint32_t *ptarget,
+	uint32_t max_nonce, uint32_t *hashes_done)
 {
-	uint32_t start_nonce = pdata[19]++;
-	uint32_t throughput = device_intensity(thr_id, __func__, 1 << 17);
-	throughput = min(throughput, max_nonce - start_nonce);
+	static THREAD uint32_t *h_found = nullptr;
+
+	uint32_t start_nonce = pdata[19];
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1 << 19);
+	uint32_t throughput = min(throughputmax, max_nonce - start_nonce) & 0xfffffc00;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		ptarget[7] = 0x0000ff;
 
 	// init
-	if(!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
 	{
 #if BIG_DEBUG
 #else
-		myriadgroestl_cpu_init(thr_id, throughput);
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (16 * sizeof(uint32_t)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			proper_exit(2);
+		}
+#endif
+		myriadgroestl_cpu_init(thr_id, throughputmax);
 #endif
-		cudaMallocHost(&(h_found[thr_id]), 4 * sizeof(uint32_t));
-		init[thr_id] = true;
+		cudaMallocHost(&h_found, 4 * sizeof(uint32_t));
+		mining_has_stopped[thr_id] = false;
+		init = true;
 	}
 
 	uint32_t endiandata[32];
@@ -67,39 +78,64 @@ extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptar
 	do {
 		const uint32_t Htarg = ptarget[7];
 
-		myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], h_found[thr_id]);
+		myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], h_found);
 
-		if (h_found[thr_id][0] < 0xffffffff)
+		if(stop_mining) {mining_has_stopped[thr_id] = true; pthread_exit(nullptr);}
+		if(h_found[0] != 0xffffffff)
 		{
-			uint32_t tmpHash[8];
-			endiandata[19] = SWAP32(h_found[thr_id][0]);
-			myriadhash(tmpHash, endiandata);
-			if (tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget))
+			const uint32_t Htarg = ptarget[7];
+			uint32_t vhash64[8]={0};
+			if(opt_verify){ be32enc(&endiandata[19], h_found[0]);
+			myriadhash(vhash64, endiandata);
+
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
 			{
 				int res = 1;
 				*hashes_done = pdata[19] - start_nonce + throughput;
-				if (h_found[thr_id][1] != 0xffffffff)
+				if (h_found[1] != 0xffffffff)
 				{
-					if (opt_benchmark) applog(LOG_INFO, "found second nounce %08x", thr_id, h_found[thr_id][1]);
-					pdata[21] = h_found[thr_id][1];
-					res++;
+					if(opt_verify){ be32enc(&endiandata[19], h_found[1]);
+					myriadhash(vhash64, endiandata);
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+
+						pdata[21] = h_found[1];
+						res++;
+						if (opt_benchmark)
+							applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]);
+					}
+					else
+					{
+						if (vhash64[7] != Htarg)
+						{
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]);
+						}
+					}
+
 				}
-				pdata[19] = h_found[thr_id][0];
+				pdata[19] = h_found[0];
 				if (opt_benchmark)
-					applog(LOG_INFO, "found nounce %08x", thr_id, h_found[thr_id][0]);
+					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]);
 				return res;
 			}
 			else
 			{
-				if (tmpHash[7] != Htarg) // don't show message if it is equal but fails fulltest
-					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_found[thr_id][0]);
+				if (vhash64[7] != Htarg)
+				{
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]);
+				}
 			}
 		}
-
 		pdata[19] += throughput;
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess)
+		{
+			applog(LOG_ERR, "GPU #%d: %s", device_map[thr_id], cudaGetErrorString(err));
+			exit(EXIT_FAILURE);
+		}
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - start_nonce + 1;
+	*hashes_done = pdata[19] - start_nonce;
 	return 0;
 }
 
diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu
new file mode 100644
index 0000000000..273a6b7e72
--- /dev/null
+++ b/neoscrypt/cuda_neoscrypt.cu
@@ -0,0 +1,1498 @@
+// originally from djm34 (https://github.com/djm34/ccminer-sp-neoscrypt/)
+
+#include <stdio.h>
+#include <memory.h>
+#include "cuda_helper.h"
+#include "cuda_vector.h" 
+
+#define vectype uintx64bis
+#define vectypeS uint28 
+
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif
+
+#ifdef _MSC_VER
+#define THREAD __declspec(thread)
+#else
+#define THREAD __thread
+#endif
+
+static THREAD cudaStream_t stream[2];
+
+__device__  __align__(16) vectypeS *  W;
+__device__  __align__(16) vectypeS * W2;
+__device__  __align__(16) vectypeS* Tr;
+__device__  __align__(16) vectypeS* Tr2;
+__device__  __align__(16) vectypeS* Input;
+__device__  __align__(16) vectypeS* B2;
+
+static uint32_t *d_NNonce[MAX_GPUS]; 
+
+__constant__  uint32_t pTarget[8];
+__constant__  uint32_t key_init[16];
+__constant__  uint32_t input_init[16];
+__constant__  uint32_t c_data[64];
+
+#define SALSA_SMALL_UNROLL 1
+#define CHACHA_SMALL_UNROLL 1
+#define BLAKE2S_BLOCK_SIZE    64U 
+#define BLAKE2S_OUT_SIZE      32U
+#define BLAKE2S_KEY_SIZE      32U
+#define BLOCK_SIZE            64U
+#define FASTKDF_BUFFER_SIZE  256U
+#define PASSWORD_LEN          80U
+/// constants ///
+
+static const __constant__  uint8 BLAKE2S_IV_Vec =
+{
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+
+static const  uint8 BLAKE2S_IV_Vechost =
+{
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint32_t BLAKE2S_SIGMA_host[10][16] =
+{
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+};
+
+__constant__ uint32_t BLAKE2S_SIGMA[10][16] =
+{
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+};
+
+#define SALSA(a,b,c,d) { \
+    b^=rotate(a+d,  7);    \
+    c^=rotate(b+a,  9);    \
+    d^=rotate(c+b, 13);    \
+    a^=rotate(d+c, 18);     \
+}
+
+#define SALSA_CORE(state) { \
+\
+SALSA(state.s0,state.s4,state.s8,state.sc); \
+SALSA(state.s5,state.s9,state.sd,state.s1); \
+SALSA(state.sa,state.se,state.s2,state.s6); \
+SALSA(state.sf,state.s3,state.s7,state.sb); \
+SALSA(state.s0,state.s1,state.s2,state.s3); \
+SALSA(state.s5,state.s6,state.s7,state.s4); \
+SALSA(state.sa,state.sb,state.s8,state.s9); \
+SALSA(state.sf,state.sc,state.sd,state.se); \
+		} 
+
+static __forceinline__ __device__ void shift256R4(uint32_t * ret, const uint8 &vec4, uint32_t shift2)
+{
+	uint32_t shift = 32 - shift2;
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[0]) : "r"(0), "r"(vec4.s0), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift));
+	asm("shr.b32         %0, %1, %2;"     : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift));
+}
+
+/*static __device__ __inline__ void chacha_step(uint32_t &a, uint32_t &b, uint32_t &c, uint32_t &d)
+{
+	asm("{\n\t"
+			"add.u32 %0,%0,%1; \n\t"
+			"xor.b32 %3,%3,%0; \n\t"
+			"prmt.b32 %3, %3, 0, 0x1032; \n\t"
+			"add.u32 %2,%2,%3; \n\t"
+			"xor.b32 %1,%1,%2; \n\t"
+			"shf.l.wrap.b32 %1, %1, %1, 12; \n\t"
+			"add.u32 %0,%0,%1; \n\t"
+			"xor.b32 %3,%3,%0; \n\t"
+			"prmt.b32 %3, %3, 0, 0x2103; \n\t"
+			"add.u32 %2,%2,%3; \n\t"
+			"xor.b32 %1,%1,%2; \n\t"
+			"shf.l.wrap.b32 %1, %1, %1, 7; \n\t}"
+			: "+r"(a), "+r"(b), "+r"(c), "+r"(d));
+}
+*/
+#if __CUDA_ARCH__ >=500  
+
+#define CHACHA_STEP(a,b,c,d) { \
+a += b; d = __byte_perm(d^a,0,0x1032); \
+c += d; b = rotate(b^c, 12); \
+a += b; d = __byte_perm(d^a,0,0x2103); \
+c += d; b = rotate(b^c, 7); \
+	}
+
+//#define CHACHA_STEP(a,b,c,d) chacha_step(a,b,c,d)
+#else 
+#define CHACHA_STEP(a,b,c,d) { \
+a += b; d = rotate(d^a,16); \
+c += d; b = rotate(b^c, 12); \
+a += b; d = rotate(d^a,8); \
+c += d; b = rotate(b^c, 7); \
+	}
+#endif
+
+#define CHACHA_CORE_PARALLEL(state)	 { \
+ \
+	CHACHA_STEP(state.lo.s0, state.lo.s4, state.hi.s0, state.hi.s4); \
+	CHACHA_STEP(state.lo.s1, state.lo.s5, state.hi.s1, state.hi.s5); \
+	CHACHA_STEP(state.lo.s2, state.lo.s6, state.hi.s2, state.hi.s6); \
+	CHACHA_STEP(state.lo.s3, state.lo.s7, state.hi.s3, state.hi.s7); \
+	CHACHA_STEP(state.lo.s0, state.lo.s5, state.hi.s2, state.hi.s7); \
+	CHACHA_STEP(state.lo.s1, state.lo.s6, state.hi.s3, state.hi.s4); \
+	CHACHA_STEP(state.lo.s2, state.lo.s7, state.hi.s0, state.hi.s5); \
+	CHACHA_STEP(state.lo.s3, state.lo.s4, state.hi.s1, state.hi.s6); \
+\
+	}
+
+#define CHACHA_CORE_PARALLEL2(i0,state)	 { \
+ \
+  CHACHA_STEP(state[2*i0].x.x, state[2*i0].z.x, state[2*i0+1].x.x, state[2*i0+1].z.x); \
+  CHACHA_STEP(state[2*i0].x.y, state[2*i0].z.y, state[2*i0+1].x.y, state[2*i0+1].z.y); \
+  CHACHA_STEP(state[2*i0].y.x, state[2*i0].w.x, state[2*i0+1].y.x, state[2*i0+1].w.x); \
+	CHACHA_STEP(state[2*i0].y.y, state[2*i0].w.y, state[2*i0+1].y.y, state[2*i0+1].w.y); \
+	CHACHA_STEP(state[2*i0].x.x, state[2*i0].z.y, state[2*i0+1].y.x, state[2*i0+1].w.y); \
+  CHACHA_STEP(state[2*i0].x.y, state[2*i0].w.x, state[2*i0+1].y.y, state[2*i0+1].z.x); \
+  CHACHA_STEP(state[2*i0].y.x, state[2*i0].w.y, state[2*i0+1].x.x, state[2*i0+1].z.y); \
+	CHACHA_STEP(state[2*i0].y.y, state[2*i0].z.x, state[2*i0+1].x.y, state[2*i0+1].w.x); \
+\
+	}
+
+#define BLAKE2S_BLOCK_SIZE    64U
+#define BLAKE2S_OUT_SIZE      32U
+#define BLAKE2S_KEY_SIZE      32U
+
+#if __CUDA_ARCH__ >= 500
+#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
+	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
+	a += b; d = __byte_perm(d^a,0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
+	a += b; d = __byte_perm(d^a,0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+} 
+#else 
+#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
+	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
+	a += b; d = rotate(d ^ a,16); \
+	c += d; b = rotateR(b ^ c, 12); \
+	idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \
+  a += b; d = rotateR(d ^ a,8); \
+	c += d; b = rotateR(b ^ c, 7); \
+} 
+#endif
+
+#if __CUDA_ARCH__ >= 500
+
+#define BLAKE(a, b, c, d, key1,key2) { \
+	a += b + key1; \
+	d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b + key2; \
+	d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+} 
+
+#define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \
+	a += b + key[idx0]; \
+	d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b + key[idx1]; \
+	d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+} 
+
+#define BLAKE_G_PRE0(idx0, idx1, a, b, c, d, key) { \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+} 
+
+#define BLAKE_G_PRE1(idx0, idx1, a, b, c, d, key) { \
+	a += b + key[idx0]; \
+	d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+} 
+
+#define BLAKE_G_PRE2(idx0, idx1, a, b, c, d, key) { \
+	a += b; d = __byte_perm(d^a,0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b + key[idx1]; \
+	d = __byte_perm(d^a,0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+} 
+
+#else 
+#define BLAKE(a, b, c, d, key1,key2) { \
+  \
+  a += key1; \
+  a += b; d = rotate(d^a,16); \
+	c += d; b = rotateR(b^c, 12); \
+  a += key2; \
+  a += b; d = rotateR(d^a,8); \
+	c += d; b = rotateR(b^c, 7); \
+	} 
+
+#define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \
+  a += key[idx0]; \
+  a += b; d = rotate(d^a,16); \
+	c += d; b = rotateR(b^c, 12); \
+  a += key[idx1]; \
+  a += b; d = rotateR(d^a,8); \
+	c += d; b = rotateR(b^c, 7); \
+	} 
+
+#define BLAKE_G_PRE0(idx0, idx1, a, b, c, d, key) { \
+     \
+  a += b; d = rotate(d^a,16); \
+	c += d; b = rotateR(b^c, 12); \
+    \
+  a += b; d = rotateR(d^a,8); \
+	c += d; b = rotateR(b^c, 7); \
+	} 
+
+#define BLAKE_G_PRE1(idx0, idx1, a, b, c, d, key) { \
+  a += key[idx0]; \
+  a += b; d = rotate(d^a,16); \
+	c += d; b = rotateR(b^c, 12); \
+  a += b; d = rotateR(d^a,8); \
+	c += d; b = rotateR(b^c, 7); \
+	} 
+
+#define BLAKE_G_PRE2(idx0, idx1, a, b, c, d, key) { \
+     \
+  a += b; d = rotate(d^a,16); \
+	c += d; b = rotateR(b^c, 12); \
+  a += key[idx1]; \
+  a += b; d = rotateR(d^a,8); \
+	c += d; b = rotateR(b^c, 7); \
+	} 
+#endif
+
+#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \
+	idx = BLAKE2S_SIGMA_host[idx0][idx1]; \
+	a += b + key[idx]; \
+	d = ROTR32(d ^ a, 16); \
+	c += d; b = ROTR32(b ^ c, 12); \
+	idx = BLAKE2S_SIGMA_host[idx0][idx1 + 1]; \
+	a += b + key[idx]; \
+	d = ROTR32(d ^ a, 8); \
+	c += d; b = ROTR32(b ^ c, 7); \
+} 
+#if __CUDA_ARCH__ < 500
+static __forceinline__ __device__ void Blake2S(uint32_t * __restrict__ out, const uint32_t* __restrict__  inout, const  uint32_t * __restrict__ TheKey)
+{
+	uint16 V;
+	uint32_t idx;
+	uint8 tmpblock;
+
+	V.hi = BLAKE2S_IV_Vec;
+	V.lo = BLAKE2S_IV_Vec;
+	V.lo.s0 ^= 0x01012020;
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
+
+
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+
+	//		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+
+	//		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+
+	//		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+
+
+	//		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	//		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+
+	//		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+
+	//		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	//		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	//		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	V.lo ^= V.hi ^ tmpblock;
+
+	V.hi = BLAKE2S_IV_Vec;
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= 128;
+	V.hi.s6 = ~V.hi.s6;
+
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+
+	//		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+	//		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+
+	//		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+	for(int x = 4; x < 10; ++x)
+	{
+		BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+		BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+		BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+		BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+		BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+		BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+		BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+		BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	}
+
+	V.lo ^= V.hi ^ tmpblock;
+
+	((uint8*)out)[0] = V.lo;
+}
+#else
+static __forceinline__ __device__ void Blake2S_v2(uint32_t * __restrict__ out, const uint32_t* __restrict__  inout, const  uint32_t * __restrict__ TheKey)
+{
+	uint16 V;
+	uint8 tmpblock;
+
+	V.hi = BLAKE2S_IV_Vec;
+	V.lo = BLAKE2S_IV_Vec;
+	V.lo.s0 ^= 0x01012020;
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
+
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	//		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	//		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	//		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	//		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	//		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+
+	//		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+
+	//		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	//		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	//		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	V.lo ^= V.hi;
+	V.lo ^= tmpblock;
+
+	V.hi = BLAKE2S_IV_Vec;
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= 128;
+	V.hi.s6 = ~V.hi.s6;
+
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+
+	//		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+	//		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+
+	//		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+	//#pragma unroll
+
+	//		13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10,
+	//		6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5,
+	//		10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0,
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[9], inout[0]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[5], inout[7]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[2], inout[4]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[10], inout[15]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[14], inout[1]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[11], inout[12]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[6], inout[8]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[3], inout[13]);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[2], inout[12]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[6], inout[10]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[0], inout[11]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[8], inout[3]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[4], inout[13]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[7], inout[5]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[15], inout[14]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[1], inout[9]);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[12], inout[5]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[1], inout[15]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[14], inout[13]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[4], inout[10]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[0], inout[7]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[6], inout[3]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[9], inout[2]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[8], inout[11]);
+
+	//		13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10,
+	//		6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5,
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[13], inout[11]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[7], inout[14]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[12], inout[1]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[3], inout[9]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[5], inout[0]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[15], inout[4]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[8], inout[6]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[2], inout[10]);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[6], inout[15]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[14], inout[9]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[11], inout[3]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[0], inout[8]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[12], inout[2]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[13], inout[7]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[1], inout[4]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[10], inout[5]);
+	//		10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0,
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[10], inout[2]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[8], inout[4]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[7], inout[6]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[1], inout[5]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[15], inout[11]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[9], inout[14]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[3], inout[12]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[13], inout[0]);
+
+	V.lo ^= V.hi;
+	V.lo ^= tmpblock;
+
+	((uint8*)out)[0] = V.lo;
+}
+#endif
+
+static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X)
+{
+	uint16 state = X;
+
+#pragma unroll 1
+	for(int i = 0; i < 10; ++i)
+	{
+		SALSA_CORE(state);
+	}
+
+	return(X + state);
+}
+
+static __device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X)
+{
+	uint16 st = X;
+#pragma nounroll 
+	for(int i = 0; i < 10; ++i)
+	{
+		CHACHA_CORE_PARALLEL(st);
+	}
+	return(X + st);
+}
+
+static __device__ __forceinline__ void neoscrypt_chacha(uint16 *XV)
+{
+	uint16 temp;
+
+	XV[0] = chacha_small_parallel_rnd(XV[0] ^ XV[3]);
+	temp  = chacha_small_parallel_rnd(XV[1] ^ XV[0]);
+	XV[1] = chacha_small_parallel_rnd(XV[2] ^ temp);
+	XV[3] = chacha_small_parallel_rnd(XV[3] ^ XV[1]);
+	XV[2] = temp;
+}
+
+static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV)
+{
+	uint16 temp;
+
+	XV[0] = salsa_small_scalar_rnd(XV[0] ^ XV[3]);
+	temp  = salsa_small_scalar_rnd(XV[1] ^ XV[0]);
+	XV[1] = salsa_small_scalar_rnd(XV[2] ^ temp);
+	XV[3] = salsa_small_scalar_rnd(XV[3] ^ XV[1]);
+	XV[2] = temp;
+}
+
+static __forceinline__ __host__ void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
+{
+	uint16 V;
+	uint32_t idx;
+	uint8 tmpblock;
+
+	V.hi = BLAKE2S_IV_Vechost;
+	V.lo = BLAKE2S_IV_Vechost;
+	V.lo.s0 ^= 0x01012020;
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
+
+	for(int x = 0; x < 10; ++x)
+	{
+		BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey);
+		BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey);
+		BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey);
+		BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey);
+		BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey);
+		BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey);
+		BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey);
+		BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey);
+	}
+
+	V.lo ^= V.hi;
+	V.lo ^= tmpblock;
+
+	V.hi = BLAKE2S_IV_Vechost;
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= 128;
+	V.hi.s6 = ~V.hi.s6;
+
+	for(int x = 0; x < 10; ++x)
+	{
+		BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+		BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+		BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+		BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+		BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+		BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+		BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+		BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	}
+
+	V.lo ^= V.hi ^ tmpblock;
+
+	((uint8*)inout)[0] = V.lo;
+}
+
+#if __CUDA_ARCH__ < 500
+static __forceinline__ __device__ void fastkdf256_v1(uint32_t thread, const uint32_t nonce, const uint32_t * __restrict__  s_data) //, vectypeS * output)
+{
+	vectypeS __align__(16) output[8];
+	uint8_t bufidx;
+	uchar4 bufhelper;
+	uint32_t B[64];
+	uint32_t qbuf, rbuf, bitbuf;
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4];
+	uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0};
+
+	const uint32_t data18 = s_data[18];
+	const uint32_t data20 = s_data[0];
+
+	((uintx64*)(B))[0] = ((uintx64*)s_data)[0];
+	((uint32_t*)B)[19] = nonce;
+	((uint32_t*)B)[39] = nonce;
+	((uint32_t*)B)[59] = nonce;
+
+	((uint816*)input)[0] = ((uint816*)input_init)[0];
+	((uint48*)key)[0] = ((uint48*)key_init)[0];
+
+#pragma unroll  1
+	for(int i = 0; i < 31; ++i)
+	{
+		bufhelper = ((uchar4*)input)[0];
+		for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			bufhelper += ((uchar4*)input)[x];
+		}
+		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+
+		qbuf = bufidx / 4;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+
+		uint32_t shifted[9];
+
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
+
+		//#pragma unroll
+		uint32_t temp[9];
+
+		for(int k = 0; k < 9; ++k)
+		{
+			uint32_t indice = (k + qbuf) & 0x0000003f;
+			temp[k] = B[indice] ^ shifted[k];
+			B[indice] = temp[k];
+		}
+
+		uint32_t a = s_data[qbuf & 0x0000003f], b;
+		//#pragma unroll
+		for(int k = 0; k<16; k+=2)
+		{
+			b = s_data[(qbuf + k + 1) & 0x0000003f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = s_data[(qbuf + k + 2) & 0x0000003f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19 - qbuf % 20;
+		if(noncepos <= 16 && qbuf<60)
+		{
+			if(noncepos != 0)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if(noncepos != 16)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		for(int k = 0; k<8; k++)
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[k]) : "r"(temp[k]), "r"(temp[k + 1]), "r"(bitbuf));
+
+		Blake2S(input, input, key); //yeah right...
+	}
+	bufhelper = ((uchar4*)input)[0];
+	for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		bufhelper += ((uchar4*)input)[x];
+	}
+	bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+
+	qbuf = bufidx / 4;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+	for(int i = 0; i<64; i++)
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[(qbuf + i) & 0x0000003f]), "r"(B[(qbuf + i + 1) & 0x0000003f4]), "r"(bitbuf));
+
+	((ulonglong4*)output)[0] ^= ((ulonglong4*)input)[0];
+
+	((uintx64*)output)[0] ^= ((uintx64*)s_data)[0];
+	((uint32_t*)output)[19] ^= nonce;
+	((uint32_t*)output)[39] ^= nonce;
+	((uint32_t*)output)[59] ^= nonce;
+
+	for(int i = 0; i<8; i++)
+		(Input + 8 * thread)[i] = output[i];
+}
+
+static __forceinline__ __device__ void fastkdf32_v1(uint32_t thread, const  uint32_t  nonce, const uint32_t * __restrict__ salt, const uint32_t *__restrict__  s_data, uint32_t &output)
+{
+	uint8_t bufidx;
+	uchar4 bufhelper;
+	uint32_t temp[9];
+
+#define Bshift 16*thread
+
+	uint32_t* const B0 = (uint32_t*)&B2[Bshift];
+	const uint32_t cdata7 = s_data[7];
+	const uint32_t data18 = s_data[18];
+	const uint32_t data20 = s_data[0];
+
+	((uintx64*)B0)[0] = ((uintx64*)salt)[0];
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0};
+	((uint816*)input)[0] = ((uint816*)s_data)[0];
+	((uint48*)key)[0] = ((uint48*)salt)[0];
+	uint32_t qbuf, rbuf, bitbuf;
+
+#pragma nounroll  
+	for(int i = 0; i < 31; i++)
+	{
+		Blake2S(input, input, key);
+
+		bufidx = 0;
+		bufhelper = ((uchar4*)input)[0];
+		for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			bufhelper += ((uchar4*)input)[x];
+		}
+		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+		qbuf = bufidx / 4;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+		uint32_t shifted[9];
+
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
+
+		for(int k = 0; k < 9; k++)
+		{
+			temp[k] = B0[(k + qbuf) & 0x0000003f];
+		}
+
+		((uint28*)temp)[0] ^= ((uint28*)shifted)[0];
+		temp[8] ^= shifted[8];
+
+		uint32_t a = s_data[qbuf & 0x0000003f], b;
+		//#pragma unroll
+		for(int k = 0; k<16; k += 2)
+		{
+			b = s_data[(qbuf + k + 1) & 0x0000003f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = s_data[(qbuf + k + 2) & 0x0000003f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19 - qbuf % 20;
+		if(noncepos <= 16 && qbuf<60)
+		{
+			if(noncepos != 0)	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if(noncepos != 16)	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+		for(int k = 0; k < 9; k++)
+		{
+			B0[(k + qbuf) & 0x0000003f] = temp[k];
+		}
+	}
+
+	Blake2S(input, input, key);
+
+	bufidx = 0;
+	bufhelper = ((uchar4*)input)[0];
+	for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		bufhelper += ((uchar4*)input)[x];
+	}
+	bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+	qbuf = bufidx / 4;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+	for(int k = 7; k < 9; k++)
+	{
+		temp[k] = B0[(k + qbuf) & 0x0000003f];
+	}
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+	output ^= input[7] ^ cdata7;
+}
+
+#else
+static __forceinline__ __device__ void fastkdf256_v2(uint32_t thread, const uint32_t nonce, const uint32_t* __restrict__ s_data) //, vectypeS * output)
+{
+	vectypeS __align__(16) output[8];
+	uint8_t bufidx;
+	uchar4 bufhelper;
+	const uint32_t data18 = s_data[18];
+	const uint32_t data20 = s_data[0];
+	uint32_t input[16];
+	uint32_t key[16] = {0};
+	uint32_t qbuf, rbuf, bitbuf;
+
+#define Bshift 16*thread
+
+	uint32_t *const B = (uint32_t*)&B2[Bshift];
+	((uintx64*)(B))[0] = ((uintx64*)s_data)[0];
+
+	B[19] = nonce;
+	B[39] = nonce;
+	B[59] = nonce;
+
+	((ulonglong4*)input)[0] = ((ulonglong4*)input_init)[0];
+	((uint28*)key)[0] = ((uint28*)key_init)[0];
+
+
+#pragma unroll  1
+	for(int i = 0; i < 31; ++i)
+	{
+		bufhelper = ((uchar4*)input)[0];
+		for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			bufhelper += ((uchar4*)input)[x];
+		}
+		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+		uint32_t shifted[9];
+
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
+
+		uint32_t temp[9];
+
+		for(int k = 0; k < 9; ++k)
+			temp[k] = __ldg(&B[(k + qbuf) & 0x0000003f]) ^ shifted[k];
+
+		uint32_t a = s_data[qbuf & 0x0000003f], b;
+		//#pragma unroll
+
+		for(int k = 0; k<16; k+=2)
+		{
+			b = s_data[(qbuf + k + 1) & 0x0000003f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = s_data[(qbuf + k + 2) & 0x0000003f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19 - qbuf % 20;
+		if(noncepos <= 16 && qbuf<60)
+		{
+			if(noncepos != 0)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if(noncepos != 16)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+		Blake2S_v2(input, input, key);
+
+		for(int k = 0; k < 9; k++)
+			B[(k + qbuf) & 0x0000003f] = temp[k];
+	}
+
+	bufhelper = ((uchar4*)input)[0];
+	for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		bufhelper += ((uchar4*)input)[x];
+	}
+	bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+
+	qbuf = bufidx / 4;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+	for(int i = 0; i<64; i++)
+	{
+		const uint32_t a = (qbuf + i) & 0x0000003f, b = (qbuf + i + 1) & 0x0000003f;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(__ldg(&B[a])), "r"(__ldg(&B[b])), "r"(bitbuf));
+	}
+
+	output[0] ^= ((uint28*)input)[0];
+	for(int i = 0; i<8; i++)
+		output[i] ^= ((uint28*)s_data)[i];
+	//	((ulonglong16 *)output)[0] ^= ((ulonglong16*)s_data)[0];
+	((uint32_t*)output)[19] ^= nonce;
+	((uint32_t*)output)[39] ^= nonce;
+	((uint32_t*)output)[59] ^= nonce;;
+	((ulonglong16 *)(Input + 8 * thread))[0] = ((ulonglong16*)output)[0];
+}
+
+static __forceinline__ __device__ void fastkdf32_v3(uint32_t thread, const  uint32_t  nonce, const uint32_t * __restrict__ salt, const uint32_t * __restrict__  s_data, uint32_t &output)
+{
+	uint32_t temp[9];
+	uint8_t bufidx;
+	uchar4 bufhelper;
+
+#define Bshift 16*thread
+
+	uint32_t*const B0 = (uint32_t*)&B2[Bshift];
+	const uint32_t cdata7 = s_data[7];
+	const uint32_t data18 = s_data[18];
+	const uint32_t data20 = s_data[0];
+
+	((uintx64*)B0)[0] = ((uintx64*)salt)[0];
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0};
+	((uint816*)input)[0] = ((uint816*)s_data)[0];
+	((uint48*)key)[0] = ((uint48*)salt)[0];
+	uint32_t qbuf, rbuf, bitbuf;
+
+#pragma nounroll  
+	for(int i = 0; i < 31; i++)
+	{
+		Blake2S_v2(input, input, key);
+
+		bufidx = 0;
+		bufhelper = ((uchar4*)input)[0];
+		for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			bufhelper += ((uchar4*)input)[x];
+		}
+		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+		qbuf = bufidx / 4;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+		uint32_t shifted[9];
+
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
+
+		for(int k = 0; k < 9; k++)
+		{
+			temp[k] = __ldg(&B0[(k + qbuf) & 0x0000003f]);
+		}
+
+		((uint28*)temp)[0] ^= ((uint28*)shifted)[0];
+		temp[8] ^= shifted[8];
+
+		uint32_t a = s_data[qbuf & 0x0000003f], b;
+		//#pragma unroll
+		for(int k = 0; k<16; k += 2)
+		{
+			b = s_data[(qbuf + k + 1) & 0x0000003f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = s_data[(qbuf + k + 2) & 0x0000003f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19 - qbuf % 20;
+		if(noncepos <= 16 && qbuf<60)
+		{
+			if(noncepos != 0)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if(noncepos != 16)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+		for(int k = 0; k < 9; k++)
+		{
+			B0[(k + qbuf) & 0x0000003f] = temp[k];
+		}
+	}
+
+	Blake2S_v2(input, input, key);
+
+	bufidx = 0;
+	bufhelper = ((uchar4*)input)[0];
+	for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		bufhelper += ((uchar4*)input)[x];
+	}
+	bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+	qbuf = bufidx / 4;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+	temp[7] = __ldg(&B0[(qbuf + 7) & 0x0000003f]);
+	temp[8] = __ldg(&B0[(qbuf + 8) & 0x0000003f]);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+	output ^= input[7] ^ cdata7;
+}
+
+#endif
+
+
+#define SHIFT 128
+#define TPB 128
+#define TPB2 64
+
+__global__ __launch_bounds__(TPB2, 1) void neoscrypt_gpu_hash_start(int stratum, uint32_t threads, uint32_t startNonce)
+{
+	__shared__ uint32_t s_data[64];
+
+#if TPB2<64
+#error TPB2 too low
+#else
+#if TPB2>64
+	if(threadIdx.x<64)
+#endif
+#endif
+		s_data[threadIdx.x] = c_data[threadIdx.x];
+	__syncthreads();
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t nonce = startNonce + thread;
+
+	const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!!
+
+#if __CUDA_ARCH__ < 500
+	fastkdf256_v1(thread, ZNonce, s_data);
+#else 
+	fastkdf256_v2(thread, ZNonce, s_data);
+#endif
+
+}
+
+__global__ __launch_bounds__(TPB, 1) void neoscrypt_gpu_hash_chacha1_stream1(uint32_t threads, uint32_t startNonce)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const int shift = SHIFT * 8 * thread;
+	const unsigned int shiftTr = 8 * thread;
+
+	vectypeS __align__(16) X[8];
+	for(int i = 0; i<8; i++)
+		X[i] = __ldg4(&(Input + shiftTr)[i]);
+
+#pragma nounroll  
+	for(int i = 0; i < 128; ++i)
+	{
+		uint32_t offset = shift + i * 8;
+		for(int j = 0; j<8; j++)
+			(W + offset)[j] = X[j];
+		neoscrypt_chacha((uint16*)X);
+
+	}
+	for(int i = 0; i<8; i++)
+		(Tr + shiftTr)[i] = X[i];
+}
+
+__global__ __launch_bounds__(TPB, 1) void neoscrypt_gpu_hash_chacha2_stream1(uint32_t threads, uint32_t startNonce)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const int shift = SHIFT * 8 * thread;
+	const int shiftTr = 8 * thread;
+
+	vectypeS __align__(16) X[8];
+#pragma unroll
+	for(int i = 0; i<8; i++)
+		X[i] = __ldg4(&(Tr + shiftTr)[i]);
+
+#pragma nounroll
+	for(int t = 0; t < 128; t++)
+	{
+		int idx = (X[6].x.x & 0x7F) << 3;
+
+		for(int j = 0; j<8; j++)
+			X[j] ^= __ldg4(&(W + shift + idx)[j]);
+		neoscrypt_chacha((uint16*)X);
+	}
+#pragma unroll
+	for(int i = 0; i<8; i++)
+		(Tr + shiftTr)[i] = X[i];  // best checked
+}
+
+__global__ __launch_bounds__(TPB, 1) void neoscrypt_gpu_hash_salsa1_stream1(uint32_t threads, uint32_t startNonce)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const int shift = SHIFT * 8 * thread;
+	const int shiftTr = 8 * thread;
+
+	vectypeS __align__(16) Z[8];
+#pragma unroll
+	for(int i = 0; i<8; i++)
+		Z[i] = __ldg4(&(Input + shiftTr)[i]);
+
+#pragma nounroll
+	for(int i = 0; i < 128; ++i)
+	{
+		for(int j = 0; j<8; j++)
+			(W2 + shift + i * 8)[j] = Z[j];
+		neoscrypt_salsa((uint16*)Z);
+	}
+#pragma unroll
+	for(int i = 0; i<8; i++)
+		(Tr2 + shiftTr)[i] = Z[i];
+}
+
+__global__ __launch_bounds__(TPB, 1) void neoscrypt_gpu_hash_salsa2_stream1(uint32_t threads, uint32_t startNonce)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const int shift = SHIFT * 8 * thread;
+	const int shiftTr = 8 * thread;
+
+	vectypeS __align__(16) X[8];
+#pragma unroll
+	for(int i = 0; i<8; i++)
+		X[i] = __ldg4(&(Tr2 + shiftTr)[i]);
+
+#pragma nounroll 
+	for(int t = 0; t < 128; t++)
+	{
+		int idx = (X[6].x.x & 0x7F) << 3;
+
+		for(int j = 0; j<8; j++)
+			X[j] ^= __ldg4(&(W2 + shift + idx)[j]);
+		neoscrypt_salsa((uint16*)X);
+	}
+#pragma unroll
+	for(int i = 0; i<8; i++)
+		(Tr2 + shiftTr)[i] = X[i];  // best checked
+}
+
+__global__ __launch_bounds__(TPB2, 8) void neoscrypt_gpu_hash_ending(int stratum, uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
+{
+	__shared__ uint32_t s_data[64];
+
+#if TPB2<64
+#error TPB2 too low
+#else
+#if TPB2>64
+	if(threadIdx.x<64)
+#endif
+#endif
+		s_data[threadIdx.x] = c_data[threadIdx.x];
+	__syncthreads();
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t nonce = startNonce + thread;
+
+	const int shiftTr = 8 * thread;
+	vectypeS __align__(16) Z[8];
+	uint32_t outbuf;
+
+	const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce;
+
+#pragma unroll
+	for(int i = 0; i<8; i++)
+		Z[i] = (Tr2 + shiftTr)[i] ^ (Tr + shiftTr)[i];
+
+#if __CUDA_ARCH__ < 500		 
+	fastkdf32_v1(thread, ZNonce, (uint32_t*)Z, s_data, outbuf);
+#else
+	fastkdf32_v3(thread, ZNonce, (uint32_t*)Z, s_data, outbuf);
+#endif
+	if(outbuf <= pTarget[7])
+	{
+		uint32_t tmp = atomicExch(nonceVector, nonce);
+		if(tmp != 0xffffffff)
+			nonceVector[1] = tmp;
+	}
+}
+
+void neoscrypt_cpu_init_2stream(int thr_id, uint32_t threads)
+{
+	uint32_t *hash1;
+	uint32_t *hash2; // 2 streams
+	uint32_t *Trans1;
+	uint32_t *Trans2; // 2 streams
+	uint32_t *Trans3; // 2 streams
+	uint32_t *Bhash;
+
+	CUDA_SAFE_CALL(cudaStreamCreate(&stream[0]));
+	CUDA_SAFE_CALL(cudaStreamCreate(&stream[1]));
+
+	CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&hash2, 32 * 128 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Bhash, 128 * sizeof(uint32_t) * threads));
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(B2,    &Bhash,  sizeof(uint28*), 0, cudaMemcpyHostToDevice, stream[0]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(W,     &hash1,  sizeof(uint28*), 0, cudaMemcpyHostToDevice, stream[0]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(W2,    &hash2,  sizeof(uint28*), 0, cudaMemcpyHostToDevice, stream[0]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Tr,    &Trans1, sizeof(uint28*), 0, cudaMemcpyHostToDevice, stream[0]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Tr2,   &Trans2, sizeof(uint28*), 0, cudaMemcpyHostToDevice, stream[0]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Input, &Trans3, sizeof(uint28*), 0, cudaMemcpyHostToDevice, stream[0]));
+}
+
+__host__ void neoscrypt_cpu_hash_k4_2stream(bool stratum, int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *result)
+{
+	const uint32_t threadsperblock = TPB;
+	
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	const uint32_t threadsperblock2 = TPB2;
+	dim3 grid2((threads + threadsperblock2 - 1) / threadsperblock2);
+	dim3 block2(threadsperblock2);
+
+	neoscrypt_gpu_hash_start << <grid2, block2, 0, stream[0] >> >(stratum, threads, startNounce); //fastkdf
+
+	CUDA_SAFE_CALL(cudaStreamSynchronize(stream[0]));
+
+	neoscrypt_gpu_hash_salsa1_stream1 << <grid, block, 0, stream[0] >> >(threads, startNounce); //chacha
+	neoscrypt_gpu_hash_chacha1_stream1 << <grid, block, 0, stream[1] >> >(threads, startNounce); //salsa
+
+	neoscrypt_gpu_hash_salsa2_stream1 << <grid, block, 0, stream[0] >> >(threads, startNounce); //chacha
+	neoscrypt_gpu_hash_chacha2_stream1 << <grid, block, 0, stream[1] >> >(threads, startNounce); //salsa
+
+	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+	neoscrypt_gpu_hash_ending << <grid2, block2 >> >(stratum, threads, startNounce, d_NNonce[thr_id]); //fastkdf+end
+
+	CUDA_SAFE_CALL(cudaMemcpy(result, d_NNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+}
+
+__host__ void neoscrypt_setBlockTarget(int thr_id, uint32_t* pdata, const void *target)
+{
+	uint32_t PaddedMessage[64];
+	uint32_t input[16], key[16] = {0};
+
+	for(int i = 0; i < 19; i++)
+	{
+		PaddedMessage[i     ] = pdata[i];
+		PaddedMessage[i + 20] = pdata[i];
+		PaddedMessage[i + 40] = pdata[i];
+	}
+	for(int i = 0; i<4; i++)
+		PaddedMessage[i + 60] = pdata[i];
+
+	PaddedMessage[19] = 0;
+	PaddedMessage[39] = 0;
+	PaddedMessage[59] = 0;
+
+	for(int i = 0; i < 16; i++)
+		input[i] = pdata[i];
+	for(int i = 0; i < 8; i++)
+		key[i] = pdata[i];
+
+	Blake2Shost(input, key);
+
+	cudaMemcpyToSymbolAsync(pTarget, target, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream[1]);
+	cudaMemcpyToSymbolAsync(input_init, input, 16 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream[0]);
+	cudaMemcpyToSymbolAsync(key_init, key, 16 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream[1]);
+	cudaMemcpyToSymbolAsync(c_data, PaddedMessage, 64 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream[0]);
+
+	CUDA_SAFE_CALL(cudaMemsetAsync(d_NNonce[thr_id], 0xff, 2 * sizeof(uint32_t), stream[1]));
+}
diff --git a/neoscrypt/cuda_neoscrypt_tpruvot.cu b/neoscrypt/cuda_neoscrypt_tpruvot.cu
new file mode 100644
index 0000000000..6edd267729
--- /dev/null
+++ b/neoscrypt/cuda_neoscrypt_tpruvot.cu
@@ -0,0 +1,1559 @@
+// originally from djm34 - github.com/djm34/ccminer-sp-neoscrypt
+// kernel code from Nanashi Meiyo-Meijin 1.7.6-r10 (July 2016)
+// modified by tpruvot
+
+#include <stdio.h>
+#include <memory.h>
+#include "cuda_helper.h"
+#include "cuda_vector_uint2x4.cuh"
+#include "cuda_vector_tpruvot.cuh" 
+#include "miner.h"
+
+#ifdef _MSC_VER
+#define THREAD __declspec(thread)
+#else
+#define THREAD __thread
+#endif
+
+#define rotate ROTL32
+#define rotateR ROTR32
+#define rotateL ROTL32
+
+typedef uint48 uint4x2;
+
+static uint32_t* d_NNonce[MAX_GPUS];
+
+__device__ uint2x4* W;
+__device__ uint2x4* Tr;
+__device__ uint2x4* Tr2;
+__device__ uint2x4* Input;
+
+__constant__ uint32_t c_data[64];
+__constant__ uint32_t c_target[2];
+__constant__ uint32_t key_init[16];
+__constant__ uint32_t input_init[16];
+
+static const __constant__ uint8 BLAKE2S_IV_Vec = {
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint8 BLAKE2S_IV_Vechost = {
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint32_t BLAKE2S_SIGMA_host[10][16] = {
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+};
+
+__constant__ uint32_t BLAKE2S_SIGMA[10][16] = {
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+};
+
+#define BLOCK_SIZE         64U
+#define BLAKE2S_BLOCK_SIZE 64U
+#define BLAKE2S_OUT_SIZE   32U
+
+#define SALSA(a,b,c,d) { \
+	t = rotateL(a + d,  7U); b ^= t; \
+	t = rotateL(b + a,  9U); c ^= t; \
+	t = rotateL(c + b, 13U); d ^= t; \
+	t = rotateL(d + c, 18U); a ^= t; \
+}
+
+#define shf_r_clamp32(out,a,b,shift) \
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(out) : "r"(a), "r"(b), "r"(shift));
+
+#if __CUDA_ARCH__ >= 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint32_t &a1, uint32_t &a2, uint32_t &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+
+#else
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	__shared__ uint32_t shared_mem[32];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+
+	uint32_t result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+	__threadfence_block();
+
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint32_t &a1, uint32_t &a2, uint32_t &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	__shared__ uint32_t shared_mem[32];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+
+	shared_mem[thread] = a2;
+	__threadfence_block();
+
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+
+	shared_mem[thread] = a3;
+	__threadfence_block();
+
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+	__threadfence_block();
+}
+
+#endif
+
+#define CHACHA_STEP(a,b,c,d) { \
+	a += b; d = __byte_perm(d ^ a, 0, 0x1032); \
+	c += d; b = rotateL(b ^ c, 12); \
+	a += b; d = __byte_perm(d ^ a, 0, 0x2103); \
+	c += d; b = rotateL(b ^ c, 7); \
+}
+
+#if __CUDA_ARCH__ < 500
+
+__device__ __forceinline__
+static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2)
+{
+#if __CUDA_ARCH__ >= 320
+	uint32_t shift = 32U - shift2;
+	asm("shf.r.clamp.b32 %0, 0, %1, %2;" : "=r"(ret[0]) : "r"(vec4.s0), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift));
+	asm("shr.b32         %0, %1, %2;"     : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift));
+#else
+	// to check
+	shift256R(ret, vec4, shift2);
+#endif
+}
+
+#define BLAKE(a, b, c, d, key1, key2) { \
+	a += key1; \
+	a += b; d = rotateL(d ^ a, 16); \
+	c += d; b = rotateR(b ^ c, 12); \
+	a += key2; \
+	a += b; d = rotateR(d ^ a, 8); \
+	c += d; b = rotateR(b ^ c, 7); \
+}
+
+#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
+	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
+	a += b; d = rotate(d ^ a, 16); \
+	c += d; b = rotateR(b ^ c, 12); \
+	idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \
+	a += b; d = rotateR(d ^ a, 8); \
+	c += d; b = rotateR(b ^ c, 7); \
+}
+
+#define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \
+	a += key[idx0]; \
+	a += b; d = rotateL(d ^ a, 16); \
+	c += d; b = rotateR(b ^ c, 12); \
+	a += key[idx1]; \
+	a += b; d = rotateR(d ^ a, 8); \
+	c += d; b = rotateR(b ^ c, 7); \
+}
+
+#define BLAKE_G_PRE0(idx0, idx1, a, b, c, d, key) { \
+	a += b; d = rotateL(d ^ a, 16); \
+	c += d; b = rotateR(b ^ c, 12); \
+	a += b; d = rotateR(d ^ a, 8); \
+	c += d; b = rotateR(b ^ c, 7); \
+}
+
+#define BLAKE_G_PRE1(idx0, idx1, a, b, c, d, key) { \
+	a += key[idx0]; \
+	a += b; d = rotateL(d ^ a, 16); \
+	c += d; b = rotateR(b ^ c, 12); \
+	a += b; d = rotateR(d ^ a, 8); \
+	c += d; b = rotateR(b ^ c, 7); \
+}
+
+#define BLAKE_G_PRE2(idx0, idx1, a, b, c, d, key) { \
+	a += b; d = rotateL(d ^ a, 16); \
+	c += d; b = rotateR(b ^ c, 12); \
+	a += key[idx1]; \
+	a += b; d = rotateR(d ^ a, 8); \
+	c += d; b = rotateR(b ^ c, 7); \
+}
+
+static __forceinline__ __device__
+void Blake2S(uint32_t *out, const uint32_t* const __restrict__  inout, const  uint32_t * const __restrict__ TheKey)
+{
+	uint16 V;
+	uint32_t idx;
+	uint8 tmpblock;
+
+	V.hi = BLAKE2S_IV_Vec;
+	V.lo = BLAKE2S_IV_Vec;
+	V.lo.s0 ^= 0x01012020;
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
+
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	V.lo ^= V.hi ^ tmpblock;
+
+	V.hi = BLAKE2S_IV_Vec;
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= 128;
+	V.hi.s6 = ~V.hi.s6;
+
+	// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+	for(int x = 4; x < 10; x++)
+	{
+		BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+		BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+		BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+		BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+		BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+		BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+		BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+		BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	}
+
+	V.lo ^= V.hi ^ tmpblock;
+
+	((uint8*)out)[0] = V.lo;
+}
+#endif
+
+#if __CUDA_ARCH__ >= 500
+
+#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
+	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
+	a += b; d = __byte_perm(d ^ a, 0, 0x1032); \
+	c += d; b = rotateR(b ^ c, 12); \
+	idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
+	a += b; d = __byte_perm(d ^ a, 0, 0x0321); \
+	c += d; b = rotateR(b ^ c, 7); \
+}
+
+#define BLAKE(a, b, c, d, key1,key2) { \
+	a += key1; \
+	a += b; d = __byte_perm(d ^ a, 0, 0x1032); \
+	c += d; b = rotateR(b ^ c, 12); \
+	a += key2; \
+	a += b; d = __byte_perm(d ^ a, 0, 0x0321); \
+	c += d; b = rotateR(b ^ c, 7); \
+}
+
+#define BLAKE_G_PRE(idx0,idx1, a, b, c, d, key) { \
+	a += key[idx0]; \
+	a += b; d = __byte_perm(d ^ a, 0, 0x1032); \
+	c += d; b = rotateR(b ^ c, 12); \
+	a += key[idx1]; \
+	a += b; d = __byte_perm(d ^ a, 0, 0x0321); \
+	c += d; b = rotateR(b ^ c, 7); \
+}
+
+#define BLAKE_G_PRE0(idx0,idx1, a, b, c, d, key) { \
+	a += b; d = __byte_perm(d ^ a, 0, 0x1032); \
+	c += d; b = rotateR(b ^ c, 12); \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE1(idx0,idx1, a, b, c, d, key) { \
+	a += key[idx0]; \
+	a += b; d = __byte_perm(d ^ a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b; d = __byte_perm(d ^ a, 0, 0x0321); \
+	c += d; b = rotateR(b ^ c, 7); \
+}
+
+#define BLAKE_G_PRE2(idx0,idx1, a, b, c, d, key) { \
+	a += b; d = __byte_perm(d ^ a, 0, 0x1032); \
+	c += d; b = rotateR(b ^ c, 12); \
+	a += key[idx1]; \
+	a += b; d = __byte_perm(d ^ a, 0, 0x0321); \
+	c += d; b = rotateR(b ^ c, 7); \
+}
+
+static __forceinline__ __device__
+void Blake2S_v2(uint32_t *out, const uint32_t* __restrict__  inout, const  uint32_t * __restrict__ TheKey)
+{
+	uint16 V;
+	uint8 tmpblock;
+
+	V.hi = BLAKE2S_IV_Vec;
+	V.lo = BLAKE2S_IV_Vec;
+	V.lo.s0 ^= 0x01012020;
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
+
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	V.lo ^= V.hi;
+	V.lo ^= tmpblock;
+
+	V.hi = BLAKE2S_IV_Vec;
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= 128;
+	V.hi.s6 = ~V.hi.s6;
+
+	// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[9], inout[0]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[5], inout[7]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[2], inout[4]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[10], inout[15]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[14], inout[1]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[11], inout[12]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[6], inout[8]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[3], inout[13]);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[2], inout[12]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[6], inout[10]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[0], inout[11]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[8], inout[3]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[4], inout[13]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[7], inout[5]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[15], inout[14]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[1], inout[9]);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[12], inout[5]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[1], inout[15]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[14], inout[13]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[4], inout[10]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[0], inout[7]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[6], inout[3]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[9], inout[2]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[8], inout[11]);
+	// 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10,
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[13], inout[11]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[7], inout[14]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[12], inout[1]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[3], inout[9]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[5], inout[0]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[15], inout[4]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[8], inout[6]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[2], inout[10]);
+	// 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5,
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[6], inout[15]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[14], inout[9]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[11], inout[3]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[0], inout[8]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[12], inout[2]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[13], inout[7]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[1], inout[4]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[10], inout[5]);
+	// 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0,
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[10], inout[2]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[8], inout[4]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[7], inout[6]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[1], inout[5]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[15], inout[11]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[9], inout[14]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[3], inout[12]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[13], inout[0]);
+
+	V.lo ^= V.hi;
+	V.lo ^= tmpblock;
+
+	((uint8*)out)[0] = V.lo;
+}
+
+#endif /* __CUDA_ARCH__ >= 500 */
+
+#define SALSA_CORE(state) { \
+	uint32_t t; \
+	SALSA(state.x, state.y, state.z, state.w); \
+	WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1,4); \
+	SALSA(state.x, state.w, state.z, state.y); \
+	WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3,4); \
+}
+
+#define CHACHA_CORE_PARALLEL(state)	{ \
+	CHACHA_STEP(state.x, state.y, state.z, state.w); \
+	WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3,4); \
+	CHACHA_STEP(state.x, state.y, state.z, state.w); \
+	WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1,4); \
+}
+
+__forceinline__ __device__
+uint4 salsa_small_scalar_rnd(const uint4 X)
+{
+	uint4 state = X;
+
+#pragma nounroll
+	for(int i = 0; i < 10; i++)
+	{
+		SALSA_CORE(state);
+	}
+
+	return (X + state);
+}
+
+__device__ __forceinline__
+uint4 chacha_small_parallel_rnd(const uint4 X)
+{
+	uint4 state = X;
+
+#pragma nounroll
+	for(int i = 0; i < 10; i++)
+	{
+		CHACHA_CORE_PARALLEL(state);
+	}
+	return (X + state);
+}
+
+__device__ __forceinline__
+void neoscrypt_chacha(uint4 XV[4])
+{
+	uint4 temp;
+
+	XV[0] = chacha_small_parallel_rnd(XV[0] ^ XV[3]);
+	temp = chacha_small_parallel_rnd(XV[1] ^ XV[0]);
+	XV[1] = chacha_small_parallel_rnd(XV[2] ^ temp);
+	XV[3] = chacha_small_parallel_rnd(XV[3] ^ XV[1]);
+	XV[2] = temp;
+}
+
+__device__ __forceinline__
+void neoscrypt_salsa(uint4 XV[4])
+{
+	uint4 temp;
+
+	XV[0] = salsa_small_scalar_rnd(XV[0] ^ XV[3]);
+	temp = salsa_small_scalar_rnd(XV[1] ^ XV[0]);
+	XV[1] = salsa_small_scalar_rnd(XV[2] ^ temp);
+	XV[3] = salsa_small_scalar_rnd(XV[3] ^ XV[1]);
+	XV[2] = temp;
+}
+
+
+#if __CUDA_ARCH__ < 500
+static __forceinline__ __device__
+void fastkdf256_v1(const uint32_t thread, const uint32_t nonce, uint32_t* const s_data)
+{
+	uint2x4 output[8];
+	uint32_t* B = (uint32_t*)&s_data[threadIdx.x * 64U];
+	uint32_t qbuf, rbuf, bitbuf;
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4];
+	uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0};
+
+	const uint32_t data18 = c_data[18];
+	const uint32_t data20 = c_data[0];
+
+	((uintx64*)(B))[0] = ((uintx64*)c_data)[0];
+	((uint32_t*)B)[19] = nonce;
+	((uint32_t*)B)[39] = nonce;
+	((uint32_t*)B)[59] = nonce;
+
+	((uint816*)input)[0] = ((uint816*)input_init)[0];
+	((uint4x2*)key)[0] = ((uint4x2*)key_init)[0];
+
+#pragma unroll 1
+	for(int i = 0; i < 31; i++)
+	{
+		uint32_t bufidx = 0;
+#pragma unroll
+		for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+
+		uint32_t shifted[9];
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
+
+		uint32_t temp[9];
+		//#pragma unroll
+		for(int k = 0; k < 9; k++)
+		{
+			uint32_t indice = (k + qbuf) & 0x3f;
+			temp[k] = B[indice] ^ shifted[k];
+			B[indice] = temp[k];
+		}
+#if __CUDA_ARCH__ >= 320  || !defined(__CUDA_ARCH__)
+		uint32_t a = c_data[qbuf & 0x3f], b;
+		//#pragma unroll
+		for(int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19U - qbuf % 20U;
+		if(noncepos <= 16U && qbuf < 60U)
+		{
+			if(noncepos != 0)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if(noncepos != 16U)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		for(int k = 0; k<8; k++)
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[k]) : "r"(temp[k]), "r"(temp[k + 1]), "r"(bitbuf));
+#else
+		//#error SM 3.0 code missing here
+		printf("", data18, data20);
+#endif
+		Blake2S(input, input, key);
+	}
+
+	uint32_t bufidx = 0;
+#pragma unroll
+	for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+		bufhelper = bufhelper + (bufhelper >> 16);
+		bufidx += bufhelper;
+	}
+	bufidx &= 0x000000ff;
+	qbuf = bufidx >> 2;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+#if __CUDA_ARCH__ >= 320
+	for(int i = 0; i<64; i++)
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[(qbuf + i) & 0x3f]), "r"(B[(qbuf + i + 1) & 0x3f4]), "r"(bitbuf));
+#endif
+
+	((ulonglong4*)output)[0] ^= ((ulonglong4*)input)[0];
+	((uintx64*)output)[0] ^= ((uintx64*)c_data)[0];
+	((uint32_t*)output)[19] ^= nonce;
+	((uint32_t*)output)[39] ^= nonce;
+	((uint32_t*)output)[59] ^= nonce;
+
+	for(int i = 0; i<8; i++)
+		(Input + 8U * thread)[i] = output[i];
+}
+#endif
+
+#if __CUDA_ARCH__ >= 500
+static __forceinline__ __device__
+void fastkdf256_v2(const uint32_t thread, const uint32_t nonce, uint32_t* const s_data)
+{
+	const uint32_t data18 = c_data[18];
+	const uint32_t data20 = c_data[0];
+	uint32_t input[16];
+	uint32_t key[16] = {0};
+	uint32_t qbuf, rbuf, bitbuf;
+
+	uint32_t* B = (uint32_t*)&s_data[threadIdx.x * 64U];
+	((uintx64*)(B))[0] = ((uintx64*)c_data)[0];
+
+	B[19] = nonce;
+	B[39] = nonce;
+	B[59] = nonce;
+
+	{
+		uint32_t bufidx = 0;
+#pragma unroll
+		for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input_init[x] & 0x00ff00ff) + ((input_init[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+
+		uint32_t temp[9];
+
+		uint32_t shifted;
+		uint32_t shift = 32U - bitbuf;
+		asm("shl.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input_init[0]), "r"(bitbuf));
+		temp[0] = B[(0 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[0]), "r"(input_init[1]), "r"(shift));
+		temp[1] = B[(1 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[1]), "r"(input_init[2]), "r"(shift));
+		temp[2] = B[(2 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[2]), "r"(input_init[3]), "r"(shift));
+		temp[3] = B[(3 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[3]), "r"(input_init[4]), "r"(shift));
+		temp[4] = B[(4 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[4]), "r"(input_init[5]), "r"(shift));
+		temp[5] = B[(5 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[5]), "r"(input_init[6]), "r"(shift));
+		temp[6] = B[(6 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[6]), "r"(input_init[7]), "r"(shift));
+		temp[7] = B[(7 + qbuf) & 0x3f] ^ shifted;
+		asm("shr.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input_init[7]), "r"(shift));
+		temp[8] = B[(8 + qbuf) & 0x3f] ^ shifted;
+
+		uint32_t a = c_data[qbuf & 0x3f], b;
+
+#pragma unroll
+		for(int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19 - qbuf % 20U;
+		if(noncepos <= 16U && qbuf < 60U)
+		{
+			if(noncepos)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if(noncepos != 16U)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+		Blake2S_v2(input, input, key);
+
+#pragma unroll
+		for(int k = 0; k < 9; k++)
+			B[(k + qbuf) & 0x3f] = temp[k];
+	}
+
+	for(int i = 1; i < 31; i++)
+	{
+		uint32_t bufidx = 0;
+#pragma unroll
+		for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+
+		uint32_t temp[9];
+
+		uint32_t shifted;
+		uint32_t shift = 32U - bitbuf;
+		asm("shl.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input[0]), "r"(bitbuf));
+		temp[0] = B[(0 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[0]), "r"(input[1]), "r"(shift));
+		temp[1] = B[(1 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[1]), "r"(input[2]), "r"(shift));
+		temp[2] = B[(2 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[2]), "r"(input[3]), "r"(shift));
+		temp[3] = B[(3 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[3]), "r"(input[4]), "r"(shift));
+		temp[4] = B[(4 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[4]), "r"(input[5]), "r"(shift));
+		temp[5] = B[(5 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[5]), "r"(input[6]), "r"(shift));
+		temp[6] = B[(6 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[6]), "r"(input[7]), "r"(shift));
+		temp[7] = B[(7 + qbuf) & 0x3f] ^ shifted;
+		asm("shr.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input[7]), "r"(shift));
+		temp[8] = B[(8 + qbuf) & 0x3f] ^ shifted;
+
+		uint32_t a = c_data[qbuf & 0x3f], b;
+
+#pragma unroll
+		for(int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19 - qbuf % 20U;
+		if(noncepos <= 16U && qbuf < 60U)
+		{
+			if(noncepos)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if(noncepos != 16U)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+		Blake2S_v2(input, input, key);
+
+#pragma unroll
+		for(int k = 0; k < 9; k++)
+			B[(k + qbuf) & 0x3f] = temp[k];
+	}
+
+	{
+		uint32_t bufidx = 0;
+#pragma unroll
+		for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+	}
+
+	uint2x4 output[8];
+	for(int i = 0; i<64; i++)
+	{
+		const uint32_t a = (qbuf + i) & 0x3f, b = (qbuf + i + 1) & 0x3f;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[a]), "r"(B[b]), "r"(bitbuf));
+	}
+
+	output[0] ^= ((uint2x4*)input)[0];
+#pragma unroll
+	for(int i = 0; i<8; i++)
+		output[i] ^= ((uint2x4*)c_data)[i];
+
+	((uint32_t*)output)[19] ^= nonce;
+	((uint32_t*)output)[39] ^= nonce;
+	((uint32_t*)output)[59] ^= nonce;;
+	((ulonglong16 *)(Input + 8U * thread))[0] = ((ulonglong16*)output)[0];
+}
+#endif
+
+#if __CUDA_ARCH__ < 500
+static __forceinline__ __device__
+uint32_t fastkdf32_v1(uint32_t thread, const uint32_t nonce, uint32_t* const salt, uint32_t* const s_data)
+{
+	const uint32_t cdata7 = c_data[7];
+	const uint32_t data18 = c_data[18];
+	const uint32_t data20 = c_data[0];
+
+	uint32_t* B0 = (uint32_t*)&s_data[threadIdx.x * 64U];
+	((uintx64*)B0)[0] = ((uintx64*)salt)[0];
+
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4];
+	((uint816*)input)[0] = ((uint816*)c_data)[0];
+
+	uint32_t key[BLAKE2S_BLOCK_SIZE / 4];
+	((uint4x2*)key)[0] = ((uint4x2*)salt)[0];
+	((uint4*)key)[2] = make_uint4(0, 0, 0, 0);
+	((uint4*)key)[3] = make_uint4(0, 0, 0, 0);
+
+	uint32_t qbuf, rbuf, bitbuf;
+	uint32_t temp[9];
+
+#pragma nounroll
+	for(int i = 0; i < 31; i++)
+	{
+		Blake2S(input, input, key);
+
+		uint32_t bufidx = 0;
+#pragma unroll
+		for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+		uint32_t shifted[9];
+
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
+
+		for(int k = 0; k < 9; k++)
+		{
+			temp[k] = B0[(k + qbuf) & 0x3f];
+		}
+
+		((uint2x4*)temp)[0] ^= ((uint2x4*)shifted)[0];
+		temp[8] ^= shifted[8];
+
+#if __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__)
+		uint32_t a = c_data[qbuf & 0x3f], b;
+		//#pragma unroll
+		for(int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19U - qbuf % 20U;
+		if(noncepos <= 16U && qbuf < 60U)
+		{
+			if(noncepos != 0)	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if(noncepos != 16U)	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+#else
+		//#error SM 3.0 code missing here
+		printf("", data18, data20);
+#endif
+		for(int k = 0; k < 9; k++)
+		{
+			B0[(k + qbuf) & 0x3f] = temp[k];
+		}
+	}
+
+	Blake2S(input, input, key);
+
+	uint32_t bufidx = 0;
+#pragma unroll
+	for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+		bufhelper = bufhelper + (bufhelper >> 16);
+		bufidx += bufhelper;
+	}
+	bufidx &= 0x000000ff;
+	qbuf = bufidx >> 2;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+	for(int k = 7; k < 9; k++)
+	{
+		temp[k] = B0[(k + qbuf) & 0x3f];
+	}
+
+	uint32_t output;
+#if __CUDA_ARCH__ >= 320
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+#else
+	output = (MAKE_ULONGLONG(temp[7], temp[8]) >> bitbuf); // to check maybe 7/8 reversed
+#endif
+	output ^= input[7] ^ cdata7;
+	return output;
+}
+#endif
+
+#if __CUDA_ARCH__ >= 500
+static __forceinline__ __device__
+uint32_t fastkdf32_v3(uint32_t thread, const uint32_t nonce, uint32_t* const salt, uint32_t* const s_data)
+{
+	const uint32_t cdata7 = c_data[7];
+	const uint32_t data18 = c_data[18];
+	const uint32_t data20 = c_data[0];
+
+	uint32_t* B0 = (uint32_t*)&s_data[threadIdx.x * 64U];
+	((uintx64*)B0)[0] = ((uintx64*)salt)[0];
+
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4];
+	((uint816*)input)[0] = ((uint816*)c_data)[0];
+
+	uint32_t key[BLAKE2S_BLOCK_SIZE / 4];
+	((uint4x2*)key)[0] = ((uint4x2*)salt)[0];
+	((uint4*)key)[2] = make_uint4(0, 0, 0, 0);
+	((uint4*)key)[3] = make_uint4(0, 0, 0, 0);
+
+	uint32_t qbuf, rbuf, bitbuf;
+	uint32_t temp[9];
+
+#pragma nounroll
+	for(int i = 0; i < 31; i++)
+	{
+		Blake2S_v2(input, input, key);
+
+		uint32_t bufidx = 0;
+#pragma unroll
+		for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+
+		uint32_t shifted;
+		uint32_t shift = 32U - bitbuf;
+		asm("shl.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input[0]), "r"(bitbuf));
+		temp[0] = B0[(0 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[0]), "r"(input[1]), "r"(shift));
+		temp[1] = B0[(1 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[1]), "r"(input[2]), "r"(shift));
+		temp[2] = B0[(2 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[2]), "r"(input[3]), "r"(shift));
+		temp[3] = B0[(3 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[3]), "r"(input[4]), "r"(shift));
+		temp[4] = B0[(4 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[4]), "r"(input[5]), "r"(shift));
+		temp[5] = B0[(5 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[5]), "r"(input[6]), "r"(shift));
+		temp[6] = B0[(6 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[6]), "r"(input[7]), "r"(shift));
+		temp[7] = B0[(7 + qbuf) & 0x3f] ^ shifted;
+		asm("shr.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input[7]), "r"(shift));
+		temp[8] = B0[(8 + qbuf) & 0x3f] ^ shifted;
+
+		uint32_t a = c_data[qbuf & 0x3f], b;
+#pragma unroll
+		for(int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19U - qbuf % 20U;
+		if(noncepos <= 16U && qbuf < 60U)
+		{
+			if(noncepos != 0)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if(noncepos != 16U)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+#pragma unroll
+		for(int k = 0; k < 9; k++)
+		{
+			B0[(k + qbuf) & 0x3f] = temp[k];
+		}
+	}
+
+	Blake2S_v2(input, input, key);
+
+	uint32_t bufidx = 0;
+#pragma unroll
+	for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+		bufhelper = bufhelper + (bufhelper >> 16);
+		bufidx += bufhelper;
+	}
+	bufidx &= 0x000000ff;
+	qbuf = bufidx >> 2;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+	temp[7] = B0[(qbuf + 7) & 0x3f];
+	temp[8] = B0[(qbuf + 8) & 0x3f];
+
+	uint32_t output;
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+	output ^= input[7] ^ cdata7;
+	return output;
+}
+#endif
+
+
+#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \
+	idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \
+	a += b; d = ROTR32(d ^ a, 16); \
+	c += d; b = ROTR32(b ^ c, 12); \
+	idx = BLAKE2S_SIGMA_host[idx0][idx1 + 1]; a += key[idx]; \
+	a += b; d = ROTR32(d ^ a, 8); \
+	c += d; b = ROTR32(b ^ c, 7); \
+}
+
+static void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
+{
+	uint16 V;
+	uint32_t idx;
+	uint8 tmpblock;
+
+	V.hi = BLAKE2S_IV_Vechost;
+	V.lo = BLAKE2S_IV_Vechost;
+	V.lo.s0 ^= 0x01012020;
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
+
+	for(int x = 0; x < 10; ++x)
+	{
+		BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey);
+		BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey);
+		BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey);
+		BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey);
+		BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey);
+		BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey);
+		BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey);
+		BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey);
+	}
+
+	V.lo ^= V.hi;
+	V.lo ^= tmpblock;
+
+	V.hi = BLAKE2S_IV_Vechost;
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= 128;
+	V.hi.s6 = ~V.hi.s6;
+
+	for(int x = 0; x < 10; ++x)
+	{
+		BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+		BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+		BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+		BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+		BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+		BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+		BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+		BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	}
+
+	V.lo ^= V.hi ^ tmpblock;
+
+	((uint8*)inout)[0] = V.lo;
+}
+
+
+#define SHIFT 128U
+#define TPB 32
+#define TPB2 64
+
+__global__
+__launch_bounds__(TPB2, 1)
+void neoscrypt_gpu_hash_start(const int stratum, const uint32_t startNonce)
+{
+	__shared__ uint32_t s_data[64 * TPB2];
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t nonce = startNonce + thread;
+	const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!!
+
+	__syncthreads();
+#if __CUDA_ARCH__ < 500
+	fastkdf256_v1(thread, ZNonce, s_data);
+#else
+	fastkdf256_v2(thread, ZNonce, s_data);
+#endif
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void neoscrypt_gpu_hash_chacha1()
+{
+	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
+	const uint32_t shift = SHIFT * 8U * (thread & 8191);
+	const uint32_t shiftTr = 8U * thread;
+
+	uint4 X[4];
+	for(int i = 0; i < 4; i++)
+	{
+		X[i].x = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 0 * 4 + threadIdx.x);
+		X[i].y = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 1 * 4 + threadIdx.x);
+		X[i].z = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 2 * 4 + threadIdx.x);
+		X[i].w = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 3 * 4 + threadIdx.x);
+	}
+
+#pragma nounroll
+	for(int i = 0; i < 128; i++)
+	{
+		uint32_t offset = shift + i * 8U;
+		for(int j = 0; j < 4; j++)
+			((uint4*)(W + offset))[j * 4 + threadIdx.x] = X[j];
+		neoscrypt_chacha(X);
+	}
+
+#pragma nounroll
+	for(int t = 0; t < 128; t++)
+	{
+		uint32_t offset = shift + (WarpShuffle(X[3].x, 0, 4) & 0x7F) * 8U;
+		for(int j = 0; j < 4; j++)
+			X[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x];
+		neoscrypt_chacha(X);
+	}
+
+#pragma unroll
+	for(int i = 0; i < 4; i++)
+	{
+		*((uint32_t*)&(Tr + shiftTr)[i * 2] + 0 * 4 + threadIdx.x) = X[i].x;
+		*((uint32_t*)&(Tr + shiftTr)[i * 2] + 1 * 4 + threadIdx.x) = X[i].y;
+		*((uint32_t*)&(Tr + shiftTr)[i * 2] + 2 * 4 + threadIdx.x) = X[i].z;
+		*((uint32_t*)&(Tr + shiftTr)[i * 2] + 3 * 4 + threadIdx.x) = X[i].w;
+	}
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void neoscrypt_gpu_hash_salsa1()
+{
+	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
+	const uint32_t shift = SHIFT * 8U * (thread & 8191);
+	const uint32_t shiftTr = 8U * thread;
+
+	uint4 Z[4];
+	for(int i = 0; i < 4; i++)
+	{
+		Z[i].x = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((0 + threadIdx.x) & 3) * 4 + threadIdx.x);
+		Z[i].y = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((1 + threadIdx.x) & 3) * 4 + threadIdx.x);
+		Z[i].z = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((2 + threadIdx.x) & 3) * 4 + threadIdx.x);
+		Z[i].w = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((3 + threadIdx.x) & 3) * 4 + threadIdx.x);
+	}
+
+#pragma nounroll
+	for(int i = 0; i < 128; i++)
+	{
+		uint32_t offset = shift + i * 8U;
+		for(int j = 0; j < 4; j++)
+			((uint4*)(W + offset))[j * 4 + threadIdx.x] = Z[j];
+		neoscrypt_salsa(Z);
+	}
+
+#pragma nounroll
+	for(int t = 0; t < 128; t++)
+	{
+		uint32_t offset = shift + (WarpShuffle(Z[3].x, 0, 4) & 0x7F) * 8U;
+		for(int j = 0; j < 4; j++)
+			Z[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x];
+		neoscrypt_salsa(Z);
+	}
+#pragma unroll
+	for(int i = 0; i < 4; i++)
+	{
+		*((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((0 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].x;
+		*((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((1 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].y;
+		*((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((2 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].z;
+		*((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((3 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].w;
+	}
+}
+
+__global__
+__launch_bounds__(TPB2, 8)
+void neoscrypt_gpu_hash_ending(const int stratum, const uint32_t startNonce, uint32_t *resNonces)
+{
+	__shared__ uint32_t s_data[64 * TPB2];
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t shiftTr = thread * 8U;
+	const uint32_t nonce = startNonce + thread;
+	const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce;
+
+	__syncthreads();
+
+	uint2x4 Z[8];
+#pragma unroll
+	for(int i = 0; i<8; i++)
+		Z[i] = __ldg4(&(Tr2 + shiftTr)[i]) ^ __ldg4(&(Tr + shiftTr)[i]);
+
+#if __CUDA_ARCH__ < 500
+	uint32_t outbuf = fastkdf32_v1(thread, ZNonce, (uint32_t*)Z, s_data);
+#else
+	uint32_t outbuf = fastkdf32_v3(thread, ZNonce, (uint32_t*)Z, s_data);
+#endif
+
+	if(outbuf <= c_target[1])
+	{
+		resNonces[0] = nonce;
+		//uint32_t tmp = atomicExch(resNonces, nonce);
+		//if(tmp != UINT32_MAX)
+		//	resNonces[1] = tmp;
+	}
+}
+
+static THREAD uint32_t *hash1 = NULL;
+static THREAD uint32_t *Trans1 = NULL;
+static THREAD uint32_t *Trans2 = NULL; // 2 streams
+static THREAD uint32_t *Trans3 = NULL; // 2 streams
+
+__host__
+void neoscrypt_init(int thr_id, uint32_t threads)
+{
+	CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * min(8192, threads)));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads));
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(W, &hash1, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(Tr, &Trans1, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(Tr2, &Trans2, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(Input, &Trans3, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice));
+}
+/*
+__host__
+void neoscrypt_free(int thr_id)
+{
+	cudaFree(d_NNonce[thr_id]);
+
+	cudaFree(hash1);
+	cudaFree(Trans1);
+	cudaFree(Trans2);
+	cudaFree(Trans3);
+}
+*/
+__host__
+void neoscrypt_hash_tpruvot(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum)
+{
+	CUDA_SAFE_CALL(cudaMemset(d_NNonce[thr_id], 0xff, 2 * sizeof(uint32_t)));
+
+	const int threadsperblock2 = TPB2;
+	dim3 grid2((threads + threadsperblock2 - 1) / threadsperblock2);
+	dim3 block2(threadsperblock2);
+
+	const int threadsperblock = TPB;
+	dim3 grid3((threads * 4 + threadsperblock - 1) / threadsperblock);
+	dim3 block3(4, threadsperblock >> 2);
+
+	neoscrypt_gpu_hash_start << <grid2, block2 >> > (stratum, startNounce); //fastkdf
+
+	neoscrypt_gpu_hash_salsa1 << <grid3, block3 >> > ();
+	neoscrypt_gpu_hash_chacha1 << <grid3, block3 >> > ();
+
+	neoscrypt_gpu_hash_ending << <grid2, block2 >> > (stratum, startNounce, d_NNonce[thr_id]); //fastkdf+end
+
+	CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_NNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+}
+
+__host__
+void neoscrypt_setBlockTarget_tpruvot(uint32_t* const pdata, uint32_t* const target)
+{
+	uint32_t PaddedMessage[64];
+	uint32_t input[16], key[16] = {0};
+
+	for(int i = 0; i < 19; i++)
+	{
+		PaddedMessage[i] = pdata[i];
+		PaddedMessage[i + 20] = pdata[i];
+		PaddedMessage[i + 40] = pdata[i];
+	}
+	for(int i = 0; i<4; i++)
+		PaddedMessage[i + 60] = pdata[i];
+
+	PaddedMessage[19] = 0;
+	PaddedMessage[39] = 0;
+	PaddedMessage[59] = 0;
+
+	((uint16*)input)[0] = ((uint16*)pdata)[0];
+	((uint8*)key)[0] = ((uint8*)pdata)[0];
+
+	Blake2Shost(input, key);
+
+	cudaMemcpyToSymbol(input_init, input, 64, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(key_init, key, 64, 0, cudaMemcpyHostToDevice);
+
+	cudaMemcpyToSymbol(c_target, &target[6], 2 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_data, PaddedMessage, 64 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaGetLastError());
+}
diff --git a/neoscrypt/cuda_vector_tpruvot.cuh b/neoscrypt/cuda_vector_tpruvot.cuh
new file mode 100644
index 0000000000..c9e09411a2
--- /dev/null
+++ b/neoscrypt/cuda_vector_tpruvot.cuh
@@ -0,0 +1,720 @@
+#ifndef CUDA_VECTOR_H
+#define CUDA_VECTOR_H
+
+
+///////////////////////////////////////////////////////////////////////////////////
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif
+
+#include "cuda_helper.h"
+
+//typedef __device_builtin__ struct ulong16 ulong16;
+
+
+typedef struct __align__(32) uint8
+{
+	unsigned int s0, s1, s2, s3, s4, s5, s6, s7;
+} uint8;
+
+typedef struct __align__(64) ulonglong2to8
+{
+	ulonglong2 l0, l1, l2, l3;
+} ulonglong2to8;
+
+typedef struct __align__(128) ulonglong8to16
+{
+	ulonglong2to8 lo, hi;
+} ulonglong8to16;
+
+typedef struct __align__(256) ulonglong16to32
+{
+	ulonglong8to16 lo, hi;
+} ulonglong16to32;
+
+typedef struct __align__(512) ulonglong32to64
+{
+	ulonglong16to32 lo, hi;
+} ulonglong32to64;
+
+
+
+typedef struct __align__(1024) ulonglonglong
+{
+	ulonglong8to16 s0, s1, s2, s3, s4, s5, s6, s7;
+} ulonglonglong;
+
+
+
+
+typedef struct __align__(64) uint16
+{
+	union
+	{
+		struct
+		{
+			unsigned int  s0, s1, s2, s3, s4, s5, s6, s7;
+		};
+		uint8 lo;
+	};
+	union
+	{
+		struct
+		{
+			unsigned int s8, s9, sa, sb, sc, sd, se, sf;
+		};
+		uint8 hi;
+	};
+} uint16;
+
+typedef struct __align__(128) uint32
+{
+
+	uint16 lo, hi;
+} uint32;
+
+
+
+struct __align__(128) ulong8
+{
+	ulonglong4 s0, s1, s2, s3;
+};
+typedef __device_builtin__ struct ulong8 ulong8;
+
+
+typedef struct  __align__(256) ulonglong16
+{
+	ulonglong2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf;
+} ulonglong16;
+
+typedef struct  __align__(32) uint48
+{
+	uint4 s0, s1;
+
+} uint48;
+
+typedef struct  __align__(64) uint816
+{
+	uint48 s0, s1;
+
+} uint816;
+
+typedef struct  __align__(128) uint1632
+{
+	uint816 s0, s1;
+
+} uint1632;
+
+typedef struct  __align__(256) uintx64
+{
+	uint1632 s0, s1;
+
+} uintx64;
+
+typedef struct  __align__(512) uintx128
+{
+	uintx64 s0, s1;
+
+} uintx128;
+
+typedef struct  __align__(1024) uintx256
+{
+	uintx128 s0, s1;
+
+} uintx256;
+
+
+
+typedef struct __align__(256) uint4x16
+{
+	uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+} uint4x16;
+
+static __inline__ __device__ ulonglong2to8 make_ulonglong2to8(ulonglong2 s0, ulonglong2 s1, ulonglong2 s2, ulonglong2 s3)
+{
+	ulonglong2to8 t; t.l0 = s0; t.l1 = s1; t.l2 = s2; t.l3 = s3;
+	return t;
+}
+
+static __inline__ __device__ ulonglong8to16 make_ulonglong8to16(const ulonglong2to8 &s0, const ulonglong2to8 &s1)
+{
+	ulonglong8to16 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+static __inline__ __device__ ulonglong16to32 make_ulonglong16to32(const ulonglong8to16 &s0, const ulonglong8to16 &s1)
+{
+	ulonglong16to32 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+static __inline__ __device__ ulonglong32to64 make_ulonglong32to64(const ulonglong16to32 &s0, const ulonglong16to32 &s1)
+{
+	ulonglong32to64 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+
+static __inline__ __host__ __device__ ulonglonglong make_ulonglonglong(
+	const ulonglong8to16 &s0, const ulonglong8to16 &s1, const ulonglong8to16 &s2, const ulonglong8to16 &s3,
+	const ulonglong8to16 &s4, const ulonglong8to16 &s5, const ulonglong8to16 &s6, const ulonglong8to16 &s7)
+{
+	ulonglonglong t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+
+static __inline__ __device__ uint48 make_uint48(uint4 s0, uint4 s1)
+{
+	uint48 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uint816 make_uint816(const uint48 &s0, const uint48 &s1)
+{
+	uint816 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uint1632 make_uint1632(const uint816 &s0, const uint816 &s1)
+{
+	uint1632 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uintx64 make_uintx64(const uint1632 &s0, const uint1632 &s1)
+{
+	uintx64 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uintx128 make_uintx128(const uintx64 &s0, const uintx64 &s1)
+{
+	uintx128 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uintx256 make_uintx256(const uintx128 &s0, const uintx128 &s1)
+{
+	uintx256 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+
+static __inline__ __device__ uintx256 make_uintx64(const uintx128 &s0, const uintx128 &s1)
+{
+	uintx256 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+
+static __inline__ __host__ __device__ uint4x16 make_uint4x16(
+	uint4 s0, uint4 s1, uint4 s2, uint4 s3, uint4 s4, uint4 s5, uint4 s6, uint4 s7,
+	uint4 s8, uint4 s9, uint4 sa, uint4 sb, uint4 sc, uint4 sd, uint4 se, uint4 sf)
+{
+	uint4x16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.s10 = sa; t.s11 = sb; t.s12 = sc; t.s13 = sd; t.s14 = se; t.s15 = sf;
+	return t;
+}
+
+
+
+
+static __inline__ __host__ __device__ uint16 make_uint16(
+	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7,
+	unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf)
+{
+	uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+	return t;
+}
+
+static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b)
+{
+	uint16 t; t.lo = a; t.hi = b; return t;
+}
+
+static __inline__ __host__ __device__ uint32 make_uint32(const uint16 &a, const uint16 &b)
+{
+	uint32 t; t.lo = a; t.hi = b; return t;
+}
+
+
+static __inline__ __host__ __device__ uint8 make_uint8(
+	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7)
+{
+	uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+
+static __inline__ __host__ __device__ ulonglong16 make_ulonglong16(const ulonglong2 &s0, const ulonglong2 &s1,
+																   const ulonglong2 &s2, const ulonglong2 &s3, const ulonglong2 &s4, const ulonglong2 &s5, const ulonglong2 &s6, const ulonglong2 &s7,
+																   const ulonglong2 &s8, const ulonglong2 &s9,
+																   const ulonglong2 &sa, const ulonglong2 &sb, const ulonglong2 &sc, const ulonglong2 &sd, const ulonglong2 &se, const ulonglong2 &sf
+)
+{
+	ulonglong16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+	return t;
+}
+
+
+
+static __inline__ __host__ __device__ ulong8 make_ulong8(
+	ulonglong4 s0, ulonglong4 s1, ulonglong4 s2, ulonglong4 s3)
+{
+	ulong8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3;// t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+/*
+static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b)
+{
+	return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b)
+{
+	return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+
+
+
+
+static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b)
+{
+	return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+static __forceinline__ __device__ uint4 operator+ (uint4 a, uint4 b)
+{
+	return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+*/
+
+static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b)
+{
+	return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b)
+{
+	return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b)
+{
+	return make_ulonglong2(a.x ^ b.x, a.y ^ b.y);
+}
+static __forceinline__ __device__ ulonglong2 operator+ (ulonglong2 a, ulonglong2 b)
+{
+	return make_ulonglong2(a.x + b.x, a.y + b.y);
+}
+
+static __forceinline__ __device__ ulong8 operator^ (const ulong8 &a, const ulong8 &b)
+{
+	return make_ulong8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3);
+} //, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
+
+static __forceinline__ __device__ ulong8 operator+ (const ulong8 &a, const ulong8 &b)
+{
+	return make_ulong8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3);
+} //, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
+
+
+static __forceinline__ __device__  __host__ uint8 operator^ (const uint8 &a, const uint8 &b)
+{
+	return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7);
+}
+
+static __forceinline__ __device__  __host__ uint8 operator+ (const uint8 &a, const uint8 &b)
+{
+	return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7);
+}
+
+////////////// mess++ //////
+
+static __forceinline__ __device__  uint48 operator^ (const uint48 &a, const uint48 &b)
+{
+	return make_uint48(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__  uint816 operator^ (const uint816 &a, const uint816 &b)
+{
+	return make_uint816(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__ uint1632 operator^ (const uint1632 &a, const uint1632 &b)
+{
+	return make_uint1632(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+
+static __forceinline__ __device__  uintx64 operator^ (const uintx64 &a, const uintx64 &b)
+{
+	return make_uintx64(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__  uintx128 operator^ (const uintx128 &a, const uintx128 &b)
+{
+	return make_uintx128(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__  uintx256 operator^ (const uintx256 &a, const uintx256 &b)
+{
+	return make_uintx256(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+/////////////////////////
+
+static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b)
+{
+	return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+					   a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf);
+}
+
+static __forceinline__ __device__  __host__ uint16 operator+ (const uint16 &a, const uint16 &b)
+{
+	return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+					   a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf);
+}
+
+static __forceinline__ __device__  uint32 operator^ (const uint32 &a, const uint32 &b)
+{
+	return make_uint32(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__  uint32 operator+ (const uint32 &a, const uint32 &b)
+{
+	return make_uint32(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__ ulonglong16 operator^ (const ulonglong16 &a, const ulonglong16 &b)
+{
+	return make_ulonglong16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+							a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf
+	);
+}
+
+static __forceinline__ __device__ ulonglong16 operator+ (const ulonglong16 &a, const ulonglong16 &b)
+{
+	return make_ulonglong16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+							a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf
+	);
+}
+
+static __forceinline__ __device__ void operator^= (ulong8 &a, const ulong8 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator^= (uintx64 &a, const uintx64 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator^= (uintx128 &a, const uintx128 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator^= (uintx256 &a, const uintx256 &b)
+{
+	a = a ^ b;
+}
+
+
+static __forceinline__ __device__ void operator^= (uint816 &a, const uint816 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator^= (uint48 &a, const uint48 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator^= (uint32 &a, const uint32 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator+= (uint32 &a, const uint32 &b)
+{
+	a = a + b;
+}
+
+/*
+static __forceinline__ __device__ void operator^= (uint4 &a, uint4 b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b)
+{
+	a = a ^ b;
+}
+*/
+static __forceinline__ __device__  __host__ void operator^= (uint8 &a, const uint8 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__  __host__ void operator^= (uint16 &a, const uint16 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator^= (ulonglong16 &a, const ulonglong16 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator+= (ulonglong2 &a, const ulonglong2 &b)
+{
+	a = a + b;
+}
+
+static __forceinline__ __device__
+ulonglong2to8 operator^ (const ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+	return make_ulonglong2to8(a.l0 ^ b.l0, a.l1 ^ b.l1, a.l2 ^ b.l2, a.l3 ^ b.l3);
+}
+static __forceinline__ __device__
+ulonglong2to8 operator+ (const ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+	return make_ulonglong2to8(a.l0 + b.l0, a.l1 + b.l1, a.l2 + b.l2, a.l3 + b.l3);
+}
+
+
+static __forceinline__ __device__
+ulonglong8to16 operator^ (const ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	return make_ulonglong8to16(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong8to16 operator+ (const ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	return make_ulonglong8to16(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong16to32 operator^ (const ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	return make_ulonglong16to32(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong16to32 operator+ (const ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	return make_ulonglong16to32(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong32to64 operator^ (const ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	return make_ulonglong32to64(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong32to64 operator+ (const ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	return make_ulonglong32to64(a.lo + b.lo, a.hi + b.hi);
+}
+
+
+static __forceinline__ __device__ ulonglonglong operator^ (const ulonglonglong &a, const ulonglonglong &b)
+{
+	return make_ulonglonglong(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7);
+}
+
+static __forceinline__ __device__ ulonglonglong operator+ (const ulonglonglong &a, const ulonglonglong &b)
+{
+	return make_ulonglonglong(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7);
+}
+
+
+static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__  __host__ void operator+= (uint8 &a, const uint8 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__  __host__ void operator+= (uint16 &a, const uint16 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator+= (ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator^= (ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator+= (ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator^= (ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator+= (ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator^= (ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	a = a ^ b;
+}
+
+static __forceinline__ __device__ void operator+= (ulonglonglong &a, const ulonglonglong &b)
+{
+	a = a + b;
+}
+static __forceinline__ __device__ void operator^= (ulonglonglong &a, const ulonglonglong &b)
+{
+	a = a ^ b;
+}
+
+#if __CUDA_ARCH__ < 320
+
+#define rotateL ROTL32
+#define rotateR ROTR32
+
+#else
+
+static __forceinline__ __device__ uint32_t rotateL(uint32_t vec4, uint32_t shift)
+{
+	uint32_t ret;
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+	return ret;
+}
+
+static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift)
+{
+	uint32_t ret;
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+	return ret;
+}
+
+#endif
+
+#if __CUDA_ARCH__ < 320
+
+// right shift a 64-bytes integer (256-bits) by 0 8 16 24 bits
+// require a uint32_t[9] ret array
+// note: djm neoscrypt implementation is near the limits of gpu capabilities
+//       and weird behaviors can happen when tuning device functions code...
+__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
+{
+	uint8_t *v = (uint8_t*)&vec4.s0;
+	uint8_t *r = (uint8_t*)ret;
+	uint8_t bytes = (uint8_t)(shift >> 3);
+	ret[0] = 0;
+	for(uint8_t i = bytes; i<32; i++)
+		r[i] = v[i - bytes];
+	ret[8] = vec4.s7 >> (32 - shift); // shuffled part required
+}
+
+#else
+
+// same for SM 3.5+, really faster ?
+__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
+{
+	uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+	ret[8] = cuda_swab32(truc);
+	truc3 = cuda_swab32(vec4.s6);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+	ret[7] = cuda_swab32(truc);
+	truc2 = cuda_swab32(vec4.s5);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+	ret[6] = cuda_swab32(truc);
+	truc3 = cuda_swab32(vec4.s4);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+	ret[5] = cuda_swab32(truc);
+	truc2 = cuda_swab32(vec4.s3);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+	ret[4] = cuda_swab32(truc);
+	truc3 = cuda_swab32(vec4.s2);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+	ret[3] = cuda_swab32(truc);
+	truc2 = cuda_swab32(vec4.s1);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+	ret[2] = cuda_swab32(truc);
+	truc3 = cuda_swab32(vec4.s0);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+	ret[1] = cuda_swab32(truc);
+	asm("shr.b32        %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift));
+	ret[0] = cuda_swab32(truc);
+}
+#endif
+
+#if __CUDA_ARCH__ < 320
+
+// copy 256 bytes
+static __device__ __inline__ uintx64 ldg256(const uint4 *ptr)
+{
+	uintx64 ret;
+	uint32_t *dst = (uint32_t*)&ret.s0;
+	uint32_t *src = (uint32_t*)&ptr[0].x;
+	for(int i = 0; i < (256 / sizeof(uint32_t)); i++)
+	{
+		dst[i] = src[i];
+	}
+	return ret;
+}
+
+#else
+
+// complicated way to copy 256 bytes ;)
+static __device__ __inline__ uintx64 ldg256(const uint4 *ptr)
+{
+	uintx64 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];"  : "=r"(ret.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];"  : "=r"(ret.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];"  : "=r"(ret.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];"  : "=r"(ret.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];"  : "=r"(ret.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];"  : "=r"(ret.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];"  : "=r"(ret.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];"  : "=r"(ret.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];"  : "=r"(ret.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];"  : "=r"(ret.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];"  : "=r"(ret.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];"  : "=r"(ret.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];"  : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+	return ret;
+}
+#endif
+
+#endif // #ifndef CUDA_VECTOR_H
\ No newline at end of file
diff --git a/neoscrypt/cuda_vector_uint2x4.cuh b/neoscrypt/cuda_vector_uint2x4.cuh
new file mode 100644
index 0000000000..780fb67077
--- /dev/null
+++ b/neoscrypt/cuda_vector_uint2x4.cuh
@@ -0,0 +1,72 @@
+// used in tpruvot's neoscrypt code
+
+#ifndef CUDA_VECTOR_UINT2x4_H
+#define CUDA_VECTOR_UINT2x4_H
+
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif
+
+#include "cuda_helper.h"
+
+typedef struct __align__(16) uint2x4
+{
+	uint2 x, y, z, w;
+} uint2x4;
+
+
+static __inline__ __device__ uint2x4 make_uint2x4(uint2 s0, uint2 s1, uint2 s2, uint2 s3)
+{
+	uint2x4 t;
+	t.x = s0; t.y = s1; t.z = s2; t.w = s3;
+	return t;
+}
+
+static __forceinline__ __device__  uint2x4 operator^ (const uint2x4 &a, const uint2x4 &b)
+{
+	return make_uint2x4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+
+static __forceinline__ __device__  uint2x4 operator+ (const uint2x4 &a, const uint2x4 &b)
+{
+	return make_uint2x4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+/////////////////////////
+
+static __forceinline__ __device__ void operator^= (uint2x4 &a, const uint2x4 &b)
+{
+	a = a ^ b;
+}
+static __forceinline__ __device__ void operator+= (uint2x4 &a, const uint2x4 &b)
+{
+	a = a + b;
+}
+
+#if __CUDA_ARCH__ >= 320
+
+static __device__ __inline__ uint2x4 __ldg4(const uint2x4 *ptr)
+{
+	uint2x4 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"    : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ void ldg4(const uint2x4 *ptr, uint2x4 *ret)
+{
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"     : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];"  : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];"  : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];"  : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr));
+}
+#elif !defined(__ldg4)
+#define __ldg4(x) (*(x))
+#define ldg4(ptr, ret) { *(ret) = (*(ptr)); }
+#endif
+
+#endif // H
\ No newline at end of file
diff --git a/neoscrypt/neoscrypt.cu b/neoscrypt/neoscrypt.cu
new file mode 100644
index 0000000000..9d7e9a6272
--- /dev/null
+++ b/neoscrypt/neoscrypt.cu
@@ -0,0 +1,201 @@
+#include <string.h>
+#include "cuda_helper.h"
+#include "miner.h"
+#include "sph/neoscrypt.h"
+
+extern void neoscrypt_setBlockTarget(int thr_id, uint32_t* pdata, const void *target);
+extern void neoscrypt_cpu_init_2stream(int thr_id, uint32_t threads);
+extern void neoscrypt_cpu_hash_k4_2stream(bool stratum, int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *result);
+//extern void neoscrypt_cpu_hash_k4_52(int stratum, int thr_id, int threads, uint32_t startNounce, int order, uint32_t* foundnonce);
+void neoscrypt_init(int thr_id, uint32_t threads);
+void neoscrypt_setBlockTarget_tpruvot(uint32_t* const pdata, uint32_t* const target);
+void neoscrypt_hash_tpruvot(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);
+
+int scanhash_neoscrypt(bool stratum, int thr_id, uint32_t *pdata,
+					   uint32_t *ptarget, uint32_t max_nonce,
+					   uint32_t *hashes_done)
+{
+	const uint32_t first_nonce = pdata[19];
+	uint32_t throughput;
+	static THREAD uint32_t throughputmax;
+
+	static THREAD volatile bool init = false;
+	static THREAD uint32_t hw_errors = 0;
+	static THREAD uint32_t *foundNonce = nullptr;
+	static THREAD bool use_tpruvot = false;
+
+	if(opt_benchmark)
+	{
+		ptarget[7] = 0x01ff;
+		stratum = 0;
+	}
+
+	if(!init)
+	{
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, device_map[thr_id]);
+		unsigned int cc = props.major * 10 + props.minor;
+		if(cc < 32)
+		{
+			applog(LOG_ERR, "GPU #%d: this gpu is not supported", device_map[thr_id]);
+			mining_has_stopped[thr_id] = true;
+			proper_exit(2);
+		}
+		unsigned int intensity = (256 * 64 * 1); // -i 14
+		if(strstr(props.name, "1080 Ti"))
+		{
+			intensity = 256 * 64 * 5;
+			use_tpruvot = true;
+		}
+		else if(strstr(props.name, "1080"))
+		{
+			intensity = 256 * 64 * 5;
+		}
+		else if(strstr(props.name, "1070"))
+		{
+			intensity = 256 * 64 * 5;
+		}
+		else if(strstr(props.name, "970"))
+		{
+			intensity = (256 * 64 * 5);
+		}
+		else if(strstr(props.name, "980"))
+		{
+			intensity = (256 * 64 * 5);
+		}
+		else if(strstr(props.name, "980 Ti"))
+		{
+			intensity = (256 * 64 * 5);
+		}
+		else if(strstr(props.name, "750 Ti"))
+		{
+			intensity = (256 * 64 * 3);
+		}
+		else if(strstr(props.name, "750"))
+		{
+			intensity = (256 * 64 * 1);
+		}
+		else if(strstr(props.name, "960"))
+		{
+			intensity = (256 * 64 * 2);
+		}
+		else if(strstr(props.name, "950"))
+		{
+			intensity = (256 * 64 * 2);
+		}
+
+		throughputmax = device_intensity(device_map[thr_id], __func__, intensity) / 2;
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		//		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);	
+		CUDA_SAFE_CALL(cudaMallocHost(&foundNonce, 2 * 4));
+
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (32 * 128 * sizeof(uint64_t)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			proper_exit(2);
+		}
+#endif
+		if(use_tpruvot)
+			neoscrypt_init(thr_id, throughputmax);
+		else
+			neoscrypt_cpu_init_2stream(thr_id, throughputmax);
+		mining_has_stopped[thr_id] = false;
+		init = true;
+	}
+	throughput = min(throughputmax, (max_nonce - first_nonce) / 2) & 0xffffff00;
+
+	uint32_t endiandata[20];
+	for(int k = 0; k < 20; k++)
+	{
+		if(stratum)
+			be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		else endiandata[k] = pdata[k];
+	}
+	if(use_tpruvot)
+		neoscrypt_setBlockTarget_tpruvot(endiandata, ptarget);
+	else
+		neoscrypt_setBlockTarget(thr_id, endiandata, ptarget);
+	
+
+	do
+	{
+		if(use_tpruvot)
+			neoscrypt_hash_tpruvot(thr_id, throughput, pdata[19], foundNonce, stratum);
+		else
+			neoscrypt_cpu_hash_k4_2stream(stratum, thr_id, throughput, pdata[19], foundNonce);
+		if(stop_mining)
+		{
+			mining_has_stopped[thr_id] = true; pthread_exit(nullptr);
+		}
+		if(foundNonce[0] != 0xffffffff)
+		{
+			uint32_t vhash64[8]={0};
+			if(opt_verify)
+			{
+				if(stratum)
+					be32enc(&endiandata[19], foundNonce[0]);
+				else
+					endiandata[19] = foundNonce[0];
+				neoscrypt((unsigned char*)endiandata, (unsigned char*)vhash64, 0x80000620);
+			}
+			if(vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget))
+			{
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				int res = 1;
+				if(opt_benchmark)
+					applog(LOG_INFO, "GPU #%d Found nonce %08x", device_map[thr_id], foundNonce[0]);
+				pdata[19] = foundNonce[0];
+				if(foundNonce[1] != 0xffffffff)
+				{
+					if(opt_verify)
+					{
+						if(stratum)
+						{
+							be32enc(&endiandata[19], foundNonce[1]);
+						}
+						else
+						{
+							endiandata[19] = foundNonce[1];
+						}
+						neoscrypt((unsigned char*)endiandata, (unsigned char*)vhash64, 0x80000620);
+					}
+					if(vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget))
+					{
+						pdata[21] = foundNonce[1];
+						res++;
+						if(opt_benchmark)
+							applog(LOG_INFO, "GPU #%d: Found second nonce %08x", device_map[thr_id], foundNonce[1]);
+					}
+					else
+					{
+						if(vhash64[7] != ptarget[7])
+						{
+							applog(LOG_WARNING, "GPU #%d: Second nonce $%08X does not validate on CPU!", device_map[thr_id], foundNonce[1]);
+							hw_errors++;
+						}
+					}
+
+				}
+				return res;
+			}
+			else
+			{
+				if(vhash64[7] != ptarget[7])
+				{
+					applog(LOG_WARNING, "GPU #%d: Nonce $%08X does not validate on CPU!", device_map[thr_id], foundNonce[0]);
+					hw_errors++;
+				}
+			}
+//						if(hw_errors > 0) applog(LOG_WARNING, "Hardware errors: %u", hw_errors);
+		}
+		pdata[19] += throughput;
+	} while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
+	*hashes_done = pdata[19] - first_nonce ;
+	return 0;
+}
+
diff --git a/nvml.cpp b/nvml.cpp
index 8be79b09ce..593abe73ce 100644
--- a/nvml.cpp
+++ b/nvml.cpp
@@ -15,20 +15,16 @@
  *
  */
 
+#include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#ifndef _MSC_VER
-#include <libgen.h>
-#endif
+#include <unistd.h>
 
 #include "miner.h"
 #include "nvml.h"
 #include "cuda_runtime.h"
 
-// cuda.cpp
-int cuda_num_devices();
-
 #ifdef USE_WRAPNVML
 
 extern nvml_handle *hnvml;
@@ -36,6 +32,15 @@ extern char driver_version[32];
 
 static uint32_t device_bus_ids[MAX_GPUS] = { 0 };
 
+extern uint32_t device_gpu_clocks[MAX_GPUS];
+extern uint32_t device_mem_clocks[MAX_GPUS];
+extern uint32_t device_plimit[MAX_GPUS];
+extern int8_t device_pstate[MAX_GPUS];
+
+uint32_t clock_prev[MAX_GPUS] = { 0 };
+uint32_t clock_prev_mem[MAX_GPUS] = { 0 };
+uint32_t limit_prev[MAX_GPUS] = { 0 };
+
 /*
  * Wrappers to emulate dlopen() on other systems like Windows
  */
@@ -110,14 +115,12 @@ nvml_handle * nvml_create()
 
 	nvmlh->nvml_dll = nvml_dll;
 
-	nvmlh->nvmlInit = (nvmlReturn_t (*)(void))
-		wrap_dlsym(nvmlh->nvml_dll, "nvmlInit_v2");
-	if (!nvmlh->nvmlInit) {
-		nvmlh->nvmlInit = (nvmlReturn_t (*)(void))
-			wrap_dlsym(nvmlh->nvml_dll, "nvmlInit");
-	}
-	nvmlh->nvmlDeviceGetCount = (nvmlReturn_t (*)(int *))
-		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCount_v2");
+	nvmlh->nvmlInit = (nvmlReturn_t (*)(void)) wrap_dlsym(nvmlh->nvml_dll, "nvmlInit_v2");
+	if (!nvmlh->nvmlInit)
+		nvmlh->nvmlInit = (nvmlReturn_t (*)(void)) wrap_dlsym(nvmlh->nvml_dll, "nvmlInit");
+	nvmlh->nvmlDeviceGetCount = (nvmlReturn_t (*)(int *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCount_v2");
+	if (!nvmlh->nvmlDeviceGetCount)
+		nvmlh->nvmlDeviceGetCount = (nvmlReturn_t (*)(int *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCount");
 	nvmlh->nvmlDeviceGetHandleByIndex = (nvmlReturn_t (*)(int, nvmlDevice_t *))
 		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetHandleByIndex_v2");
 	nvmlh->nvmlDeviceGetAPIRestriction = (nvmlReturn_t (*)(nvmlDevice_t, nvmlRestrictedAPI_t, nvmlEnableState_t *))
@@ -130,10 +133,37 @@ nvml_handle * nvml_create()
 		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetApplicationsClock");
 	nvmlh->nvmlDeviceSetApplicationsClocks = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int mem, unsigned int gpu))
 		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceSetApplicationsClocks");
+	nvmlh->nvmlDeviceResetApplicationsClocks = (nvmlReturn_t (*)(nvmlDevice_t))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceResetApplicationsClocks");
+	nvmlh->nvmlDeviceGetSupportedGraphicsClocks = (nvmlReturn_t (*)(nvmlDevice_t, uint32_t mem, uint32_t *num, uint32_t *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetSupportedGraphicsClocks");
+	nvmlh->nvmlDeviceGetSupportedMemoryClocks = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *count, unsigned int *clocksMHz))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetSupportedMemoryClocks");
 	nvmlh->nvmlDeviceGetClockInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlClockType_t, unsigned int *clock))
 		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetClockInfo");
-	nvmlh->nvmlDeviceGetPciInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlPciInfo_t *))
-		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPciInfo");
+	nvmlh->nvmlDeviceGetMaxClockInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlClockType_t, unsigned int *clock))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetMaxClockInfo");
+	nvmlh->nvmlDeviceGetPciInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlPciInfo_t *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPciInfo_v2");
+	if (!nvmlh->nvmlDeviceGetPciInfo)
+		nvmlh->nvmlDeviceGetPciInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlPciInfo_t *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPciInfo");
+	nvmlh->nvmlDeviceGetCurrPcieLinkGeneration = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *gen))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCurrPcieLinkGeneration");
+	nvmlh->nvmlDeviceGetCurrPcieLinkWidth = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *width))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCurrPcieLinkWidth");
+	nvmlh->nvmlDeviceGetMaxPcieLinkGeneration = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *gen))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetMaxPcieLinkGeneration");
+	nvmlh->nvmlDeviceGetMaxPcieLinkWidth = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *width))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetMaxPcieLinkWidth");
+	nvmlh->nvmlDeviceGetPowerUsage = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerUsage");
+	nvmlh->nvmlDeviceGetPowerManagementDefaultLimit = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *limit))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerManagementDefaultLimit");
+	nvmlh->nvmlDeviceGetPowerManagementLimit = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *limit))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerManagementLimit");
+	nvmlh->nvmlDeviceGetPowerManagementLimitConstraints = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *min, unsigned int *max))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerManagementLimitConstraints");
+	nvmlh->nvmlDeviceSetPowerManagementLimit = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int limit))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceSetPowerManagementLimit");
 	nvmlh->nvmlDeviceGetName = (nvmlReturn_t (*)(nvmlDevice_t, char *, int))
 		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetName");
 	nvmlh->nvmlDeviceGetTemperature = (nvmlReturn_t (*)(nvmlDevice_t, int, unsigned int *))
@@ -141,7 +171,7 @@ nvml_handle * nvml_create()
 	nvmlh->nvmlDeviceGetFanSpeed = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *))
 		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetFanSpeed");
 	nvmlh->nvmlDeviceGetPerformanceState = (nvmlReturn_t (*)(nvmlDevice_t, int *))
-		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerUsage");
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPerformanceState"); /* or nvmlDeviceGetPowerState */
 	nvmlh->nvmlDeviceGetSerial = (nvmlReturn_t (*)(nvmlDevice_t, char *, unsigned int))
 		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetSerial");
 	nvmlh->nvmlDeviceGetUUID = (nvmlReturn_t (*)(nvmlDevice_t, char *, unsigned int))
@@ -154,17 +184,26 @@ nvml_handle * nvml_create()
 		wrap_dlsym(nvmlh->nvml_dll, "nvmlErrorString");
 	nvmlh->nvmlShutdown = (nvmlReturn_t (*)())
 		wrap_dlsym(nvmlh->nvml_dll, "nvmlShutdown");
+	// v331
+	nvmlh->nvmlDeviceGetEnforcedPowerLimit = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *limit))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetEnforcedPowerLimit");
+	// v340
+	/* NVML_ERROR_NOT_SUPPORTED
+	nvmlh->nvmlDeviceGetAutoBoostedClocksEnabled = (nvmlReturn_t (*)(nvmlDevice_t, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetAutoBoostedClocksEnabled");
+	nvmlh->nvmlDeviceSetAutoBoostedClocksEnabled = (nvmlReturn_t (*)(nvmlDevice_t, nvmlEnableState_t enabled))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceSetAutoBoostedClocksEnabled"); */
+	// v346
+	nvmlh->nvmlDeviceGetPcieThroughput = (nvmlReturn_t (*)(nvmlDevice_t, nvmlPcieUtilCounter_t, unsigned int *value))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPcieThroughput");
 
 	if (nvmlh->nvmlInit == NULL ||
 			nvmlh->nvmlShutdown == NULL ||
 			nvmlh->nvmlErrorString == NULL ||
-			nvmlh->nvmlSystemGetDriverVersion == NULL ||
 			nvmlh->nvmlDeviceGetCount == NULL ||
 			nvmlh->nvmlDeviceGetHandleByIndex == NULL ||
 			nvmlh->nvmlDeviceGetPciInfo == NULL ||
-			nvmlh->nvmlDeviceGetName == NULL ||
-			nvmlh->nvmlDeviceGetTemperature == NULL ||
-			nvmlh->nvmlDeviceGetFanSpeed == NULL)
+			nvmlh->nvmlDeviceGetName == NULL)
 	{
 		if (opt_debug)
 			applog(LOG_DEBUG, "Failed to obtain required NVML function pointers");
@@ -172,10 +211,20 @@ nvml_handle * nvml_create()
 		free(nvmlh);
 		return NULL;
 	}
+	nvmlReturn_t rc;
+	rc = nvmlh->nvmlInit();
+	if(rc != NVML_SUCCESS)
+	{
+		applog(LOG_WARNING, "nvmlInit() failed: %s", nvmlh->nvmlErrorString(rc));
+		return NULL;
+	}
 
-	nvmlh->nvmlInit();
-	nvmlh->nvmlSystemGetDriverVersion(driver_version, sizeof(driver_version));
-	nvmlh->nvmlDeviceGetCount(&nvmlh->nvml_gpucount);
+	rc = nvmlh->nvmlSystemGetDriverVersion(driver_version, sizeof(driver_version));
+	if(rc != NVML_SUCCESS)
+		applog(LOG_WARNING, "nvmlSystemGetDriverVersion() failed: %s", nvmlh->nvmlErrorString(rc));
+	rc = nvmlh->nvmlDeviceGetCount(&nvmlh->nvml_gpucount);
+	if(rc != NVML_SUCCESS)
+		applog(LOG_WARNING, "nvmlDeviceGetCount() failed: %s", nvmlh->nvmlErrorString(rc));
 
 	/* Query CUDA device count, in case it doesn't agree with NVML, since  */
 	/* CUDA will only report GPUs with compute capability greater than 1.0 */
@@ -197,8 +246,11 @@ nvml_handle * nvml_create()
 	nvmlh->app_clocks = (nvmlEnableState_t*) calloc(nvmlh->nvml_gpucount, sizeof(nvmlEnableState_t));
 
 	/* Obtain GPU device handles we're going to need repeatedly... */
-	for (i=0; i<nvmlh->nvml_gpucount; i++) {
-		nvmlh->nvmlDeviceGetHandleByIndex(i, &nvmlh->devs[i]);
+	for (i=0; i<nvmlh->nvml_gpucount; i++)
+	{
+		rc = nvmlh->nvmlDeviceGetHandleByIndex(i, &nvmlh->devs[i]);
+		if(rc != NVML_SUCCESS)
+			applog(LOG_WARNING, "GPU %d: nvmlDeviceGetHandleByIndex() failed: %s", i, nvmlh->nvmlErrorString(rc));
 	}
 
 	/* Query PCI info for each NVML device, and build table for mapping of */
@@ -210,32 +262,23 @@ nvml_handle * nvml_create()
 		nvmlh->nvml_pci_domain_id[i] = pciinfo.domain;
 		nvmlh->nvml_pci_bus_id[i]    = pciinfo.bus;
 		nvmlh->nvml_pci_device_id[i] = pciinfo.device;
-		nvmlh->nvml_pci_subsys_id[i] = pciinfo.pci_device_id;
+		nvmlh->nvml_pci_subsys_id[i] = pciinfo.pci_subsystem_id;
 
 		nvmlh->app_clocks[i] = NVML_FEATURE_UNKNOWN;
-		if (nvmlh->nvmlDeviceSetAPIRestriction) {
-			nvmlh->nvmlDeviceSetAPIRestriction(nvmlh->devs[i], NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS,
+		if (nvmlh->nvmlDeviceSetAPIRestriction)
+		{
+			rc = nvmlh->nvmlDeviceSetAPIRestriction(nvmlh->devs[i], NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS,
 				NVML_FEATURE_ENABLED);
+			if(rc != NVML_SUCCESS && opt_debug)
+				applog(LOG_WARNING, "Device %d: nvmlDeviceSetAPIRestriction() failed: %s", nvmlh->devs[i], nvmlh->nvmlErrorString(rc));
 			/* there is only this API_SET_APPLICATION_CLOCKS on the 750 Ti (340.58) */
 		}
-		if (nvmlh->nvmlDeviceGetAPIRestriction) {
-			nvmlh->nvmlDeviceGetAPIRestriction(nvmlh->devs[i], NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS,
+		if (nvmlh->nvmlDeviceGetAPIRestriction)
+		{
+			rc = nvmlh->nvmlDeviceGetAPIRestriction(nvmlh->devs[i], NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS,
 				&nvmlh->app_clocks[i]);
-			if (nvmlh->app_clocks[i] == NVML_FEATURE_ENABLED && opt_debug) {
-				applog(LOG_DEBUG, "NVML application clock feature is allowed");
-#if 0
-				uint32_t mem;
-				nvmlReturn_t rc;
-				rc = nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[i], NVML_CLOCK_MEM, &mem);
-				if (rc == NVML_SUCCESS)
-					applog(LOG_DEBUG, "nvmlDeviceGetDefaultApplicationsClock: mem %u", mem);
-				else
-					applog(LOG_DEBUG, "nvmlDeviceGetDefaultApplicationsClock: %s", nvmlh->nvmlErrorString(rc));
-				rc = nvmlh->nvmlDeviceSetApplicationsClocks(nvmlh->devs[i], mem, 1228000);
-				if (rc != NVML_SUCCESS)
-					applog(LOG_DEBUG, "nvmlDeviceSetApplicationsClocks: %s", nvmlh->nvmlErrorString(rc));
-#endif
-			}
+			if(rc != NVML_SUCCESS)
+				applog(LOG_WARNING, "Device %d: nvmlDeviceGetAPIRestriction() failed: %s", nvmlh->devs[i], nvmlh->nvmlErrorString(rc));
 		}
 	}
 
@@ -254,7 +297,7 @@ nvml_handle * nvml_create()
 				    (nvmlh->nvml_pci_bus_id[j]    == (uint32_t) props.pciBusID) &&
 				    (nvmlh->nvml_pci_device_id[j] == (uint32_t) props.pciDeviceID)) {
 					if (opt_debug)
-						applog(LOG_DEBUG, "CUDA GPU#%d matches NVML GPU %d by busId %u",
+						applog(LOG_DEBUG, "CUDA GPU %d matches NVML GPU %d by busId %u",
 							i, j, (uint32_t) props.pciBusID);
 					nvmlh->nvml_cuda_device_id[j] = i;
 					nvmlh->cuda_nvml_device_id[i] = j;
@@ -266,6 +309,279 @@ nvml_handle * nvml_create()
 	return nvmlh;
 }
 
+#define MAXCLOCKS 255
+/* apply config clocks to an used device */
+int nvml_set_clocks(nvml_handle *nvmlh, int dev_id)
+{
+	nvmlReturn_t rc;
+	uint32_t gpu_clk = 0, mem_clk = 0;
+	int n = nvmlh->cuda_nvml_device_id[dev_id];
+	if (n < 0 || n >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (!device_gpu_clocks[dev_id] && !device_mem_clocks[dev_id])
+		return 0; // nothing to do
+
+	if (nvmlh->app_clocks[n] != NVML_FEATURE_ENABLED) {
+		applog(LOG_WARNING, "GPU #%d: NVML application clock feature is not allowed!", dev_id);
+		return -EPERM;
+	}
+
+	uint32_t mem_prev = clock_prev_mem[dev_id];
+	if(!mem_prev)
+	{
+		rc = nvmlh->nvmlDeviceGetApplicationsClock(nvmlh->devs[n], NVML_CLOCK_MEM, &mem_prev);
+		if(rc != NVML_SUCCESS)
+		{
+			applog(LOG_WARNING, "GPU #%d: unable to query memory clock", dev_id);
+			return -1;
+		}
+	}
+	uint32_t gpu_prev = clock_prev[dev_id];
+	if(!gpu_prev)
+	{
+		rc = nvmlh->nvmlDeviceGetApplicationsClock(nvmlh->devs[n], NVML_CLOCK_GRAPHICS, &gpu_prev);
+		if(rc != NVML_SUCCESS)
+		{
+			applog(LOG_WARNING, "GPU #%d: unable to query graphics clock", dev_id);
+			return -1;
+		}
+	}
+
+	rc = nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_MEM, &mem_clk);
+	if(rc != NVML_SUCCESS)
+	{
+		applog(LOG_WARNING, "GPU #%d: unable to query default memory clock", dev_id);
+		return -1;
+	}
+	rc = nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_GRAPHICS, &gpu_clk);
+	if (rc != NVML_SUCCESS) {
+		applog(LOG_WARNING, "GPU #%d: unable to query default graphics clock", dev_id);
+		return -1;
+	}
+
+	if (opt_debug)
+		applog(LOG_DEBUG, "GPU #%d: default application clocks are %u/%u", dev_id, mem_clk, gpu_clk);
+
+	// get application config values
+	if (device_mem_clocks[dev_id]) mem_clk = device_mem_clocks[dev_id];
+	if (device_gpu_clocks[dev_id]) gpu_clk = device_gpu_clocks[dev_id];
+
+	// these functions works for the 960 and the 970 (346.72+), not for the 750 Ti
+	uint32_t nclocks = MAXCLOCKS;
+	uint32_t clocks[MAXCLOCKS] = {0};
+
+	rc = nvmlh->nvmlDeviceGetSupportedMemoryClocks(nvmlh->devs[n], &nclocks, clocks);
+	if(rc != NVML_SUCCESS)
+	{
+		applog(LOG_WARNING, "GPU #%d: unable to query supported memory clocks", dev_id);
+		return -1;
+	}
+	for (uint8_t u=0; u < nclocks; u++) {
+		// ordered by pstate (so highest is first memory clock - P0)
+		if(clocks[u] <= mem_clk)
+		{
+			mem_clk = clocks[u];
+			break;
+		}
+	}
+
+	nclocks = MAXCLOCKS;
+	rc = nvmlh->nvmlDeviceGetSupportedGraphicsClocks(nvmlh->devs[n], mem_clk, &nclocks, clocks);
+	if(rc != NVML_SUCCESS)
+	{
+		applog(LOG_WARNING, "GPU #%d: unable to query supported graphics clocks", dev_id);
+		return -1;
+	}
+	for (uint8_t u=0; u < nclocks; u++) {
+		// ordered desc, so get first
+		if (clocks[u] <= gpu_clk) {
+			gpu_clk = clocks[u];
+			break;
+		}
+	}
+
+	rc = nvmlh->nvmlDeviceSetApplicationsClocks(nvmlh->devs[n], mem_clk, gpu_clk);
+	if (rc == NVML_SUCCESS)
+		applog(LOG_INFO, "GPU #%d: application clocks set to %u/%u", dev_id, mem_clk, gpu_clk);
+	else {
+		applog(LOG_WARNING, "GPU #%d: %u/%u - %s", dev_id, mem_clk, gpu_clk, nvmlh->nvmlErrorString(rc));
+		return -1;
+	}
+
+	// store previous clocks for reset on exit (or during wait...)
+	clock_prev[dev_id] = gpu_prev;
+	clock_prev_mem[dev_id] = mem_prev;
+	return 1;
+}
+
+/* reset default app clocks and limits on exit */
+int nvml_reset_clocks(nvml_handle *nvmlh, int dev_id)
+{
+	int ret = 0;
+	nvmlReturn_t rc;
+	uint32_t gpu_clk = 0, mem_clk = 0;
+	int n = nvmlh->cuda_nvml_device_id[dev_id];
+	if (n < 0 || n >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (clock_prev[dev_id]) {
+		rc = nvmlh->nvmlDeviceResetApplicationsClocks(nvmlh->devs[n]);
+		if (rc != NVML_SUCCESS) {
+			applog(LOG_WARNING, "GPU #%d: unable to reset application clocks", dev_id);
+		}
+		clock_prev[dev_id] = 0;
+		ret = 1;
+	}
+
+	if (limit_prev[dev_id]) {
+		uint32_t plimit = limit_prev[dev_id];
+		if (nvmlh->nvmlDeviceGetPowerManagementDefaultLimit && !plimit) {
+			rc = nvmlh->nvmlDeviceGetPowerManagementDefaultLimit(nvmlh->devs[n], &plimit);
+		} else if (plimit) {
+			rc = NVML_SUCCESS;
+		}
+		if (rc == NVML_SUCCESS)
+			nvmlh->nvmlDeviceSetPowerManagementLimit(nvmlh->devs[n], plimit);
+		ret = 1;
+	}
+	return ret;
+}
+
+
+/**
+ * Set power state of a device (9xx)
+ * Code is similar as clocks one, which allow the change of the pstate
+ */
+int nvml_set_pstate(nvml_handle *nvmlh, int dev_id)
+{
+	nvmlReturn_t rc;
+	uint32_t gpu_clk = 0, mem_clk = 0;
+	int n = nvmlh->cuda_nvml_device_id[dev_id];
+	if (n < 0 || n >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (device_pstate[dev_id] < 0)
+		return 0;
+
+	if (nvmlh->app_clocks[n] != NVML_FEATURE_ENABLED) {
+		applog(LOG_WARNING, "GPU #%d: NVML app. clock feature is not allowed!", dev_id);
+		return -EPERM;
+	}
+
+	rc = nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_MEM, &mem_clk);
+	if(rc != NVML_SUCCESS)
+	{
+		applog(LOG_WARNING, "GPU #%d: nvmlDeviceGetDefaultApplicationsClock: %s", dev_id, nvmlh->nvmlErrorString(rc));
+		return -1;
+	}
+	rc = nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_GRAPHICS, &gpu_clk);
+	if (rc != NVML_SUCCESS) {
+		applog(LOG_WARNING, "GPU #%d: unable to query application clocks", dev_id);
+		return -EINVAL;
+	}
+
+	// get application config values
+	if (device_mem_clocks[dev_id]) mem_clk = device_mem_clocks[dev_id];
+	if (device_gpu_clocks[dev_id]) gpu_clk = device_gpu_clocks[dev_id];
+
+	// these functions works for the 960 and the 970 (346.72+), not for the 750 Ti
+	uint32_t clocks[MAXCLOCKS] = {0};
+	uint32_t nclocks = MAXCLOCKS;
+	int8_t wanted_pstate = device_pstate[dev_id];
+	rc = nvmlh->nvmlDeviceGetSupportedMemoryClocks(nvmlh->devs[n], &nclocks, clocks);
+	if(rc != NVML_SUCCESS)
+	{
+		applog(LOG_WARNING, "GPU #%d: nvmlDeviceGetSupportedMemoryClocks: %s", dev_id, nvmlh->nvmlErrorString(rc));
+		return -1;
+	}
+	if(wanted_pstate < 0)
+		return -1;
+	if(wanted_pstate < nclocks)
+	{
+		mem_clk = clocks[wanted_pstate];
+	}
+	else
+	{
+		applog(LOG_WARNING, "GPU #%d: pstate %d is unsupported");
+		return -1;
+	}
+
+	nclocks = MAXCLOCKS;
+	rc = nvmlh->nvmlDeviceGetSupportedGraphicsClocks(nvmlh->devs[n], mem_clk, &nclocks, clocks);
+	if(rc != NVML_SUCCESS)
+	{
+		applog(LOG_WARNING, "GPU #%d: nvmlDeviceGetSupportedGraphicsClocks: %s", dev_id, nvmlh->nvmlErrorString(rc));
+		return -1;
+	}
+	if(device_gpu_clocks[dev_id] == 0)
+		gpu_clk = 9999;
+	for(uint8_t u = 0; u < nclocks; u++)
+	{
+		// ordered desc, so get first
+		if(clocks[u] <= gpu_clk)
+		{
+			gpu_clk = clocks[u];
+			break;
+		}
+	}
+
+	rc = nvmlh->nvmlDeviceSetApplicationsClocks(nvmlh->devs[n], mem_clk, gpu_clk);
+	if (rc != NVML_SUCCESS) {
+		applog(LOG_WARNING, "GPU #%d: pstate %s", dev_id, nvmlh->nvmlErrorString(rc));
+		return -1;
+	}
+
+	if (!opt_quiet)
+		applog(LOG_INFO, "GPU #%d: app clocks set to P%d (%u/%u)", dev_id, (int)wanted_pstate, mem_clk, gpu_clk);
+
+	clock_prev[dev_id] = 1;
+	return 1;
+}
+
+int nvml_set_plimit(nvml_handle *nvmlh, int dev_id)
+{
+	nvmlReturn_t rc = NVML_ERROR_UNKNOWN;
+	uint32_t gpu_clk = 0, mem_clk = 0;
+	int n = nvmlh->cuda_nvml_device_id[dev_id];
+	if (n < 0 || n >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (!device_plimit[dev_id])
+		return 0; // nothing to do
+
+	if (!nvmlh->nvmlDeviceSetPowerManagementLimit)
+		return -ENOSYS;
+
+	uint32_t plimit = device_plimit[dev_id] * 1000;
+	uint32_t pmin = 1000, pmax = 0, prev_limit = 0;
+	if (nvmlh->nvmlDeviceGetPowerManagementLimitConstraints)
+		rc = nvmlh->nvmlDeviceGetPowerManagementLimitConstraints(nvmlh->devs[n], &pmin, &pmax);
+
+	if (rc != NVML_SUCCESS) {
+		if (!nvmlh->nvmlDeviceGetPowerManagementLimit)
+			return -ENOSYS;
+	}
+	nvmlh->nvmlDeviceGetPowerManagementLimit(nvmlh->devs[n], &prev_limit);
+	if (!pmax) pmax = prev_limit;
+
+	plimit = min(plimit, pmax);
+	plimit = max(plimit, pmin);
+	rc = nvmlh->nvmlDeviceSetPowerManagementLimit(nvmlh->devs[n], plimit);
+	if (rc != NVML_SUCCESS) {
+		applog(LOG_WARNING, "GPU #%d: plimit %s", dev_id, nvmlh->nvmlErrorString(rc));
+		return -1;
+	}
+
+	if (!opt_quiet) {
+		applog(LOG_INFO, "GPU #%d: power limit set to %uW (allowed range is %u-%u)",
+			dev_id, plimit/1000U, pmin/1000U, pmax/1000U);
+	}
+
+	limit_prev[dev_id] = prev_limit;
+	return 1;
+}
+
 int nvml_get_gpucount(nvml_handle *nvmlh, int *gpucount)
 {
 	*gpucount = nvmlh->nvml_gpucount;
@@ -283,7 +599,10 @@ int nvml_get_gpu_name(nvml_handle *nvmlh, int cudaindex, char *namebuf, int bufs
 {
 	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
 	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
-		return -1;
+		return -ENODEV;
+
+	if (!nvmlh->nvmlDeviceGetName)
+		return -ENOSYS;
 
 	if (nvmlh->nvmlDeviceGetName(nvmlh->devs[gpuindex], namebuf, bufsize) != NVML_SUCCESS)
 		return -1;
@@ -297,7 +616,10 @@ int nvml_get_tempC(nvml_handle *nvmlh, int cudaindex, unsigned int *tempC)
 	nvmlReturn_t rc;
 	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
 	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
-		return -1;
+		return -ENODEV;
+
+	if (!nvmlh->nvmlDeviceGetTemperature)
+		return -ENOSYS;
 
 	rc = nvmlh->nvmlDeviceGetTemperature(nvmlh->devs[gpuindex], 0u /* NVML_TEMPERATURE_GPU */, tempC);
 	if (rc != NVML_SUCCESS) {
@@ -313,7 +635,10 @@ int nvml_get_fanpcnt(nvml_handle *nvmlh, int cudaindex, unsigned int *fanpcnt)
 	nvmlReturn_t rc;
 	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
 	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
-		return -1;
+		return -ENODEV;
+
+	if (!nvmlh->nvmlDeviceGetFanSpeed)
+		return -ENOSYS;
 
 	rc = nvmlh->nvmlDeviceGetFanSpeed(nvmlh->devs[gpuindex], fanpcnt);
 	if (rc != NVML_SUCCESS) {
@@ -328,12 +653,15 @@ int nvml_get_power_usage(nvml_handle *nvmlh, int cudaindex, unsigned int *milliw
 {
 	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
 	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
-		return -1;
+		return -ENODEV;
+
+	if (!nvmlh->nvmlDeviceGetPowerUsage)
+		return -ENOSYS;
 
 	nvmlReturn_t res = nvmlh->nvmlDeviceGetPowerUsage(nvmlh->devs[gpuindex], milliwatts);
 	if (res != NVML_SUCCESS) {
-		if (opt_debug)
-			applog(LOG_DEBUG, "nvmlDeviceGetPowerUsage: %s", nvmlh->nvmlErrorString(res));
+		//if (opt_debug)
+		//	applog(LOG_DEBUG, "nvmlDeviceGetPowerUsage: %s", nvmlh->nvmlErrorString(res));
 		return -1;
 	}
 
@@ -345,7 +673,10 @@ int nvml_get_pstate(nvml_handle *nvmlh, int cudaindex, int *pstate)
 {
 	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
 	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
-		return -1;
+		return -ENODEV;
+
+	if (!nvmlh->nvmlDeviceGetPerformanceState)
+		return -ENOSYS;
 
 	nvmlReturn_t res = nvmlh->nvmlDeviceGetPerformanceState(nvmlh->devs[gpuindex], pstate);
 	if (res != NVML_SUCCESS) {
@@ -361,7 +692,7 @@ int nvml_get_busid(nvml_handle *nvmlh, int cudaindex, int *busid)
 {
 	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
 	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
-		return -1;
+		return -ENODEV;
 
 	(*busid) = nvmlh->nvml_pci_bus_id[gpuindex];
 	return 0;
@@ -374,13 +705,17 @@ int nvml_get_serial(nvml_handle *nvmlh, int cudaindex, char *sn, int maxlen)
 	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
 	nvmlReturn_t res;
 	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
-		return -1;
+		return -ENODEV;
 
-	res = nvmlh->nvmlDeviceGetSerial(nvmlh->devs[gpuindex], sn, maxlen);
-	if (res == NVML_SUCCESS) {
-		return 0;
+	if (nvmlh->nvmlDeviceGetSerial) {
+		res = nvmlh->nvmlDeviceGetSerial(nvmlh->devs[gpuindex], sn, maxlen);
+		if (res == NVML_SUCCESS)
+			return 0;
 	}
 
+	if (!nvmlh->nvmlDeviceGetUUID)
+		return -ENOSYS;
+
 	// nvmlDeviceGetUUID: GPU-f2bd642c-369f-5a14-e0b4-0d22dfe9a1fc
 	// use a part of uuid to generate an unique serial
 	// todo: check if there is vendor id is inside
@@ -401,7 +736,10 @@ int nvml_get_bios(nvml_handle *nvmlh, int cudaindex, char *desc, int maxlen)
 	uint32_t subids = 0;
 	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
 	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
-		return -1;
+		return -ENODEV;
+
+	if (!nvmlh->nvmlDeviceGetVbiosVersion)
+		return -ENOSYS;
 
 	nvmlReturn_t res = nvmlh->nvmlDeviceGetVbiosVersion(nvmlh->devs[gpuindex], desc, maxlen);
 	if (res != NVML_SUCCESS) {
@@ -412,16 +750,17 @@ int nvml_get_bios(nvml_handle *nvmlh, int cudaindex, char *desc, int maxlen)
 	return 0;
 }
 
-int nvml_get_info(nvml_handle *nvmlh, int cudaindex, uint16_t *vid, uint16_t *pid)
+int nvml_get_info(nvml_handle *nvmlh, int cudaindex, uint16_t &vid, uint16_t &pid)
 {
 	uint32_t subids = 0;
 	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
 	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
-		return -1;
+		return -ENODEV;
 
 	subids = nvmlh->nvml_pci_subsys_id[gpuindex];
-	(*pid) = subids >> 16;
-	(*vid) = subids & 0xFFFF;
+	if (!subids) subids = nvmlh->nvml_pci_device_id[gpuindex];
+	pid = subids >> 16;
+	vid = subids & 0xFFFF;
 	return 0;
 }
 
@@ -461,7 +800,7 @@ int nvapi_temperature(unsigned int devNum, unsigned int *temperature)
 	NvAPI_Status ret;
 
 	if (devNum >= nvapi_dev_cnt)
-		return -1;
+		return -ENODEV;
 
 	NV_GPU_THERMAL_SETTINGS thermal;
 	thermal.version = NV_GPU_THERMAL_SETTINGS_VER;
@@ -484,7 +823,7 @@ int nvapi_fanspeed(unsigned int devNum, unsigned int *speed)
 	NvAPI_Status ret;
 
 	if (devNum >= nvapi_dev_cnt)
-		return -1;
+		return -ENODEV;
 
 	NvU32 fanspeed = 0;
 	ret = NvAPI_GPU_GetTachReading(phys[devNum], &fanspeed);
@@ -506,7 +845,7 @@ int nvapi_getpstate(unsigned int devNum, unsigned int *power)
 	NvAPI_Status ret;
 
 	if (devNum >= nvapi_dev_cnt)
-		return -1;
+		return -ENODEV;
 
 	NV_GPU_PERF_PSTATE_ID CurrentPstate = NVAPI_GPU_PERF_PSTATE_UNDEFINED; /* 16 */
 	ret = NvAPI_GPU_GetCurrentPstate(phys[devNum], &CurrentPstate);
@@ -531,7 +870,7 @@ int nvapi_getusage(unsigned int devNum, unsigned int *pct)
 	NvAPI_Status ret;
 
 	if (devNum >= nvapi_dev_cnt)
-		return -1;
+		return -ENODEV;
 
 	NV_GPU_DYNAMIC_PSTATES_INFO_EX info;
 	info.version = NV_GPU_DYNAMIC_PSTATES_INFO_EX_VER;
@@ -551,13 +890,13 @@ int nvapi_getusage(unsigned int devNum, unsigned int *pct)
 	return 0;
 }
 
-int nvapi_getinfo(unsigned int devNum, uint16_t *vid, uint16_t *pid)
+int nvapi_getinfo(unsigned int devNum, uint16_t &vid, uint16_t &pid)
 {
 	NvAPI_Status ret;
 	NvU32 pDeviceId, pSubSystemId, pRevisionId, pExtDeviceId;
 
 	if (devNum >= nvapi_dev_cnt)
-		return -1;
+		return -ENODEV;
 
 	ret = NvAPI_GPU_GetPCIIdentifiers(phys[devNum], &pDeviceId, &pSubSystemId, &pRevisionId, &pExtDeviceId);
 	if (ret != NVAPI_OK) {
@@ -568,8 +907,12 @@ int nvapi_getinfo(unsigned int devNum, uint16_t *vid, uint16_t *pid)
 		return -1;
 	}
 
-	(*pid) = pDeviceId >> 16;
-	(*vid) = pDeviceId & 0xFFFF;
+	pid = pDeviceId >> 16;
+	vid = pDeviceId & 0xFFFF;
+	if (vid == 0x10DE && pSubSystemId) {
+		vid = pSubSystemId & 0xFFFF;
+		pid = pSubSystemId >> 16;
+	}
 
 	return 0;
 }
@@ -578,7 +921,7 @@ int nvapi_getserial(unsigned int devNum, char *serial, unsigned int maxlen)
 {
 //	NvAPI_Status ret;
 	if (devNum >= nvapi_dev_cnt)
-		return -1;
+		return -ENODEV;
 
 	sprintf(serial, "");
 
@@ -602,7 +945,7 @@ int nvapi_getbios(unsigned int devNum, char *desc, unsigned int maxlen)
 {
 	NvAPI_Status ret;
 	if (devNum >= nvapi_dev_cnt)
-		return -1;
+		return -ENODEV;
 
 	if (maxlen < 64) // Short String
 		return -1;
@@ -657,7 +1000,7 @@ int nvapi_init()
 				if (ret == NVAPI_OK && busId == device_bus_ids[g]) {
 					nvapi_dev_map[g] = i;
 					if (opt_debug)
-						applog(LOG_DEBUG, "CUDA GPU#%d matches NVAPI GPU %d by busId %u",
+						applog(LOG_DEBUG, "CUDA GPU %d matches NVAPI GPU %d by busId %u",
 							g, i, busId);
 					break;
 				}
@@ -679,7 +1022,7 @@ int nvapi_init()
 	NvAPI_ShortString str;
 	ret = NvAPI_SYS_GetDriverAndBranchVersion(&udv, str);
 	if (ret == NVAPI_OK) {
-		sprintf(driver_version,"%d.%d", udv/100, udv % 100);
+		sprintf(driver_version,"%d.%02d", udv / 100, udv % 100);
 	}
 
 	return 0;
@@ -785,12 +1128,125 @@ unsigned int gpu_power(struct cgpu_info *gpu)
 		mw = pct; // to fix
 	}
 #endif
+	if(gpu->gpu_power > 0)
+	{
+		// average
+		mw = (gpu->gpu_power + mw) / 2;
+	}
 	return mw;
 }
 
+static int translate_vendor_id(uint16_t vid, char *vendorname)
+{
+	struct VENDORS {
+		const uint16_t vid;
+		const char *name;
+	} vendors[] = {
+		{ 0x1043, "ASUS" },
+		{ 0x107D, "Leadtek" },
+		{ 0x10B0, "Gainward" },
+		// { 0x10DE, "NVIDIA" },
+		{ 0x1458, "Gigabyte" },
+		{ 0x1462, "MSI" },
+		{ 0x154B, "PNY" },
+		{ 0x1682, "XFX" },
+		{ 0x196D, "Club3D" },
+		{ 0x19DA, "Zotac" },
+		{ 0x19F1, "BFG" },
+		{ 0x1ACC, "PoV" },
+		{ 0x1B4C, "KFA2" },
+		{ 0x3842, "EVGA" },
+		{ 0x7377, "Colorful" },
+		{ 0, "" }
+	};
+
+	if (!vendorname)
+		return -EINVAL;
+
+	for(int v=0; v < ARRAY_SIZE(vendors); v++) {
+		if (vid == vendors[v].vid) {
+			strcpy(vendorname, vendors[v].name);
+			return vid;
+		}
+	}
+	if (opt_debug && vid != 0x10DE)
+		applog(LOG_DEBUG, "nvml: Unknown vendor %04x\n", vid);
+	return 0;
+}
+
+#ifdef HAVE_PCIDEV
+extern "C" {
+#include <pci/pci.h>
+}
+static int linux_gpu_vendor(uint8_t pci_bus_id, char* vendorname, uint16_t &pid)
+{
+	uint16_t subvendor = 0;
+	struct pci_access *pci;
+	struct pci_dev *dev;
+	uint16_t subdevice;
+
+	if (!vendorname)
+		return -EINVAL;
+
+	pci = pci_alloc();
+	if (!pci)
+		return -ENODEV;
+
+	pci_init(pci);
+	pci_scan_bus(pci);
+
+	for(dev = pci->devices; dev; dev = dev->next)
+	{
+		if (dev->bus == pci_bus_id  && dev->vendor_id == 0x10DE)
+		{
+			if (!(dev->known_fields & PCI_FILL_CLASS))
+				pci_fill_info(dev, PCI_FILL_CLASS);
+			if (dev->device_class != PCI_CLASS_DISPLAY_VGA)
+				continue;
+			subvendor = pci_read_word(dev, PCI_SUBSYSTEM_VENDOR_ID);
+			subdevice = pci_read_word(dev, PCI_SUBSYSTEM_ID); // model
+
+			translate_vendor_id(subvendor, vendorname);
+		}
+	}
+	pci_cleanup(pci);
+	return (int) subvendor;
+}
+#endif
+
+int gpu_vendor(uint8_t pci_bus_id, char *vendorname)
+{
+#ifdef HAVE_PCIDEV
+	uint16_t pid = 0;
+	return linux_gpu_vendor(pci_bus_id, vendorname, pid);
+#else
+	uint16_t vid = 0, pid = 0;
+	if (hnvml) { // may not be initialized on start...
+		for (int id=0; id < hnvml->nvml_gpucount; id++) {
+			if (hnvml->nvml_pci_bus_id[id] == pci_bus_id) {
+				int dev_id = hnvml->nvml_cuda_device_id[id];
+				nvml_get_info(hnvml, dev_id, vid, pid);
+			}
+		}
+	} else {
+#ifdef WIN32
+		for (unsigned id = 0; id < nvapi_dev_cnt; id++) {
+			if (device_bus_ids[id] == pci_bus_id) {
+				nvapi_getinfo(nvapi_dev_map[id], vid, pid);
+				break;
+			}
+		}
+#endif
+	}
+	return translate_vendor_id(vid, vendorname);
+#endif
+}
+
 int gpu_info(struct cgpu_info *gpu)
 {
+	char vendorname[32] = { 0 };
 	int id = gpu->gpu_id;
+	uint8_t bus_id = 0;
 
 	gpu->nvml_id = -1;
 	gpu->nvapi_id = -1;
@@ -800,13 +1256,19 @@ int gpu_info(struct cgpu_info *gpu)
 
 	if (hnvml) {
 		gpu->nvml_id = (int8_t) hnvml->cuda_nvml_device_id[id];
-		nvml_get_info(hnvml, id, &gpu->gpu_vid, &gpu->gpu_pid);
+#ifdef HAVE_PCIDEV
+		gpu->gpu_vid = linux_gpu_vendor(hnvml->nvml_pci_bus_id[id], vendorname, gpu->gpu_pid);
+		if (!gpu->gpu_vid || !gpu->gpu_pid)
+			nvml_get_info(hnvml, id, gpu->gpu_vid, gpu->gpu_pid);
+#else
+		nvml_get_info(hnvml, id, gpu->gpu_vid, gpu->gpu_pid);
+#endif
 		nvml_get_serial(hnvml, id, gpu->gpu_sn, sizeof(gpu->gpu_sn));
 		nvml_get_bios(hnvml, id, gpu->gpu_desc, sizeof(gpu->gpu_desc));
 	}
 #ifdef WIN32
 	gpu->nvapi_id = (int8_t) nvapi_dev_map[id];
-	nvapi_getinfo(nvapi_dev_map[id], &gpu->gpu_vid, &gpu->gpu_pid);
+	nvapi_getinfo(nvapi_dev_map[id], gpu->gpu_vid, gpu->gpu_pid);
 	nvapi_getserial(nvapi_dev_map[id], gpu->gpu_sn, sizeof(gpu->gpu_sn));
 	nvapi_getbios(nvapi_dev_map[id], gpu->gpu_desc, sizeof(gpu->gpu_desc));
 #endif
diff --git a/nvml.h b/nvml.h
index d9fa5e4a08..4e1df9ff3d 100644
--- a/nvml.h
+++ b/nvml.h
@@ -66,6 +66,20 @@ enum nvmlClockType_t {
 	NVML_CLOCK_MEM = 2
 };
 
+enum nvmlPcieUtilCounter_t {
+	NVML_PCIE_UTIL_TX_BYTES = 0,
+	NVML_PCIE_UTIL_RX_BYTES = 1,
+	NVML_PCIE_UTIL_COUNT
+};
+
+enum nvmlValueType_t {
+	NVML_VALUE_TYPE_DOUBLE = 0,
+	NVML_VALUE_TYPE_UNSIGNED_INT = 1,
+	NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
+	NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
+	NVML_VALUE_TYPE_COUNT
+};
+
 #define NVML_DEVICE_SERIAL_BUFFER_SIZE 30
 #define NVML_DEVICE_UUID_BUFFER_SIZE 80
 #define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE 32
@@ -94,8 +108,20 @@ typedef struct {
 	nvmlReturn_t (*nvmlDeviceGetDefaultApplicationsClock)(nvmlDevice_t, nvmlClockType_t, unsigned int *);
 	nvmlReturn_t (*nvmlDeviceGetApplicationsClock)(nvmlDevice_t, nvmlClockType_t, unsigned int *);
 	nvmlReturn_t (*nvmlDeviceSetApplicationsClocks)(nvmlDevice_t, unsigned int, unsigned int);
+	nvmlReturn_t (*nvmlDeviceResetApplicationsClocks)(nvmlDevice_t);
+	nvmlReturn_t (*nvmlDeviceGetSupportedGraphicsClocks)(nvmlDevice_t, uint32_t mem, uint32_t *num, uint32_t *arr);
+	nvmlReturn_t (*nvmlDeviceGetSupportedMemoryClocks)(nvmlDevice_t, unsigned int *count, unsigned int *clocksMHz);
 	nvmlReturn_t (*nvmlDeviceGetClockInfo)(nvmlDevice_t, nvmlClockType_t, unsigned int *);
+	nvmlReturn_t (*nvmlDeviceGetMaxClockInfo)(nvmlDevice_t, nvmlClockType_t, unsigned int *);
+	nvmlReturn_t (*nvmlDeviceGetPowerManagementDefaultLimit)(nvmlDevice_t, unsigned int *limit);
+	nvmlReturn_t (*nvmlDeviceGetPowerManagementLimit)(nvmlDevice_t, unsigned int *limit);
+	nvmlReturn_t (*nvmlDeviceGetPowerManagementLimitConstraints)(nvmlDevice_t, unsigned int *min, unsigned int *max);
+	nvmlReturn_t (*nvmlDeviceSetPowerManagementLimit)(nvmlDevice_t device, unsigned int limit);
 	nvmlReturn_t (*nvmlDeviceGetPciInfo)(nvmlDevice_t, nvmlPciInfo_t *);
+	nvmlReturn_t (*nvmlDeviceGetCurrPcieLinkGeneration)(nvmlDevice_t device, unsigned int *gen);
+	nvmlReturn_t (*nvmlDeviceGetCurrPcieLinkWidth)(nvmlDevice_t device, unsigned int *width);
+	nvmlReturn_t (*nvmlDeviceGetMaxPcieLinkGeneration)(nvmlDevice_t device, unsigned int *gen);
+	nvmlReturn_t (*nvmlDeviceGetMaxPcieLinkWidth)(nvmlDevice_t device, unsigned int *width);
 	nvmlReturn_t (*nvmlDeviceGetName)(nvmlDevice_t, char *, int);
 	nvmlReturn_t (*nvmlDeviceGetTemperature)(nvmlDevice_t, int, unsigned int *);
 	nvmlReturn_t (*nvmlDeviceGetFanSpeed)(nvmlDevice_t, unsigned int *);
@@ -107,6 +133,15 @@ typedef struct {
 	nvmlReturn_t (*nvmlSystemGetDriverVersion)(char *version, unsigned int len);
 	char* (*nvmlErrorString)(nvmlReturn_t);
 	nvmlReturn_t (*nvmlShutdown)(void);
+	// v331
+	nvmlReturn_t (*nvmlDeviceGetEnforcedPowerLimit)(nvmlDevice_t, unsigned int *limit);
+	// v340
+	//nvmlReturn_t (*nvmlDeviceGetCpuAffinity)(nvmlDevice_t, unsigned int cpuSetSize, unsigned long* cpuSet);
+	//nvmlReturn_t (*nvmlDeviceSetCpuAffinity)(nvmlDevice_t);
+	//nvmlReturn_t (*nvmlDeviceGetAutoBoostedClocksEnabled)(nvmlDevice_t, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled);
+	//nvmlReturn_t (*nvmlDeviceSetAutoBoostedClocksEnabled)(nvmlDevice_t, nvmlEnableState_t enabled);
+	// v346
+	nvmlReturn_t (*nvmlDeviceGetPcieThroughput)(nvmlDevice_t, nvmlPcieUtilCounter_t, unsigned int *value);
 } nvml_handle;
 
 
@@ -118,43 +153,11 @@ int nvml_destroy(nvml_handle *nvmlh);
  */
 int nvml_get_gpucount(nvml_handle *nvmlh, int *gpucount);
 
-/*
- * Query the number of GPUs seen by CUDA
- */
-int cuda_get_gpucount(nvml_handle *nvmlh, int *gpucount);
+int nvml_set_plimit(nvml_handle *nvmlh, int dev_id);
+int nvml_set_pstate(nvml_handle *nvmlh, int dev_id);
 
-
-/*
- * query the name of the GPU model from the CUDA device ID
- *
- */
-int nvml_get_gpu_name(nvml_handle *nvmlh,
-                           int gpuindex,
-                           char *namebuf,
-                           int bufsize);
-
-/*
- * Query the current GPU temperature (Celsius), from the CUDA device ID
- */
-int nvml_get_tempC(nvml_handle *nvmlh,
-                        int gpuindex, unsigned int *tempC);
-
-/*
- * Query the current GPU fan speed (percent) from the CUDA device ID
- */
-int nvml_get_fanpcnt(nvml_handle *nvmlh,
-                          int gpuindex, unsigned int *fanpcnt);
-
-/*
- * Query the current GPU power usage in millwatts from the CUDA device ID
- *
- * This feature is only available on recent GPU generations and may be
- * limited in some cases only to Tesla series GPUs.
- * If the query is run on an unsupported GPU, this routine will return -1.
- */
-int nvml_get_power_usage(nvml_handle *nvmlh,
-                              int gpuindex,
-                              unsigned int *milliwatts);
+int nvml_set_clocks(nvml_handle *nvmlh, int dev_id);
+int nvml_reset_clocks(nvml_handle *nvmlh, int dev_id);
 
 /* api functions */
 
@@ -162,12 +165,13 @@ unsigned int gpu_fanpercent(struct cgpu_info *gpu);
 unsigned int gpu_fanrpm(struct cgpu_info *gpu);
 float gpu_temp(struct cgpu_info *gpu);
 unsigned int gpu_power(struct cgpu_info *gpu);
-unsigned int gpu_usage(struct cgpu_info *gpu);
 int gpu_pstate(struct cgpu_info *gpu);
 int gpu_busid(struct cgpu_info *gpu);
 
 /* pid/vid, sn and bios rev */
 int gpu_info(struct cgpu_info *gpu);
+int gpu_vendor(uint8_t pci_bus_id, char *vendorname);
+
 
 /* nvapi functions */
 #ifdef WIN32
diff --git a/pentablake.cu b/pentablake.cu
index 3b184e53af..0c791dad8b 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -8,15 +8,21 @@
 
 extern "C" {
 #include "sph/sph_blake.h"
+}
+#ifdef __cplusplus
+#include <cstdint>
+#else
 #include <stdint.h>
+#endif
 #include <memory.h>
-}
+
+
 
 /* threads per block */
 #define TPB 192
 
 /* hash by cpu with blake 256 */
-extern "C" void pentablakehash(void *output, const void *input)
+void pentablakehash(void *output, const void *input)
 {
 	unsigned char hash[128];
 	#define hashB hash + 64
@@ -49,10 +55,9 @@ static uint32_t __align__(32) c_Target[8];
 __constant__
 static uint64_t __align__(32) c_data[32];
 
-static uint32_t *d_hash[MAX_GPUS];
 static uint32_t *d_resNounce[MAX_GPUS];
 static uint32_t *h_resNounce[MAX_GPUS];
-static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX };
+static uint32_t extra_results[MAX_GPUS][2] = { UINT32_MAX };
 
 /* prefer uint32_t to prevent size conversions = speed +5/10 % */
 __constant__
@@ -103,7 +108,7 @@ const uint64_t c_u512[16] =
 
 #define G(a,b,c,d,x) { \
 	uint32_t idx1 = c_sigma[i][x]; \
-	uint32_t idx2 = c_sigma[i][x+1]; \
+	uint32_t idx2 = c_sigma[i][x + 1]; \
 	v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \
 	v[d] = SWAPDWORDS(v[d] ^ v[a]); \
 	v[c] += v[d]; \
@@ -188,7 +193,7 @@ void pentablake_compress(uint64_t *h, const uint64_t *block, const uint32_t T0)
 __global__
 void pentablake_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		const uint32_t nounce = startNounce + thread;
@@ -268,12 +273,12 @@ void pentablake_compress(uint64_t *h, const uint64_t *block, const uint64_t T0)
 __global__
 void pentablake_gpu_hash_80(uint32_t threads, const uint32_t startNounce, void *outputHash)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint64_t h[8];
 		uint64_t buf[16];
-		uint32_t nounce = startNounce + thread;
+		const uint32_t nounce = startNounce + thread;
 
 		//#pragma unroll 8
 		for(int i=0; i<8; i++)
@@ -288,39 +293,27 @@ void pentablake_gpu_hash_80(uint32_t threads, const uint32_t startNounce, void *
 
 		pentablake_compress(h, buf, 640ULL);
 
-#if __CUDA_ARCH__ < 300
-		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
-		#pragma unroll 8
-		for (uint32_t i=0; i < 8; i++) {
-			outHash[2*i]   = cuda_swab32( _HIWORD(h[i]) );
-			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
-		}
-#else
 		uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
 		for (uint32_t i=0; i < 8; i++) {
 			outHash[i] = cuda_swab64( h[i] );
 		}
-#endif
-
 	}
 }
 
 __host__
-void pentablake_cpu_hash_80(int thr_id, uint32_t threads, const uint32_t startNounce, uint32_t *d_outputHash, int order)
+void pentablake_cpu_hash_80(int thr_id, uint32_t threads, const uint32_t startNounce, uint32_t *d_outputHash)
 {
 	dim3 grid((threads + TPB-1)/TPB);
 	dim3 block(TPB);
 
-	pentablake_gpu_hash_80 <<<grid, block>>> (threads, startNounce, d_outputHash);
-
-	//MyStreamSynchronize(NULL, order, thr_id);
+	pentablake_gpu_hash_80 <<<grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, d_outputHash);
 }
 
 
 __global__
 void pentablake_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
 	if (thread < threads)
 	{
@@ -344,31 +337,20 @@ void pentablake_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_
 		// Ending round
 		pentablake_compress(h, buf, 512);
 
-#if __CUDA_ARCH__ < 300
-		uint32_t *outHash = (uint32_t*)&g_hash[thread<<3];
-		#pragma unroll 8
-		for (int i=0; i < 8; i++) {
-			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
-			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
-		}
-#else
 		uint64_t *outHash = &g_hash[thread<<3];
 		for (int i=0; i < 8; i++) {
 			outHash[i] = cuda_swab64(h[i]);
 		}
-#endif
 	}
 }
 
 __host__
-void pentablake_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+void pentablake_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
 {
 	dim3 grid((threads + TPB - 1) / TPB);
 	dim3 block(TPB);
 
-	pentablake_gpu_hash_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_outputHash);
-
-	//MyStreamSynchronize(NULL, order, thr_id);
+	pentablake_gpu_hash_64 <<<grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, (uint64_t*)d_outputHash);
 }
 
 #if 0
@@ -385,11 +367,11 @@ uint32_t pentablake_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun
 	if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
 		return result;
 
-	pentablake_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_resNounce[thr_id]);
+	pentablake_gpu_hash_80<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_resNounce[thr_id]);
 	cudaDeviceSynchronize();
-	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+	if (cudaSuccess == cudaMemcpyAsync(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		result = h_resNounce[thr_id][0];
-		extra_results[0] = h_resNounce[thr_id][1];
+		extra_results[thr_id][0] = h_resNounce[thr_id][1];
 	}
 	return result;
 }
@@ -398,11 +380,11 @@ uint32_t pentablake_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun
 __global__
 void pentablake_gpu_check_hash(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *resNounce)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t nounce = startNounce + thread;
-		uint32_t *inpHash = &g_hash[thread<<4];
+		const uint32_t nounce = startNounce + thread;
+		const uint32_t *const inpHash = &g_hash[thread<<4];
 
 		if (cuda_hashisbelowtarget(inpHash, c_Target))
 		{
@@ -414,7 +396,7 @@ void pentablake_gpu_check_hash(uint32_t threads, uint32_t startNounce, uint32_t
 }
 
 __host__ static
-uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
+uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash)
 {
 	uint32_t result = UINT32_MAX;
 
@@ -422,21 +404,21 @@ uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounc
 	dim3 block(TPB);
 
 	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
-	if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
+	if (cudaMemsetAsync(d_resNounce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]) != cudaSuccess)
 		return result;
 
-	pentablake_gpu_check_hash <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
+	pentablake_gpu_check_hash <<<grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
 
-	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
-		result = h_resNounce[thr_id][0];
-		extra_results[0] = h_resNounce[thr_id][1];
-	}
+	CUDA_SAFE_CALL(cudaMemcpyAsync(h_resNounce[thr_id], d_resNounce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]));
+	cudaStreamSynchronize(gpustream[thr_id]);
+	result = h_resNounce[thr_id][0];
+	extra_results[thr_id][0] = h_resNounce[thr_id][1];
 	return result;
 }
 
 
 __host__
-void pentablake_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
+void pentablake_cpu_setBlock_80(int thr_id, uint32_t *pdata, const uint32_t *ptarget)
 {
 	uint8_t data[128];
 	memcpy((void*) data, (void*) pdata, 80);
@@ -448,32 +430,47 @@ void pentablake_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
 	data[126] = 0x02;
 	data[127] = 0x80;
 
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
 }
 
-static bool init[MAX_GPUS] = { 0 };
+static volatile bool init[MAX_GPUS] = { false };
 
-extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
+extern int scanhash_pentablake(int thr_id, uint32_t *pdata, uint32_t *ptarget,
+	uint32_t max_nonce, uint32_t *hashes_done)
 {
+	static THREAD uint32_t *d_hash = nullptr;
+
 	const uint32_t first_nonce = pdata[19];
 	uint32_t endiandata[20];
 	int rc = 0;
-	uint32_t throughput = device_intensity(thr_id, __func__, 128U * 2560); // 18.5
-	throughput = min(throughput, (max_nonce - first_nonce));
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 128U * 2560); // 18.5
+	uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x000F;
+		ptarget[7] = 0x000F;
 
-	if (!init[thr_id]) {
-		if (active_gpus > 1) {
-			CUDA_CALL_OR_RET_X(cudaSetDevice(device_map[thr_id]), 0);
+	if (!init[thr_id]) 
+	{
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / 64)
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
 		}
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
+#endif
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 64 * throughputmax));
 		CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t)));
 		CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t)));
+		mining_has_stopped[thr_id] = false;
 
 		init[thr_id] = true;
 	}
@@ -481,52 +478,55 @@ extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
-	pentablake_cpu_setBlock_80(endiandata, ptarget);
+	pentablake_cpu_setBlock_80(thr_id, endiandata, ptarget);
 
 	do {
-		int order = 0;
 
 		// GPU HASH
-		pentablake_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		uint32_t foundNonce = pentablake_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		if (foundNonce != UINT32_MAX)
+		pentablake_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+
+		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		CUDA_SAFE_CALL(cudaGetLastError());
+		uint32_t foundNonce = pentablake_check_hash(thr_id, throughput, pdata[19], d_hash);
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(foundNonce != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhashcpu[8];
+			uint32_t vhashcpu[8] = { 0 };
 
-			be32enc(&endiandata[19], foundNonce);
-			pentablakehash(vhashcpu, endiandata);
-
-			if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
+			if(opt_verify)
+			{
+				be32enc(&endiandata[19], foundNonce);
+				pentablakehash(vhashcpu, endiandata);
+			}
+			if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
+			{
 				rc = 1;
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (extra_results[0] != UINT32_MAX) {
+				if (extra_results[thr_id][0] != UINT32_MAX) {
 					// Rare but possible if the throughput is big
 					applog(LOG_NOTICE, "GPU found more than one result yippee!");
-					pdata[21] = extra_results[0];
-					extra_results[0] = UINT32_MAX;
+					pdata[21] = extra_results[thr_id][0];
+					extra_results[thr_id][0] = UINT32_MAX;
 					rc++;
 				}
 				pdata[19] = foundNonce;
 				return rc;
 			}
 			else if (vhashcpu[7] > Htarg) {
-				applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg);
+				applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", device_map[thr_id], foundNonce, vhashcpu[7], Htarg);
 			}
 			else {
-				applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
+				applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
 			}
 		}
 
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 	return rc;
 }
diff --git a/quark/animecoin.cu b/quark/animecoin.cu
index 4994e0a831..426d5e85d8 100644
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@@ -13,42 +13,39 @@ extern "C"
 static uint32_t *d_hash[MAX_GPUS];
 
 // Speicher zur Generierung der Noncevektoren f�r die bedingten Hashes
-static uint32_t *d_animeNonces[MAX_GPUS];
 static uint32_t *d_branch1Nonces[MAX_GPUS];
 static uint32_t *d_branch2Nonces[MAX_GPUS];
 static uint32_t *d_branch3Nonces[MAX_GPUS];
 
-extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_bmw512_cpu_setBlock_80(void *pdata);
-extern void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order);
-extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
+extern void quark_bmw512_cpu_setBlock_80(int thr_id, void *pdata);
+extern void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash);
+extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash);
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_skein512_cpu_init(int thr_id);
-extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
-extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+extern void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
-extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+extern void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
 											uint32_t *d_nonces1, uint32_t *nrm1,
-											uint32_t *d_nonces2, uint32_t *nrm2,
-											int order);
+											uint32_t *d_nonces2, uint32_t *nrm2);
 extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, uint32_t *nrm1,
-											int order);
+											uint32_t *d_nonces1, uint32_t *nrm1);
 
-extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
-extern void cuda_check_quarkcoin(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order, uint32_t *foundnonces);
+extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash);
+extern void cuda_check_quarkcoin(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, uint32_t *foundnonces);
 
 /* CPU Hash */
 extern "C" void animehash(void *state, const void *input)
@@ -151,7 +148,7 @@ struct HashPredicate
     __device__
     bool operator()(const uint32_t x)
     {
-        uint32_t *hash = &m_hashes[(x - m_startNonce)*16];
+        uint32_t *const Hash = &m_hashes[(x - m_startNonce)*16];
         return hash[0] & 0x8;
     }
 
@@ -160,25 +157,26 @@ struct HashPredicate
 };
 */
 
-static bool init[MAX_GPUS] = { 0 };
+static volatile bool init[MAX_GPUS] = { false };
 
-extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done)
+extern int scanhash_anime(int thr_id, uint32_t *pdata,
+    uint32_t *ptarget, uint32_t max_nonce,
+    uint32_t *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
-	uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
-	throughput = min(throughput, (max_nonce - first_nonce));
+	uint32_t throughput = device_intensity(device_map[thr_id], __func__, 1 << 20); // 256*256*8
+	throughput = min(throughput, (max_nonce - first_nonce)) & 0xfffffc00;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x00ff;
+		ptarget[7] = 0x00ff;
 
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		get_cuda_arch(&cuda_arch[thr_id]);
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
 
@@ -188,7 +186,6 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
 		cuda_check_cpu_init(thr_id, throughput);
 		quark_compactTest_cpu_init(thr_id, throughput);
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput));
 		CUDA_SAFE_CALL(cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput));
 		CUDA_SAFE_CALL(cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput));
 		CUDA_SAFE_CALL(cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput));
@@ -198,92 +195,95 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
 
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
-	quark_bmw512_cpu_setBlock_80((void*)endiandata);
-	cuda_check_cpu_setTarget(ptarget);
+	quark_bmw512_cpu_setBlock_80(thr_id, (void*)endiandata);
+	cuda_check_cpu_setTarget(ptarget, thr_id);
 
 	do {
-		int order = 0;
 		uint32_t nrm1=0, nrm2=0, nrm3=0;
 
 		// erstes BMW512 Hash mit CUDA
-		quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
 
 		// das ist der unbedingte Branch f�r Blake512
-		quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id]);
 
 		quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
-				d_branch3Nonces[thr_id], &nrm3,
-				order++);
+				d_branch3Nonces[thr_id], &nrm3);
 		
 		// nur den Skein Branch weiterverfolgen
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id]);
 
 		// das ist der unbedingte Branch f�r Groestl512
-		quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id]);
 
 		// das ist der unbedingte Branch f�r JH512
-		quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id]);
 
 		// quarkNonces in branch1 und branch2 aufsplitten gem�ss if (hash[0] & 0x8)
 		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
 			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
+			d_branch2Nonces[thr_id], &nrm2);
 
 		// das ist der bedingte Branch f�r Blake512
-		quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id]);
 
 		// das ist der bedingte Branch f�r Bmw512
-		quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+		quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id]);
 
 		// das ist der unbedingte Branch f�r Keccak512
-		quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id]);
 
 		// das ist der unbedingte Branch f�r Skein512
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id]);
 
 		// quarkNonces in branch1 und branch2 aufsplitten gem�ss if (hash[0] & 0x8)
 		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
 			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
+			d_branch2Nonces[thr_id], &nrm2);
 
-		quark_keccak512_cpu_hash_64_final(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
-		quark_jh512_cpu_hash_64_final(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64_final(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id]);
+		quark_jh512_cpu_hash_64_final(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id]);
 
 		uint32_t foundnonces[2];
-		cuda_check_quarkcoin(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++, foundnonces);
-		if (foundnonces[0] != 0xffffffff)
+		cuda_check_quarkcoin(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], foundnonces);
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(foundnonces[0] != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
-/*			uint32_t vhash64[8];
+			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundnonces[0]);
 			animehash(vhash64, endiandata);
 
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
-*/			{
+			{
 				int res = 1;
 				*hashes_done = pdata[19] - first_nonce + throughput;
 				// check if there was some other ones...
 				if (foundnonces[1] != 0xffffffff)
 				{
-					pdata[21] = foundnonces[1];
-					res++;
-					if (opt_benchmark)  applog(LOG_INFO, "GPU #%d: Found second nonce $%08X", thr_id, foundnonces[1]);
+					be32enc(&endiandata[19], foundnonces[1]);
+					animehash(vhash64, endiandata);
+					if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+
+						pdata[21] = foundnonces[1];
+						res++;
+						if (opt_benchmark)  applog(LOG_INFO, "GPU #%d: Found second nonce $%08X", device_map[thr_id], foundnonces[1]);
+					}
 				}
 				pdata[19] = foundnonces[0];
-				if (opt_benchmark) applog(LOG_INFO, "GPU #%d: Found nonce $%08X", thr_id, foundnonces[0]);
+				if (opt_benchmark) applog(LOG_INFO, "GPU #%d: Found nonce $%08X", device_map[thr_id], foundnonces[0]);
 				return res;
 			}
-/*			else
+			else
 			{
 				if (vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest
-					applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundnonces[0]);
+					applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundnonces[0]);
 			}
-*/		}
-		pdata[19] += throughput;
+		}
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
 	*hashes_done = pdata[19] - first_nonce + 1;
diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu
index a425e9667d..1640630f4d 100644
--- a/quark/cuda_bmw512.cu
+++ b/quark/cuda_bmw512.cu
@@ -2,13 +2,8 @@
 #include <memory.h>
 
 #include "cuda_helper.h"
+#include "cuda_vector.h"
 
-
-// die Message it Padding zur Berechnung auf der GPU
-__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
-
-//#define SHL(x, n)            ((x) << (n))
-//#define SHR(x, n)            ((x) >> (n))
 #define SHR(x, n) SHR2(x, n) 
 #define SHL(x, n) SHL2(x, n) 
 
@@ -21,185 +16,78 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + paddi
                     q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
                     q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
 
-__device__ void Compression512_64_first(uint2 *msg, uint2 *hash)
-{
-	// Compression ref. implementation
-	uint2 q[32];
-	uint2 tmp;
-
-	tmp = (msg[5] ^ hash[5]) - (msg[7] ^ hash[7]) + (hash[10]) + (hash[13]) + (hash[14]);
-	q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[1];
-	tmp = (msg[6] ^ hash[6]) - (msg[8] ^ hash[8]) + (hash[11]) + (hash[14]) - (msg[15] ^ hash[15]);
-	q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2];
-	tmp = (msg[0] ^ hash[0]) + (msg[7] ^ hash[7]) + (hash[9]) - (hash[12]) + (msg[15] ^ hash[15]);
-	q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3];
-	tmp = (msg[0] ^ hash[0]) - (msg[1] ^ hash[1]) + (msg[8] ^ hash[8]) - (hash[10]) + (hash[13]);
-	q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4];
-	tmp = (msg[1] ^ hash[1]) + (msg[2] ^ hash[2]) + (hash[9]) - (hash[11]) - (hash[14]);
-	q[4] = (SHR(tmp, 1) ^ tmp) + hash[5];
-	tmp = (msg[3] ^ hash[3]) - (msg[2] ^ hash[2]) + (hash[10]) - (hash[12]) + (msg[15] ^ hash[15]);
-	q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[6];
-	tmp = (msg[4] ^ hash[4]) - (msg[0] ^ hash[0]) - (msg[3] ^ hash[3]) - (hash[11]) + (hash[13]);
-	q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7];
-	tmp = (msg[1] ^ hash[1]) - (msg[4] ^ hash[4]) - (msg[5] ^ hash[5]) - (hash[12]) - (hash[14]);
-	q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8];
-	tmp = (msg[2] ^ hash[2]) - (msg[5] ^ hash[5]) - (msg[6] ^ hash[6]) + (hash[13]) - (msg[15] ^ hash[15]);
-	q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9];
-	tmp = (msg[0] ^ hash[0]) - (msg[3] ^ hash[3]) + (msg[6] ^ hash[6]) - (msg[7] ^ hash[7]) + (hash[14]);
-	q[9] = (SHR(tmp, 1) ^ tmp) + hash[10];
-	tmp = (msg[8] ^ hash[8]) - (msg[1] ^ hash[1]) - (msg[4] ^ hash[4]) - (msg[7] ^ hash[7]) + (msg[15] ^ hash[15]);
-	q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[11];
-	tmp = (msg[8] ^ hash[8]) - (msg[0] ^ hash[0]) - (msg[2] ^ hash[2]) - (msg[5] ^ hash[5]) + (hash[9]);
-	q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12];
-	tmp = (msg[1] ^ hash[1]) + (msg[3] ^ hash[3]) - (msg[6] ^ hash[6]) - (hash[9]) + (hash[10]);
-	q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13];
-	tmp = (msg[2] ^ hash[2]) + (msg[4] ^ hash[4]) + (msg[7] ^ hash[7]) + (hash[10]) + (hash[11]);
-	q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14];
-	tmp = (msg[3] ^ hash[3]) - (msg[5] ^ hash[5]) + (msg[8] ^ hash[8]) - (hash[11]) - (hash[12]);
-	q[14] = (SHR(tmp, 1) ^ tmp) + hash[15];
-	tmp = (msg[12] ^ hash[12]) - (msg[4] ^ hash[4]) - (msg[6] ^ hash[6]) - (hash[9]) + (hash[13]);
-	q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0];
-
-		q[0 + 16] =
-			(SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) +
-			(SHR(q[0 + 1], 2) ^ SHL(q[0 + 1], 1) ^ ROTL64(q[0 + 1], 19) ^ ROTL64(q[0 + 1], 53)) +
-			(SHR(q[0 + 2], 2) ^ SHL(q[0 + 2], 2) ^ ROTL64(q[0 + 2], 28) ^ ROTL64(q[0 + 2], 59)) +
-			(SHR(q[0 + 3], 1) ^ SHL(q[0 + 3], 3) ^ ROTL64(q[0 + 3], 4) ^ ROTL64(q[0 + 3], 37)) +
-			(SHR(q[0 + 4], 1) ^ SHL(q[0 + 4], 2) ^ ROTL64(q[0 + 4], 13) ^ ROTL64(q[0 + 4], 43)) +
-			(SHR(q[0 + 5], 2) ^ SHL(q[0 + 5], 1) ^ ROTL64(q[0 + 5], 19) ^ ROTL64(q[0 + 5], 53)) +
-			(SHR(q[0 + 6], 2) ^ SHL(q[0 + 6], 2) ^ ROTL64(q[0 + 6], 28) ^ ROTL64(q[0 + 6], 59)) +
-			(SHR(q[0 + 7], 1) ^ SHL(q[0 + 7], 3) ^ ROTL64(q[0 + 7], 4) ^ ROTL64(q[0 + 7], 37)) +
-			(SHR(q[0 + 8], 1) ^ SHL(q[0 + 8], 2) ^ ROTL64(q[0 + 8], 13) ^ ROTL64(q[0 + 8], 43)) +
-			(SHR(q[0 + 9], 2) ^ SHL(q[0 + 9], 1) ^ ROTL64(q[0 + 9], 19) ^ ROTL64(q[0 + 9], 53)) +
-			(SHR(q[0 + 10], 2) ^ SHL(q[0 + 10], 2) ^ ROTL64(q[0 + 10], 28) ^ ROTL64(q[0 + 10], 59)) +
-			(SHR(q[0 + 11], 1) ^ SHL(q[0 + 11], 3) ^ ROTL64(q[0 + 11], 4) ^ ROTL64(q[0 + 11], 37)) +
-			(SHR(q[0 + 12], 1) ^ SHL(q[0 + 12], 2) ^ ROTL64(q[0 + 12], 13) ^ ROTL64(q[0 + 12], 43)) +
-			(SHR(q[0 + 13], 2) ^ SHL(q[0 + 13], 1) ^ ROTL64(q[0 + 13], 19) ^ ROTL64(q[0 + 13], 53)) +
-			(SHR(q[0 + 14], 2) ^ SHL(q[0 + 14], 2) ^ ROTL64(q[0 + 14], 28) ^ ROTL64(q[0 + 14], 59)) +
-			(SHR(q[0 + 15], 1) ^ SHL(q[0 + 15], 3) ^ ROTL64(q[0 + 15], 4) ^ ROTL64(q[0 + 15], 37)) +
-			((make_uint2(0x55555550ul,0x55555555) + ROTL64(msg[0], 0 + 1) +
-			ROTL64(msg[0 + 3], 0 + 4)) ^ hash[0 + 7]);
-		q[1 + 16] =
-			(SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) +
-			(SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROTL64(q[1 + 1], 19) ^ ROTL64(q[1 + 1], 53)) +
-			(SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROTL64(q[1 + 2], 28) ^ ROTL64(q[1 + 2], 59)) +
-			(SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROTL64(q[1 + 3], 4) ^ ROTL64(q[1 + 3], 37)) +
-			(SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROTL64(q[1 + 4], 13) ^ ROTL64(q[1 + 4], 43)) +
-			(SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROTL64(q[1 + 5], 19) ^ ROTL64(q[1 + 5], 53)) +
-			(SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROTL64(q[1 + 6], 28) ^ ROTL64(q[1 + 6], 59)) +
-			(SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROTL64(q[1 + 7], 4) ^ ROTL64(q[1 + 7], 37)) +
-			(SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROTL64(q[1 + 8], 13) ^ ROTL64(q[1 + 8], 43)) +
-			(SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROTL64(q[1 + 9], 19) ^ ROTL64(q[1 + 9], 53)) +
-			(SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROTL64(q[1 + 10], 28) ^ ROTL64(q[1 + 10], 59)) +
-			(SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROTL64(q[1 + 11], 4) ^ ROTL64(q[1 + 11], 37)) +
-			(SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROTL64(q[1 + 12], 13) ^ ROTL64(q[1 + 12], 43)) +
-			(SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) +
-			(SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) +
-			(SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) +
-			((make_uint2(0xAAAAAAA5, 0x5AAAAAAA) + ROTL64(msg[1], 1 + 1) +
-			ROTL64(msg[1 + 3], 1 + 4)) ^ hash[1 + 7]);		
-
-	q[2 + 16] = CONST_EXP2(2) +
-		((make_uint2(0xFFFFFFFA, 0x5FFFFFFF) + ROTL64(msg[2], 2 + 1) +
-		ROTL64(msg[2 + 3], 2 + 4) - ROTL64(msg[2 + 10], 2 + 11)) ^ hash[2 + 7]);
-	q[3 + 16] = CONST_EXP2(3) +
-		((make_uint2(0x5555554F, 0x65555555) + ROTL64(msg[3], 3 + 1) +
-		ROTL64(msg[3 + 3], 3 + 4) - ROTL64(msg[3 + 10], 3 + 11)) ^ hash[3 + 7]);
-	q[4 + 16] = CONST_EXP2(4) +
-		((make_uint2(0xAAAAAAA4, 0x6AAAAAAA) +ROTL64(msg[4], 4 + 1) +
-		ROTL64(msg[4 + 3], 4 + 4) - ROTL64(msg[4 + 10], 4 + 11)) ^ hash[4 + 7]);
-	q[5 + 16] = CONST_EXP2(5) +
-		((make_uint2(0xFFFFFFF9, 0x6FFFFFFF) + ROTL64(msg[5], 5 + 1) +
-		ROTL64(msg[5 + 3], 5 + 4) - ROTL64(msg[5 + 10], 5 + 11)) ^ hash[5 + 7]);
-
-
-#pragma unroll 3
-	for (int i = 6; i<9; i++) {
-		q[i + 16] = CONST_EXP2(i) +
-			((vectorize((i + 16)*(0x0555555555555555ull)) + ROTL64(msg[i], i + 1) -
-			ROTL64(msg[i - 6], (i - 6) + 1)) ^ hash[i + 7]);
-	}
-
-#pragma unroll 4
-	for (int i = 9; i<13; i++) {
-		q[i + 16] = CONST_EXP2(i) +
-			((vectorize((i + 16)*(0x0555555555555555ull)) +
-			ROTL64(msg[i + 3], i + 4) - ROTL64(msg[i - 6], (i - 6) + 1)) ^ hash[i - 9]);
-	}
-
-	q[13 + 16] = CONST_EXP2(13) +
-		((make_uint2(0xAAAAAAA1, 0x9AAAAAAA) + ROTL64(msg[13], 13 + 1) +
-		ROTL64(msg[13 - 13], (13 - 13) + 1) - ROTL64(msg[13 - 6], (13 - 6) + 1)) ^ hash[13 - 9]);
-	q[14 + 16] = CONST_EXP2(14) +
-		((make_uint2(0xFFFFFFF6, 0x9FFFFFFF) + ROTL64(msg[14], 14 + 1) +
-		ROTL64(msg[14 - 13], (14 - 13) + 1) - ROTL64(msg[14 - 6], (14 - 6) + 1)) ^ hash[14 - 9]);
-	q[15 + 16] = CONST_EXP2(15) +
-		((make_uint2(0x5555554B, 0xA5555555) + ROTL64(msg[15], 15 + 1) +
-		ROTL64(msg[15 - 13], (15 - 13) + 1) - ROTL64(msg[15 - 6], (15 - 6) + 1)) ^ hash[15 - 9]);
+#define CONST_EXP3(i)    ROTL64(q[i+1], 5) + ROTL64(q[i+3], 11) + \
+                     ROTL64(q[i+5], 27) + SWAPDWORDS2(q[i+7]) + \
+                    ROTL64(q[i+9], 37)  + ROTL64(q[i+11], 43) + \
+                    ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
 
+__device__ __forceinline__ void Compression512(const uint2 *msg, uint2 *hash)
+{
 
+	const uint2 precalc[16] =
+	{
+		{ 0x55555550, 0x55555555 },
+		{ 0xAAAAAAA5, 0x5AAAAAAA },
+		{ 0xFFFFFFFA, 0x5FFFFFFF },
+		{ 0x5555554F, 0x65555555 },
+		{ 0xAAAAAAA4, 0x6AAAAAAA },
+		{ 0xFFFFFFF9, 0x6FFFFFFF },
+		{ 0x5555554E, 0x75555555 },
+		{ 0xAAAAAAA3, 0x7AAAAAAA },
+		{ 0xFFFFFFF8, 0x7FFFFFFF },
+		{ 0x5555554D, 0x85555555 },
+		{ 0xAAAAAAA2, 0x8AAAAAAA },
+		{ 0xFFFFFFF7, 0x8FFFFFFF },
+		{ 0x5555554C, 0x95555555 },
+		{ 0xAAAAAAA1, 0x9AAAAAAA },
+		{ 0xFFFFFFF6, 0x9FFFFFFF },
+		{ 0x5555554B, 0xA5555555 },
+	};
 
-	uint2 XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23];
-	uint2 XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31];
 
-	hash[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ msg[0]) + (XL64    ^ q[24] ^ q[0]);
-	hash[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ msg[1]) + (XL64    ^ q[25] ^ q[1]);
-	hash[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ msg[2]) + (XL64    ^ q[26] ^ q[2]);
-	hash[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ msg[3]) + (XL64    ^ q[27] ^ q[3]);
-	hash[4] = (SHR(XH64, 3) ^ q[20] ^ msg[4]) + (XL64    ^ q[28] ^ q[4]);
-	hash[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ msg[5]) + (XL64    ^ q[29] ^ q[5]);
-	hash[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ msg[6]) + (XL64    ^ q[30] ^ q[6]);
-	hash[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ msg[7]) + (XL64    ^ q[31] ^ q[7]);
-
-	hash[8] = ROTL64(hash[4], 9) + (XH64     ^     q[24] ^ msg[8]) + (SHL(XL64, 8) ^ q[23] ^ q[8]);
-	hash[9] = ROTL64(hash[5], 10) + (XH64     ^     q[25]) + (SHR(XL64, 6) ^ q[16] ^ q[9]);
-	hash[10] = ROTL64(hash[6], 11) + (XH64     ^     q[26]) + (SHL(XL64, 6) ^ q[17] ^ q[10]);
-	hash[11] = ROTL64(hash[7], 12) + (XH64     ^     q[27]) + (SHL(XL64, 4) ^ q[18] ^ q[11]);
-	hash[12] = ROTL64(hash[0], 13) + (XH64     ^     q[28]) + (SHR(XL64, 3) ^ q[19] ^ q[12]);
-	hash[13] = ROTL64(hash[1], 14) + (XH64     ^     q[29]) + (SHR(XL64, 4) ^ q[20] ^ q[13]);
-	hash[14] = ROTL64(hash[2], 15) + (XH64     ^     q[30]) + (SHR(XL64, 7) ^ q[21] ^ q[14]);
-	hash[15] = ROTL64(hash[3], 16) + (XH64     ^     q[31] ^ msg[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]);
-}
-
-__device__ void Compression512(uint2 *msg, uint2 *hash)
-{
-    // Compression ref. implementation
+	// Compression ref. implementation
 	uint2 q[32];
 	uint2 tmp;
+//	const uint2 pre = (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]);
+	const uint2 pre2 = (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]);
+	const uint2 pre3 = (msg[14] ^ hash[14]) - (msg[7] ^ hash[7]);
+	const uint2 pre4 = (msg[6] ^ hash[6]) + (msg[9] ^ hash[9]);
+	const uint2 pre5 = (msg[8] ^ hash[8]) - (msg[5] ^ hash[5]);
+	const uint2 pre6 = (msg[1] ^ hash[1]) - (msg[14] ^ hash[14]);
+	const uint2 pre7 = (msg[8] ^ hash[8]) - (msg[1] ^ hash[1]);
 
-    tmp = (msg[ 5] ^ hash[ 5]) - (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]) + (msg[14] ^ hash[14]);
+	tmp = (msg[5] ^ hash[5]) + (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]) + pre3;
     q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp,  4) ^ ROTL64(tmp, 37)) + hash[1];
-    tmp = (msg[ 6] ^ hash[ 6]) - (msg[ 8] ^ hash[ 8]) + (msg[11] ^ hash[11]) + (msg[14] ^ hash[14]) - (msg[15] ^ hash[15]);
+	tmp = (msg[6] ^ hash[6]) - (msg[8] ^ hash[8]) + pre2 - (msg[15] ^ hash[15]);
     q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2];
-    tmp = (msg[ 0] ^ hash[ 0]) + (msg[ 7] ^ hash[ 7]) + (msg[ 9] ^ hash[ 9]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]);
+    tmp = (msg[ 0] ^ hash[ 0]) + (msg[ 7] ^ hash[ 7]) + (msg[ 9] ^ hash[ 9]) - pre2;
     q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3];
-    tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 1] ^ hash[ 1]) + (msg[ 8] ^ hash[ 8]) - (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]);
+	tmp = (msg[0] ^ hash[0]) + pre7 - (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]);
     q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4];
-    tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 2] ^ hash[ 2]) + (msg[ 9] ^ hash[ 9]) - (msg[11] ^ hash[11]) - (msg[14] ^ hash[14]);
+	tmp = pre6 + (msg[2] ^ hash[2]) + (msg[9] ^ hash[9]) - (msg[11] ^ hash[11]);
     q[4] = (SHR(tmp, 1) ^ tmp) + hash[5];
-    tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 2] ^ hash[ 2]) + (msg[10] ^ hash[10]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]);
+	tmp = (msg[3] ^ hash[3]) - (msg[2] ^ hash[2]) + (msg[10] ^ hash[10]) - pre2;
     q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp,  4) ^ ROTL64(tmp, 37)) + hash[6];
     tmp = (msg[ 4] ^ hash[ 4]) - (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) - (msg[11] ^ hash[11]) + (msg[13] ^ hash[13]);
     q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7];
-    tmp = (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 5] ^ hash[ 5]) - (msg[12] ^ hash[12]) - (msg[14] ^ hash[14]);
+    tmp = pre6 - (msg[ 4] ^ hash[ 4]) - (msg[ 5] ^ hash[ 5]) - (msg[12] ^ hash[12]);
     q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8];
     tmp = (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) - (msg[ 6] ^ hash[ 6]) + (msg[13] ^ hash[13]) - (msg[15] ^ hash[15]);
     q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9];
-    tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) + (msg[ 6] ^ hash[ 6]) - (msg[ 7] ^ hash[ 7]) + (msg[14] ^ hash[14]);
+    tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) + (msg[ 6] ^ hash[ 6])+pre3;
     q[9] = (SHR(tmp, 1) ^ tmp) + hash[10];
-    tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 7] ^ hash[ 7]) + (msg[15] ^ hash[15]);
+	tmp = pre7 - (msg[4] ^ hash[4]) - (msg[7] ^ hash[7]) + (msg[15] ^ hash[15]);
     q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp,  4) ^ ROTL64(tmp, 37)) + hash[11];
-    tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 0] ^ hash[ 0]) - (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) + (msg[ 9] ^ hash[ 9]);
+    tmp = pre5 - (msg[ 0] ^ hash[ 0]) - (msg[ 2] ^ hash[ 2]) + (msg[ 9] ^ hash[ 9]);
     q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12];
-    tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 3] ^ hash[ 3]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[10] ^ hash[10]);
+	tmp = (msg[1] ^ hash[1]) + (msg[3] ^ hash[3]) - pre4 + (msg[10] ^ hash[10]);
     q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13];
     tmp = (msg[ 2] ^ hash[ 2]) + (msg[ 4] ^ hash[ 4]) + (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[11] ^ hash[11]);
     q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14];
-    tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 5] ^ hash[ 5]) + (msg[ 8] ^ hash[ 8]) - (msg[11] ^ hash[11]) - (msg[12] ^ hash[12]);
+    tmp = (msg[ 3] ^ hash[ 3]) +pre5 - (msg[11] ^ hash[11]) - (msg[12] ^ hash[12]);
     q[14] = (SHR(tmp, 1) ^ tmp) + hash[15];
-    tmp = (msg[12] ^ hash[12]) - (msg[ 4] ^ hash[ 4]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[13] ^ hash[13]);
+	tmp = (msg[12] ^ hash[12]) - (msg[4] ^ hash[4]) - pre4 + (msg[13] ^ hash[13]);
     q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0];
-
         q[0+16] =
         (SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) +
         (SHR(q[0+1], 2) ^ SHL(q[0+1], 1) ^ ROTL64(q[0+1], 19) ^ ROTL64(q[0+1], 53)) +
@@ -217,7 +105,7 @@ __device__ void Compression512(uint2 *msg, uint2 *hash)
         (SHR(q[0+13], 2) ^ SHL(q[0+13], 1) ^ ROTL64(q[0+13], 19) ^ ROTL64(q[0+13], 53)) +
         (SHR(q[0+14], 2) ^ SHL(q[0+14], 2) ^ ROTL64(q[0+14], 28) ^ ROTL64(q[0+14], 59)) +
         (SHR(q[0+15], 1) ^ SHL(q[0+15], 3) ^ ROTL64(q[0+15],  4) ^ ROTL64(q[0+15], 37)) +
-		((make_uint2(0x55555550ul, 0x55555555) + ROTL64(msg[0], 0 + 1) +
+		((precalc[0] + ROTL64(msg[0], 0 + 1) +
             ROTL64(msg[0+3], 0+4) - ROTL64(msg[0+10], 0+11) ) ^ hash[0+7]);
 		q[1 + 16] =
 			(SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) +
@@ -236,55 +124,56 @@ __device__ void Compression512(uint2 *msg, uint2 *hash)
 			(SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) +
 			(SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) +
 			(SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) +
-			((make_uint2(0xAAAAAAA5, 0x5AAAAAAA) + ROTL64(msg[1], 1 + 1) +
+			((precalc[1] + ROTL64(msg[1], 1 + 1) +
 			ROTL64(msg[1 + 3], 1 + 4) - ROTL64(msg[1 + 10], 1 + 11)) ^ hash[1 + 7]);
 
 		q[2 + 16] = CONST_EXP2(2) +
-			((make_uint2(0xFFFFFFFA, 0x5FFFFFFF) + ROTL64(msg[2], 2 + 1) +
+			((precalc[2] + ROTL64(msg[2], 2 + 1) +
             ROTL64(msg[2+3], 2+4) - ROTL64(msg[2+10], 2+11) ) ^ hash[2+7]);
 		q[3 + 16] = CONST_EXP2(3) +
-			((make_uint2(0x5555554F, 0x65555555) + ROTL64(msg[3], 3 + 1) +
+			((precalc[3] + ROTL64(msg[3], 3 + 1) +
 			ROTL64(msg[3 + 3], 3 + 4) - ROTL64(msg[3 + 10], 3 + 11)) ^ hash[3 + 7]);
 		q[4 + 16] = CONST_EXP2(4) +
-			((make_uint2(0xAAAAAAA4, 0x6AAAAAAA) + ROTL64(msg[4], 4 + 1) +
-			ROTL64(msg[4 + 3], 4 + 4) - ROTL64(msg[4 + 10], 4 + 11)) ^ hash[4 + 7]);
+			((precalc[4] + ROTL64(msg[4], 4 + 1) +
+			ROL8(msg[4 + 3]) - ROTL64(msg[4 + 10], 4 + 11)) ^ hash[4 + 7]);
 		q[5 + 16] = CONST_EXP2(5) +
-			((make_uint2(0xFFFFFFF9, 0x6FFFFFFF) + ROTL64(msg[5], 5 + 1) +
-			ROTL64(msg[5 + 3], 5 + 4) - ROTL64(msg[5 + 10], 5 + 11)) ^ hash[5 + 7]);
+			((precalc[5] + ROTL64(msg[5], 5 + 1) +
+			ROTL64(msg[5 + 3], 5 + 4) - ROL16(msg[5 + 10])) ^ hash[5 + 7]);
+
 
 		q[6 + 16] = CONST_EXP2(6) +
-			((make_uint2(0x5555554E, 0x75555555)+ ROTL64(msg[6], 6 + 1) +
+			((precalc[6]+ ROTL64(msg[6], 6 + 1) +
 			ROTL64(msg[6 + 3], 6 + 4) - ROTL64(msg[6 - 6], (6 - 6) + 1)) ^ hash[6 + 7]);
 		q[7 + 16] = CONST_EXP2(7) +
-			((make_uint2(0xAAAAAAA3, 0x7AAAAAAA) + ROTL64(msg[7], 7 + 1) +
+			((precalc[7] + ROL8(msg[7]) +
 			ROTL64(msg[7 + 3], 7 + 4) - ROTL64(msg[7 - 6], (7 - 6) + 1)) ^ hash[7 + 7]);
 		q[8 + 16] = CONST_EXP2(8) +
-			((make_uint2(0xFFFFFFF8, 0x7FFFFFFF) + ROTL64(msg[8], 8 + 1) +
+			((precalc[8] + ROTL64(msg[8], 8 + 1) +
 			ROTL64(msg[8 + 3], 8 + 4) - ROTL64(msg[8 - 6], (8 - 6) + 1)) ^ hash[8 + 7]);
 
 	q[9 + 16] = CONST_EXP2(9) +
-	((make_uint2(0x5555554D, 0x85555555) + ROTL64(msg[9], 9 + 1) +
+	((precalc[9] + ROTL64(msg[9], 9 + 1) +
 		ROTL64(msg[9 + 3], 9 + 4) - ROTL64(msg[9 - 6], (9 - 6) + 1)) ^ hash[9 - 9]);
 	q[10 + 16] = CONST_EXP2(10) +
-		((make_uint2(0xAAAAAAA2, 0x8AAAAAAA) + ROTL64(msg[10], 10 + 1) +
+		((precalc[10] + ROTL64(msg[10], 10 + 1) +
 		ROTL64(msg[10 + 3], 10 + 4) - ROTL64(msg[10 - 6], (10 - 6) + 1)) ^ hash[10 - 9]);
 	q[11 + 16] = CONST_EXP2(11) +
-		((make_uint2(0xFFFFFFF7, 0x8FFFFFFF) + ROTL64(msg[11], 11 + 1) +
+		((precalc[11] + ROTL64(msg[11], 11 + 1) +
 		ROTL64(msg[11 + 3], 11 + 4) - ROTL64(msg[11 - 6], (11 - 6) + 1)) ^ hash[11 - 9]);
 	q[12 + 16] = CONST_EXP2(12) +
-		((make_uint2(0x5555554C, 0x95555555) + ROTL64(msg[12], 12 + 1) +
-		ROTL64(msg[12 + 3], 12 + 4) - ROTL64(msg[12 - 6], (12 - 6) + 1)) ^ hash[12 - 9]);
+		((precalc[12] + ROTL64(msg[12], 12 + 1) +
+		ROL16(msg[12 + 3]) - ROTL64(msg[12 - 6], (12 - 6) + 1)) ^ hash[12 - 9]);
 
 	
 
 		q[13 + 16] = CONST_EXP2(13) +
-			((make_uint2(0xAAAAAAA1, 0x9AAAAAAA) + ROTL64(msg[13], 13 + 1) +
-			ROTL64(msg[13 - 13], (13 - 13) + 1) - ROTL64(msg[13 - 6], (13 - 6) + 1)) ^ hash[13 - 9]);
+			((precalc[13] + ROTL64(msg[13], 13 + 1) +
+			ROTL64(msg[13 - 13], (13 - 13) + 1) - ROL8(msg[13 - 6])) ^ hash[13 - 9]);
 		q[14 + 16] = CONST_EXP2(14) +
-			((make_uint2(0xFFFFFFF6, 0x9FFFFFFF) + ROTL64(msg[14], 14 + 1) +
+			((precalc[14] + ROTL64(msg[14], 14 + 1) +
 			ROTL64(msg[14 - 13], (14 - 13) + 1) - ROTL64(msg[14 - 6], (14 - 6) + 1)) ^ hash[14 - 9]);
 		q[15 + 16] = CONST_EXP2(15) +
-			((make_uint2(0x5555554B, 0xA5555555) + ROTL64(msg[15], 15 + 1) +
+			((precalc[15] + ROL16(msg[15]) +
 			ROTL64(msg[15 - 13], (15 - 13) + 1) - ROTL64(msg[15 - 6], (15 - 6) + 1)) ^ hash[15 - 9]);
 
     uint2 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
@@ -306,27 +195,22 @@ __device__ void Compression512(uint2 *msg, uint2 *hash)
     hash[12] = ROTL64(hash[0],13) + (    XH64     ^     q[28]    ^ msg[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
     hash[13] = ROTL64(hash[1],14) + (    XH64     ^     q[29]    ^ msg[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
     hash[14] = ROTL64(hash[2],15) + (    XH64     ^     q[30]    ^ msg[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
-	hash[15] = ROTL64(hash[3],16) + (XH64     ^     q[31] ^ msg[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]);
+	hash[15] = ROL16(hash[3]) + (XH64     ^     q[31] ^ msg[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]);
 }
-
-__global__
-#if __CUDA_ARCH__ > 500
-__launch_bounds__(32, 16)
-#else
-__launch_bounds__(64, 8)
-#endif
-void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__global__ __launch_bounds__(32, 16)
+void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
 {
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+    const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+
+		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-        int hashPosition = nounce - startNounce;
-        uint64_t *inpHash = &g_hash[8 * hashPosition];
+        const uint32_t hashPosition = nounce - startNounce;
+		uint64_t *const inpHash = &g_hash[8 * hashPosition];
 
-        // Init
-		uint2 h[16] = {
+		const uint2 hash[16] =
+		{
 			{ 0x84858687UL, 0x80818283UL },
 			{ 0x8C8D8E8FUL, 0x88898A8BUL },
 			{ 0x94959697UL, 0x90919293UL },
@@ -335,8 +219,8 @@ void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
 			{ 0xACADAEAFUL, 0xA8A9AAABUL },
 			{ 0xB4B5B6B7UL, 0xB0B1B2B3UL },
 			{ 0xBCBDBEBFUL, 0xB8B9BABBUL },
-			{ 0xC4C5C6C7UL, 0xC0C1C2C3UL, },
-			{ 0xCCCDCECFUL, 0xC8C9CACBUL, },
+			{ 0xC4C5C6C7UL, 0xC0C1C2C3UL },
+			{ 0xCCCDCECFUL, 0xC8C9CACBUL },
 			{ 0xD4D5D6D7UL, 0xD0D1D2D3UL },
 			{ 0xDCDDDEDFUL, 0xD8D9DADBUL },
 			{ 0xE4E5E6E7UL, 0xE0E1E2E3UL },
@@ -344,52 +228,470 @@ void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
 			{ 0xF4F5F6F7UL, 0xF0F1F2F3UL },
 			{ 0xFCFDFEFFUL, 0xF8F9FAFBUL }
 		};
-        // Nachricht kopieren (Achtung, die Nachricht hat 64 Byte,
-        // BMW arbeitet mit 128 Byte!!!
-		uint2 message[16];
+
+		const uint64_t hash2[16] =
+		{
+			0x8081828384858687,
+			0x88898A8B8C8D8E8F,
+			0x9091929394959697,
+			0x98999A9B9C9D9E9F,
+			0xA0A1A2A3A4A5A6A7,
+			0xA8A9AAABACADAEAF,
+			0xB0B1B2B3B4B5B6B7,
+			0xB8B9BABBBCBDBEBF,
+			0xC0C1C2C3C4C5C6C7^0x80,
+			0xC8C9CACBCCCDCECF,
+			0xD0D1D2D3D4D5D6D7,
+			0xD8D9DADBDCDDDEDF,
+			0xE0E1E2E3E4E5E6E7,
+			0xE8E9EAEBECEDEEEF,
+			0xF0F1F2F3F4F5F6F7,
+			0xF8F9FAFBFCFDFEFF
+		};
+
+		uint64_t msg[16];
+		uint2    msg2[16];
+		uint64_t mxh[8];
+		uint2 h[16];
+		
+		uint28 *phash = (uint28*)inpHash;
+		uint28 *outpt = (uint28*)msg2;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+
 #pragma unroll 8
-        for(int i=0;i<8;i++)
-			message[i] = vectorize(inpHash[i]);
-#pragma unroll 6
-        for(int i=9;i<15;i++)
-            message[i] = make_uint2(0,0);
+		for (int i = 0; i < 8; i++)
+		{
+			msg[i] = devectorize(msg2[i]);
+		}
+
+
+		mxh[0] = msg[0] ^ hash2[0];
+		mxh[1] = msg[1] ^ hash2[1];
+		mxh[2] = msg[2] ^ hash2[2];
+		mxh[3] = msg[3] ^ hash2[3];
+		mxh[4] = msg[4] ^ hash2[4];
+		mxh[5] = msg[5] ^ hash2[5];
+		mxh[6] = msg[6] ^ hash2[6];
+		mxh[7] = msg[7] ^ hash2[7];
+
+		const uint2 precalcf[9] =
+		{
+			{ 0x55555550ul, 0x55555555 },
+			{ 0xAAAAAAA5, 0x5AAAAAAA },
+			{ 0xFFFFFFFA, 0x5FFFFFFF },
+			{ 0x5555554F, 0x65555555 },
+			{ 0xAAAAAAA4, 0x6AAAAAAA },
+			{ 0xFE00FFF9, 0x6FFFFFFF },
+			{ 0xAAAAAAA1, 0x9AAAAAAA },
+			{ 0xFFFEFFF6, 0x9FFFFFFF },
+			{ 0x5755554B, 0xA5555555 },
+		};
 
-        // Padding einf�gen (Byteorder?!?)
-		message[8] = make_uint2(0x80,0);
-        // L�nge (in Bits, d.h. 64 Byte * 8 = 512 Bits
-		message[15] = make_uint2(512,0);
+		uint2 q[32];
 
-        // Compression 1
-        Compression512_64_first(message, h);
+		uint2 tmp;
+		tmp = vectorize((mxh[5]) - (mxh[7]) + (hash2[10] + hash2[13] + hash2[14]));
+		q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[1];
+		tmp = vectorize((mxh[6]) + (hash2[11] + hash2[14] - (512 ^ hash2[15]) - hash2[8]));
+		q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2];
+		tmp = vectorize((mxh[0] + mxh[7]) + hash2[9] - hash2[12] + (512 ^ hash2[15]));
+		q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3];
+		tmp = vectorize((mxh[0] - mxh[1]) + hash2[8] - hash2[10] + hash2[13]);
+		q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4];
+		tmp = vectorize((mxh[1] + mxh[2]) + hash2[9] - hash2[11] - hash2[14]);
+		q[4] = (SHR(tmp, 1) ^ tmp) + hash[5];
+		tmp = vectorize((mxh[3] - mxh[2] + hash2[10] - hash2[12] + (512 ^ hash2[15])));
+		q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[6];
+		tmp = vectorize((mxh[4]) - (mxh[0]) - (mxh[3]) + hash2[13] - hash2[11]);
+		q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7];
+		tmp = vectorize((mxh[1]) - (mxh[4]) - (mxh[5]) - hash2[12] - hash2[14]);
+		q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8];
+		tmp = vectorize((mxh[2]) - (mxh[5]) - (mxh[6]) + hash2[13] - (512 ^ hash2[15]));
+		q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9];
+		tmp = vectorize((mxh[0]) - (mxh[3]) + (mxh[6]) - (mxh[7]) + (hash2[14]));
+		q[9] = (SHR(tmp, 1) ^ tmp) + hash[10];
+		tmp = vectorize((512 ^ hash2[15]) + hash2[8] - (mxh[1]) - (mxh[4]) - (mxh[7]));
+		q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[11];
+		tmp = vectorize(hash2[9] + hash2[8] - (mxh[0]) - (mxh[2]) - (mxh[5]));
+		q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12];
+		tmp = vectorize((mxh[1]) + (mxh[3]) - (mxh[6]) + hash2[10] - hash2[9]);
+		q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13];
+		tmp = vectorize((mxh[2]) + (mxh[4]) + (mxh[7]) + hash2[10] + hash2[11]);
+		q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14];
+		tmp = vectorize((mxh[3]) - (mxh[5]) + hash2[8] - hash2[11] - hash2[12]);
+		q[14] = (SHR(tmp, 1) ^ tmp) + hash[15];
+		tmp = vectorize(hash2[12] - hash2[9] + hash2[13] - (mxh[4]) - (mxh[6]));
+		q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0];
+
+		q[0 + 16] =
+			(SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) +
+			(SHR(q[0 + 1], 2) ^ SHL(q[0 + 1], 1) ^ ROTL64(q[0 + 1], 19) ^ ROTL64(q[0 + 1], 53)) +
+			(SHR(q[0 + 2], 2) ^ SHL(q[0 + 2], 2) ^ ROTL64(q[0 + 2], 28) ^ ROTL64(q[0 + 2], 59)) +
+			(SHR(q[0 + 3], 1) ^ SHL(q[0 + 3], 3) ^ ROTL64(q[0 + 3], 4) ^ ROTL64(q[0 + 3], 37)) +
+			(SHR(q[0 + 4], 1) ^ SHL(q[0 + 4], 2) ^ ROTL64(q[0 + 4], 13) ^ ROTL64(q[0 + 4], 43)) +
+			(SHR(q[0 + 5], 2) ^ SHL(q[0 + 5], 1) ^ ROTL64(q[0 + 5], 19) ^ ROTL64(q[0 + 5], 53)) +
+			(SHR(q[0 + 6], 2) ^ SHL(q[0 + 6], 2) ^ ROTL64(q[0 + 6], 28) ^ ROTL64(q[0 + 6], 59)) +
+			(SHR(q[0 + 7], 1) ^ SHL(q[0 + 7], 3) ^ ROTL64(q[0 + 7], 4) ^ ROTL64(q[0 + 7], 37)) +
+			(SHR(q[0 + 8], 1) ^ SHL(q[0 + 8], 2) ^ ROTL64(q[0 + 8], 13) ^ ROTL64(q[0 + 8], 43)) +
+			(SHR(q[0 + 9], 2) ^ SHL(q[0 + 9], 1) ^ ROTL64(q[0 + 9], 19) ^ ROTL64(q[0 + 9], 53)) +
+			(SHR(q[0 + 10], 2) ^ SHL(q[0 + 10], 2) ^ ROTL64(q[0 + 10], 28) ^ ROTL64(q[0 + 10], 59)) +
+			(SHR(q[0 + 11], 1) ^ SHL(q[0 + 11], 3) ^ ROTL64(q[0 + 11], 4) ^ ROTL64(q[0 + 11], 37)) +
+			(SHR(q[0 + 12], 1) ^ SHL(q[0 + 12], 2) ^ ROTL64(q[0 + 12], 13) ^ ROTL64(q[0 + 12], 43)) +
+			(SHR(q[0 + 13], 2) ^ SHL(q[0 + 13], 1) ^ ROTL64(q[0 + 13], 19) ^ ROTL64(q[0 + 13], 53)) +
+			(SHR(q[0 + 14], 2) ^ SHL(q[0 + 14], 2) ^ ROTL64(q[0 + 14], 28) ^ ROTL64(q[0 + 14], 59)) +
+			(SHR(q[0 + 15], 1) ^ SHL(q[0 + 15], 3) ^ ROTL64(q[0 + 15], 4) ^ ROTL64(q[0 + 15], 37)) +
+			((precalcf[0] + ROTL64(msg2[0], 0 + 1) +
+			ROTL64(msg2[0 + 3], 0 + 4)) ^ hash[0 + 7]);
+		q[1 + 16] =
+			(SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) +
+			(SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROTL64(q[1 + 1], 19) ^ ROTL64(q[1 + 1], 53)) +
+			(SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROTL64(q[1 + 2], 28) ^ ROTL64(q[1 + 2], 59)) +
+			(SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROTL64(q[1 + 3], 4) ^ ROTL64(q[1 + 3], 37)) +
+			(SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROTL64(q[1 + 4], 13) ^ ROTL64(q[1 + 4], 43)) +
+			(SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROTL64(q[1 + 5], 19) ^ ROTL64(q[1 + 5], 53)) +
+			(SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROTL64(q[1 + 6], 28) ^ ROTL64(q[1 + 6], 59)) +
+			(SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROTL64(q[1 + 7], 4) ^ ROTL64(q[1 + 7], 37)) +
+			(SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROTL64(q[1 + 8], 13) ^ ROTL64(q[1 + 8], 43)) +
+			(SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROTL64(q[1 + 9], 19) ^ ROTL64(q[1 + 9], 53)) +
+			(SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROTL64(q[1 + 10], 28) ^ ROTL64(q[1 + 10], 59)) +
+			(SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROTL64(q[1 + 11], 4) ^ ROTL64(q[1 + 11], 37)) +
+			(SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROTL64(q[1 + 12], 13) ^ ROTL64(q[1 + 12], 43)) +
+			(SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) +
+			(SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) +
+			(SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) +
+			((precalcf[1] + ROTL64(msg2[1], 1 + 1) +
+			ROTL64(msg2[1 + 3], 1 + 4)) ^ hash[1 + 7]);
+
+		uint2 pre1 = q[2 + 0] + q[2 + 2] + q[2 + 4] + q[2 + 6] + q[2 + 8] + q[2 + 10] + q[2 + 12];
+		uint2 pre2 = q[3 + 0] + q[3 + 2] + q[3 + 4] + q[3 + 6] + q[3 + 8] + q[3 + 10] + q[3 + 12];
+
+		q[2 + 16] = pre1 + CONST_EXP3(2) +
+			((precalcf[2] + ROTL64(msg2[2], 2 + 1) +
+			ROTL64(msg2[2 + 3], 2 + 4)) ^ hash[2 + 7]);
+		q[3 + 16] = pre2 + CONST_EXP3(3) +
+			((precalcf[3] + ROTL64(msg2[3], 3 + 1) +
+			ROTL64(msg2[3 + 3], 3 + 4)) ^ hash[3 + 7]);
+		pre1 = pre1 - q[2 + 0] + q[2 + 14];
+		pre2 = pre2 - q[3 + 0] + q[3 + 14];
+
+		q[4 + 16] = pre1 + CONST_EXP3(4) +
+			((precalcf[4] + ROTL64(msg2[4], 4 + 1) +
+			ROL8(msg2[4 + 3])) ^ hash[4 + 7]);
+		q[5 + 16] = pre2 + CONST_EXP3(5) +
+			((precalcf[5] + ROTL64(msg2[5], 5 + 1))
+			^ hash[5 + 7]);
+
+		pre1 = pre1 - q[4 + 0] + q[4 + 14];
+		pre2 = pre2 - q[5 + 0] + q[5 + 14];
+
+
+		q[6 + 16] = pre1 + CONST_EXP3(6) +
+			((vectorize((6 + 16)*(0x0555555555555555ull)) + ROTL64(msg2[6], 6 + 1) -
+			ROTL64(msg2[6 - 6], (6 - 6) + 1)) ^ hash[13]);
+		q[7 + 16] = pre2 + CONST_EXP3(7) +
+			((vectorize((7 + 16)*(0x0555555555555555ull)) + ROTL64(msg2[7], 7 + 1) -
+			ROTL64(msg2[7 - 6], (7 - 6) + 1)) ^ hash[14]);
+
+		pre1 = pre1 - q[6 + 0] + q[6 + 14];
+		pre2 = pre2 - q[7 + 0] + q[7 + 14];
+
+		q[8 + 16] = pre1 + CONST_EXP3(8) +
+			((vectorize((8 + 16)*(0x0555555555555555ull) + 0x10000) -
+			ROTL64(msg2[8 - 6], (8 - 6) + 1)) ^ hash[15]);
+		q[25] = pre2 + CONST_EXP3(9) +
+			((vectorize((25)*(0x0555555555555555ull)) - ROTL64(msg2[3], 4)) ^ hash[0]);
+
+		pre1 = pre1 - q[8 + 0] + q[8 + 14];
+		pre2 = pre2 - q[9 + 0] + q[9 + 14];
+
+		q[26] = pre1 + CONST_EXP3(10) +
+			((vectorize((26)*(0x0555555555555555ull)) - ROTL64(msg2[4], 5)) ^ hash[1]);
+		q[27] = pre2 + CONST_EXP3(11) +
+			((vectorize((27)*(0x0555555555555555ull)) - ROTL64(msg2[5], 6)) ^ hash[2]);
+
+		pre1 = pre1 - q[10 + 0] + q[10 + 14];
+		pre2 = pre2 - q[11 + 0] + q[11 + 14];
+
+		q[28] = pre1 + CONST_EXP3(12) +
+			((vectorize(0x955555555755554C) - ROTL64(msg2[6], 7)) ^ hash[3]);
+		q[13 + 16] = pre2 + CONST_EXP3(13) +
+			((precalcf[6] +
+			ROTL64(msg2[13 - 13], (13 - 13) + 1) - ROL8(msg2[13 - 6])) ^ hash[13 - 9]);
+
+		pre1 = pre1 - q[12 + 0] + q[12 + 14];
+		pre2 = pre2 - q[13 + 0] + q[13 + 14];
+
+		q[14 + 16] = pre1 + CONST_EXP3(14) +
+			((precalcf[7] +
+			ROTL64(msg2[14 - 13], (14 - 13) + 1)) ^ hash[14 - 9]);
+		q[15 + 16] = pre2 + CONST_EXP3(15) +
+			((precalcf[8] +
+			ROTL64(msg2[15 - 13], (15 - 13) + 1)) ^ hash[15 - 9]);
+
+
+		uint2 XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23];
+		uint2 XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31];
+
+		h[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ msg2[0]) + (XL64    ^ q[24] ^ q[0]);
+		h[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ msg2[1]) + (XL64    ^ q[25] ^ q[1]);
+		h[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ msg2[2]) + (XL64    ^ q[26] ^ q[2]);
+		h[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ msg2[3]) + (XL64    ^ q[27] ^ q[3]);
+		h[4] = (SHR(XH64, 3) ^ q[20] ^ msg2[4]) + (XL64    ^ q[28] ^ q[4]);
+		h[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ msg2[5]) + (XL64    ^ q[29] ^ q[5]);
+		h[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ msg2[6]) + (XL64    ^ q[30] ^ q[6]);
+		h[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ msg2[7]) + (XL64    ^ q[31] ^ q[7]);
+
+		h[8] = ROTL64(h[4], 9) + (XH64     ^     q[24] ^ 0x80) + (SHL(XL64, 8) ^ q[23] ^ q[8]);
+		h[9] = ROTL64(h[5], 10) + (XH64     ^     q[25]) + (SHR(XL64, 6) ^ q[16] ^ q[9]);
+		h[10] = ROTL64(h[6], 11) + (XH64     ^     q[26]) + (SHL(XL64, 6) ^ q[17] ^ q[10]);
+		h[11] = ROTL64(h[7], 12) + (XH64     ^     q[27]) + (SHL(XL64, 4) ^ q[18] ^ q[11]);
+		h[12] = ROTL64(h[0], 13) + (XH64     ^     q[28]) + (SHR(XL64, 3) ^ q[19] ^ q[12]);
+		h[13] = ROTL64(h[1], 14) + (XH64     ^     q[29]) + (SHR(XL64, 4) ^ q[20] ^ q[13]);
+		h[14] = ROTL64(h[2], 15) + (XH64     ^     q[30]) + (SHR(XL64, 7) ^ q[21] ^ q[14]);
+		h[15] = ROL16(h[3]) + (XH64     ^     q[31] ^ (512)) + (SHR(XL64, 2) ^ q[22] ^ q[15]);
+
+		const uint2 cmsg[16] =
+		{
+			0xaaaaaaa0, 0xaaaaaaaa,
+			0xaaaaaaa1, 0xaaaaaaaa,
+			0xaaaaaaa2, 0xaaaaaaaa,
+			0xaaaaaaa3, 0xaaaaaaaa,
+			0xaaaaaaa4, 0xaaaaaaaa,
+			0xaaaaaaa5, 0xaaaaaaaa,
+			0xaaaaaaa6, 0xaaaaaaaa,
+			0xaaaaaaa7, 0xaaaaaaaa,
+			0xaaaaaaa8, 0xaaaaaaaa,
+			0xaaaaaaa9, 0xaaaaaaaa,
+			0xaaaaaaaa, 0xaaaaaaaa,
+			0xaaaaaaab, 0xaaaaaaaa,
+			0xaaaaaaac, 0xaaaaaaaa,
+			0xaaaaaaad, 0xaaaaaaaa,
+			0xaaaaaaae, 0xaaaaaaaa,
+			0xaaaaaaaf, 0xaaaaaaaa
+		};
 
-        // Final
 #pragma unroll 16
-        for(int i=0;i<16;i++)
+		for (int i = 0; i < 16; i++)
 		{
-			message[i].y = 0xaaaaaaaa;
-			message[i].x = 0xaaaaaaa0ul + (uint32_t)i;
+			msg[i] = devectorize(cmsg[i] ^ h[i]);
 		}
-        Compression512(h, message);
 
-        // fertig
-        uint64_t *outpHash = &g_hash[8 * hashPosition];
 
-#pragma unroll 8
-        for(int i=0;i<8;i++)
-            outpHash[i] = devectorize(message[i+8]);
-    }
+		const uint2 precalc[16] =
+		{
+			{ 0x55555550, 0x55555555 },
+			{ 0xAAAAAAA5, 0x5AAAAAAA },
+			{ 0xFFFFFFFA, 0x5FFFFFFF },
+			{ 0x5555554F, 0x65555555 },
+			{ 0xAAAAAAA4, 0x6AAAAAAA },
+			{ 0xFFFFFFF9, 0x6FFFFFFF },
+			{ 0x5555554E, 0x75555555 },
+			{ 0xAAAAAAA3, 0x7AAAAAAA },
+			{ 0xFFFFFFF8, 0x7FFFFFFF },
+			{ 0x5555554D, 0x85555555 },
+			{ 0xAAAAAAA2, 0x8AAAAAAA },
+			{ 0xFFFFFFF7, 0x8FFFFFFF },
+			{ 0x5555554C, 0x95555555 },
+			{ 0xAAAAAAA1, 0x9AAAAAAA },
+			{ 0xFFFFFFF6, 0x9FFFFFFF },
+			{ 0x5555554B, 0xA5555555 },
+		};
+
+		const uint64_t p2 = msg[15] - msg[12];
+		const uint64_t p3 = msg[14] - msg[7];
+		const uint64_t p4 = msg[6] + msg[9];
+		const uint64_t p5 = msg[8] - msg[5];
+		const uint64_t p6 = msg[1] - msg[14];
+		const uint64_t p7 = msg[8] - msg[1];
+		const uint64_t p8 = msg[3] + msg[10];
+
+
+		tmp = vectorize((msg[5]) + (msg[10]) + (msg[13]) + p3);
+		q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[1];
+		tmp = vectorize((msg[6]) - (msg[8]) + (msg[11]) + (msg[14]) - (msg[15]));
+		q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + cmsg[2];
+		tmp = vectorize((msg[0]) + (msg[7]) + (msg[9]) + p2);
+		q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + cmsg[3];
+		tmp = vectorize((msg[0]) + p7 - (msg[10]) + (msg[13]));
+		q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + cmsg[4];
+		tmp = vectorize((msg[2]) + (msg[9]) - (msg[11]) + p6);
+		q[4] = (SHR(tmp, 1) ^ tmp) + cmsg[5];
+		tmp = vectorize(p8 + p2 - (msg[2]));
+		q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[6];
+		tmp = vectorize((msg[4]) - (msg[0]) - (msg[3]) - (msg[11]) + (msg[13]));
+		q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + cmsg[7];
+		tmp = vectorize(p6 - (msg[4]) - (msg[5]) - (msg[12]));
+		q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + cmsg[8];
+		tmp = vectorize((msg[2]) - (msg[5]) - (msg[6]) + (msg[13]) - (msg[15]));
+		q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + cmsg[9];
+		tmp = vectorize((msg[0]) - (msg[3]) + (msg[6]) + p3);
+		q[9] = (SHR(tmp, 1) ^ tmp) + cmsg[10];
+		tmp = vectorize(p7 - (msg[4]) - (msg[7]) + (msg[15]));
+		q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[11];
+		tmp = vectorize(p5 - (msg[0]) - (msg[2]) + (msg[9]));
+		q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + cmsg[12];
+		tmp = vectorize(p8+msg[1] - p4 );
+		q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + cmsg[13];
+		tmp = vectorize((msg[2]) + (msg[4]) + (msg[7]) + (msg[10]) + (msg[11]));
+		q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + cmsg[14];
+		tmp = vectorize((msg[3]) + p5 - (msg[11]) - (msg[12]));
+		q[14] = (SHR(tmp, 1) ^ tmp) + cmsg[15];
+		tmp = vectorize((msg[12]) - (msg[4]) - p4 + (msg[13]));
+		q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[0];
+
+		q[0 + 16] =
+			(SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) +
+			(SHR(q[0 + 1], 2) ^ SHL(q[0 + 1], 1) ^ ROTL64(q[0 + 1], 19) ^ ROTL64(q[0 + 1], 53)) +
+			(SHR(q[0 + 2], 2) ^ SHL(q[0 + 2], 2) ^ ROTL64(q[0 + 2], 28) ^ ROTL64(q[0 + 2], 59)) +
+			(SHR(q[0 + 3], 1) ^ SHL(q[0 + 3], 3) ^ ROTL64(q[0 + 3], 4) ^ ROTL64(q[0 + 3], 37)) +
+			(SHR(q[0 + 4], 1) ^ SHL(q[0 + 4], 2) ^ ROTL64(q[0 + 4], 13) ^ ROTL64(q[0 + 4], 43)) +
+			(SHR(q[0 + 5], 2) ^ SHL(q[0 + 5], 1) ^ ROTL64(q[0 + 5], 19) ^ ROTL64(q[0 + 5], 53)) +
+			(SHR(q[0 + 6], 2) ^ SHL(q[0 + 6], 2) ^ ROTL64(q[0 + 6], 28) ^ ROTL64(q[0 + 6], 59)) +
+			(SHR(q[0 + 7], 1) ^ SHL(q[0 + 7], 3) ^ ROTL64(q[0 + 7], 4) ^ ROTL64(q[0 + 7], 37)) +
+			(SHR(q[0 + 8], 1) ^ SHL(q[0 + 8], 2) ^ ROTL64(q[0 + 8], 13) ^ ROTL64(q[0 + 8], 43)) +
+			(SHR(q[0 + 9], 2) ^ SHL(q[0 + 9], 1) ^ ROTL64(q[0 + 9], 19) ^ ROTL64(q[0 + 9], 53)) +
+			(SHR(q[0 + 10], 2) ^ SHL(q[0 + 10], 2) ^ ROTL64(q[0 + 10], 28) ^ ROTL64(q[0 + 10], 59)) +
+			(SHR(q[0 + 11], 1) ^ SHL(q[0 + 11], 3) ^ ROTL64(q[0 + 11], 4) ^ ROTL64(q[0 + 11], 37)) +
+			(SHR(q[0 + 12], 1) ^ SHL(q[0 + 12], 2) ^ ROTL64(q[0 + 12], 13) ^ ROTL64(q[0 + 12], 43)) +
+			(SHR(q[0 + 13], 2) ^ SHL(q[0 + 13], 1) ^ ROTL64(q[0 + 13], 19) ^ ROTL64(q[0 + 13], 53)) +
+			(SHR(q[0 + 14], 2) ^ SHL(q[0 + 14], 2) ^ ROTL64(q[0 + 14], 28) ^ ROTL64(q[0 + 14], 59)) +
+			(SHR(q[0 + 15], 1) ^ SHL(q[0 + 15], 3) ^ ROTL64(q[0 + 15], 4) ^ ROTL64(q[0 + 15], 37)) +
+			((precalc[0] + ROTL64(h[0], 0 + 1) +
+			ROTL64(h[0 + 3], 0 + 4) - ROTL64(h[0 + 10], 0 + 11)) ^ cmsg[0 + 7]);
+		q[1 + 16] =
+			(SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) +
+			(SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROTL64(q[1 + 1], 19) ^ ROTL64(q[1 + 1], 53)) +
+			(SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROTL64(q[1 + 2], 28) ^ ROTL64(q[1 + 2], 59)) +
+			(SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROTL64(q[1 + 3], 4) ^ ROTL64(q[1 + 3], 37)) +
+			(SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROTL64(q[1 + 4], 13) ^ ROTL64(q[1 + 4], 43)) +
+			(SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROTL64(q[1 + 5], 19) ^ ROTL64(q[1 + 5], 53)) +
+			(SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROTL64(q[1 + 6], 28) ^ ROTL64(q[1 + 6], 59)) +
+			(SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROTL64(q[1 + 7], 4) ^ ROTL64(q[1 + 7], 37)) +
+			(SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROTL64(q[1 + 8], 13) ^ ROTL64(q[1 + 8], 43)) +
+			(SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROTL64(q[1 + 9], 19) ^ ROTL64(q[1 + 9], 53)) +
+			(SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROTL64(q[1 + 10], 28) ^ ROTL64(q[1 + 10], 59)) +
+			(SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROTL64(q[1 + 11], 4) ^ ROTL64(q[1 + 11], 37)) +
+			(SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROTL64(q[1 + 12], 13) ^ ROTL64(q[1 + 12], 43)) +
+			(SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) +
+			(SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) +
+			(SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) +
+			((precalc[1] + ROTL64(h[1], 1 + 1) +
+			ROTL64(h[1 + 3], 1 + 4) - ROTL64(h[1 + 10], 1 + 11)) ^ cmsg[1 + 7]);
+
+		pre1 = q[2] + q[4] + q[6] + q[8] + q[10] + q[12] + q[14];
+		pre2 = q[3] + q[5] + q[7] + q[9] + q[11] + q[13] + q[15];
+
+		q[2 + 16] = pre1 + CONST_EXP3(2) +
+			((precalc[2] + ROTL64(h[2], 2 + 1) +
+			ROTL64(h[2 + 3], 2 + 4) - ROTL64(h[2 + 10], 2 + 11)) ^ cmsg[2 + 7]);
+		q[3 + 16] = pre2 + CONST_EXP3(3) +
+			((precalc[3] + ROTL64(h[3], 3 + 1) +
+			ROTL64(h[3 + 3], 3 + 4) - ROTL64(h[3 + 10], 3 + 11)) ^ cmsg[3 + 7]);
+
+		pre1 = pre1 - q[2 + 0] + q[2 + 14];
+		pre2 = pre2 - q[3 + 0] + q[3 + 14];
+
+		q[4 + 16] = pre1 + CONST_EXP3(4) +
+			((precalc[4] + ROTL64(h[4], 4 + 1) +
+			ROL8(h[4 + 3]) - ROTL64(h[4 + 10], 4 + 11)) ^ cmsg[4 + 7]);
+		q[5 + 16] = pre2 + CONST_EXP3(5) +
+			((precalc[5] + ROTL64(h[5], 5 + 1) +
+			ROTL64(h[5 + 3], 5 + 4) - ROL16(h[5 + 10])) ^ cmsg[5 + 7]);
+
+		pre1 = pre1 - q[4 + 0] + q[4 + 14];
+		pre2 = pre2 - q[5 + 0] + q[5 + 14];
+
+		q[6 + 16] = pre1 + CONST_EXP3(6) +
+			((precalc[6] + ROTL64(h[6], 6 + 1) +
+			ROTL64(h[6 + 3], 6 + 4) - ROTL64(h[6 - 6], (6 - 6) + 1)) ^ cmsg[6 + 7]);
+		q[7 + 16] = pre2 + CONST_EXP3(7) +
+			((precalc[7] + ROL8(h[7]) +
+			ROTL64(h[7 + 3], 7 + 4) - ROTL64(h[7 - 6], (7 - 6) + 1)) ^ cmsg[7 + 7]);
+
+		pre1 = pre1 - q[6 + 0] + q[6 + 14];
+		pre2 = pre2 - q[7 + 0] + q[7 + 14];
+
+		q[8 + 16] = pre1 + CONST_EXP3(8) +
+			((precalc[8] + ROTL64(h[8], 8 + 1) +
+			ROTL64(h[8 + 3], 8 + 4) - ROTL64(h[8 - 6], (8 - 6) + 1)) ^ cmsg[8 + 7]);
+		q[9 + 16] = pre2 + CONST_EXP3(9) +
+			((precalc[9] + ROTL64(h[9], 9 + 1) +
+			ROTL64(h[9 + 3], 9 + 4) - ROTL64(h[9 - 6], (9 - 6) + 1)) ^ cmsg[9 - 9]);
+
+		pre1 = pre1 - q[8 + 0] + q[8 + 14];
+		pre2 = pre2 - q[9 + 0] + q[9 + 14];
+
+		q[10 + 16] = pre1 + CONST_EXP3(10) +
+			((precalc[10] + ROTL64(h[10], 10 + 1) +
+			ROTL64(h[10 + 3], 10 + 4) - ROTL64(h[10 - 6], (10 - 6) + 1)) ^ cmsg[10 - 9]);
+		q[11 + 16] = pre2 + CONST_EXP3(11) +
+			((precalc[11] + ROTL64(h[11], 11 + 1) +
+			ROTL64(h[11 + 3], 11 + 4) - ROTL64(h[11 - 6], (11 - 6) + 1)) ^ cmsg[11 - 9]);
+
+		pre1 = pre1 - q[10 + 0] + q[10 + 14];
+		pre2 = pre2 - q[11 + 0] + q[11 + 14];
+
+		q[12 + 16] = pre1 + CONST_EXP3(12) +
+			((precalc[12] + ROTL64(h[12], 12 + 1) +
+			ROL16(h[12 + 3]) - ROTL64(h[12 - 6], (12 - 6) + 1)) ^ cmsg[12 - 9]);
+		q[13 + 16] = pre2 + CONST_EXP3(13) +
+			((precalc[13] + ROTL64(h[13], 13 + 1) +
+			ROTL64(h[13 - 13], (13 - 13) + 1) - ROL8(h[13 - 6])) ^ cmsg[13 - 9]);
+
+		pre1 = pre1 - q[12 + 0] + q[12 + 14];
+		pre2 = pre2 - q[13 + 0] + q[13 + 14];
+
+		q[14 + 16] = pre1 + CONST_EXP3(14) +
+			((precalc[14] + ROTL64(h[14], 14 + 1) +
+			ROTL64(h[14 - 13], (14 - 13) + 1) - ROTL64(h[14 - 6], (14 - 6) + 1)) ^ cmsg[14 - 9]);
+		q[15 + 16] = pre2 + CONST_EXP3(15) +
+			((precalc[15] + ROL16(h[15]) +
+			ROTL64(h[15 - 13], (15 - 13) + 1) - ROTL64(h[15 - 6], (15 - 6) + 1)) ^ cmsg[15 - 9]);
+
+		XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23];
+		XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31];
+
+		msg2[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ h[0]) + (XL64    ^ q[24] ^ q[0]);
+		msg2[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ h[1]) + (XL64    ^ q[25] ^ q[1]);
+		msg2[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ h[2]) + (XL64    ^ q[26] ^ q[2]);
+		msg2[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ h[3]) + (XL64    ^ q[27] ^ q[3]);
+		msg2[4] = (SHR(XH64, 3) ^ q[20] ^ h[4]) + (XL64    ^ q[28] ^ q[4]);
+		msg2[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ h[5]) + (XL64    ^ q[29] ^ q[5]);
+		msg2[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ h[6]) + (XL64    ^ q[30] ^ q[6]);
+		msg2[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ h[7]) + (XL64    ^ q[31] ^ q[7]);
+		msg2[8] = ROTL64(msg2[4], 9) + (XH64     ^     q[24] ^ h[8]) + (SHL(XL64, 8) ^ q[23] ^ q[8]);
+
+		msg2[9] = ROTL64(msg2[5], 10) + (XH64     ^     q[25] ^ h[9]) + (SHR(XL64, 6) ^ q[16] ^ q[9]);
+		msg2[10] = ROTL64(msg2[6], 11) + (XH64     ^     q[26] ^ h[10]) + (SHL(XL64, 6) ^ q[17] ^ q[10]);
+		msg2[11] = ROTL64(msg2[7], 12) + (XH64     ^     q[27] ^ h[11]) + (SHL(XL64, 4) ^ q[18] ^ q[11]);
+		uint28 *phash2 = (uint28*)inpHash;
+		phash2[0] = make_uint28(msg2[8], msg2[9], msg2[10], msg2[11]);
+
+		msg2[12] = ROTL64(msg2[0], 13) + (XH64     ^     q[28] ^ h[12]) + (SHR(XL64, 3) ^ q[19] ^ q[12]);
+		msg2[13] = ROTL64(msg2[1], 14) + (XH64     ^     q[29] ^ h[13]) + (SHR(XL64, 4) ^ q[20] ^ q[13]);
+		msg2[14] = ROTL64(msg2[2], 15) + (XH64     ^     q[30] ^ h[14]) + (SHR(XL64, 7) ^ q[21] ^ q[14]);
+		msg2[15] = ROL16(msg2[3]) + (XH64     ^     q[31] ^ h[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]);
+
+		phash2[1] = make_uint28(msg2[8], msg2[9], msg2[10], msg2[11]);
+		phash2[1] = make_uint28(msg2[12], msg2[13], msg2[14], msg2[15]);
+
+	}
 }
 
-__global__ __launch_bounds__(256, 2)
-void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+__global__ __launch_bounds__(32, 16)
+void quark_bmw512_gpu_hash_64_quark(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, uint32_t *const __restrict__ g_nonceVector)
 {
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = startNounce + thread;
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if(thread < threads)
+	{
+		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		const int hashPosition = nounce - startNounce;
+		uint64_t *const inpHash = &g_hash[8 * hashPosition];
 
-        // Init
-		uint2 h[16] = {
+		const uint2 hash[16] =
+		{
 			{ 0x84858687UL, 0x80818283UL },
 			{ 0x8C8D8E8FUL, 0x88898A8BUL },
 			{ 0x94959697UL, 0x90919293UL },
@@ -398,8 +700,8 @@ void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *
 			{ 0xACADAEAFUL, 0xA8A9AAABUL },
 			{ 0xB4B5B6B7UL, 0xB0B1B2B3UL },
 			{ 0xBCBDBEBFUL, 0xB8B9BABBUL },
-			{ 0xC4C5C6C7UL, 0xC0C1C2C3UL, },
-			{ 0xCCCDCECFUL, 0xC8C9CACBUL, },
+			{ 0xC4C5C6C7UL, 0xC0C1C2C3UL },
+			{ 0xCCCDCECFUL, 0xC8C9CACBUL },
 			{ 0xD4D5D6D7UL, 0xD0D1D2D3UL },
 			{ 0xDCDDDEDFUL, 0xD8D9DADBUL },
 			{ 0xE4E5E6E7UL, 0xE0E1E2E3UL },
@@ -407,32 +709,460 @@ void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *
 			{ 0xF4F5F6F7UL, 0xF0F1F2F3UL },
 			{ 0xFCFDFEFFUL, 0xF8F9FAFBUL }
 		};
-        // Nachricht kopieren (Achtung, die Nachricht hat 64 Byte,
-        // BMW arbeitet mit 128 Byte!!!
-		uint2 message[16];
-#pragma unroll 16
-        for(int i=0;i<16;i++)
-			message[i] = vectorize(c_PaddedMessage80[i]);
 
-        // die Nounce durch die thread-spezifische ersetzen
-		message[9].x = cuda_swab32(nounce);	//REPLACE_HIWORD(message[9], cuda_swab32(nounce));
-        // Compression 1
-        Compression512(message, h);
+		const uint64_t hash2[16] =
+		{
+			0x8081828384858687,
+			0x88898A8B8C8D8E8F,
+			0x9091929394959697,
+			0x98999A9B9C9D9E9F,
+			0xA0A1A2A3A4A5A6A7,
+			0xA8A9AAABACADAEAF,
+			0xB0B1B2B3B4B5B6B7,
+			0xB8B9BABBBCBDBEBF,
+			0xC0C1C2C3C4C5C6C7,
+			0xC8C9CACBCCCDCECF,
+			0xD0D1D2D3D4D5D6D7,
+			0xD8D9DADBDCDDDEDF,
+			0xE0E1E2E3E4E5E6E7,
+			0xE8E9EAEBECEDEEEF,
+			0xF0F1F2F3F4F5F6F7,
+			0xF8F9FAFBFCFDFEFF
+		};
+
+		uint64_t msg[16];
+		uint2    msg2[16];
+		uint64_t mxh[8];
+		uint2 h[16];
+
+		uint28 *phash = (uint28*)inpHash;
+		uint28 *outpt = (uint28*)msg2;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+
+#pragma unroll 8
+		for(int i = 0; i < 8; i++)
+		{
+			msg[i] = devectorize(msg2[i]);
+		}
+
+
+		mxh[0] = msg[0] ^ hash2[0];
+		mxh[1] = msg[1] ^ hash2[1];
+		mxh[2] = msg[2] ^ hash2[2];
+		mxh[3] = msg[3] ^ hash2[3];
+		mxh[4] = msg[4] ^ hash2[4];
+		mxh[5] = msg[5] ^ hash2[5];
+		mxh[6] = msg[6] ^ hash2[6];
+		mxh[7] = msg[7] ^ hash2[7];
+
+		const uint2 precalcf[9] =
+		{
+			{ 0x55555550ul, 0x55555555 },
+			{ 0xAAAAAAA5, 0x5AAAAAAA },
+			{ 0xFFFFFFFA, 0x5FFFFFFF },
+			{ 0x5555554F, 0x65555555 },
+			{ 0xAAAAAAA4, 0x6AAAAAAA },
+			{ 0xFE00FFF9, 0x6FFFFFFF },
+			{ 0xAAAAAAA1, 0x9AAAAAAA },
+			{ 0xFFFEFFF6, 0x9FFFFFFF },
+			{ 0x5755554B, 0xA5555555 },
+		};
+
+		uint2 q[32];
+
+		uint2 tmp;
+		tmp = vectorize((mxh[5]) - (mxh[7]) + (hash2[10] + hash2[13] + hash2[14]));
+		q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[1];
+		tmp = vectorize((mxh[6]) + (hash2[11] + hash2[14] - (512 ^ hash2[15]) - (0x80 ^ hash2[8])));
+		q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2];
+		tmp = vectorize((mxh[0] + mxh[7]) + hash2[9] - hash2[12] + (512 ^ hash2[15]));
+		q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3];
+		tmp = vectorize((mxh[0] - mxh[1]) + (0x80 ^ hash2[8]) - hash2[10] + hash2[13]);
+		q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4];
+		tmp = vectorize((mxh[1] + mxh[2]) + hash2[9] - hash2[11] - hash2[14]);
+		q[4] = (SHR(tmp, 1) ^ tmp) + hash[5];
+		tmp = vectorize((mxh[3] - mxh[2] + hash2[10] - hash2[12] + (512 ^ hash2[15])));
+		q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[6];
+		tmp = vectorize((mxh[4]) - (mxh[0]) - (mxh[3]) + hash2[13] - hash2[11]);
+		q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7];
+		tmp = vectorize((mxh[1]) - (mxh[4]) - (mxh[5]) - hash2[12] - hash2[14]);
+		q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8];
+		tmp = vectorize((mxh[2]) - (mxh[5]) - (mxh[6]) + hash2[13] - (512 ^ hash2[15]));
+		q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9];
+		tmp = vectorize((mxh[0]) - (mxh[3]) + (mxh[6]) - (mxh[7]) + (hash2[14]));
+		q[9] = (SHR(tmp, 1) ^ tmp) + hash[10];
+		tmp = vectorize((512 ^ hash2[15]) + (0x80 ^ hash2[8]) - (mxh[1]) - (mxh[4]) - (mxh[7]));
+		q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[11];
+		tmp = vectorize(hash2[9] + (0x80 ^ hash2[8]) - (mxh[0]) - (mxh[2]) - (mxh[5]));
+		q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12];
+		tmp = vectorize((mxh[1]) + (mxh[3]) - (mxh[6]) + hash2[10] - hash2[9]);
+		q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13];
+		tmp = vectorize((mxh[2]) + (mxh[4]) + (mxh[7]) + hash2[10] + hash2[11]);
+		q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14];
+		tmp = vectorize((mxh[3]) - (mxh[5]) + (0x80 ^ hash2[8]) - hash2[11] - hash2[12]);
+		q[14] = (SHR(tmp, 1) ^ tmp) + hash[15];
+		tmp = vectorize(hash2[12] - hash2[9] + hash2[13] - (mxh[4]) - (mxh[6]));
+		q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0];
+
+		q[0 + 16] =
+			(SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) +
+			(SHR(q[0 + 1], 2) ^ SHL(q[0 + 1], 1) ^ ROTL64(q[0 + 1], 19) ^ ROTL64(q[0 + 1], 53)) +
+			(SHR(q[0 + 2], 2) ^ SHL(q[0 + 2], 2) ^ ROTL64(q[0 + 2], 28) ^ ROTL64(q[0 + 2], 59)) +
+			(SHR(q[0 + 3], 1) ^ SHL(q[0 + 3], 3) ^ ROTL64(q[0 + 3], 4) ^ ROTL64(q[0 + 3], 37)) +
+			(SHR(q[0 + 4], 1) ^ SHL(q[0 + 4], 2) ^ ROTL64(q[0 + 4], 13) ^ ROTL64(q[0 + 4], 43)) +
+			(SHR(q[0 + 5], 2) ^ SHL(q[0 + 5], 1) ^ ROTL64(q[0 + 5], 19) ^ ROTL64(q[0 + 5], 53)) +
+			(SHR(q[0 + 6], 2) ^ SHL(q[0 + 6], 2) ^ ROTL64(q[0 + 6], 28) ^ ROTL64(q[0 + 6], 59)) +
+			(SHR(q[0 + 7], 1) ^ SHL(q[0 + 7], 3) ^ ROTL64(q[0 + 7], 4) ^ ROTL64(q[0 + 7], 37)) +
+			(SHR(q[0 + 8], 1) ^ SHL(q[0 + 8], 2) ^ ROTL64(q[0 + 8], 13) ^ ROTL64(q[0 + 8], 43)) +
+			(SHR(q[0 + 9], 2) ^ SHL(q[0 + 9], 1) ^ ROTL64(q[0 + 9], 19) ^ ROTL64(q[0 + 9], 53)) +
+			(SHR(q[0 + 10], 2) ^ SHL(q[0 + 10], 2) ^ ROTL64(q[0 + 10], 28) ^ ROTL64(q[0 + 10], 59)) +
+			(SHR(q[0 + 11], 1) ^ SHL(q[0 + 11], 3) ^ ROTL64(q[0 + 11], 4) ^ ROTL64(q[0 + 11], 37)) +
+			(SHR(q[0 + 12], 1) ^ SHL(q[0 + 12], 2) ^ ROTL64(q[0 + 12], 13) ^ ROTL64(q[0 + 12], 43)) +
+			(SHR(q[0 + 13], 2) ^ SHL(q[0 + 13], 1) ^ ROTL64(q[0 + 13], 19) ^ ROTL64(q[0 + 13], 53)) +
+			(SHR(q[0 + 14], 2) ^ SHL(q[0 + 14], 2) ^ ROTL64(q[0 + 14], 28) ^ ROTL64(q[0 + 14], 59)) +
+			(SHR(q[0 + 15], 1) ^ SHL(q[0 + 15], 3) ^ ROTL64(q[0 + 15], 4) ^ ROTL64(q[0 + 15], 37)) +
+			((precalcf[0] + ROTL64(msg2[0], 0 + 1) +
+			ROTL64(msg2[0 + 3], 0 + 4)) ^ hash[0 + 7]);
+		q[1 + 16] =
+			(SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) +
+			(SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROTL64(q[1 + 1], 19) ^ ROTL64(q[1 + 1], 53)) +
+			(SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROTL64(q[1 + 2], 28) ^ ROTL64(q[1 + 2], 59)) +
+			(SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROTL64(q[1 + 3], 4) ^ ROTL64(q[1 + 3], 37)) +
+			(SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROTL64(q[1 + 4], 13) ^ ROTL64(q[1 + 4], 43)) +
+			(SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROTL64(q[1 + 5], 19) ^ ROTL64(q[1 + 5], 53)) +
+			(SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROTL64(q[1 + 6], 28) ^ ROTL64(q[1 + 6], 59)) +
+			(SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROTL64(q[1 + 7], 4) ^ ROTL64(q[1 + 7], 37)) +
+			(SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROTL64(q[1 + 8], 13) ^ ROTL64(q[1 + 8], 43)) +
+			(SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROTL64(q[1 + 9], 19) ^ ROTL64(q[1 + 9], 53)) +
+			(SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROTL64(q[1 + 10], 28) ^ ROTL64(q[1 + 10], 59)) +
+			(SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROTL64(q[1 + 11], 4) ^ ROTL64(q[1 + 11], 37)) +
+			(SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROTL64(q[1 + 12], 13) ^ ROTL64(q[1 + 12], 43)) +
+			(SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) +
+			(SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) +
+			(SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) +
+			((precalcf[1] + ROTL64(msg2[1], 1 + 1) +
+			ROTL64(msg2[1 + 3], 1 + 4)) ^ hash[1 + 7]);
+
+		uint2 pre1 = q[2 + 0] + q[2 + 2] + q[2 + 4] + q[2 + 6] + q[2 + 8] + q[2 + 10] + q[2 + 12];
+		uint2 pre2 = q[3 + 0] + q[3 + 2] + q[3 + 4] + q[3 + 6] + q[3 + 8] + q[3 + 10] + q[3 + 12];
+
+		q[2 + 16] = pre1 + CONST_EXP3(2) +
+			((precalcf[2] + ROTL64(msg2[2], 2 + 1) +
+			ROTL64(msg2[2 + 3], 2 + 4)) ^ hash[2 + 7]);
+		q[3 + 16] = pre2 + CONST_EXP3(3) +
+			((precalcf[3] + ROTL64(msg2[3], 3 + 1) +
+			ROTL64(msg2[3 + 3], 3 + 4)) ^ hash[3 + 7]);
+		pre1 = pre1 - q[2 + 0] + q[2 + 14];
+		pre2 = pre2 - q[3 + 0] + q[3 + 14];
+
+		q[4 + 16] = pre1 + CONST_EXP3(4) +
+			((precalcf[4] + ROTL64(msg2[4], 4 + 1) +
+			ROL8(msg2[4 + 3])) ^ hash[4 + 7]);
+		q[5 + 16] = pre2 + CONST_EXP3(5) +
+			((precalcf[5] + ROTL64(msg2[5], 5 + 1))
+			^ hash[5 + 7]);
+
+		pre1 = pre1 - q[4 + 0] + q[4 + 14];
+		pre2 = pre2 - q[5 + 0] + q[5 + 14];
+
+
+		q[6 + 16] = pre1 + CONST_EXP3(6) +
+			((vectorize((6 + 16)*(0x0555555555555555ull)) + ROTL64(msg2[6], 6 + 1) -
+			ROTL64(msg2[6 - 6], (6 - 6) + 1)) ^ hash[6 + 7]);
+		q[7 + 16] = pre2 + CONST_EXP3(7) +
+			((vectorize((7 + 16)*(0x0555555555555555ull)) + ROTL64(msg2[7], 7 + 1) -
+			ROTL64(msg2[7 - 6], (7 - 6) + 1)) ^ hash[7 + 7]);
+
+		pre1 = pre1 - q[6 + 0] + q[6 + 14];
+		pre2 = pre2 - q[7 + 0] + q[7 + 14];
+
+		q[8 + 16] = pre1 + CONST_EXP3(8) +
+			((vectorize((8 + 16)*(0x0555555555555555ull) + 0x10000) -
+			ROTL64(msg2[8 - 6], (8 - 6) + 1)) ^ hash[8 + 7]);
+		q[25] = pre2 + CONST_EXP3(9) +
+			((vectorize((25)*(0x0555555555555555ull)) - ROTL64(msg2[3], 4)) ^ hash[0]);
+
+		pre1 = pre1 - q[8 + 0] + q[8 + 14];
+		pre2 = pre2 - q[9 + 0] + q[9 + 14];
+
+		q[26] = pre1 + CONST_EXP3(10) +
+			((vectorize((26)*(0x0555555555555555ull)) - ROTL64(msg2[4], 5)) ^ hash[1]);
+		q[27] = pre2 + CONST_EXP3(11) +
+			((vectorize((27)*(0x0555555555555555ull)) - ROTL64(msg2[5], 6)) ^ hash[2]);
+
+		pre1 = pre1 - q[10 + 0] + q[10 + 14];
+		pre2 = pre2 - q[11 + 0] + q[11 + 14];
+
+		q[28] = pre1 + CONST_EXP3(12) +
+			((vectorize(0x955555555755554C) - ROTL64(msg2[6], 7)) ^ hash[3]);
+		q[13 + 16] = pre2 + CONST_EXP3(13) +
+			((precalcf[6] +
+			ROTL64(msg2[13 - 13], (13 - 13) + 1) - ROL8(msg2[13 - 6])) ^ hash[13 - 9]);
+
+		pre1 = pre1 - q[12 + 0] + q[12 + 14];
+		pre2 = pre2 - q[13 + 0] + q[13 + 14];
+
+		q[14 + 16] = pre1 + CONST_EXP3(14) +
+			((precalcf[7] +
+			ROTL64(msg2[14 - 13], (14 - 13) + 1)) ^ hash[14 - 9]);
+		q[15 + 16] = pre2 + CONST_EXP3(15) +
+			((precalcf[8] +
+			ROTL64(msg2[15 - 13], (15 - 13) + 1)) ^ hash[15 - 9]);
+
+
+		uint2 XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23];
+		uint2 XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31];
+
+		h[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ msg2[0]) + (XL64    ^ q[24] ^ q[0]);
+		h[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ msg2[1]) + (XL64    ^ q[25] ^ q[1]);
+		h[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ msg2[2]) + (XL64    ^ q[26] ^ q[2]);
+		h[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ msg2[3]) + (XL64    ^ q[27] ^ q[3]);
+		h[4] = (SHR(XH64, 3) ^ q[20] ^ msg2[4]) + (XL64    ^ q[28] ^ q[4]);
+		h[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ msg2[5]) + (XL64    ^ q[29] ^ q[5]);
+		h[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ msg2[6]) + (XL64    ^ q[30] ^ q[6]);
+		h[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ msg2[7]) + (XL64    ^ q[31] ^ q[7]);
+
+		h[8] = ROTL64(h[4], 9) + (XH64     ^     q[24] ^ 0x80) + (SHL(XL64, 8) ^ q[23] ^ q[8]);
+		h[9] = ROTL64(h[5], 10) + (XH64     ^     q[25]) + (SHR(XL64, 6) ^ q[16] ^ q[9]);
+		h[10] = ROTL64(h[6], 11) + (XH64     ^     q[26]) + (SHL(XL64, 6) ^ q[17] ^ q[10]);
+		h[11] = ROTL64(h[7], 12) + (XH64     ^     q[27]) + (SHL(XL64, 4) ^ q[18] ^ q[11]);
+		h[12] = ROTL64(h[0], 13) + (XH64     ^     q[28]) + (SHR(XL64, 3) ^ q[19] ^ q[12]);
+		h[13] = ROTL64(h[1], 14) + (XH64     ^     q[29]) + (SHR(XL64, 4) ^ q[20] ^ q[13]);
+		h[14] = ROTL64(h[2], 15) + (XH64     ^     q[30]) + (SHR(XL64, 7) ^ q[21] ^ q[14]);
+		h[15] = ROL16(h[3]) + (XH64     ^     q[31] ^ (512)) + (SHR(XL64, 2) ^ q[22] ^ q[15]);
+
+		const uint2 cmsg[16] =
+		{
+			0xaaaaaaa0, 0xaaaaaaaa,
+			0xaaaaaaa1, 0xaaaaaaaa,
+			0xaaaaaaa2, 0xaaaaaaaa,
+			0xaaaaaaa3, 0xaaaaaaaa,
+			0xaaaaaaa4, 0xaaaaaaaa,
+			0xaaaaaaa5, 0xaaaaaaaa,
+			0xaaaaaaa6, 0xaaaaaaaa,
+			0xaaaaaaa7, 0xaaaaaaaa,
+			0xaaaaaaa8, 0xaaaaaaaa,
+			0xaaaaaaa9, 0xaaaaaaaa,
+			0xaaaaaaaa, 0xaaaaaaaa,
+			0xaaaaaaab, 0xaaaaaaaa,
+			0xaaaaaaac, 0xaaaaaaaa,
+			0xaaaaaaad, 0xaaaaaaaa,
+			0xaaaaaaae, 0xaaaaaaaa,
+			0xaaaaaaaf, 0xaaaaaaaa
+		};
 
 #pragma unroll 16
-        for(int i=0;i<16;i++)
-			message[i] = make_uint2(0xaaaaaaa0+i,0xaaaaaaaa);
+		for(int i = 0; i < 16; i++)
+		{
+			msg[i] = devectorize(cmsg[i] ^ h[i]);
+		}
 
 
-		Compression512(h, message);
+		const uint2 precalc[16] =
+		{
+			{ 0x55555550, 0x55555555 },
+			{ 0xAAAAAAA5, 0x5AAAAAAA },
+			{ 0xFFFFFFFA, 0x5FFFFFFF },
+			{ 0x5555554F, 0x65555555 },
+			{ 0xAAAAAAA4, 0x6AAAAAAA },
+			{ 0xFFFFFFF9, 0x6FFFFFFF },
+			{ 0x5555554E, 0x75555555 },
+			{ 0xAAAAAAA3, 0x7AAAAAAA },
+			{ 0xFFFFFFF8, 0x7FFFFFFF },
+			{ 0x5555554D, 0x85555555 },
+			{ 0xAAAAAAA2, 0x8AAAAAAA },
+			{ 0xFFFFFFF7, 0x8FFFFFFF },
+			{ 0x5555554C, 0x95555555 },
+			{ 0xAAAAAAA1, 0x9AAAAAAA },
+			{ 0xFFFFFFF6, 0x9FFFFFFF },
+			{ 0x5555554B, 0xA5555555 },
+		};
 
-        // fertig
-        uint64_t *outpHash = &g_hash[8 * thread];
+		const uint64_t p2 = msg[15] - msg[12];
+		const uint64_t p3 = msg[14] - msg[7];
+		const uint64_t p4 = msg[6] + msg[9];
+		const uint64_t p5 = msg[8] - msg[5];
+		const uint64_t p6 = msg[1] - msg[14];
+		const uint64_t p7 = msg[8] - msg[1];
+		const uint64_t p8 = msg[3] + msg[10];
 
-#pragma unroll 8
-        for(int i=0;i<8;i++)
-            outpHash[i] = devectorize(message[i+8]);
-    }
+
+		tmp = vectorize((msg[5]) + (msg[10]) + (msg[13]) + p3);
+		q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[1];
+		tmp = vectorize((msg[6]) - (msg[8]) + (msg[11]) + (msg[14]) - (msg[15]));
+		q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + cmsg[2];
+		tmp = vectorize((msg[0]) + (msg[7]) + (msg[9]) + p2);
+		q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + cmsg[3];
+		tmp = vectorize((msg[0]) + p7 - (msg[10]) + (msg[13]));
+		q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + cmsg[4];
+		tmp = vectorize((msg[2]) + (msg[9]) - (msg[11]) + p6);
+		q[4] = (SHR(tmp, 1) ^ tmp) + cmsg[5];
+		tmp = vectorize(p8 + p2 - (msg[2]));
+		q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[6];
+		tmp = vectorize((msg[4]) - (msg[0]) - (msg[3]) - (msg[11]) + (msg[13]));
+		q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + cmsg[7];
+		tmp = vectorize(p6 - (msg[4]) - (msg[5]) - (msg[12]));
+		q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + cmsg[8];
+		tmp = vectorize((msg[2]) - (msg[5]) - (msg[6]) + (msg[13]) - (msg[15]));
+		q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + cmsg[9];
+		tmp = vectorize((msg[0]) - (msg[3]) + (msg[6]) + p3);
+		q[9] = (SHR(tmp, 1) ^ tmp) + cmsg[10];
+		tmp = vectorize(p7 - (msg[4]) - (msg[7]) + (msg[15]));
+		q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[11];
+		tmp = vectorize(p5 - (msg[0]) - (msg[2]) + (msg[9]));
+		q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + cmsg[12];
+		tmp = vectorize(p8 + msg[1] - p4);
+		q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + cmsg[13];
+		tmp = vectorize((msg[2]) + (msg[4]) + (msg[7]) + (msg[10]) + (msg[11]));
+		q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + cmsg[14];
+		tmp = vectorize((msg[3]) + p5 - (msg[11]) - (msg[12]));
+		q[14] = (SHR(tmp, 1) ^ tmp) + cmsg[15];
+		tmp = vectorize((msg[12]) - (msg[4]) - p4 + (msg[13]));
+		q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[0];
+
+		q[0 + 16] =
+			(SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) +
+			(SHR(q[0 + 1], 2) ^ SHL(q[0 + 1], 1) ^ ROTL64(q[0 + 1], 19) ^ ROTL64(q[0 + 1], 53)) +
+			(SHR(q[0 + 2], 2) ^ SHL(q[0 + 2], 2) ^ ROTL64(q[0 + 2], 28) ^ ROTL64(q[0 + 2], 59)) +
+			(SHR(q[0 + 3], 1) ^ SHL(q[0 + 3], 3) ^ ROTL64(q[0 + 3], 4) ^ ROTL64(q[0 + 3], 37)) +
+			(SHR(q[0 + 4], 1) ^ SHL(q[0 + 4], 2) ^ ROTL64(q[0 + 4], 13) ^ ROTL64(q[0 + 4], 43)) +
+			(SHR(q[0 + 5], 2) ^ SHL(q[0 + 5], 1) ^ ROTL64(q[0 + 5], 19) ^ ROTL64(q[0 + 5], 53)) +
+			(SHR(q[0 + 6], 2) ^ SHL(q[0 + 6], 2) ^ ROTL64(q[0 + 6], 28) ^ ROTL64(q[0 + 6], 59)) +
+			(SHR(q[0 + 7], 1) ^ SHL(q[0 + 7], 3) ^ ROTL64(q[0 + 7], 4) ^ ROTL64(q[0 + 7], 37)) +
+			(SHR(q[0 + 8], 1) ^ SHL(q[0 + 8], 2) ^ ROTL64(q[0 + 8], 13) ^ ROTL64(q[0 + 8], 43)) +
+			(SHR(q[0 + 9], 2) ^ SHL(q[0 + 9], 1) ^ ROTL64(q[0 + 9], 19) ^ ROTL64(q[0 + 9], 53)) +
+			(SHR(q[0 + 10], 2) ^ SHL(q[0 + 10], 2) ^ ROTL64(q[0 + 10], 28) ^ ROTL64(q[0 + 10], 59)) +
+			(SHR(q[0 + 11], 1) ^ SHL(q[0 + 11], 3) ^ ROTL64(q[0 + 11], 4) ^ ROTL64(q[0 + 11], 37)) +
+			(SHR(q[0 + 12], 1) ^ SHL(q[0 + 12], 2) ^ ROTL64(q[0 + 12], 13) ^ ROTL64(q[0 + 12], 43)) +
+			(SHR(q[0 + 13], 2) ^ SHL(q[0 + 13], 1) ^ ROTL64(q[0 + 13], 19) ^ ROTL64(q[0 + 13], 53)) +
+			(SHR(q[0 + 14], 2) ^ SHL(q[0 + 14], 2) ^ ROTL64(q[0 + 14], 28) ^ ROTL64(q[0 + 14], 59)) +
+			(SHR(q[0 + 15], 1) ^ SHL(q[0 + 15], 3) ^ ROTL64(q[0 + 15], 4) ^ ROTL64(q[0 + 15], 37)) +
+			((precalc[0] + ROTL64(h[0], 0 + 1) +
+			ROTL64(h[0 + 3], 0 + 4) - ROTL64(h[0 + 10], 0 + 11)) ^ cmsg[0 + 7]);
+		q[1 + 16] =
+			(SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) +
+			(SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROTL64(q[1 + 1], 19) ^ ROTL64(q[1 + 1], 53)) +
+			(SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROTL64(q[1 + 2], 28) ^ ROTL64(q[1 + 2], 59)) +
+			(SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROTL64(q[1 + 3], 4) ^ ROTL64(q[1 + 3], 37)) +
+			(SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROTL64(q[1 + 4], 13) ^ ROTL64(q[1 + 4], 43)) +
+			(SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROTL64(q[1 + 5], 19) ^ ROTL64(q[1 + 5], 53)) +
+			(SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROTL64(q[1 + 6], 28) ^ ROTL64(q[1 + 6], 59)) +
+			(SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROTL64(q[1 + 7], 4) ^ ROTL64(q[1 + 7], 37)) +
+			(SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROTL64(q[1 + 8], 13) ^ ROTL64(q[1 + 8], 43)) +
+			(SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROTL64(q[1 + 9], 19) ^ ROTL64(q[1 + 9], 53)) +
+			(SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROTL64(q[1 + 10], 28) ^ ROTL64(q[1 + 10], 59)) +
+			(SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROTL64(q[1 + 11], 4) ^ ROTL64(q[1 + 11], 37)) +
+			(SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROTL64(q[1 + 12], 13) ^ ROTL64(q[1 + 12], 43)) +
+			(SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) +
+			(SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) +
+			(SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) +
+			((precalc[1] + ROTL64(h[1], 1 + 1) +
+			ROTL64(h[1 + 3], 1 + 4) - ROTL64(h[1 + 10], 1 + 11)) ^ cmsg[1 + 7]);
+
+		pre1 = q[2] + q[4] + q[6] + q[8] + q[10] + q[12] + q[14];
+		pre2 = q[3] + q[5] + q[7] + q[9] + q[11] + q[13] + q[15];
+
+		q[2 + 16] = pre1 + CONST_EXP3(2) +
+			((precalc[2] + ROTL64(h[2], 2 + 1) +
+			ROTL64(h[2 + 3], 2 + 4) - ROTL64(h[2 + 10], 2 + 11)) ^ cmsg[2 + 7]);
+		q[3 + 16] = pre2 + CONST_EXP3(3) +
+			((precalc[3] + ROTL64(h[3], 3 + 1) +
+			ROTL64(h[3 + 3], 3 + 4) - ROTL64(h[3 + 10], 3 + 11)) ^ cmsg[3 + 7]);
+
+		pre1 = pre1 - q[2 + 0] + q[2 + 14];
+		pre2 = pre2 - q[3 + 0] + q[3 + 14];
+
+		q[4 + 16] = pre1 + CONST_EXP3(4) +
+			((precalc[4] + ROTL64(h[4], 4 + 1) +
+			ROL8(h[4 + 3]) - ROTL64(h[4 + 10], 4 + 11)) ^ cmsg[4 + 7]);
+		q[5 + 16] = pre2 + CONST_EXP3(5) +
+			((precalc[5] + ROTL64(h[5], 5 + 1) +
+			ROTL64(h[5 + 3], 5 + 4) - ROL16(h[5 + 10])) ^ cmsg[5 + 7]);
+
+		pre1 = pre1 - q[4 + 0] + q[4 + 14];
+		pre2 = pre2 - q[5 + 0] + q[5 + 14];
+
+		q[6 + 16] = pre1 + CONST_EXP3(6) +
+			((precalc[6] + ROTL64(h[6], 6 + 1) +
+			ROTL64(h[6 + 3], 6 + 4) - ROTL64(h[6 - 6], (6 - 6) + 1)) ^ cmsg[6 + 7]);
+		q[7 + 16] = pre2 + CONST_EXP3(7) +
+			((precalc[7] + ROL8(h[7]) +
+			ROTL64(h[7 + 3], 7 + 4) - ROTL64(h[7 - 6], (7 - 6) + 1)) ^ cmsg[7 + 7]);
+
+		pre1 = pre1 - q[6 + 0] + q[6 + 14];
+		pre2 = pre2 - q[7 + 0] + q[7 + 14];
+
+		q[8 + 16] = pre1 + CONST_EXP3(8) +
+			((precalc[8] + ROTL64(h[8], 8 + 1) +
+			ROTL64(h[8 + 3], 8 + 4) - ROTL64(h[8 - 6], (8 - 6) + 1)) ^ cmsg[8 + 7]);
+		q[9 + 16] = pre2 + CONST_EXP3(9) +
+			((precalc[9] + ROTL64(h[9], 9 + 1) +
+			ROTL64(h[9 + 3], 9 + 4) - ROTL64(h[9 - 6], (9 - 6) + 1)) ^ cmsg[9 - 9]);
+
+		pre1 = pre1 - q[8 + 0] + q[8 + 14];
+		pre2 = pre2 - q[9 + 0] + q[9 + 14];
+
+		q[10 + 16] = pre1 + CONST_EXP3(10) +
+			((precalc[10] + ROTL64(h[10], 10 + 1) +
+			ROTL64(h[10 + 3], 10 + 4) - ROTL64(h[10 - 6], (10 - 6) + 1)) ^ cmsg[10 - 9]);
+		q[11 + 16] = pre2 + CONST_EXP3(11) +
+			((precalc[11] + ROTL64(h[11], 11 + 1) +
+			ROTL64(h[11 + 3], 11 + 4) - ROTL64(h[11 - 6], (11 - 6) + 1)) ^ cmsg[11 - 9]);
+
+		pre1 = pre1 - q[10 + 0] + q[10 + 14];
+		pre2 = pre2 - q[11 + 0] + q[11 + 14];
+
+		q[12 + 16] = pre1 + CONST_EXP3(12) +
+			((precalc[12] + ROTL64(h[12], 12 + 1) +
+			ROL16(h[12 + 3]) - ROTL64(h[12 - 6], (12 - 6) + 1)) ^ cmsg[12 - 9]);
+		q[13 + 16] = pre2 + CONST_EXP3(13) +
+			((precalc[13] + ROTL64(h[13], 13 + 1) +
+			ROTL64(h[13 - 13], (13 - 13) + 1) - ROL8(h[13 - 6])) ^ cmsg[13 - 9]);
+
+		pre1 = pre1 - q[12 + 0] + q[12 + 14];
+		pre2 = pre2 - q[13 + 0] + q[13 + 14];
+
+		q[14 + 16] = pre1 + CONST_EXP3(14) +
+			((precalc[14] + ROTL64(h[14], 14 + 1) +
+			ROTL64(h[14 - 13], (14 - 13) + 1) - ROTL64(h[14 - 6], (14 - 6) + 1)) ^ cmsg[14 - 9]);
+		q[15 + 16] = pre2 + CONST_EXP3(15) +
+			((precalc[15] + ROL16(h[15]) +
+			ROTL64(h[15 - 13], (15 - 13) + 1) - ROTL64(h[15 - 6], (15 - 6) + 1)) ^ cmsg[15 - 9]);
+
+		XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23];
+		XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31];
+
+		msg2[4] = (SHR(XH64, 3) ^ q[20] ^ h[4]) + (XL64    ^ q[28] ^ q[4]);
+		msg2[8] = ROTL64(msg2[4], 9) + (XH64     ^     q[24] ^ h[8]) + (SHL(XL64, 8) ^ q[23] ^ q[8]);
+
+		inpHash[0] = devectorize(msg2[8]);
+
+		if(((msg2[8].x) & 0x8)) return;
+		{
+
+			msg2[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ h[0]) + (XL64    ^ q[24] ^ q[0]);
+			msg2[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ h[1]) + (XL64    ^ q[25] ^ q[1]);
+			msg2[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ h[2]) + (XL64    ^ q[26] ^ q[2]);
+			msg2[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ h[3]) + (XL64    ^ q[27] ^ q[3]);
+			msg2[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ h[5]) + (XL64    ^ q[29] ^ q[5]);
+			msg2[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ h[6]) + (XL64    ^ q[30] ^ q[6]);
+			msg2[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ h[7]) + (XL64    ^ q[31] ^ q[7]);
+
+			msg2[9] = ROTL64(msg2[5], 10) + (XH64     ^     q[25] ^ h[9]) + (SHR(XL64, 6) ^ q[16] ^ q[9]);
+			msg2[10] = ROTL64(msg2[6], 11) + (XH64     ^     q[26] ^ h[10]) + (SHL(XL64, 6) ^ q[17] ^ q[10]);
+			msg2[11] = ROTL64(msg2[7], 12) + (XH64     ^     q[27] ^ h[11]) + (SHL(XL64, 4) ^ q[18] ^ q[11]);
+			msg2[12] = ROTL64(msg2[0], 13) + (XH64     ^     q[28] ^ h[12]) + (SHR(XL64, 3) ^ q[19] ^ q[12]);
+			msg2[13] = ROTL64(msg2[1], 14) + (XH64     ^     q[29] ^ h[13]) + (SHR(XL64, 4) ^ q[20] ^ q[13]);
+			msg2[14] = ROTL64(msg2[2], 15) + (XH64     ^     q[30] ^ h[14]) + (SHR(XL64, 7) ^ q[21] ^ q[14]);
+			msg2[15] = ROL16(msg2[3]) + (XH64     ^     q[31] ^ h[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]);
+
+			uint28 *phash2 = (uint28*)inpHash;
+			phash2[0] = make_uint28(msg2[8], msg2[9], msg2[10], msg2[11]);
+			phash2[1] = make_uint28(msg2[12], msg2[13], msg2[14], msg2[15]);
+
+		}
+	}
 }
 
 // Setup-Funktionen
@@ -440,44 +1170,25 @@ __host__ void quark_bmw512_cpu_init(int thr_id, uint32_t threads)
 {
 }
 
-// Bmw512 f�r 80 Byte grosse Eingangsdaten
-__host__ void quark_bmw512_cpu_setBlock_80(void *pdata)
-{
-	// Message mit Padding bereitstellen
-	// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
-	unsigned char PaddedMessage[128];
-	memcpy(PaddedMessage, pdata, 80);
-	memset(PaddedMessage+80, 0, 48);
-	uint64_t *message = (uint64_t*)PaddedMessage;
-	// Padding einf�gen (Byteorder?!?)
-	message[10] = SPH_C64(0x80);
-	// L�nge (in Bits, d.h. 80 Byte * 8 = 640 Bits
-	message[15] = SPH_C64(640);
-
-	// die Message zur Berechnung auf der GPU
-	cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
-}
 
-__host__ void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash)
 {
-    const uint32_t threadsperblock = 32;
+	const uint32_t threadsperblock = 32;
 
     // berechne wie viele Thread Blocks wir brauchen
     dim3 grid((threads + threadsperblock-1)/threadsperblock);
     dim3 block(threadsperblock);
 
-    quark_bmw512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-//	MyStreamSynchronize(NULL, order, thr_id);
+    quark_bmw512_gpu_hash_64<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
-
-__host__ void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
+__host__ void quark_bmw512_cpu_hash_64_quark(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash)
 {
-    const uint32_t threadsperblock = 128;
+	const uint32_t threadsperblock = 32;
 
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
 
-    quark_bmw512_gpu_hash_80<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
+	quark_bmw512_gpu_hash_64_quark << <grid, block, 0, gpustream[thr_id] >> >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 }
-
diff --git a/quark/cuda_jh512.cu b/quark/cuda_jh512.cu
index 7809443d02..f0c26f2dc8 100644
--- a/quark/cuda_jh512.cu
+++ b/quark/cuda_jh512.cu
@@ -1,69 +1,124 @@
 #include "cuda_helper.h"
+#include "cuda_vector.h"
+__constant__ static __align__(16) uint32_t c_E8_bslice32[42][8] = {
+	// Round 0 (Function0)
+		{ 0xa2ded572, 0x90d6ab81, 0x67f815df, 0xf6875a4d, 0x0a15847b, 0xc54f9f4e, 0x571523b7, 0x402bd1c3 },
+		{ 0xe03a98ea, 0xb4960266, 0x9cfa455c, 0x8a53bbf2, 0x99d2c503, 0x1a1456b5, 0x9a99b266, 0x31a2db88 }, // 1
+		{ 0x5c5aa303, 0x8019051c, 0xdb0e199a, 0x1d959e84, 0x0ab23f40, 0xadeb336f, 0x1044c187, 0xdccde75e }, // 2
+		{ 0x9213ba10, 0x39812c0a, 0x416bbf02, 0x5078aa37, 0x156578dc, 0xd2bf1a3f, 0xd027bbf7, 0xd3910041 }, // 3
+		{ 0x0d5a2d42, 0x0ba75c18, 0x907eccf6, 0xac442bc7, 0x9c9f62dd, 0xd665dfd1, 0xce97c092, 0x23fcc663 }, // 4
+		{ 0x036c6e97, 0xbb03f1ee, 0x1ab8e09e, 0xfa618e5d, 0x7e450521, 0xb29796fd, 0xa8ec6c44, 0x97818394 }, // 5
+		{ 0x37858e4a, 0x8173fe8a, 0x2f3003db, 0x6c69b8f8, 0x2d8d672a, 0x4672c78a, 0x956a9ffb, 0x14427fc0 }, // 6
+		// Round 7 (Function0)
+		{ 0x8f15f4c5, 0xb775de52, 0xc45ec7bd, 0xbc88e4ae, 0xa76f4475, 0x1e00b882, 0x80bb118f, 0xf4a3a698 },
+		{ 0x338ff48e, 0x20edf1b6, 0x1563a3a9, 0xfde05a7c, 0x24565faa, 0x5ae9ca36, 0x89f9b7d5, 0x362c4206 },
+		{ 0x433529ce, 0x591ff5d0, 0x3d98fe4e, 0x86814e6f, 0x74f93a53, 0x81ad9d0e, 0xa74b9a73, 0x9f5ad8af },
+		{ 0x670605a7, 0x26077447, 0x6a6234ee, 0x3f1080c6, 0xbe280b8b, 0x6f7ea0e0, 0x2717b96e, 0x7b487ec6 },
+		{ 0xa50a550d, 0x81727686, 0xc0a4f84a, 0xd48d6050, 0x9fe7e391, 0x415a9e7e, 0x9ef18e97, 0x62b0e5f3 },
+		{ 0xec1f9ffc, 0xf594d74f, 0x7a205440, 0xd895fa9d, 0x001ae4e3, 0x117e2e55, 0x84c9f4ce, 0xa554c324 },
+		{ 0x2872df5b, 0xef7c8905, 0x286efebd, 0x2ed349ee, 0xe27ff578, 0x85937e44, 0xb2c4a50f, 0x7f5928eb },
+		// Round 14 (Function0)
+		{ 0x37695f70, 0x04771bc7, 0x4a3124b3, 0xe720b951, 0xf128865e, 0xe843fe74, 0x65e4d61d, 0x8a87d423 },
+		{ 0xa3e8297d, 0xfb301b1d, 0xf2947692, 0xe01bdc5b, 0x097acbdd, 0x4f4924da, 0xc1d9309b, 0xbf829cf2 },
+		{ 0x31bae7a4, 0x32fcae3b, 0xffbf70b4, 0x39d3bb53, 0x0544320d, 0xc1c39f45, 0x48bcf8de, 0xa08b29e0 },
+		{ 0xfd05c9e5, 0x01b771a2, 0x0f09aef7, 0x95ed44e3, 0x12347094, 0x368e3be9, 0x34f19042, 0x4a982f4f },
+		{ 0x631d4088, 0xf14abb7e, 0x15f66ca0, 0x30c60ae2, 0x4b44c147, 0xc5b67046, 0xffaf5287, 0xe68c6ecc },
+		{ 0x56a4d5a4, 0x45ce5773, 0x00ca4fbd, 0xadd16430, 0x4b849dda, 0x68cea6e8, 0xae183ec8, 0x67255c14 },
+		{ 0xf28cdaa3, 0x20b2601f, 0x16e10ecb, 0x7b846fc2, 0x5806e933, 0x7facced1, 0x9a99949a, 0x1885d1a0 },
+		// Round 21 (Function0)
+		{ 0xa15b5932, 0x67633d9f, 0xd319dd8d, 0xba6b04e4, 0xc01c9a50, 0xab19caf6, 0x46b4a5aa, 0x7eee560b },
+		{ 0xea79b11f, 0x5aac571d, 0x742128a9, 0x76d35075, 0x35f7bde9, 0xfec2463a, 0xee51363b, 0x01707da3 },
+		{ 0xafc135f7, 0x15638341, 0x42d8a498, 0xa8db3aea, 0x20eced78, 0x4d3bc3fa, 0x79676b9e, 0x832c8332 },
+		{ 0x1f3b40a7, 0x6c4e3ee7, 0xf347271c, 0xfd4f21d2, 0x34f04059, 0x398dfdb8, 0x9a762db7, 0xef5957dc },
+		{ 0x490c9b8d, 0xd0ae3b7d, 0xdaeb492b, 0x84558d7a, 0x49d7a25b, 0xf0e9a5f5, 0x0d70f368, 0x658ef8e4 },
+		{ 0xf4a2b8a0, 0x92946891, 0x533b1036, 0x4f88e856, 0x9e07a80c, 0x555cb05b, 0x5aec3e75, 0x4cbcbaf8 },
+		{ 0x993bbbe3, 0x28acae64, 0x7b9487f3, 0x6db334dc, 0xd6f4da75, 0x50a5346c, 0x5d1c6b72, 0x71db28b8 },
+		// Round 28 (Function0)
+		{ 0xf2e261f8, 0xf1bcac1c, 0x2a518d10, 0xa23fce43, 0x3364dbe3, 0x3cd1bb67, 0xfc75dd59, 0xb043e802 },
+		{ 0xca5b0a33, 0xc3943b92, 0x75a12988, 0x1e4d790e, 0x4d19347f, 0xd7757479, 0x5c5316b4, 0x3fafeeb6 },
+		{ 0xf7d4a8ea, 0x5324a326, 0x21391abe, 0xd23c32ba, 0x097ef45c, 0x4a17a344, 0x5127234c, 0xadd5a66d },
+		{ 0xa63e1db5, 0xa17cf84c, 0x08c9f2af, 0x4d608672, 0x983d5983, 0xcc3ee246, 0x563c6b91, 0xf6c76e08 },
+		{ 0xb333982f, 0xe8b6f406, 0x5e76bcb1, 0x36d4c1be, 0xa566d62b, 0x1582ee74, 0x2ae6c4ef, 0x6321efbc },
+		{ 0x0d4ec1fd, 0x1614c17e, 0x69c953f4, 0x16fae006, 0xc45a7da7, 0x3daf907e, 0x26585806, 0x3f9d6328 },
+		{ 0xe3f2c9d2, 0x16512a74, 0x0cd29b00, 0x9832e0f2, 0x30ceaa5f, 0xd830eb0d, 0x300cd4b7, 0x9af8cee3 },
+		// Round 35 (Function0)
+		{ 0x7b9ec54b, 0x574d239b, 0x9279f1b5, 0x316796e6, 0x6ee651ff, 0xf3a6e6cc, 0xd3688604, 0x05750a17 },
+		{ 0xd98176b1, 0xb3cb2bf4, 0xce6c3213, 0x47154778, 0x8452173c, 0x825446ff, 0x62a205f8, 0x486a9323 },
+		{ 0x0758df38, 0x442e7031, 0x65655e4e, 0x86ca0bd0, 0x897cfcf2, 0xa20940f0, 0x8e5086fc, 0x4e477830 },
+		{ 0x39eea065, 0x26b29721, 0x8338f7d1, 0x6ff81301, 0x37e95ef7, 0xd1ed44a3, 0xbd3a2ce4, 0xe7de9fef },
+		{ 0x15dfa08b, 0x7ceca7d8, 0xd9922576, 0x7eb027ab, 0xf6f7853c, 0xda7d8d53, 0xbe42dc12, 0xdea83eaa },
+		{ 0x93ce25aa, 0xdaef5fc0, 0xd86902bd, 0xa5194a17, 0xfd43f65a, 0x33664d97, 0xf908731a, 0x6a21fd4c },
+		{ 0x3198b435, 0xa163d09a, 0x701541db, 0x72409751, 0xbb0f1eea, 0xbf9d75f6, 0x9b54cded, 0xe26f4791 }
+		// 42 rounds...
+};
+
+static uint32_t *d_found[MAX_GPUS];
+
+#ifndef NOASM
+__device__ __forceinline__
+static void SWAP4(uint32_t *x) {
+#pragma nounroll
+	// y is used as tmp register too
+	for (uint32_t y = 0; y<4; y++, ++x) {
+		asm("and.b32 %1, %0, 0xF0F0F0F0;"
+			"xor.b32 %0, %0, %1;"
+			"shr.b32 %1, %1, 4;"
+			"vshl.u32.u32.u32.clamp.add %0, %0, 4, %1;\n\t"
+			: "+r"(*x) : "r"(y));
+	}
+}
 
-__constant__ unsigned char c_E8_bitslice_roundconstant[42][32] = {
-	{ 0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40 },
-	{ 0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31 },
-	{ 0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc },
-	{ 0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3 },
-	{ 0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23 },
-	{ 0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97 },
-	{ 0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14 },
-	{ 0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4 },
-	{ 0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36 },
-	{ 0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f },
-	{ 0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b },
-	{ 0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62 },
-	{ 0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5 },
-	{ 0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f },
-	{ 0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a },
-	{ 0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf },
-	{ 0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0 },
-	{ 0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a },
-	{ 0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6 },
-	{ 0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67 },
-	{ 0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18 },
-	{ 0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e },
-	{ 0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1 },
-	{ 0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83 },
-	{ 0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef },
-	{ 0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65 },
-	{ 0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c },
-	{ 0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71 },
-	{ 0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0 },
-	{ 0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f },
-	{ 0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad },
-	{ 0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6 },
-	{ 0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63 },
-	{ 0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f },
-	{ 0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a },
-	{ 0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5 },
-	{ 0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48 },
-	{ 0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e },
-	{ 0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7 },
-	{ 0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde },
-	{ 0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a },
-	{ 0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2 } };
-
-#define SWAP4(x,y)\
-		y = (x &  0xf0f0f0f0UL); \
-		x = (x ^ y); \
-		y = (y >> 4); \
-		x = (x << 4); \
-		x= x | y;
-
-#define SWAP2(x,y)\
-		y = (x &  0xccccccccUL); \
-		x = (x ^ y); \
-		y = (y >> 2); \
-		x = (x << 2); \
-		x= x | y;
-
-#define SWAP1(x,y)\
-		y = (x &  0xaaaaaaaaUL); \
-		x = (x ^ y); \
-		y = (y >> 1); \
-		x = x + x; \
-		x= x | y;
+__device__ __forceinline__
+static void SWAP2(uint32_t *x) {
+#pragma nounroll
+	// y is used as tmp register too
+	for (uint32_t y = 0; y<4; y++, ++x) {
+		asm("and.b32 %1, %0, 0xCCCCCCCC;"
+			"xor.b32 %0, %0, %1;"
+			"shr.b32 %1, %1, 2;"
+			"vshl.u32.u32.u32.clamp.add %0, %0, 2, %1;\n\t"
+			: "+r"(*x) : "r"(y));
+	}
+}
+
+__device__ __forceinline__
+static void SWAP1(uint32_t *x) {
+#pragma nounroll
+	// y is used as tmp register too
+	for (uint32_t y = 0; y<4; y++, ++x) {
+		asm("and.b32 %1, %0, 0xAAAAAAAA;"
+			"xor.b32 %0, %0, %1;"
+			"shr.b32 %1, %1, 1;"
+			"vshl.u32.u32.u32.clamp.add %0, %0, 1, %1;\n\t"
+			: "+r"(*x) : "r"(y));
+	}
+}
+#else
+__device__ __forceinline__
+static void SWAP4(uint32_t *x)
+{
+	x[0] = ((x[0] & 0x0f0f0f0fu) << 4) | (x[0] & 0xF0F0F0F0u) >> 4;
+	x[1] = ((x[1] & 0x0f0f0f0fu) << 4) | (x[1] & 0xF0F0F0F0u) >> 4;
+	x[2] = ((x[2] & 0x0f0f0f0fu) << 4) | (x[2] & 0xF0F0F0F0u) >> 4;
+	x[3] = ((x[3] & 0x0f0f0f0fu) << 4) | (x[3] & 0xF0F0F0F0u) >> 4;
+}
+__device__ __forceinline__
+static void SWAP2(uint32_t *x)
+{
+	x[0] = ((x[0] & 0x33333333u) << 2) | (x[0] & 0xCCCCCCCCu) >> 2;
+	x[1] = ((x[1] & 0x33333333u) << 2) | (x[1] & 0xCCCCCCCCu) >> 2;
+	x[2] = ((x[2] & 0x33333333u) << 2) | (x[2] & 0xCCCCCCCCu) >> 2;
+	x[3] = ((x[3] & 0x33333333u) << 2) | (x[3] & 0xCCCCCCCCu) >> 2;
+}
+__device__ __forceinline__
+static void SWAP1(uint32_t *x)
+{
+	x[0] = ((x[0] & 0x55555555u) << 1) | (x[0] & 0xAAAAAAAAu) >> 1;
+	x[1] = ((x[1] & 0x55555555u) << 1) | (x[1] & 0xAAAAAAAAu) >> 1;
+	x[2] = ((x[2] & 0x55555555u) << 1) | (x[2] & 0xAAAAAAAAu) >> 1;
+	x[3] = ((x[3] & 0x55555555u) << 1) | (x[3] & 0xAAAAAAAAu) >> 1;
+}
+#endif
 /*swapping bits 16i||16i+1||......||16i+7  with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/
 //#define SWAP8(x)   (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8));
 #define SWAP8(x) (x) = __byte_perm(x, x, 0x2301);
@@ -96,17 +151,17 @@ __constant__ unsigned char c_E8_bitslice_roundconstant[42][32] = {
       m1 ^= (temp0 & (m0));        \
       m2 ^= temp0;
 
-static __device__ __forceinline__ void Sbox_and_MDS_layer(uint32_t x[8][4], uint32_t roundnumber)
+__device__ __forceinline__
+static void Sbox_and_MDS_layer(uint32_t x[8][4], const int rnd)
 {
-	uint32_t temp0;
-	uint32_t cc0, cc1;
-	//Sbox and MDS layer
-#pragma unroll 4
-	for (int i = 0; i < 4; i++) {
-		cc0 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i];
-		cc1 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i + 4];
-		Sbox(x[0][i], x[2][i], x[4][i], x[6][i], cc0);
-		Sbox(x[1][i], x[3][i], x[5][i], x[7][i], cc1);
+	uint2* cc = (uint2*)&c_E8_bslice32[rnd];
+
+	// Sbox and MDS layer
+#pragma unroll
+	for (int i = 0; i < 4; i++, ++cc) {
+		uint32_t temp0;
+		Sbox(x[0][i], x[2][i], x[4][i], x[6][i], cc->x);
+		Sbox(x[1][i], x[3][i], x[5][i], x[7][i], cc->y);
 		L(x[0][i], x[2][i], x[4][i], x[6][i], x[1][i], x[3][i], x[5][i], x[7][i]);
 	}
 }
@@ -118,11 +173,7 @@ static __device__ __forceinline__ void RoundFunction0(uint32_t x[8][4], uint32_t
 #pragma unroll 4
 	for (int j = 1; j < 8; j = j + 2)
 	{
-		uint32_t y;
-		SWAP1(x[j][0], y);
-		SWAP1(x[j][1], y);
-		SWAP1(x[j][2], y);
-		SWAP1(x[j][3], y);
+		SWAP1(x[j]);
 	}
 }
 
@@ -133,11 +184,7 @@ static __device__ __forceinline__ void RoundFunction1(uint32_t x[8][4], uint32_t
 #pragma unroll 4
 	for (int j = 1; j < 8; j = j + 2)
 	{
-		uint32_t y;
-		SWAP2(x[j][0], y);
-		SWAP2(x[j][1], y);
-		SWAP2(x[j][2], y);
-		SWAP2(x[j][3], y);
+		SWAP2(x[j]);
 	}
 }
 
@@ -148,11 +195,7 @@ static __device__ __forceinline__ void RoundFunction2(uint32_t x[8][4], uint32_t
 #pragma unroll 4
 	for (int j = 1; j < 8; j = j + 2)
 	{
-		uint32_t y;
-		SWAP4(x[j][0], y);
-		SWAP4(x[j][1], y);
-		SWAP4(x[j][2], y);
-		SWAP4(x[j][3], y);
+		SWAP4(x[j]);
 	}
 }
 
@@ -245,14 +288,14 @@ static __device__ __forceinline__ void F8(uint32_t x[8][4], const uint32_t buffe
 
 // Die Hash-Funktion
 __global__ __launch_bounds__(256, 4)
-void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
+void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
 {
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+    const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
     if (thread < threads)
     {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-		uint32_t hashPosition = nounce - startNounce;
-		uint32_t *Hash = &g_hash[16 * hashPosition];
+        const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t hashPosition = nounce - startNounce;
+		uint32_t *const Hash = &g_hash[16 * hashPosition];
 		uint32_t x[8][4] = {
 				{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a },
 				{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 },
@@ -263,11 +306,18 @@ void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g
 				{ 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 },
 				{ 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } };
 
+		uint32_t msg[16];
+
+		uint28 *phash = (uint28*)Hash;
+		uint28 *outpt = (uint28*)msg;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+
 #pragma unroll 16
-		for (int i = 0; i < 16; i++)  x[i >> 2][i & 3] ^= ((uint32_t*)Hash)[i];
+		for (int i = 0; i < 16; i++)  x[i >> 2][i & 3] ^= msg[i];
 		E8(x);
 #pragma unroll 16
-		for (int i = 0; i < 16; i++) x[(16 + i) >> 2][(16 + i) & 3] ^= ((uint32_t*)Hash)[i];
+		for (int i = 0; i < 16; i++) x[(16 + i) >> 2][(16 + i) & 3] ^= msg[i];
 
 		x[0 >> 2][0 & 3] ^= 0x80;
 		x[15 >> 2][15 & 3] ^= 0x00020000;
@@ -296,16 +346,25 @@ void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g
 
 // Die Hash-Funktion
 #define TPB2 256
-__global__ __launch_bounds__(TPB2, 4)
-void quark_jh512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
+__global__ __launch_bounds__(TPB2, 2)
+void quark_jh512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector, uint32_t *const __restrict__ d_found, uint32_t target)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		const uint32_t hashPosition = nounce - startNounce;
+		uint32_t *const Hash = (uint32_t*)&g_hash[8 * hashPosition];
+
+
+		uint32_t msg[16];
+
+		uint28 *phash = (uint28*)Hash;
+		uint28 *outpt = (uint28*)msg;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
 
-		int hashPosition = nounce - startNounce;
-		uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
 
 		uint32_t x[8][4] = {
 			{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a },
@@ -317,12 +376,12 @@ void quark_jh512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6
 			{ 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 },
 			{ 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } };
 
-		F8(x, Hash);
+		F8(x, msg);
 
 		x[0][0] ^= 0x80U;
 		x[3][3] ^= 0x00020000U;
 
-		for (int i = 0; i < 42; i += 7)
+		for (int i = 0; i < 35; i += 7)
 		{
 			RoundFunction0(x, i);
 			RoundFunction1(x, i + 1);
@@ -332,31 +391,45 @@ void quark_jh512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6
 			RoundFunction5(x, i + 5);
 			RoundFunction6(x, i + 6);
 		}
-
-		Hash[7] = x[5][3];
+		RoundFunction0(x, 35);
+		RoundFunction1(x, 35 + 1);
+		RoundFunction2(x, 35 + 2);
+		RoundFunction3(x, 35 + 3);
+		RoundFunction4(x, 35 + 4);
+		RoundFunction5(x, 35 + 5);
+		RoundFunction6(x, 35 + 6);
+
+		if(x[5][3] <= target)
+		{
+			uint32_t tmp = atomicExch(&(d_found[0]), nounce);
+			if(tmp != 0xffffffff)
+				d_found[1] = tmp;
+		}
 	}
 }
 
 
-__host__ void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash)
 {
     const uint32_t threadsperblock = 32;
 
     // berechne wie viele Thread Blocks wir brauchen
     dim3 grid((threads + threadsperblock-1)/threadsperblock);
     dim3 block(threadsperblock);
-    quark_jh512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
+    quark_jh512_gpu_hash_64<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_hash, d_nonceVector);
 }
 
 // Setup-Funktionen
-__host__ void  quark_jh512_cpu_init(int thr_id, uint32_t threads)
+__host__ void quark_jh512_cpu_init(int thr_id)
 {
+	cudaMalloc(&(d_found[thr_id]), 2 * sizeof(uint32_t));
 }
 
-__host__ void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found)
 {
 	dim3 grid((threads + TPB2 - 1) / TPB2);
 	dim3 block(TPB2);
-
-	quark_jh512_gpu_hash_64_final << <grid, block >> >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	cudaMemsetAsync(d_found[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]);
+	quark_jh512_gpu_hash_64_final << <grid, block, 0, gpustream[thr_id] >> >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_found[thr_id], target);
+	cudaMemcpyAsync(h_found, d_found[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
 }
diff --git a/quark/cuda_jh512__4.cu b/quark/cuda_jh512__4.cu
new file mode 100644
index 0000000000..b0290033e5
--- /dev/null
+++ b/quark/cuda_jh512__4.cu
@@ -0,0 +1,362 @@
+#include "cuda_helper.h"
+
+__constant__ unsigned char c_E8_bitslice_roundconstant[42][32] = {
+	{ 0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40 },
+	{ 0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31 },
+	{ 0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc },
+	{ 0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3 },
+	{ 0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23 },
+	{ 0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97 },
+	{ 0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14 },
+	{ 0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4 },
+	{ 0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36 },
+	{ 0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f },
+	{ 0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b },
+	{ 0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62 },
+	{ 0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5 },
+	{ 0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f },
+	{ 0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a },
+	{ 0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf },
+	{ 0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0 },
+	{ 0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a },
+	{ 0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6 },
+	{ 0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67 },
+	{ 0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18 },
+	{ 0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e },
+	{ 0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1 },
+	{ 0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83 },
+	{ 0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef },
+	{ 0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65 },
+	{ 0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c },
+	{ 0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71 },
+	{ 0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0 },
+	{ 0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f },
+	{ 0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad },
+	{ 0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6 },
+	{ 0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63 },
+	{ 0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f },
+	{ 0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a },
+	{ 0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5 },
+	{ 0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48 },
+	{ 0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e },
+	{ 0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7 },
+	{ 0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde },
+	{ 0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a },
+	{ 0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2 } };
+
+#define SWAP4(x,y)\
+		y = (x &  0xf0f0f0f0UL); \
+		x = (x ^ y); \
+		y = (y >> 4); \
+		x = (x << 4); \
+		x= x | y;
+
+#define SWAP2(x,y)\
+		y = (x &  0xccccccccUL); \
+		x = (x ^ y); \
+		y = (y >> 2); \
+		x = (x << 2); \
+		x= x | y;
+
+#define SWAP1(x,y)\
+		y = (x &  0xaaaaaaaaUL); \
+		x = (x ^ y); \
+		y = (y >> 1); \
+		x = x + x; \
+		x= x | y;
+/*swapping bits 16i||16i+1||......||16i+7  with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/
+//#define SWAP8(x)   (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8));
+#define SWAP8(x) (x) = __byte_perm(x, x, 0x2301);
+/*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 32-bit x*/
+//#define SWAP16(x)  (x) = ((((x) & 0x0000ffffUL) << 16) | (((x) & 0xffff0000UL) >> 16));
+#define SWAP16(x) (x) = __byte_perm(x, x, 0x1032);
+
+/*The MDS transform*/
+#define L(m0,m1,m2,m3,m4,m5,m6,m7) \
+      (m4) ^= (m1);                \
+      (m5) ^= (m2);                \
+      (m6) ^= (m0) ^ (m3);         \
+      (m7) ^= (m0);                \
+      (m0) ^= (m5);                \
+      (m1) ^= (m6);                \
+      (m2) ^= (m4) ^ (m7);         \
+      (m3) ^= (m4);
+
+/*The Sbox*/
+#define Sbox(m0,m1,m2,m3,cc)       \
+      m3  = ~(m3);                 \
+      m0 ^= ((~(m2)) & (cc));      \
+      temp0 = (cc) ^ ((m0) & (m1));\
+      m0 ^= ((m2) & (m3));         \
+      m3 ^= ((~(m1)) & (m2));      \
+      m1 ^= ((m0) & (m2));         \
+      m2 ^= ((m0) & (~(m3)));      \
+      m0 ^= ((m1) | (m3));         \
+      m3 ^= ((m1) & (m2));         \
+      m1 ^= (temp0 & (m0));        \
+      m2 ^= temp0;
+
+static __device__ __forceinline__ void Sbox_and_MDS_layer(uint32_t x[8][4], uint32_t roundnumber)
+{
+	uint32_t temp0;
+	uint32_t cc0, cc1;
+	//Sbox and MDS layer
+#pragma unroll 4
+	for (int i = 0; i < 4; i++) {
+		cc0 = c_E8_bitslice_roundconstant[roundnumber][i];
+		cc1 = c_E8_bitslice_roundconstant[roundnumber][i + 4];
+		Sbox(x[0][i], x[2][i], x[4][i], x[6][i], cc0);
+		Sbox(x[1][i], x[3][i], x[5][i], x[7][i], cc1);
+		L(x[0][i], x[2][i], x[4][i], x[6][i], x[1][i], x[3][i], x[5][i], x[7][i]);
+	}
+}
+
+static __device__ __forceinline__ void RoundFunction0(uint32_t x[8][4], uint32_t roundnumber)
+{
+	Sbox_and_MDS_layer(x, roundnumber);
+
+#pragma unroll 4
+	for (int j = 1; j < 8; j = j + 2)
+	{
+		uint32_t y;
+		SWAP1(x[j][0], y);
+		SWAP1(x[j][1], y);
+		SWAP1(x[j][2], y);
+		SWAP1(x[j][3], y);
+	}
+}
+
+static __device__ __forceinline__ void RoundFunction1(uint32_t x[8][4], uint32_t roundnumber)
+{
+	Sbox_and_MDS_layer(x, roundnumber);
+
+#pragma unroll 4
+	for (int j = 1; j < 8; j = j + 2)
+	{
+		uint32_t y;
+		SWAP2(x[j][0], y);
+		SWAP2(x[j][1], y);
+		SWAP2(x[j][2], y);
+		SWAP2(x[j][3], y);
+	}
+}
+
+static __device__ __forceinline__ void RoundFunction2(uint32_t x[8][4], uint32_t roundnumber)
+{
+	Sbox_and_MDS_layer(x, roundnumber);
+
+#pragma unroll 4
+	for (int j = 1; j < 8; j = j + 2)
+	{
+		uint32_t y;
+		SWAP4(x[j][0], y);
+		SWAP4(x[j][1], y);
+		SWAP4(x[j][2], y);
+		SWAP4(x[j][3], y);
+	}
+}
+
+static __device__ __forceinline__ void RoundFunction3(uint32_t x[8][4], uint32_t roundnumber)
+{
+	Sbox_and_MDS_layer(x, roundnumber);
+
+#pragma unroll 4
+	for (int j = 1; j < 8; j = j + 2)
+	{
+#pragma unroll 4
+		for (int i = 0; i < 4; i++) SWAP8(x[j][i]);
+	}
+}
+
+static __device__ __forceinline__ void RoundFunction4(uint32_t x[8][4], uint32_t roundnumber)
+{
+	Sbox_and_MDS_layer(x, roundnumber);
+
+#pragma unroll 4
+	for (int j = 1; j < 8; j = j + 2)
+	{
+#pragma unroll 4
+		for (int i = 0; i < 4; i++) SWAP16(x[j][i]);
+	}
+}
+
+static __device__ __forceinline__ void RoundFunction5(uint32_t x[8][4], uint32_t roundnumber)
+{
+	uint32_t temp0;
+
+	Sbox_and_MDS_layer(x, roundnumber);
+
+#pragma unroll 4
+	for (int j = 1; j < 8; j = j + 2)
+	{
+#pragma unroll 2
+		for (int i = 0; i < 4; i = i + 2) {
+			temp0 = x[j][i]; x[j][i] = x[j][i + 1]; x[j][i + 1] = temp0;
+		}
+	}
+}
+
+static __device__ __forceinline__ void RoundFunction6(uint32_t x[8][4], uint32_t roundnumber)
+{
+	uint32_t temp0;
+
+	Sbox_and_MDS_layer(x, roundnumber);
+
+#pragma unroll 4
+	for (int j = 1; j < 8; j = j + 2)
+	{
+#pragma unroll 2
+		for (int i = 0; i < 2; i++) {
+			temp0 = x[j][i]; x[j][i] = x[j][i + 2]; x[j][i + 2] = temp0;
+		}
+	}
+}
+
+/*The bijective function E8, in bitslice form */
+static __device__ __forceinline__ void E8(uint32_t x[8][4])
+{
+	/*perform 6 rounds*/
+	//#pragma unroll 6
+	for (int i = 0; i < 42; i += 7)
+	{
+		RoundFunction0(x, i);
+		RoundFunction1(x, i + 1);
+		RoundFunction2(x, i + 2);
+		RoundFunction3(x, i + 3);
+		RoundFunction4(x, i + 4);
+		RoundFunction5(x, i + 5);
+		RoundFunction6(x, i + 6);
+	}
+}
+
+static __device__ __forceinline__ void F8(uint32_t x[8][4], const uint32_t buffer[16])
+{
+	/*xor the 512-bit message with the fist half of the 1024-bit hash state*/
+#pragma unroll 16
+	for (int i = 0; i < 16; i++)  x[i >> 2][i & 3] ^= ((uint32_t*)buffer)[i];
+
+	/*the bijective function E8 */
+	E8(x);
+
+	/*xor the 512-bit message with the second half of the 1024-bit hash state*/
+#pragma unroll 16
+	for (int i = 0; i < 16; i++)  x[(16 + i) >> 2][(16 + i) & 3] ^= ((uint32_t*)buffer)[i];
+}
+
+// Die Hash-Funktion
+__global__ __launch_bounds__(256, 4)
+void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
+{
+    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+    if (thread < threads)
+    {
+        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		uint32_t hashPosition = nounce - startNounce;
+		uint32_t *Hash = &g_hash[16 * hashPosition];
+		uint32_t x[8][4] = {
+				{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a },
+				{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 },
+				{ 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea },
+				{ 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba },
+				{ 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e },
+				{ 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d },
+				{ 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 },
+				{ 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } };
+
+#pragma unroll 16
+		for (int i = 0; i < 16; i++)  x[i >> 2][i & 3] ^= ((uint32_t*)Hash)[i];
+		E8(x);
+#pragma unroll 16
+		for (int i = 0; i < 16; i++) x[(16 + i) >> 2][(16 + i) & 3] ^= ((uint32_t*)Hash)[i];
+
+		x[0 >> 2][0 & 3] ^= 0x80;
+		x[15 >> 2][15 & 3] ^= 0x00020000;
+		E8(x);
+		x[(16 + 0) >> 2][(16 + 0) & 3] ^= 0x80;
+		x[(16 + 15) >> 2][(16 + 15) & 3] ^= 0x00020000;
+
+		Hash[0] = x[4][0];
+		Hash[1] = x[4][1];
+		Hash[2] = x[4][2];
+		Hash[3] = x[4][3];
+		Hash[4] = x[5][0];
+		Hash[5] = x[5][1];
+		Hash[6] = x[5][2];
+		Hash[7] = x[5][3];
+		Hash[8] = x[6][0];
+		Hash[9] = x[6][1];
+		Hash[10] = x[6][2];
+		Hash[11] = x[6][3];
+		Hash[12] = x[7][0];
+		Hash[13] = x[7][1];
+		Hash[14] = x[7][2];
+		Hash[15] = x[7][3];
+	}
+}
+
+// Die Hash-Funktion
+#define TPB2 256
+__global__ __launch_bounds__(TPB2, 4)
+void quark_jh512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		int hashPosition = nounce - startNounce;
+		uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
+
+		uint32_t x[8][4] = {
+			{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a },
+			{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 },
+			{ 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea },
+			{ 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba },
+			{ 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e },
+			{ 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d },
+			{ 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 },
+			{ 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } };
+
+		F8(x, Hash);
+
+		x[0][0] ^= 0x80U;
+		x[3][3] ^= 0x00020000U;
+
+		for (int i = 0; i < 42; i += 7)
+		{
+			RoundFunction0(x, i);
+			RoundFunction1(x, i + 1);
+			RoundFunction2(x, i + 2);
+			RoundFunction3(x, i + 3);
+			RoundFunction4(x, i + 4);
+			RoundFunction5(x, i + 5);
+			RoundFunction6(x, i + 6);
+		}
+
+		Hash[7] = x[5][3];
+	}
+}
+
+
+__host__ void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash)
+{
+    const uint32_t threadsperblock = 32;
+
+    // berechne wie viele Thread Blocks wir brauchen
+    dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    dim3 block(threadsperblock);
+    quark_jh512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
+}
+
+// Setup-Funktionen
+__host__ void  quark_jh512_cpu_init(int thr_id, uint32_t threads)
+{
+}
+
+__host__ void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash)
+{
+	dim3 grid((threads + TPB2 - 1) / TPB2);
+	dim3 block(TPB2);
+
+	quark_jh512_gpu_hash_64_final << <grid, block >> >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+}
diff --git a/quark/cuda_jh512keccak512.cu b/quark/cuda_jh512keccak512.cu
index 14489f390e..a0663d3e83 100644
--- a/quark/cuda_jh512keccak512.cu
+++ b/quark/cuda_jh512keccak512.cu
@@ -1,5 +1,5 @@
 #include "cuda_helper.h"
-
+#include "cuda_vector.h"
 
 #ifdef _MSC_VER
 #define UINT2(x,y) { x, y }
@@ -7,81 +7,131 @@
 #define UINT2(x,y) (uint2) { x, y }
 #endif
 
-/*42 round constants, each round constant is 32-byte (256-bit)*/
-__constant__ uint32_t c_INIT_bitslice[8][4] = {
-	{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a },
-	{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 },
-	{ 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea },
-	{ 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba },
-	{ 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e },
-	{ 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d },
-	{ 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 },
-	{ 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } };
-
-__constant__ unsigned char c_E8_bitslice_roundconstant[42][32] = {
-	{ 0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40 },
-	{ 0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31 },
-	{ 0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc },
-	{ 0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3 },
-	{ 0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23 },
-	{ 0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97 },
-	{ 0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14 },
-	{ 0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4 },
-	{ 0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36 },
-	{ 0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f },
-	{ 0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b },
-	{ 0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62 },
-	{ 0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5 },
-	{ 0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f },
-	{ 0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a },
-	{ 0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf },
-	{ 0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0 },
-	{ 0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a },
-	{ 0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6 },
-	{ 0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67 },
-	{ 0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18 },
-	{ 0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e },
-	{ 0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1 },
-	{ 0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83 },
-	{ 0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef },
-	{ 0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65 },
-	{ 0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c },
-	{ 0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71 },
-	{ 0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0 },
-	{ 0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f },
-	{ 0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad },
-	{ 0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6 },
-	{ 0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63 },
-	{ 0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f },
-	{ 0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a },
-	{ 0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5 },
-	{ 0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48 },
-	{ 0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e },
-	{ 0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7 },
-	{ 0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde },
-	{ 0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a },
-	{ 0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2 } };
-
-#define SWAP4(x,y)\
-		y = (x &  0xf0f0f0f0UL); \
-		x = (x ^ y); \
-		y = (y >> 4); \
-		x = (x << 4); \
-		x= x | y;
-
-#define SWAP2(x,y)\
-		y = (x &  0xccccccccUL); \
-		x = (x ^ y); \
-		y = (y >> 2); \
-		x = (x << 2); \
-		x= x | y;
-
-#define SWAP1(x,y)\
-		y = (x &  0xaaaaaaaaUL); \
-		x = (x ^ y); \
-		y = (y >> 1); \
-		x = x + x; \
-		x= x | y;
+__constant__ static __align__(16) uint32_t c_E8_bslice32[42][8] = {
+	// Round 0 (Function0)
+	{ 0xa2ded572, 0x90d6ab81, 0x67f815df, 0xf6875a4d, 0x0a15847b, 0xc54f9f4e, 0x571523b7, 0x402bd1c3 },
+	{ 0xe03a98ea, 0xb4960266, 0x9cfa455c, 0x8a53bbf2, 0x99d2c503, 0x1a1456b5, 0x9a99b266, 0x31a2db88 }, // 1
+	{ 0x5c5aa303, 0x8019051c, 0xdb0e199a, 0x1d959e84, 0x0ab23f40, 0xadeb336f, 0x1044c187, 0xdccde75e }, // 2
+	{ 0x9213ba10, 0x39812c0a, 0x416bbf02, 0x5078aa37, 0x156578dc, 0xd2bf1a3f, 0xd027bbf7, 0xd3910041 }, // 3
+	{ 0x0d5a2d42, 0x0ba75c18, 0x907eccf6, 0xac442bc7, 0x9c9f62dd, 0xd665dfd1, 0xce97c092, 0x23fcc663 }, // 4
+	{ 0x036c6e97, 0xbb03f1ee, 0x1ab8e09e, 0xfa618e5d, 0x7e450521, 0xb29796fd, 0xa8ec6c44, 0x97818394 }, // 5
+	{ 0x37858e4a, 0x8173fe8a, 0x2f3003db, 0x6c69b8f8, 0x2d8d672a, 0x4672c78a, 0x956a9ffb, 0x14427fc0 }, // 6
+	// Round 7 (Function0)
+	{ 0x8f15f4c5, 0xb775de52, 0xc45ec7bd, 0xbc88e4ae, 0xa76f4475, 0x1e00b882, 0x80bb118f, 0xf4a3a698 },
+	{ 0x338ff48e, 0x20edf1b6, 0x1563a3a9, 0xfde05a7c, 0x24565faa, 0x5ae9ca36, 0x89f9b7d5, 0x362c4206 },
+	{ 0x433529ce, 0x591ff5d0, 0x3d98fe4e, 0x86814e6f, 0x74f93a53, 0x81ad9d0e, 0xa74b9a73, 0x9f5ad8af },
+	{ 0x670605a7, 0x26077447, 0x6a6234ee, 0x3f1080c6, 0xbe280b8b, 0x6f7ea0e0, 0x2717b96e, 0x7b487ec6 },
+	{ 0xa50a550d, 0x81727686, 0xc0a4f84a, 0xd48d6050, 0x9fe7e391, 0x415a9e7e, 0x9ef18e97, 0x62b0e5f3 },
+	{ 0xec1f9ffc, 0xf594d74f, 0x7a205440, 0xd895fa9d, 0x001ae4e3, 0x117e2e55, 0x84c9f4ce, 0xa554c324 },
+	{ 0x2872df5b, 0xef7c8905, 0x286efebd, 0x2ed349ee, 0xe27ff578, 0x85937e44, 0xb2c4a50f, 0x7f5928eb },
+	// Round 14 (Function0)
+	{ 0x37695f70, 0x04771bc7, 0x4a3124b3, 0xe720b951, 0xf128865e, 0xe843fe74, 0x65e4d61d, 0x8a87d423 },
+	{ 0xa3e8297d, 0xfb301b1d, 0xf2947692, 0xe01bdc5b, 0x097acbdd, 0x4f4924da, 0xc1d9309b, 0xbf829cf2 },
+	{ 0x31bae7a4, 0x32fcae3b, 0xffbf70b4, 0x39d3bb53, 0x0544320d, 0xc1c39f45, 0x48bcf8de, 0xa08b29e0 },
+	{ 0xfd05c9e5, 0x01b771a2, 0x0f09aef7, 0x95ed44e3, 0x12347094, 0x368e3be9, 0x34f19042, 0x4a982f4f },
+	{ 0x631d4088, 0xf14abb7e, 0x15f66ca0, 0x30c60ae2, 0x4b44c147, 0xc5b67046, 0xffaf5287, 0xe68c6ecc },
+	{ 0x56a4d5a4, 0x45ce5773, 0x00ca4fbd, 0xadd16430, 0x4b849dda, 0x68cea6e8, 0xae183ec8, 0x67255c14 },
+	{ 0xf28cdaa3, 0x20b2601f, 0x16e10ecb, 0x7b846fc2, 0x5806e933, 0x7facced1, 0x9a99949a, 0x1885d1a0 },
+	// Round 21 (Function0)
+	{ 0xa15b5932, 0x67633d9f, 0xd319dd8d, 0xba6b04e4, 0xc01c9a50, 0xab19caf6, 0x46b4a5aa, 0x7eee560b },
+	{ 0xea79b11f, 0x5aac571d, 0x742128a9, 0x76d35075, 0x35f7bde9, 0xfec2463a, 0xee51363b, 0x01707da3 },
+	{ 0xafc135f7, 0x15638341, 0x42d8a498, 0xa8db3aea, 0x20eced78, 0x4d3bc3fa, 0x79676b9e, 0x832c8332 },
+	{ 0x1f3b40a7, 0x6c4e3ee7, 0xf347271c, 0xfd4f21d2, 0x34f04059, 0x398dfdb8, 0x9a762db7, 0xef5957dc },
+	{ 0x490c9b8d, 0xd0ae3b7d, 0xdaeb492b, 0x84558d7a, 0x49d7a25b, 0xf0e9a5f5, 0x0d70f368, 0x658ef8e4 },
+	{ 0xf4a2b8a0, 0x92946891, 0x533b1036, 0x4f88e856, 0x9e07a80c, 0x555cb05b, 0x5aec3e75, 0x4cbcbaf8 },
+	{ 0x993bbbe3, 0x28acae64, 0x7b9487f3, 0x6db334dc, 0xd6f4da75, 0x50a5346c, 0x5d1c6b72, 0x71db28b8 },
+	// Round 28 (Function0)
+	{ 0xf2e261f8, 0xf1bcac1c, 0x2a518d10, 0xa23fce43, 0x3364dbe3, 0x3cd1bb67, 0xfc75dd59, 0xb043e802 },
+	{ 0xca5b0a33, 0xc3943b92, 0x75a12988, 0x1e4d790e, 0x4d19347f, 0xd7757479, 0x5c5316b4, 0x3fafeeb6 },
+	{ 0xf7d4a8ea, 0x5324a326, 0x21391abe, 0xd23c32ba, 0x097ef45c, 0x4a17a344, 0x5127234c, 0xadd5a66d },
+	{ 0xa63e1db5, 0xa17cf84c, 0x08c9f2af, 0x4d608672, 0x983d5983, 0xcc3ee246, 0x563c6b91, 0xf6c76e08 },
+	{ 0xb333982f, 0xe8b6f406, 0x5e76bcb1, 0x36d4c1be, 0xa566d62b, 0x1582ee74, 0x2ae6c4ef, 0x6321efbc },
+	{ 0x0d4ec1fd, 0x1614c17e, 0x69c953f4, 0x16fae006, 0xc45a7da7, 0x3daf907e, 0x26585806, 0x3f9d6328 },
+	{ 0xe3f2c9d2, 0x16512a74, 0x0cd29b00, 0x9832e0f2, 0x30ceaa5f, 0xd830eb0d, 0x300cd4b7, 0x9af8cee3 },
+	// Round 35 (Function0)
+	{ 0x7b9ec54b, 0x574d239b, 0x9279f1b5, 0x316796e6, 0x6ee651ff, 0xf3a6e6cc, 0xd3688604, 0x05750a17 },
+	{ 0xd98176b1, 0xb3cb2bf4, 0xce6c3213, 0x47154778, 0x8452173c, 0x825446ff, 0x62a205f8, 0x486a9323 },
+	{ 0x0758df38, 0x442e7031, 0x65655e4e, 0x86ca0bd0, 0x897cfcf2, 0xa20940f0, 0x8e5086fc, 0x4e477830 },
+	{ 0x39eea065, 0x26b29721, 0x8338f7d1, 0x6ff81301, 0x37e95ef7, 0xd1ed44a3, 0xbd3a2ce4, 0xe7de9fef },
+	{ 0x15dfa08b, 0x7ceca7d8, 0xd9922576, 0x7eb027ab, 0xf6f7853c, 0xda7d8d53, 0xbe42dc12, 0xdea83eaa },
+	{ 0x93ce25aa, 0xdaef5fc0, 0xd86902bd, 0xa5194a17, 0xfd43f65a, 0x33664d97, 0xf908731a, 0x6a21fd4c },
+	{ 0x3198b435, 0xa163d09a, 0x701541db, 0x72409751, 0xbb0f1eea, 0xbf9d75f6, 0x9b54cded, 0xe26f4791 }
+	// 42 rounds...
+};
+
+#ifndef NOASM
+__device__ __forceinline__
+static void SWAP4(uint32_t *x)
+{
+#pragma nounroll
+	// y is used as tmp register too
+	for(uint32_t y = 0; y<4; y++, ++x)
+	{
+		asm("and.b32 %1, %0, 0xF0F0F0F0;"
+			"xor.b32 %0, %0, %1;"
+			"shr.b32 %1, %1, 4;"
+			"vshl.u32.u32.u32.clamp.add %0, %0, 4, %1;\n\t"
+			: "+r"(*x) : "r"(y));
+	}
+}
+
+__device__ __forceinline__
+static void SWAP2(uint32_t *x)
+{
+#pragma nounroll
+	// y is used as tmp register too
+	for(uint32_t y = 0; y<4; y++, ++x)
+	{
+		asm("and.b32 %1, %0, 0xCCCCCCCC;"
+			"xor.b32 %0, %0, %1;"
+			"shr.b32 %1, %1, 2;"
+			"vshl.u32.u32.u32.clamp.add %0, %0, 2, %1;\n\t"
+			: "+r"(*x) : "r"(y));
+	}
+}
+
+__device__ __forceinline__
+static void SWAP1(uint32_t *x)
+{
+#pragma nounroll
+	// y is used as tmp register too
+	for(uint32_t y = 0; y<4; y++, ++x)
+	{
+		asm("and.b32 %1, %0, 0xAAAAAAAA;"
+			"xor.b32 %0, %0, %1;"
+			"shr.b32 %1, %1, 1;"
+			"vshl.u32.u32.u32.clamp.add %0, %0, 1, %1;\n\t"
+			: "+r"(*x) : "r"(y));
+	}
+}
+#else
+__device__ __forceinline__
+static void SWAP4(uint32_t *x)
+{
+	x[0] = ((x[0] & 0x0f0f0f0fu) << 4) | (x[0] & 0xF0F0F0F0u) >> 4;
+	x[1] = ((x[1] & 0x0f0f0f0fu) << 4) | (x[1] & 0xF0F0F0F0u) >> 4;
+	x[2] = ((x[2] & 0x0f0f0f0fu) << 4) | (x[2] & 0xF0F0F0F0u) >> 4;
+	x[3] = ((x[3] & 0x0f0f0f0fu) << 4) | (x[3] & 0xF0F0F0F0u) >> 4;
+}
+__device__ __forceinline__
+static void SWAP2(uint32_t *x)
+{
+	x[0] = ((x[0] & 0x33333333u) << 2) | (x[0] & 0xCCCCCCCCu) >> 2;
+	x[1] = ((x[1] & 0x33333333u) << 2) | (x[1] & 0xCCCCCCCCu) >> 2;
+	x[2] = ((x[2] & 0x33333333u) << 2) | (x[2] & 0xCCCCCCCCu) >> 2;
+	x[3] = ((x[3] & 0x33333333u) << 2) | (x[3] & 0xCCCCCCCCu) >> 2;
+}
+__device__ __forceinline__
+static void SWAP1(uint32_t *x)
+{
+	x[0] = ((x[0] & 0x55555555u) << 1) | (x[0] & 0xAAAAAAAAu) >> 1;
+	x[1] = ((x[1] & 0x55555555u) << 1) | (x[1] & 0xAAAAAAAAu) >> 1;
+	x[2] = ((x[2] & 0x55555555u) << 1) | (x[2] & 0xAAAAAAAAu) >> 1;
+	x[3] = ((x[3] & 0x55555555u) << 1) | (x[3] & 0xAAAAAAAAu) >> 1;
+}
+
+#endif
+
 /*swapping bits 16i||16i+1||......||16i+7  with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/
 //#define SWAP8(x)   (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8));
 #define SWAP8(x) (x) = __byte_perm(x, x, 0x2301);
@@ -114,19 +164,20 @@ __constant__ unsigned char c_E8_bitslice_roundconstant[42][32] = {
       m1 ^= (temp0 & (m0));        \
       m2 ^= temp0;
 
-static __device__ __forceinline__ void Sbox_and_MDS_layer(uint32_t x[8][4], uint32_t roundnumber)
+__device__ __forceinline__
+static void Sbox_and_MDS_layer(uint32_t x[8][4], const int rnd)
 {
-    uint32_t temp0;
-	uint32_t cc0, cc1;
-    //Sbox and MDS layer
-#pragma unroll 4
-    for (int i = 0; i < 4; i++) {
-		cc0 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i];
-		cc1 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i+4];
-        Sbox(x[0][i],x[2][i], x[4][i], x[6][i], cc0);
-        Sbox(x[1][i],x[3][i], x[5][i], x[7][i], cc1);
-        L(x[0][i],x[2][i],x[4][i],x[6][i],x[1][i],x[3][i],x[5][i],x[7][i]);
-    }
+	uint2* cc = (uint2*)&c_E8_bslice32[rnd];
+
+	//Sbox and MDS layer
+#pragma unroll
+	for(int i = 0; i < 4; i++, ++cc)
+	{
+		uint32_t temp0;
+		Sbox(x[0][i], x[2][i], x[4][i], x[6][i], cc->x);
+		Sbox(x[1][i], x[3][i], x[5][i], x[7][i], cc->y);
+		L(x[0][i], x[2][i], x[4][i], x[6][i], x[1][i], x[3][i], x[5][i], x[7][i]);
+	}
 }
 
 static __device__ __forceinline__ void RoundFunction0(uint32_t x[8][4], uint32_t roundnumber)
@@ -134,13 +185,9 @@ static __device__ __forceinline__ void RoundFunction0(uint32_t x[8][4], uint32_t
 	Sbox_and_MDS_layer(x, roundnumber);
 
 #pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
+	for(int j = 1; j < 8; j = j + 2)
 	{
-		uint32_t y;
-		SWAP1(x[j][0], y);
-		SWAP1(x[j][1], y);
-		SWAP1(x[j][2], y);
-		SWAP1(x[j][3], y);
+		SWAP1(x[j]);
 	}
 }
 
@@ -149,13 +196,9 @@ static __device__ __forceinline__ void RoundFunction1(uint32_t x[8][4], uint32_t
 	Sbox_and_MDS_layer(x, roundnumber);
 
 #pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
+	for(int j = 1; j < 8; j = j + 2)
 	{
-		uint32_t y;
-		SWAP2(x[j][0], y);
-		SWAP2(x[j][1], y);
-		SWAP2(x[j][2], y);
-		SWAP2(x[j][3], y);
+		SWAP2(x[j]);
 	}
 }
 
@@ -164,13 +207,9 @@ static __device__ __forceinline__ void RoundFunction2(uint32_t x[8][4], uint32_t
 	Sbox_and_MDS_layer(x, roundnumber);
 
 #pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
+	for(int j = 1; j < 8; j = j + 2)
 	{
-		uint32_t y;
-		SWAP4(x[j][0], y);
-		SWAP4(x[j][1], y);
-		SWAP4(x[j][2], y);
-		SWAP4(x[j][3], y);
+		SWAP4(x[j]);
 	}
 }
 
@@ -179,10 +218,10 @@ static __device__ __forceinline__ void RoundFunction3(uint32_t x[8][4], uint32_t
 	Sbox_and_MDS_layer(x, roundnumber);
 
 #pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
+	for(int j = 1; j < 8; j = j + 2)
 	{
 #pragma unroll 4
-		for (int i = 0; i < 4; i++) SWAP8(x[j][i]);
+		for(int i = 0; i < 4; i++) SWAP8(x[j][i]);
 	}
 }
 
@@ -191,10 +230,10 @@ static __device__ __forceinline__ void RoundFunction4(uint32_t x[8][4], uint32_t
 	Sbox_and_MDS_layer(x, roundnumber);
 
 #pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
+	for(int j = 1; j < 8; j = j + 2)
 	{
 #pragma unroll 4
-		for (int i = 0; i < 4; i++) SWAP16(x[j][i]);
+		for(int i = 0; i < 4; i++) SWAP16(x[j][i]);
 	}
 }
 
@@ -205,11 +244,12 @@ static __device__ __forceinline__ void RoundFunction5(uint32_t x[8][4], uint32_t
 	Sbox_and_MDS_layer(x, roundnumber);
 
 #pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
+	for(int j = 1; j < 8; j = j + 2)
 	{
 #pragma unroll 2
-		for (int i = 0; i < 4; i = i+2) {
-			temp0 = x[j][i]; x[j][i] = x[j][i+1]; x[j][i+1] = temp0;
+		for(int i = 0; i < 4; i = i + 2)
+		{
+			temp0 = x[j][i]; x[j][i] = x[j][i + 1]; x[j][i + 1] = temp0;
 		}
 	}
 }
@@ -221,11 +261,12 @@ static __device__ __forceinline__ void RoundFunction6(uint32_t x[8][4], uint32_t
 	Sbox_and_MDS_layer(x, roundnumber);
 
 #pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
+	for(int j = 1; j < 8; j = j + 2)
 	{
 #pragma unroll 2
-		for (int i = 0; i < 2; i++) {
-			temp0 = x[j][i]; x[j][i] = x[j][i+2]; x[j][i+2] = temp0;
+		for(int i = 0; i < 2; i++)
+		{
+			temp0 = x[j][i]; x[j][i] = x[j][i + 2]; x[j][i + 2] = temp0;
 		}
 	}
 }
@@ -233,9 +274,9 @@ static __device__ __forceinline__ void RoundFunction6(uint32_t x[8][4], uint32_t
 /*The bijective function E8, in bitslice form */
 static __device__ __forceinline__ void E8(uint32_t x[8][4])
 {
-    /*perform 6 rounds*/
-//#pragma unroll 6
-    for (int i = 0; i < 42; i+=7)
+	/*perform 6 rounds*/
+#pragma unroll 1
+	for(int i = 0; i < 42; i += 7)
 	{
 		RoundFunction0(x, i);
 		RoundFunction1(x, i + 1);
@@ -252,221 +293,213 @@ static __device__ __forceinline__ void E8(uint32_t x[8][4])
 
 #define U64TO32_LE(p, v) \
     *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
- 
+
 __constant__ uint2 c_keccak_round_constants[24] = {
-		{ 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 },
-		{ 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 },
-		{ 0x0000808bul, 0x00000000 }, { 0x80000001ul, 0x00000000 },
-		{ 0x80008081ul, 0x80000000 }, { 0x00008009ul, 0x80000000 },
-		{ 0x0000008aul, 0x00000000 }, { 0x00000088ul, 0x00000000 },
-		{ 0x80008009ul, 0x00000000 }, { 0x8000000aul, 0x00000000 },
-		{ 0x8000808bul, 0x00000000 }, { 0x0000008bul, 0x80000000 },
-		{ 0x00008089ul, 0x80000000 }, { 0x00008003ul, 0x80000000 },
-		{ 0x00008002ul, 0x80000000 }, { 0x00000080ul, 0x80000000 },
-		{ 0x0000800aul, 0x00000000 }, { 0x8000000aul, 0x80000000 },
-		{ 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 },
-		{ 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 }
+	{ 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 },
+	{ 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 },
+	{ 0x0000808bul, 0x00000000 }, { 0x80000001ul, 0x00000000 },
+	{ 0x80008081ul, 0x80000000 }, { 0x00008009ul, 0x80000000 },
+	{ 0x0000008aul, 0x00000000 }, { 0x00000088ul, 0x00000000 },
+	{ 0x80008009ul, 0x00000000 }, { 0x8000000aul, 0x00000000 },
+	{ 0x8000808bul, 0x00000000 }, { 0x0000008bul, 0x80000000 },
+	{ 0x00008089ul, 0x80000000 }, { 0x00008003ul, 0x80000000 },
+	{ 0x00008002ul, 0x80000000 }, { 0x00000080ul, 0x80000000 },
+	{ 0x0000800aul, 0x00000000 }, { 0x8000000aul, 0x80000000 },
+	{ 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 },
+	{ 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 }
 };
 
-static __device__ __forceinline__ void
-keccak_block(uint2 *s) {
-	int i;
-	uint2 t[5], u[5], v, w;
-
-	/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-	t[0] = s[0] ^ s[5];
-	t[1] = s[1] ^ s[6];
-	t[2] = s[2] ^ s[7];
-	t[3] = s[3] ^ s[8];
-	t[4] = s[4];
-
-	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-	u[0] = t[4] ^ ROL2(t[1], 1);
-	u[1] = t[0] ^ ROL2(t[2], 1);
-	u[2] = t[1] ^ ROL2(t[3], 1);
-	u[3] = t[2] ^ ROL2(t[4], 1);
-	u[4] = t[3] ^ ROL2(t[0], 1);
-
-	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-	s[0] ^= u[0]; s[5] ^= u[0]; s[10] = u[0]; s[15] = u[0]; s[20] = u[0];
-	s[1] ^= u[1]; s[6] ^= u[1]; s[11] = u[1]; s[16] = u[1]; s[21] = u[1];
-	s[2] ^= u[2]; s[7] ^= u[2]; s[12] = u[2]; s[17] = u[2]; s[22] = u[2];
-	s[3] ^= u[3]; s[8] ^= u[3]; s[13] = u[3]; s[18] = u[3]; s[23] = u[3];
-	s[4] ^= u[4]; s[9] = u[4]; s[14] = u[4]; s[19] = u[4]; s[24] = u[4];
-
-	/* rho pi: b[..] = rotl(a[..], ..) */
-	v = s[1];
-	s[1] = ROL2(s[6], 44);
-	s[6] = ROL2(u[4], 20);
-	s[9] = ROL2(u[2], 61);
-	s[22] = ROL2(u[4], 39);
-	s[14] = ROL2(u[0], 18);
-	s[20] = ROL2(s[2], 62);
-	s[2] = ROL2(u[2], 43);
-	s[12] = ROL2(u[3], 25);
-	s[13] = ROL2(u[4], 8);
-	s[19] = ROL2(u[3], 56);
-	s[23] = ROL2(u[0], 41);
-	s[15] = ROL2(s[4], 27);
-	s[4] = ROL2(u[4], 14);
-	s[24] = ROL2(u[1], 2);
-	s[21] = ROL2(s[8], 55);
-	s[8] = ROL2(u[1], 45);
-	s[16] = ROL2(s[5], 36);
-	s[5] = ROL2(s[3], 28);
-	s[3] = ROL2(u[3], 21);
-	s[18] = ROL2(u[2], 15);
-	s[17] = ROL2(u[1], 10);
-	s[11] = ROL2(s[7], 6);
-	s[7] = ROL2(u[0], 3);
-	s[10] = ROL2(v, 1);
-
-	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-	v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
-	v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
-	v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-	v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-	v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-	/* iota: a[0,0] ^= round constant */
-	s[0] = s[0]^1;//vectorize(c_keccak_round_constants[0]);
-
-	for (i = 1; i < 24; i++) {
-		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
-
-		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-		u[0] = t[4] ^ ROL2(t[1], 1);
-		u[1] = t[0] ^ ROL2(t[2], 1);
-		u[2] = t[1] ^ ROL2(t[3], 1);
-		u[3] = t[2] ^ ROL2(t[4], 1);
-		u[4] = t[3] ^ ROL2(t[0], 1);
-
-		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-		/* rho pi: b[..] = rotl(a[..], ..) */
-		v = s[1];
-		s[1] = ROL2(s[6], 44);
-		s[6] = ROL2(s[9], 20);
-		s[9] = ROL2(s[22], 61);
-		s[22] = ROL2(s[14], 39);
-		s[14] = ROL2(s[20], 18);
-		s[20] = ROL2(s[2], 62);
-		s[2] = ROL2(s[12], 43);
-		s[12] = ROL2(s[13], 25);
-		s[13] = ROL2(s[19], 8);
-		s[19] = ROL2(s[23], 56);
-		s[23] = ROL2(s[15], 41);
-		s[15] = ROL2(s[4], 27);
-		s[4] = ROL2(s[24], 14);
-		s[24] = ROL2(s[21], 2);
-		s[21] = ROL2(s[8], 55);
-		s[8] = ROL2(s[16], 45);
-		s[16] = ROL2(s[5], 36);
-		s[5] = ROL2(s[3], 28);
-		s[3] = ROL2(s[18], 21);
-		s[18] = ROL2(s[17], 15);
-		s[17] = ROL2(s[11], 10);
-		s[11] = ROL2(s[7], 6);
-		s[7] = ROL2(s[10], 3);
-		s[10] = ROL2(v, 1);
-
-		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-		v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
-		v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
-		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-		/* iota: a[0,0] ^= round constant */
-		s[0] ^= c_keccak_round_constants[i];
-	}
-}
+#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
 
-__global__ __launch_bounds__(256,3)
-void quark_jh512Keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
+__global__ __launch_bounds__(256, 3)
+void quark_jh512Keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash)
 {
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if(thread < threads)
+	{
+		const uint32_t nounce = (startNounce + thread);
 
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = &g_hash[16 * hashPosition];
+		const uint32_t hashPosition = nounce - startNounce;
+		uint32_t *Hash = &g_hash[16 * hashPosition];
 		uint32_t x[8][4] = {
-				{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a },
-				{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 },
-				{ 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea },
-				{ 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba },
-				{ 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e },
-				{ 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d },
-				{ 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 },
-				{ 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } };
+			{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a },
+			{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 },
+			{ 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea },
+			{ 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba },
+			{ 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e },
+			{ 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d },
+			{ 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 },
+			{ 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } };
+
+
+		uint32_t msg[16];
+
+		uint28 *phash = (uint28*)Hash;
+		uint28 *outpt = (uint28*)msg;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
 
 #pragma unroll 16
-		for (int i = 0; i < 16; i++)  x[i >> 2][i & 3] ^= ((uint32_t*)Hash)[i];
+		for (int i = 0; i < 16; i++)  x[i >> 2][i & 3] ^= (msg)[i];
 		E8(x);
 #pragma unroll 16
-		for (int i = 0; i < 16; i++) x[(16 + i) >> 2][(16 + i) & 3] ^= ((uint32_t*)Hash)[i];
+		for (int i = 0; i < 16; i++) x[(16 + i) >> 2][(16 + i) & 3] ^= (msg)[i];
 
-		x[0 >> 2][0 & 3] ^= 0x80;
-		x[15 >> 2][15 & 3] ^= 0x00020000;
+		x[0][0] ^= 0x80;
+		x[3][3] ^= 0x00020000;
 		E8(x);
-		x[(16 + 0) >> 2][(16 + 0) & 3] ^= 0x80;
-		x[(16 + 15) >> 2][(16 + 15) & 3] ^= 0x00020000;
-
-		uint2 keccak_gpu_state[25];
-
-		keccak_gpu_state[0].x = x[4][0];
-		keccak_gpu_state[0].y = x[4][1];
-		keccak_gpu_state[1].x = x[4][2];
-		keccak_gpu_state[1].y = x[4][3];
-		keccak_gpu_state[2].x = x[5][0];
-		keccak_gpu_state[2].y = x[5][1];
-		keccak_gpu_state[3].x = x[5][2];
-		keccak_gpu_state[3].y = x[5][3];
-		keccak_gpu_state[4].x = x[6][0];
-		keccak_gpu_state[4].y = x[6][1];
-		keccak_gpu_state[5].x = x[6][2];
-		keccak_gpu_state[5].y = x[6][3];
-		keccak_gpu_state[6].x = x[7][0];
-		keccak_gpu_state[6].y = x[7][1];
-		keccak_gpu_state[7].x = x[7][2];
-		keccak_gpu_state[7].y = x[7][3];
-		keccak_gpu_state[8] = make_uint2(0x00000001, 0x80000000);
-#pragma unroll
-		for (int i = 9; i<25; i++)
+		x[4][0] ^= 0x80;
+		x[7][3] ^= 0x00020000;
+
+		uint2 s[25] =
+		{
+			{ x[4][0], x[4][1] }, { x[4][2], x[4][3] }, { x[5][0], x[5][1] }, { x[5][2], x[5][3] },
+			{ x[6][0], x[6][1] }, { x[6][2], x[6][3] }, { x[7][0], x[7][1] }, { x[7][2], x[7][3] },
+			{ 1, 0x80000000 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },
+			{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },
+			{ 0, 0 }
+		};
+		uint2 bc[5], tmpxor[5], tmp1, tmp2;
+
+		tmpxor[0] = s[0] ^ s[5];
+		tmpxor[1] = s[1] ^ s[6];
+		tmpxor[2] = s[2] ^ s[7];
+		tmpxor[3] = s[3] ^ s[8];
+		tmpxor[4] = s[4];
+
+		bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+		tmp1 = s[1] ^ bc[0];
+
+		s[0] = s[0] ^ bc[4];
+		s[1] = ROL2(s[6] ^ bc[0], 44);
+		s[6] = ROL2(bc[3], 20);
+		s[9] = ROL2(bc[1], 61);
+		s[22] = ROL2(bc[3], 39);
+		s[14] = ROL2(bc[4], 18);
+		s[20] = ROL2(s[2] ^ bc[1], 62);
+		s[2] = ROL2(bc[1], 43);
+		s[12] = ROL2(bc[2], 25);
+		s[13] = ROL8(bc[3]);
+		s[19] = ROR8(bc[2]);
+		s[23] = ROL2(bc[4], 41);
+		s[15] = ROL2(s[4] ^ bc[3], 27);
+		s[4] = ROL2(bc[3], 14);
+		s[24] = ROL2(bc[0], 2);
+		s[21] = ROL2(s[8] ^ bc[2], 55);
+		s[8] = ROL2(bc[0], 45);
+		s[16] = ROL2(s[5] ^ bc[4], 36);
+		s[5] = ROL2(s[3] ^ bc[2], 28);
+		s[3] = ROL2(bc[2], 21);
+		s[18] = ROL2(bc[1], 15);
+		s[17] = ROL2(bc[0], 10);
+		s[11] = ROL2(s[7] ^ bc[1], 6);
+		s[7] = ROL2(bc[4], 3);
+		s[10] = ROL2(tmp1, 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0].x ^= 1;
+#pragma unroll 1
+		for(int i = 1; i < 23; ++i)
 		{
-			keccak_gpu_state[i] = make_uint2(0, 0);
-		}
-		keccak_block(keccak_gpu_state);
 
-		uint64_t *outputhash = (uint64_t *)Hash;
-#pragma unroll 16
-		for (int i = 0; i<8; i++)
-			outputhash[i] = devectorize(keccak_gpu_state[i]);
+#pragma unroll
+			for(int x = 0; x < 5; x++)
+				tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
+
+			bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+			bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+			bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+			bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+			bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+			tmp1 = s[1] ^ bc[0];
+
+			s[0] ^= bc[4];
+			s[1] = ROL2(s[6] ^ bc[0], 44);
+			s[6] = ROL2(s[9] ^ bc[3], 20);
+			s[9] = ROL2(s[22] ^ bc[1], 61);
+			s[22] = ROL2(s[14] ^ bc[3], 39);
+			s[14] = ROL2(s[20] ^ bc[4], 18);
+			s[20] = ROL2(s[2] ^ bc[1], 62);
+			s[2] = ROL2(s[12] ^ bc[1], 43);
+			s[12] = ROL2(s[13] ^ bc[2], 25);
+			s[13] = ROL8(s[19] ^ bc[3]);
+			s[19] = ROR8(s[23] ^ bc[2]);
+			s[23] = ROL2(s[15] ^ bc[4], 41);
+			s[15] = ROL2(s[4] ^ bc[3], 27);
+			s[4] = ROL2(s[24] ^ bc[3], 14);
+			s[24] = ROL2(s[21] ^ bc[0], 2);
+			s[21] = ROL2(s[8] ^ bc[2], 55);
+			s[8] = ROL2(s[16] ^ bc[0], 45);
+			s[16] = ROL2(s[5] ^ bc[4], 36);
+			s[5] = ROL2(s[3] ^ bc[2], 28);
+			s[3] = ROL2(s[18] ^ bc[2], 21);
+			s[18] = ROL2(s[17] ^ bc[1], 15);
+			s[17] = ROL2(s[11] ^ bc[0], 10);
+			s[11] = ROL2(s[7] ^ bc[1], 6);
+			s[7] = ROL2(s[10] ^ bc[4], 3);
+			s[10] = ROL2(tmp1, 1);
+
+			tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+			tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+			tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+			tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+			tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+			s[0] ^= c_keccak_round_constants[i];
+		}
+#pragma unroll
+		for(int x = 0; x < 5; x++)
+			tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
+
+		bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+		s[0] ^= bc[4];
+		s[1] = ROL2(s[6] ^ bc[0], 44);
+		s[6] = ROL2(s[9] ^ bc[3], 20);
+		s[9] = ROL2(s[22] ^ bc[1], 61);
+		s[2] = ROL2(s[12] ^ bc[1], 43);
+		s[4] = ROL2(s[24] ^ bc[3], 14);
+		s[8] = ROL2(s[16] ^ bc[0], 45);
+		s[5] = ROL2(s[3] ^ bc[2], 28);
+		s[3] = ROL2(s[18] ^ bc[2], 21);
+		s[7] = ROL2(s[10] ^ bc[4], 3);
+
+		uint2 *outputhash = (uint2 *)Hash;
+
+		outputhash[0] = bitselect(s[0] ^ s[2], s[0], s[1]) ^ c_keccak_round_constants[23];
+		outputhash[1] = bitselect(s[1] ^ s[3], s[1], s[2]);
+		outputhash[2] = bitselect(s[2] ^ s[4], s[2], s[3]);
+		outputhash[3] = bitselect(s[3] ^ s[0], s[3], s[4]);
+		outputhash[4] = bitselect(s[4] ^ s[1], s[4], s[0]);
+		outputhash[5] = bitselect(s[5] ^ s[7], s[5], s[6]);
+		outputhash[6] = bitselect(s[6] ^ s[8], s[6], s[7]);
+		outputhash[7] = bitselect(s[7] ^ s[9], s[7], s[8]);
 	}
 }
 
 
-__host__ void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
 {
-    const uint32_t threadsperblock = 256;
+	const uint32_t threadsperblock = 256;
 
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
 
-	quark_jh512Keccak512_gpu_hash_64 << <grid, block>> >(threads, startNounce, d_hash, d_nonceVector);
-//    MyStreamSynchronize(NULL, order, thr_id);
+	quark_jh512Keccak512_gpu_hash_64 << <grid, block, 0, gpustream[thr_id] >> >(threads, startNounce, d_hash);
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
 
diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu
index 534280dd83..bc05bfe7ed 100644
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@@ -2,89 +2,213 @@
 #include <memory.h>
 
 #include "cuda_helper.h"
+#include "cuda_vector.h"
 
 #define ROTR(x,n) ROTR64(x,n)
 
 #define USE_SHUFFLE 0
 
-// die Message it Padding zur Berechnung auf der GPU
-__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
+static uint2* c_PaddedMessage80[MAX_GPUS]; // padded message (80 bytes + padding)
+__constant__ uint2 __align__(16) c_PaddedM[10];
+__constant__ uint28 Hostprecalc[4];
+__constant__ uint2 __align__(16) pre[224];
+
+
+__constant__ uint2 c_u512[16] =
+{
+	{0x85a308d3UL, 0x243f6a88}, {0x03707344UL, 0x13198a2e},
+	{0x299f31d0UL, 0xa4093822}, {0xec4e6c89UL, 0x082efa98},
+	{0x38d01377UL, 0x452821e6}, {0x34e90c6cUL, 0xbe5466cf},
+	{0xc97c50ddUL, 0xc0ac29b7}, {0xb5470917UL, 0x3f84d5b5},
+	{0x8979fb1bUL, 0x9216d5d9}, {0x98dfb5acUL, 0xd1310ba6},
+	{0xd01adfb7UL, 0x2ffd72db}, {0x6a267e96UL, 0xb8e1afed},
+	{0xf12c7f99UL, 0xba7c9045}, {0xb3916cf7UL, 0x24a19947},
+	{0x858efc16UL, 0x0801f2e2}, {0x71574e69UL, 0x636920d8}
+};
 
 // ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
 
+#define GSPREC_SP(a,b,c,d) { \
+	v[a] += (pre[i++]) + v[b]; \
+	v[d] = eorswap32( v[d] , v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2(v[b] ^ v[c], 25); \
+	v[a] += (pre[i++]) + v[b]; \
+	v[d] = ROR16(v[d] ^ v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2(v[b] ^ v[c], 11); \
+	}
+
+#define GSPREC_SP_HI(a,b,c,d,idx1,idx2) { \
+	v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \
+	v[d] = eorswap32( v[d] , v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2(v[b] ^ v[c], 25); \
+	v[a] += (pre[i++]) + v[b]; \
+	v[d] = ROR16(v[d] ^ v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2(v[b] ^ v[c], 11); \
+	}
+
+#define GSPREC_SP_LO(a,b,c,d,idx1,idx2) { \
+	v[a] += (pre[i++]) + v[b]; \
+	v[d] = eorswap32( v[d] , v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2(v[b] ^ v[c], 25); \
+	v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \
+	v[d] = ROR16(v[d] ^ v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2(v[b] ^ v[c], 11); \
+		}
 
 #define Gprecalc(a,b,c,d,idx1,idx2) { \
 	v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \
-	v[d] = SWAPDWORDS2( v[d] ^ v[a]); \
+	v[d] = eorswap32( v[d] , v[a]); \
 	v[c] += v[d]; \
 	v[b] = ROR2(v[b] ^ v[c], 25); \
 	v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \
-	v[d] = ROR2(v[d] ^ v[a],16); \
+	v[d] = ROR16(v[d] ^ v[a]); \
 	v[c] += v[d]; \
 	v[b] = ROR2(v[b] ^ v[c], 11); \
 	}
 
-__global__ 
+#define RSPRECHOST(idx1,idx2) { \
+	prehost[i++] = (block[idx2] ^ u512[idx1]); \
+	prehost[i++] = (block[idx1] ^ u512[idx2]); \
+	}
+
+#define RSPRECHOSTLO(idx1,idx2) { \
+	prehost[i++] = (block[idx2] ^ u512[idx1]); \
+	}
+#define RSPRECHOSTHI(idx1,idx2) { \
+	prehost[i++] = (block[idx1] ^ u512[idx2]); \
+	}
+
+
+#define GprecalcHost(a,b,c,d,idx1,idx2) { \
+	v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \
+	v[d] = ROTR64( v[d] ^ v[a],32); \
+	v[c] += v[d]; \
+	v[b] = ROTR64(v[b] ^ v[c], 25); \
+	v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \
+	v[d] = ROTR64(v[d] ^ v[a],16); \
+	v[c] += v[d]; \
+	v[b] = ROTR64(v[b] ^ v[c], 11); \
+		}
+
+__constant__ uint8_t c_sigma[16][16] = {
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}};
+
+
+#define G(a,b,c,d,x) { \
+	uint32_t idx1 = c_sigma[i][x]; \
+	uint32_t idx2 = c_sigma[i][x+1]; \
+	v[a] += (block[idx1] ^ c_u512[idx2]) + v[b]; \
+	v[d] = eorswap32(v[d] , v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2( v[b] ^ v[c], 25); \
+	v[a] += (block[idx2] ^ c_u512[idx1]) + v[b]; \
+	v[d] = ROR16( v[d] ^ v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2( v[b] ^ v[c], 11); \
+}
+
+__global__
 #if __CUDA_ARCH__ > 500
-	__launch_bounds__(256, 1)
+__launch_bounds__(256, 1)
 #else
-	__launch_bounds__(256, 2)
+__launch_bounds__(256, 2)
 #endif
-void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint64_t *const __restrict__ g_hash)
+void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2 *const __restrict__ g_hash)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
 #if USE_SHUFFLE
-	const int warpID = threadIdx.x & 0x0F; // 16 warps
-	const int warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Bl�cke
-	const int maxHashPosition = thread<<3;
+	//	const int warpID = threadIdx.x & 0x02F; // 16 warps
+	const int warpBlockID = (thread + 15) >> 5; // aufrunden auf volle Warp-Bl�cke
+	//	const int maxHashPosition = thread<<3;
 #endif
 
 #if USE_SHUFFLE
-	if (warpBlockID < ( (threads+15)>>4 ))
+	if(warpBlockID < ((threads + 15) >> 5))
 #else
-	if (thread < threads)
+	if(thread < threads)
 #endif
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
+		const int hashPosition = nounce - startNounce;
+
+		uint2 block[16];
+		uint2 msg[16];
+
+		uint28 *phash = (uint28*)&g_hash[hashPosition * 8];
+		uint28 *outpt = (uint28*)msg;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+		block[0].x = cuda_swab32(msg[0].y);
+		block[0].y = cuda_swab32(msg[0].x);
+		block[1].x = cuda_swab32(msg[1].y);
+		block[1].y = cuda_swab32(msg[1].x);
+		block[2].x = cuda_swab32(msg[2].y);
+		block[2].y = cuda_swab32(msg[2].x);
+		block[3].x = cuda_swab32(msg[3].y);
+		block[3].y = cuda_swab32(msg[3].x);
+		block[4].x = cuda_swab32(msg[4].y);
+		block[4].y = cuda_swab32(msg[4].x);
+		block[5].x = cuda_swab32(msg[5].y);
+		block[5].y = cuda_swab32(msg[5].x);
+		block[6].x = cuda_swab32(msg[6].y);
+		block[6].y = cuda_swab32(msg[6].x);
+		block[7].x = cuda_swab32(msg[7].y);
+		block[7].y = cuda_swab32(msg[7].x);
+
+
+		block[8] = vectorizehigh(0x80000000);
+		block[9] = vectorizelow(0x0);
+		block[10] = vectorizelow(0x0);
+		block[11] = vectorizelow(0x0);
+		block[12] = vectorizelow(0x0);
+		block[13] = vectorizelow(0x1);
+		block[14] = vectorizelow(0x0);
+		block[15] = vectorizelow(0x200);
 
-		uint64_t *inpHash = &g_hash[hashPosition*8];
-		uint2 block[16] =
-		{
-			vectorizeswap(inpHash[0]), vectorizeswap(inpHash[1]), vectorizeswap(inpHash[2]), vectorizeswap(inpHash[3]),
-			vectorizeswap(inpHash[4]), vectorizeswap(inpHash[5]), vectorizeswap(inpHash[6]), vectorizeswap(inpHash[7])
-		};
-		block[8] = make_uint2(0, 0x80000000UL);
-		block[9] = make_uint2(0,0);
-		block[10] = make_uint2(0,0);
-		block[11] = make_uint2(0,0);
-		block[12] = make_uint2(0,0);
-		block[13] = make_uint2(1,0);
-		block[14] = make_uint2(0,0);
-		block[15] = make_uint2(0x200,0);
 		const uint2 h[8] =
 		{
-				{ 0xf3bcc908UL, 0x6a09e667UL },
-				{ 0x84caa73bUL, 0xbb67ae85UL },
-				{ 0xfe94f82bUL, 0x3c6ef372UL },
-				{ 0x5f1d36f1UL, 0xa54ff53aUL },
-				{ 0xade682d1UL, 0x510e527fUL },
-				{ 0x2b3e6c1fUL, 0x9b05688cUL },
-				{ 0xfb41bd6bUL, 0x1f83d9abUL },
-				{ 0x137e2179UL, 0x5be0cd19UL }
+			{0xf3bcc908UL, 0x6a09e667UL},
+			{0x84caa73bUL, 0xbb67ae85UL},
+			{0xfe94f82bUL, 0x3c6ef372UL},
+			{0x5f1d36f1UL, 0xa54ff53aUL},
+			{0xade682d1UL, 0x510e527fUL},
+			{0x2b3e6c1fUL, 0x9b05688cUL},
+			{0xfb41bd6bUL, 0x1f83d9abUL},
+			{0x137e2179UL, 0x5be0cd19UL}
 		};
 		const uint2 u512[16] =
 		{
-			{ 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e },
-			{ 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 },
-			{ 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf },
-			{ 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 },
-			{ 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 },
-			{ 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed },
-			{ 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 },
-			{ 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 }
+			{0x85a308d3UL, 0x243f6a88}, {0x03707344UL, 0x13198a2e},
+			{0x299f31d0UL, 0xa4093822}, {0xec4e6c89UL, 0x082efa98},
+			{0x38d01377UL, 0x452821e6}, {0x34e90c6cUL, 0xbe5466cf},
+			{0xc97c50ddUL, 0xc0ac29b7}, {0xb5470917UL, 0x3f84d5b5},
+			{0x8979fb1bUL, 0x9216d5d9}, {0x98dfb5acUL, 0xd1310ba6},
+			{0xd01adfb7UL, 0x2ffd72db}, {0x6a267e96UL, 0xb8e1afed},
+			{0xf12c7f99UL, 0xba7c9045}, {0xb3916cf7UL, 0x24a19947},
+			{0x858efc16UL, 0x0801f2e2}, {0x71574e69UL, 0x636920d8}
 		};
 
 		uint2 v[16] =
@@ -94,412 +218,803 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t
 		};
 
 		Gprecalc(0, 4, 8, 12, 0x1, 0x0)
-		Gprecalc(1, 5, 9, 13, 0x3, 0x2)
-		Gprecalc(2, 6, 10, 14, 0x5, 0x4)
-		Gprecalc(3, 7, 11, 15, 0x7, 0x6)
-		Gprecalc(0, 5, 10, 15, 0x9, 0x8)
-		Gprecalc(1, 6, 11, 12, 0xb, 0xa)
-		Gprecalc(2, 7, 8, 13, 0xd, 0xc)
-		Gprecalc(3, 4, 9, 14, 0xf, 0xe)
-
-		Gprecalc(0, 4, 8, 12, 0xa, 0xe)
-		Gprecalc(1, 5, 9, 13, 0x8, 0x4)
-		Gprecalc(2, 6, 10, 14, 0xf, 0x9)
-		Gprecalc(3, 7, 11, 15, 0x6, 0xd)
-		Gprecalc(0, 5, 10, 15, 0xc, 0x1)
-		Gprecalc(1, 6, 11, 12, 0x2, 0x0)
-		Gprecalc(2, 7, 8, 13, 0x7, 0xb)
-		Gprecalc(3, 4, 9, 14, 0x3, 0x5)
-
-		Gprecalc(0, 4, 8, 12, 0x8, 0xb)
-		Gprecalc(1, 5, 9, 13, 0x0, 0xc)
-		Gprecalc(2, 6, 10, 14, 0x2, 0x5)
-		Gprecalc(3, 7, 11, 15, 0xd, 0xf)
-		Gprecalc(0, 5, 10, 15, 0xe, 0xa)
-		Gprecalc(1, 6, 11, 12, 0x6, 0x3)
-		Gprecalc(2, 7, 8, 13, 0x1, 0x7)
-		Gprecalc(3, 4, 9, 14, 0x4, 0x9)
-		
-		Gprecalc(0, 4, 8, 12, 0x9, 0x7)
-		Gprecalc(1, 5, 9, 13, 0x1, 0x3)
-		Gprecalc(2, 6, 10, 14, 0xc, 0xd)
-		Gprecalc(3, 7, 11, 15, 0xe, 0xb)
-		Gprecalc(0, 5, 10, 15, 0x6, 0x2)
-		Gprecalc(1, 6, 11, 12, 0xa, 0x5)
-		Gprecalc(2, 7, 8, 13, 0x0, 0x4)
-		Gprecalc(3, 4, 9, 14, 0x8, 0xf)
-		
-		Gprecalc(0, 4, 8, 12, 0x0, 0x9)
-		Gprecalc(1, 5, 9, 13, 0x7, 0x5)
-		Gprecalc(2, 6, 10, 14, 0x4, 0x2)
-		Gprecalc(3, 7, 11, 15, 0xf, 0xa)
-		Gprecalc(0, 5, 10, 15, 0x1, 0xe)
-		Gprecalc(1, 6, 11, 12, 0xc, 0xb)
-		Gprecalc(2, 7, 8, 13, 0x8, 0x6)
-		Gprecalc(3, 4, 9, 14, 0xd, 0x3)
-		
-		Gprecalc(0, 4, 8, 12, 0xc, 0x2)
-		Gprecalc(1, 5, 9, 13, 0xa, 0x6)
-		Gprecalc(2, 6, 10, 14, 0xb, 0x0)
-		Gprecalc(3, 7, 11, 15, 0x3, 0x8)
-		Gprecalc(0, 5, 10, 15, 0xd, 0x4)
-		Gprecalc(1, 6, 11, 12, 0x5, 0x7)
-		Gprecalc(2, 7, 8, 13, 0xe, 0xf)
-		Gprecalc(3, 4, 9, 14, 0x9, 0x1)
-		
-		Gprecalc(0, 4, 8, 12, 0x5, 0xc)
-		Gprecalc(1, 5, 9, 13, 0xf, 0x1)
-		Gprecalc(2, 6, 10, 14, 0xd, 0xe)
-		Gprecalc(3, 7, 11, 15, 0xa, 0x4)
-		Gprecalc(0, 5, 10, 15, 0x7, 0x0)
-		Gprecalc(1, 6, 11, 12, 0x3, 0x6)
-		Gprecalc(2, 7, 8, 13, 0x2, 0x9)
-		Gprecalc(3, 4, 9, 14, 0xb, 0x8)
-		
-		Gprecalc(0, 4, 8, 12, 0xb, 0xd)
-		Gprecalc(1, 5, 9, 13, 0xe, 0x7)
-		Gprecalc(2, 6, 10, 14, 0x1, 0xc)
-		Gprecalc(3, 7, 11, 15, 0x9, 0x3)
-		Gprecalc(0, 5, 10, 15, 0x0, 0x5)
-		Gprecalc(1, 6, 11, 12, 0x4, 0xf)
-		Gprecalc(2, 7, 8, 13, 0x6, 0x8)
-		Gprecalc(3, 4, 9, 14, 0xa, 0x2)
-		
-		Gprecalc(0, 4, 8, 12, 0xf, 0x6)
-		Gprecalc(1, 5, 9, 13, 0x9, 0xe)
-		Gprecalc(2, 6, 10, 14, 0x3, 0xb)
-		Gprecalc(3, 7, 11, 15, 0x8, 0x0)
-		Gprecalc(0, 5, 10, 15, 0x2, 0xc)
-		Gprecalc(1, 6, 11, 12, 0x7, 0xd)
-		Gprecalc(2, 7, 8, 13, 0x4, 0x1)
-		Gprecalc(3, 4, 9, 14, 0x5, 0xa)
-		
-		Gprecalc(0, 4, 8, 12, 0x2, 0xa)
-		Gprecalc(1, 5, 9, 13, 0x4, 0x8)
-		Gprecalc(2, 6, 10, 14, 0x6, 0x7)
-		Gprecalc(3, 7, 11, 15, 0x5, 0x1)
-		Gprecalc(0, 5, 10, 15, 0xb, 0xf)
-		Gprecalc(1, 6, 11, 12, 0xe, 0x9)
-		Gprecalc(2, 7, 8, 13, 0xc, 0x3)
-		Gprecalc(3, 4, 9, 14, 0x0, 0xd)
+			Gprecalc(1, 5, 9, 13, 0x3, 0x2)
+			Gprecalc(2, 6, 10, 14, 0x5, 0x4)
+			Gprecalc(3, 7, 11, 15, 0x7, 0x6)
+			Gprecalc(0, 5, 10, 15, 0x9, 0x8)
+			Gprecalc(1, 6, 11, 12, 0xb, 0xa)
+			Gprecalc(2, 7, 8, 13, 0xd, 0xc)
+			Gprecalc(3, 4, 9, 14, 0xf, 0xe)
+
+			Gprecalc(0, 4, 8, 12, 0xa, 0xe)
+			Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+			Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+			Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+			Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+			Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+			Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+			Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+
+			Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+			Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+			Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+			Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+			Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+			Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+			Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+			Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+
+			Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+			Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+			Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+			Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+			Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+			Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+			Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+			Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+
+			Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+			Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+			Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+			Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+			Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+			Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+			Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+			Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+
+			Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+			Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+			Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+			Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+			Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+			Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+			Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+			Gprecalc(3, 4, 9, 14, 0x9, 0x1)
+
+			Gprecalc(0, 4, 8, 12, 0x5, 0xc)
+			Gprecalc(1, 5, 9, 13, 0xf, 0x1)
+			Gprecalc(2, 6, 10, 14, 0xd, 0xe)
+			Gprecalc(3, 7, 11, 15, 0xa, 0x4)
+			Gprecalc(0, 5, 10, 15, 0x7, 0x0)
+			Gprecalc(1, 6, 11, 12, 0x3, 0x6)
+			Gprecalc(2, 7, 8, 13, 0x2, 0x9)
+			Gprecalc(3, 4, 9, 14, 0xb, 0x8)
+
+			Gprecalc(0, 4, 8, 12, 0xb, 0xd)
+			Gprecalc(1, 5, 9, 13, 0xe, 0x7)
+			Gprecalc(2, 6, 10, 14, 0x1, 0xc)
+			Gprecalc(3, 7, 11, 15, 0x9, 0x3)
+			Gprecalc(0, 5, 10, 15, 0x0, 0x5)
+			Gprecalc(1, 6, 11, 12, 0x4, 0xf)
+			Gprecalc(2, 7, 8, 13, 0x6, 0x8)
+			Gprecalc(3, 4, 9, 14, 0xa, 0x2)
+
+			Gprecalc(0, 4, 8, 12, 0xf, 0x6)
+			Gprecalc(1, 5, 9, 13, 0x9, 0xe)
+			Gprecalc(2, 6, 10, 14, 0x3, 0xb)
+			Gprecalc(3, 7, 11, 15, 0x8, 0x0)
+			Gprecalc(0, 5, 10, 15, 0x2, 0xc)
+			Gprecalc(1, 6, 11, 12, 0x7, 0xd)
+			Gprecalc(2, 7, 8, 13, 0x4, 0x1)
+			Gprecalc(3, 4, 9, 14, 0x5, 0xa)
+
+			Gprecalc(0, 4, 8, 12, 0x2, 0xa)
+			Gprecalc(1, 5, 9, 13, 0x4, 0x8)
+			Gprecalc(2, 6, 10, 14, 0x6, 0x7)
+			Gprecalc(3, 7, 11, 15, 0x5, 0x1)
+			Gprecalc(0, 5, 10, 15, 0xb, 0xf)
+			Gprecalc(1, 6, 11, 12, 0xe, 0x9)
+			Gprecalc(2, 7, 8, 13, 0xc, 0x3)
+			Gprecalc(3, 4, 9, 14, 0x0, 0xd)
+
+#if __CUDA_ARCH__ == 500
+
+			Gprecalc(0, 4, 8, 12, 0x1, 0x0)
+			Gprecalc(1, 5, 9, 13, 0x3, 0x2)
+			Gprecalc(2, 6, 10, 14, 0x5, 0x4)
+			Gprecalc(3, 7, 11, 15, 0x7, 0x6)
+			Gprecalc(0, 5, 10, 15, 0x9, 0x8)
+			Gprecalc(1, 6, 11, 12, 0xb, 0xa)
+			Gprecalc(2, 7, 8, 13, 0xd, 0xc)
+			Gprecalc(3, 4, 9, 14, 0xf, 0xe)
+
+			Gprecalc(0, 4, 8, 12, 0xa, 0xe)
+			Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+			Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+			Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+			Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+			Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+			Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+			Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+
+			Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+			Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+			Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+			Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+			Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+			Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+			Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+			Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+
+			Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+			Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+			Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+			Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+			Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+			Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+			Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+			Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+
+			Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+			Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+			Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+			Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+			Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+			Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+			Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+			Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+
+			Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+			Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+			Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+			Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+			Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+			Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+			Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+			Gprecalc(3, 4, 9, 14, 0x9, 0x1)
 
-		Gprecalc(0, 4, 8, 12, 0x1, 0x0)
-		Gprecalc(1, 5, 9, 13, 0x3, 0x2)
-		Gprecalc(2, 6, 10, 14, 0x5, 0x4)
-		Gprecalc(3, 7, 11, 15, 0x7, 0x6)
-		Gprecalc(0, 5, 10, 15, 0x9, 0x8)
-		Gprecalc(1, 6, 11, 12, 0xb, 0xa)
-		Gprecalc(2, 7, 8, 13, 0xd, 0xc)
-		Gprecalc(3, 4, 9, 14, 0xf, 0xe)
-
-		Gprecalc(0, 4, 8, 12, 0xa, 0xe)
-		Gprecalc(1, 5, 9, 13, 0x8, 0x4)
-		Gprecalc(2, 6, 10, 14, 0xf, 0x9)
-		Gprecalc(3, 7, 11, 15, 0x6, 0xd)
-		Gprecalc(0, 5, 10, 15, 0xc, 0x1)
-		Gprecalc(1, 6, 11, 12, 0x2, 0x0)
-		Gprecalc(2, 7, 8, 13, 0x7, 0xb)
-		Gprecalc(3, 4, 9, 14, 0x3, 0x5)
-
-		Gprecalc(0, 4, 8, 12, 0x8, 0xb)
-		Gprecalc(1, 5, 9, 13, 0x0, 0xc)
-		Gprecalc(2, 6, 10, 14, 0x2, 0x5)
-		Gprecalc(3, 7, 11, 15, 0xd, 0xf)
-		Gprecalc(0, 5, 10, 15, 0xe, 0xa)
-		Gprecalc(1, 6, 11, 12, 0x6, 0x3)
-		Gprecalc(2, 7, 8, 13, 0x1, 0x7)
-		Gprecalc(3, 4, 9, 14, 0x4, 0x9)
-
-		Gprecalc(0, 4, 8, 12, 0x9, 0x7)
-		Gprecalc(1, 5, 9, 13, 0x1, 0x3)
-		Gprecalc(2, 6, 10, 14, 0xc, 0xd)
-		Gprecalc(3, 7, 11, 15, 0xe, 0xb)
-		Gprecalc(0, 5, 10, 15, 0x6, 0x2)
-		Gprecalc(1, 6, 11, 12, 0xa, 0x5)
-		Gprecalc(2, 7, 8, 13, 0x0, 0x4)
-		Gprecalc(3, 4, 9, 14, 0x8, 0xf)
-
-		Gprecalc(0, 4, 8, 12, 0x0, 0x9)
-		Gprecalc(1, 5, 9, 13, 0x7, 0x5)
-		Gprecalc(2, 6, 10, 14, 0x4, 0x2)
-		Gprecalc(3, 7, 11, 15, 0xf, 0xa)
-		Gprecalc(0, 5, 10, 15, 0x1, 0xe)
-		Gprecalc(1, 6, 11, 12, 0xc, 0xb)
-		Gprecalc(2, 7, 8, 13, 0x8, 0x6)
-		Gprecalc(3, 4, 9, 14, 0xd, 0x3)
-
-		Gprecalc(0, 4, 8, 12, 0xc, 0x2)
-		Gprecalc(1, 5, 9, 13, 0xa, 0x6)
-		Gprecalc(2, 6, 10, 14, 0xb, 0x0)
-		Gprecalc(3, 7, 11, 15, 0x3, 0x8)
-		Gprecalc(0, 5, 10, 15, 0xd, 0x4)
-		Gprecalc(1, 6, 11, 12, 0x5, 0x7)
-		Gprecalc(2, 7, 8, 13, 0xe, 0xf)
-		Gprecalc(3, 4, 9, 14, 0x9, 0x1)
-
-		uint64_t *outHash = &g_hash[8 * hashPosition];
-
-		outHash[0] = devectorizeswap(h[0] ^ v[0] ^ v[8]);
-		outHash[1] = devectorizeswap(h[1] ^ v[1] ^ v[9]);
-		outHash[2] = devectorizeswap(h[2] ^ v[2] ^ v[10]);
-		outHash[3] = devectorizeswap(h[3] ^ v[3] ^ v[11]);
-		outHash[4] = devectorizeswap(h[4] ^ v[4] ^ v[12]);
-		outHash[5] = devectorizeswap(h[5] ^ v[5] ^ v[13]);
-		outHash[6] = devectorizeswap(h[6] ^ v[6] ^ v[14]);
-		outHash[7] = devectorizeswap(h[7] ^ v[7] ^ v[15]);
+#else
+
+			for(int i = 10; i < 16; i++)
+			{
+				/* column step */
+				G(0, 4, 8, 12, 0);
+				G(1, 5, 9, 13, 2);
+				G(2, 6, 10, 14, 4);
+				G(3, 7, 11, 15, 6);
+				/* diagonal step */
+				G(0, 5, 10, 15, 8);
+				G(1, 6, 11, 12, 10);
+				G(2, 7, 8, 13, 12);
+				G(3, 4, 9, 14, 14);
+			}
+#endif
+
+		v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]);
+		v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]);
+		v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]);
+		v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]);
+		v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]);
+		v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]);
+		v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]);
+		v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]);
+
+		phash = (uint28*)v;
+		outpt = (uint28*)&g_hash[hashPosition * 8];
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
 	}
 }
 
-__global__ 
-#if __CUDA_ARCH__ > 500
-__launch_bounds__(256, 4)
-#else
-__launch_bounds__(32, 32)
-#endif
-void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *outputHash)
+__global__
+void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if(thread < threads)
 	{
-		uint32_t nounce = startNounce + thread;
-
+		const uint32_t nounce = startNounce + thread;
 		uint2 block[16];
 
-		// Message f�r die erste Runde in Register holen
-#pragma unroll 16
-		for (int i = 0; i < 16; ++i)
-			block[i] = vectorize(c_PaddedMessage80[i]);
-		// The test Nonce
-			//		((uint32_t*)block)[18] = nounce;
+		block[0] = c_PaddedM[0];
+		block[1] = c_PaddedM[1];
+		block[2] = c_PaddedM[2];
+		block[3] = c_PaddedM[3];
+		block[4] = c_PaddedM[4];
+		block[5] = c_PaddedM[5];
+		block[6] = c_PaddedM[6];
+		block[7] = c_PaddedM[7];
+		block[8] = c_PaddedM[8];
+		block[9].y = c_PaddedM[9].y;
+		block[10] = vectorizehigh(0x80000000);
+		block[11] = vectorizelow(0);
+		block[12] = vectorizelow(0);
+		block[13] = vectorizelow(0x1);
+		block[14] = vectorizelow(0);
+		block[15] = vectorizelow(0x280);
 		block[9].x = nounce;
-//		((uint32_t*)block)[18] = nounce;
 
 		const uint2 u512[16] =
 		{
-			{ 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e },
-			{ 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 },
-			{ 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf },
-			{ 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 },
-			{ 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 },
-			{ 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed },
-			{ 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 },
-			{ 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 }
+			{0x85a308d3UL, 0x243f6a88}, {0x03707344UL, 0x13198a2e},
+			{0x299f31d0UL, 0xa4093822}, {0xec4e6c89UL, 0x082efa98},
+			{0x38d01377UL, 0x452821e6}, {0x34e90c6cUL, 0xbe5466cf},
+			{0xc97c50ddUL, 0xc0ac29b7}, {0xb5470917UL, 0x3f84d5b5},
+			{0x8979fb1bUL, 0x9216d5d9}, {0x98dfb5acUL, 0xd1310ba6},
+			{0xd01adfb7UL, 0x2ffd72db}, {0x6a267e96UL, 0xb8e1afed},
+			{0xf12c7f99UL, 0xba7c9045}, {0xb3916cf7UL, 0x24a19947},
+			{0x858efc16UL, 0x0801f2e2}, {0x71574e69UL, 0x636920d8}
 		};
 
-		const uint2 h[8] = {
-				{ 0xf3bcc908UL,0x6a09e667UL },
-				{ 0x84caa73bUL ,0xbb67ae85UL },
-				{ 0xfe94f82bUL,0x3c6ef372UL },
-				{ 0x5f1d36f1UL,0xa54ff53aUL },
-				{ 0xade682d1UL,0x510e527fUL },
-				{ 0x2b3e6c1fUL,0x9b05688cUL },
-				{ 0xfb41bd6bUL,0x1f83d9abUL },
-				{ 0x137e2179UL,0x5be0cd19UL }
-		};
-
-		uint2 v[16] =
+		/*		const uint2 u512[16] =
 		{
-			h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7],
-			u512[0], u512[1], u512[2], u512[3], u512[4] ^ 640, u512[5] ^ 640, u512[6], u512[7]
+		{ 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e },
+		{ 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 },
+		{ 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf },
+		{ 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 },
+		{ 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 },
+		{ 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed },
+		{ 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 },
+		{ 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 }
 		};
-
-		Gprecalc(0, 4, 8, 12, 0x1, 0x0)
-		Gprecalc(1, 5, 9, 13, 0x3, 0x2)
-		Gprecalc(2, 6, 10, 14, 0x5, 0x4)
-		Gprecalc(3, 7, 11, 15, 0x7, 0x6)
-		Gprecalc(0, 5, 10, 15, 0x9, 0x8)
-		Gprecalc(1, 6, 11, 12, 0xb, 0xa)
-		Gprecalc(2, 7, 8, 13, 0xd, 0xc)
-		Gprecalc(3, 4, 9, 14, 0xf, 0xe)
-
-		Gprecalc(0, 4, 8, 12, 0xa, 0xe)
-		Gprecalc(1, 5, 9, 13, 0x8, 0x4)
-		Gprecalc(2, 6, 10, 14, 0xf, 0x9)
-		Gprecalc(3, 7, 11, 15, 0x6, 0xd)
-		Gprecalc(0, 5, 10, 15, 0xc, 0x1)
-		Gprecalc(1, 6, 11, 12, 0x2, 0x0)
-		Gprecalc(2, 7, 8, 13, 0x7, 0xb)
-		Gprecalc(3, 4, 9, 14, 0x3, 0x5)
-
-		Gprecalc(0, 4, 8, 12, 0x8, 0xb)
-		Gprecalc(1, 5, 9, 13, 0x0, 0xc)
-		Gprecalc(2, 6, 10, 14, 0x2, 0x5)
-		Gprecalc(3, 7, 11, 15, 0xd, 0xf)
-		Gprecalc(0, 5, 10, 15, 0xe, 0xa)
-		Gprecalc(1, 6, 11, 12, 0x6, 0x3)
-		Gprecalc(2, 7, 8, 13, 0x1, 0x7)
-		Gprecalc(3, 4, 9, 14, 0x4, 0x9)
-
-		Gprecalc(0, 4, 8, 12, 0x9, 0x7)
-		Gprecalc(1, 5, 9, 13, 0x1, 0x3)
-		Gprecalc(2, 6, 10, 14, 0xc, 0xd)
-		Gprecalc(3, 7, 11, 15, 0xe, 0xb)
-		Gprecalc(0, 5, 10, 15, 0x6, 0x2)
-		Gprecalc(1, 6, 11, 12, 0xa, 0x5)
-		Gprecalc(2, 7, 8, 13, 0x0, 0x4)
-		Gprecalc(3, 4, 9, 14, 0x8, 0xf)
-
-		Gprecalc(0, 4, 8, 12, 0x0, 0x9)
-		Gprecalc(1, 5, 9, 13, 0x7, 0x5)
-		Gprecalc(2, 6, 10, 14, 0x4, 0x2)
-		Gprecalc(3, 7, 11, 15, 0xf, 0xa)
-		Gprecalc(0, 5, 10, 15, 0x1, 0xe)
-		Gprecalc(1, 6, 11, 12, 0xc, 0xb)
-		Gprecalc(2, 7, 8, 13, 0x8, 0x6)
-		Gprecalc(3, 4, 9, 14, 0xd, 0x3)
-		
-		Gprecalc(0, 4, 8, 12, 0xc, 0x2)
-		Gprecalc(1, 5, 9, 13, 0xa, 0x6)
-		Gprecalc(2, 6, 10, 14, 0xb, 0x0)
-		Gprecalc(3, 7, 11, 15, 0x3, 0x8)
-		Gprecalc(0, 5, 10, 15, 0xd, 0x4)
-		Gprecalc(1, 6, 11, 12, 0x5, 0x7)
-		Gprecalc(2, 7, 8, 13, 0xe, 0xf)
-		Gprecalc(3, 4, 9, 14, 0x9, 0x1)
-
-		Gprecalc(0, 4, 8, 12, 0x5, 0xc)
-		Gprecalc(1, 5, 9, 13, 0xf, 0x1)
-		Gprecalc(2, 6, 10, 14, 0xd, 0xe)
-		Gprecalc(3, 7, 11, 15, 0xa, 0x4)
-		Gprecalc(0, 5, 10, 15, 0x7, 0x0)
-		Gprecalc(1, 6, 11, 12, 0x3, 0x6)
-		Gprecalc(2, 7, 8, 13, 0x2, 0x9)
-		Gprecalc(3, 4, 9, 14, 0xb, 0x8)
-
-		Gprecalc(0, 4, 8, 12, 0xb, 0xd)
-		Gprecalc(1, 5, 9, 13, 0xe, 0x7)
-		Gprecalc(2, 6, 10, 14, 0x1, 0xc)
-		Gprecalc(3, 7, 11, 15, 0x9, 0x3)
-		Gprecalc(0, 5, 10, 15, 0x0, 0x5)
-		Gprecalc(1, 6, 11, 12, 0x4, 0xf)
-		Gprecalc(2, 7, 8, 13, 0x6, 0x8)
-		Gprecalc(3, 4, 9, 14, 0xa, 0x2)
-
-		Gprecalc(0, 4, 8, 12, 0xf, 0x6)
-		Gprecalc(1, 5, 9, 13, 0x9, 0xe)
-		Gprecalc(2, 6, 10, 14, 0x3, 0xb)
-		Gprecalc(3, 7, 11, 15, 0x8, 0x0)
-		Gprecalc(0, 5, 10, 15, 0x2, 0xc)
-		Gprecalc(1, 6, 11, 12, 0x7, 0xd)
-		Gprecalc(2, 7, 8, 13, 0x4, 0x1)
-		Gprecalc(3, 4, 9, 14, 0x5, 0xa)
-
-		Gprecalc(0, 4, 8, 12, 0x2, 0xa)
-		Gprecalc(1, 5, 9, 13, 0x4, 0x8)
-		Gprecalc(2, 6, 10, 14, 0x6, 0x7)
-		Gprecalc(3, 7, 11, 15, 0x5, 0x1)
-		Gprecalc(0, 5, 10, 15, 0xb, 0xf)
-		Gprecalc(1, 6, 11, 12, 0xe, 0x9)
-		Gprecalc(2, 7, 8, 13, 0xc, 0x3)
-		Gprecalc(3, 4, 9, 14, 0x0, 0xd)
-		
-		Gprecalc(0, 4, 8, 12, 0x1, 0x0)
-		Gprecalc(1, 5, 9, 13, 0x3, 0x2)
-		Gprecalc(2, 6, 10, 14, 0x5, 0x4)
-		Gprecalc(3, 7, 11, 15, 0x7, 0x6)
-		Gprecalc(0, 5, 10, 15, 0x9, 0x8)
-		Gprecalc(1, 6, 11, 12, 0xb, 0xa)
-		Gprecalc(2, 7, 8, 13, 0xd, 0xc)
-		Gprecalc(3, 4, 9, 14, 0xf, 0xe)
-
-		Gprecalc(0, 4, 8, 12, 0xa, 0xe)
-		Gprecalc(1, 5, 9, 13, 0x8, 0x4)
-		Gprecalc(2, 6, 10, 14, 0xf, 0x9)
-		Gprecalc(3, 7, 11, 15, 0x6, 0xd)
-		Gprecalc(0, 5, 10, 15, 0xc, 0x1)
-		Gprecalc(1, 6, 11, 12, 0x2, 0x0)
-		Gprecalc(2, 7, 8, 13, 0x7, 0xb)
-		Gprecalc(3, 4, 9, 14, 0x3, 0x5)
-
-		Gprecalc(0, 4, 8, 12, 0x8, 0xb)
-		Gprecalc(1, 5, 9, 13, 0x0, 0xc)
-		Gprecalc(2, 6, 10, 14, 0x2, 0x5)
-		Gprecalc(3, 7, 11, 15, 0xd, 0xf)
-		Gprecalc(0, 5, 10, 15, 0xe, 0xa)
-		Gprecalc(1, 6, 11, 12, 0x6, 0x3)
-		Gprecalc(2, 7, 8, 13, 0x1, 0x7)
-		Gprecalc(3, 4, 9, 14, 0x4, 0x9)
-
-		Gprecalc(0, 4, 8, 12, 0x9, 0x7)
-		Gprecalc(1, 5, 9, 13, 0x1, 0x3)
-		Gprecalc(2, 6, 10, 14, 0xc, 0xd)
-		Gprecalc(3, 7, 11, 15, 0xe, 0xb)
-		Gprecalc(0, 5, 10, 15, 0x6, 0x2)
-		Gprecalc(1, 6, 11, 12, 0xa, 0x5)
-		Gprecalc(2, 7, 8, 13, 0x0, 0x4)
-		Gprecalc(3, 4, 9, 14, 0x8, 0xf)
-
-		Gprecalc(0, 4, 8, 12, 0x0, 0x9)
-		Gprecalc(1, 5, 9, 13, 0x7, 0x5)
-		Gprecalc(2, 6, 10, 14, 0x4, 0x2)
-		Gprecalc(3, 7, 11, 15, 0xf, 0xa)
-		Gprecalc(0, 5, 10, 15, 0x1, 0xe)
-		Gprecalc(1, 6, 11, 12, 0xc, 0xb)
-		Gprecalc(2, 7, 8, 13, 0x8, 0x6)
-		Gprecalc(3, 4, 9, 14, 0xd, 0x3)
-		
-		Gprecalc(0, 4, 8, 12, 0xc, 0x2)
-		Gprecalc(1, 5, 9, 13, 0xa, 0x6)
-		Gprecalc(2, 6, 10, 14, 0xb, 0x0)
-		Gprecalc(3, 7, 11, 15, 0x3, 0x8)
-		Gprecalc(0, 5, 10, 15, 0xd, 0x4)
-		Gprecalc(1, 6, 11, 12, 0x5, 0x7)
-		Gprecalc(2, 7, 8, 13, 0xe, 0xf)
-		Gprecalc(3, 4, 9, 14, 0x9, 0x1)
-
-		uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
-		outHash[0] = devectorizeswap(h[0] ^ v[0] ^ v[8]);
-		outHash[1] = devectorizeswap(h[1] ^ v[1] ^ v[9]);
-		outHash[2] = devectorizeswap(h[2] ^ v[2] ^ v[10]);
-		outHash[3] = devectorizeswap(h[3] ^ v[3] ^ v[11]);
-		outHash[4] = devectorizeswap(h[4] ^ v[4] ^ v[12]);
-		outHash[5] = devectorizeswap(h[5] ^ v[5] ^ v[13]);
-		outHash[6] = devectorizeswap(h[6] ^ v[6] ^ v[14]);
-		outHash[7] = devectorizeswap(h[7] ^ v[7] ^ v[15]);
+		*/
+		const uint2 h[8] = {
+			{0xf3bcc908UL, 0x6a09e667UL},
+			{0x84caa73bUL, 0xbb67ae85UL},
+			{0xfe94f82bUL, 0x3c6ef372UL},
+			{0x5f1d36f1UL, 0xa54ff53aUL},
+			{0xade682d1UL, 0x510e527fUL},
+			{0x2b3e6c1fUL, 0x9b05688cUL},
+			{0xfb41bd6bUL, 0x1f83d9abUL},
+			{0x137e2179UL, 0x5be0cd19UL}
+		};
+		uint2 v[16];
+		uint28 *outpt = (uint28*)v;
+		outpt[0] = Hostprecalc[0];
+		outpt[1] = Hostprecalc[1];
+		outpt[2] = Hostprecalc[2];
+		outpt[3] = Hostprecalc[3];
+
+		int i = 0;
+
+		v[0] += (block[9] ^ c_u512[8]);
+		v[15] = ROR16(v[15] ^ v[0]);
+		v[10] += v[15];
+		v[5] = ROR2(v[5] ^ v[10], 11);
+
+		GSPREC_SP(0, 4, 8, 12)
+
+			//		Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+			v[1] += v[5];
+		v[13] = eorswap32(v[13], v[1]);
+		v[9] += v[13];
+
+		v[5] = ROR2(v[5] ^ v[9], 25);
+		v[1] += (pre[i++]) + v[5];
+		v[13] = ROR16(v[13] ^ v[1]);
+		v[9] += v[13];
+		v[5] = ROR2(v[5] ^ v[9], 11);
+
+		//		Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+		v[2] += (block[9] ^ c_u512[0xf]);
+		v[14] = eorswap32(v[14], v[2]);
+		v[10] += v[14];
+		v[6] = ROR2(v[6] ^ v[10], 25);
+		v[2] += pre[i++] + v[6];
+		v[14] = ROR16(v[14] ^ v[2]);
+		v[10] += v[14];
+		v[6] = ROR2(v[6] ^ v[10], 11);
+
+		//		Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+		v[15] = eorswap32(v[15], v[3]);
+		v[11] += v[15];
+		v[7] = ROR2(v[7] ^ v[11], 25);
+		v[3] += pre[i++] + v[7];
+		v[15] = ROR16(v[15] ^ v[3]);
+		v[11] += v[15];
+		v[7] = ROR2(v[7] ^ v[11], 11);
+
+		GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP(3, 4, 9, 14)
+
+			GSPREC_SP(0, 4, 8, 12)
+			GSPREC_SP(1, 5, 9, 13)
+			GSPREC_SP(2, 6, 10, 14)
+			GSPREC_SP(3, 7, 11, 15)
+			GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP_HI(3, 4, 9, 14, 0x4, 0x9)
+
+			GSPREC_SP_LO(0, 4, 8, 12, 0x9, 0x7)
+			GSPREC_SP(1, 5, 9, 13)
+			GSPREC_SP(2, 6, 10, 14)
+			GSPREC_SP(3, 7, 11, 15)
+			GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP(3, 4, 9, 14)
+
+
+			GSPREC_SP_HI(0, 4, 8, 12, 0x0, 0x9)
+			GSPREC_SP(1, 5, 9, 13)
+			GSPREC_SP(2, 6, 10, 14)
+			GSPREC_SP(3, 7, 11, 15)
+			GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP(3, 4, 9, 14)
+
+			GSPREC_SP(0, 4, 8, 12)
+			GSPREC_SP(1, 5, 9, 13)
+			GSPREC_SP(2, 6, 10, 14)
+			GSPREC_SP(3, 7, 11, 15)
+			GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP_LO(3, 4, 9, 14, 0x9, 0x1)
+
+			GSPREC_SP(0, 4, 8, 12)
+			GSPREC_SP(1, 5, 9, 13)
+			GSPREC_SP(2, 6, 10, 14)
+			GSPREC_SP(3, 7, 11, 15)
+			GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP_HI(2, 7, 8, 13, 0x2, 0x9)
+			GSPREC_SP(3, 4, 9, 14)
+
+
+			GSPREC_SP(0, 4, 8, 12)
+			GSPREC_SP(1, 5, 9, 13)
+			GSPREC_SP(2, 6, 10, 14)
+			GSPREC_SP_LO(3, 7, 11, 15, 0x9, 0x3)
+			GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP(3, 4, 9, 14)
+
+			GSPREC_SP(0, 4, 8, 12)
+			GSPREC_SP_LO(1, 5, 9, 13, 0x9, 0xe)
+			GSPREC_SP(2, 6, 10, 14)
+			GSPREC_SP(3, 7, 11, 15)
+			GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP(3, 4, 9, 14)
+
+			GSPREC_SP(0, 4, 8, 12)
+			GSPREC_SP(1, 5, 9, 13)
+			GSPREC_SP(2, 6, 10, 14)
+			GSPREC_SP(3, 7, 11, 15)
+			GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP_HI(1, 6, 11, 12, 0xe, 0x9)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP(3, 4, 9, 14)
+
+			GSPREC_SP(0, 4, 8, 12)
+			GSPREC_SP(1, 5, 9, 13)
+			GSPREC_SP(2, 6, 10, 14)
+			GSPREC_SP(3, 7, 11, 15)
+			GSPREC_SP_LO(0, 5, 10, 15, 0x9, 0x8)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP(3, 4, 9, 14)
+
+			GSPREC_SP(0, 4, 8, 12)
+			GSPREC_SP(1, 5, 9, 13)
+			GSPREC_SP_HI(2, 6, 10, 14, 0xf, 0x9)
+			GSPREC_SP(3, 7, 11, 15)
+			GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP(3, 4, 9, 14)
+
+			GSPREC_SP(0, 4, 8, 12)
+			GSPREC_SP(1, 5, 9, 13)
+			GSPREC_SP(2, 6, 10, 14)
+			GSPREC_SP(3, 7, 11, 15)
+			GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP_HI(3, 4, 9, 14, 0x4, 0x9)
+
+			GSPREC_SP_LO(0, 4, 8, 12, 0x9, 0x7)
+			GSPREC_SP(1, 5, 9, 13)
+			GSPREC_SP(2, 6, 10, 14)
+			GSPREC_SP(3, 7, 11, 15)
+			GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP(3, 4, 9, 14)
+
+			GSPREC_SP_HI(0, 4, 8, 12, 0x0, 0x9)
+			GSPREC_SP(1, 5, 9, 13)
+			GSPREC_SP(2, 6, 10, 14)
+			GSPREC_SP(3, 7, 11, 15)
+			GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP(3, 4, 9, 14)
+
+			GSPREC_SP(0, 4, 8, 12)
+			GSPREC_SP(1, 5, 9, 13)
+			GSPREC_SP(2, 6, 10, 14)
+			GSPREC_SP(3, 7, 11, 15)
+			GSPREC_SP(0, 5, 10, 15)
+			GSPREC_SP(1, 6, 11, 12)
+			GSPREC_SP(2, 7, 8, 13)
+			GSPREC_SP_LO(3, 4, 9, 14, 0x9, 0x1)
+
+
+			/*			Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+			Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+			Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+			Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+			Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+			Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+			Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+			Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+			Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+			Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+			Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+			Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+			Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+			Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+			Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+			Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+			Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+			Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+			Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+			Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+			Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+			Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+			Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+			Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+			Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+			Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+			Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+			Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+			Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+			Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+			Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+			Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+			Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+			Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+			Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+			Gprecalc(3, 4, 9, 14, 0x9, 0x1)
+			Gprecalc(0, 4, 8, 12, 0x5, 0xc)
+			Gprecalc(1, 5, 9, 13, 0xf, 0x1)
+			Gprecalc(2, 6, 10, 14, 0xd, 0xe)
+			Gprecalc(3, 7, 11, 15, 0xa, 0x4)
+			Gprecalc(0, 5, 10, 15, 0x7, 0x0)
+			Gprecalc(1, 6, 11, 12, 0x3, 0x6)
+			Gprecalc(2, 7, 8, 13, 0x2, 0x9)
+			Gprecalc(3, 4, 9, 14, 0xb, 0x8)
+			Gprecalc(0, 4, 8, 12, 0xb, 0xd)
+			Gprecalc(1, 5, 9, 13, 0xe, 0x7)
+			Gprecalc(2, 6, 10, 14, 0x1, 0xc)
+			Gprecalc(3, 7, 11, 15, 0x9, 0x3)
+			Gprecalc(0, 5, 10, 15, 0x0, 0x5)
+			Gprecalc(1, 6, 11, 12, 0x4, 0xf)
+			Gprecalc(2, 7, 8, 13, 0x6, 0x8)
+			Gprecalc(3, 4, 9, 14, 0xa, 0x2)
+			Gprecalc(0, 4, 8, 12, 0xf, 0x6)
+			Gprecalc(1, 5, 9, 13, 0x9, 0xe)
+			Gprecalc(2, 6, 10, 14, 0x3, 0xb)
+			Gprecalc(3, 7, 11, 15, 0x8, 0x0)
+			Gprecalc(0, 5, 10, 15, 0x2, 0xc)
+			Gprecalc(1, 6, 11, 12, 0x7, 0xd)
+			Gprecalc(2, 7, 8, 13, 0x4, 0x1)
+			Gprecalc(3, 4, 9, 14, 0x5, 0xa)
+			Gprecalc(0, 4, 8, 12, 0x2, 0xa)
+			Gprecalc(1, 5, 9, 13, 0x4, 0x8)
+			Gprecalc(2, 6, 10, 14, 0x6, 0x7)
+			Gprecalc(3, 7, 11, 15, 0x5, 0x1)
+			Gprecalc(0, 5, 10, 15, 0xb, 0xf)
+			Gprecalc(1, 6, 11, 12, 0xe, 0x9)
+			Gprecalc(2, 7, 8, 13, 0xc, 0x3)
+			Gprecalc(3, 4, 9, 14, 0x0, 0xd)
+			Gprecalc(0, 4, 8, 12, 0x1, 0x0)
+			Gprecalc(1, 5, 9, 13, 0x3, 0x2)
+			Gprecalc(2, 6, 10, 14, 0x5, 0x4)
+			Gprecalc(3, 7, 11, 15, 0x7, 0x6)
+			Gprecalc(0, 5, 10, 15, 0x9, 0x8)
+			Gprecalc(1, 6, 11, 12, 0xb, 0xa)
+			Gprecalc(2, 7, 8, 13, 0xd, 0xc)
+			Gprecalc(3, 4, 9, 14, 0xf, 0xe)
+			Gprecalc(0, 4, 8, 12, 0xa, 0xe)
+			Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+			Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+			Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+			Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+			Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+			Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+			Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+			Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+			Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+			Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+			Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+			Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+			Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+			Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+			Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+			Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+			Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+			Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+			Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+			Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+			Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+			Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+			Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+			Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+			Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+			Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+			Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+			Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+			Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+			Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+			Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+			Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+			Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+			Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+			Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+			Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+			Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+			Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+			Gprecalc(3, 4, 9, 14, 0x9, 0x1)
+			*/
+		v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]);
+		v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]);
+		v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]);
+		v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]);
+		v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]);
+		v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]);
+		v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]);
+		v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]);
+
+		uint28 *phash = (uint28*)v;
+		outpt = (uint28*)&outputHash[8 * thread];
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
 	}
 }
 
 
+
 // ---------------------------- END CUDA quark_blake512 functions ------------------------------------
 
+__host__ void quark_blake512_cpu_init(int thr_id)
+{
+	CUDA_SAFE_CALL(cudaMalloc(&c_PaddedMessage80[thr_id], 10 * sizeof(uint2)));
+}
 
-// Blake512 f�r 80 Byte grosse Eingangsdaten
-__host__ void quark_blake512_cpu_setBlock_80(void *pdata)
+__host__ void quark_blake512_cpu_setBlock_80_multi(uint32_t thr_id, uint64_t *pdata)
 {
-	// Message mit Padding bereitstellen
-	// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
-	unsigned char PaddedMessage[128];
-	memcpy(PaddedMessage, pdata, 80);
-	memset(PaddedMessage+80, 0, 48);
-	PaddedMessage[80] = 0x80;
-	PaddedMessage[111] = 1;
-	PaddedMessage[126] = 0x02;
-	PaddedMessage[127] = 0x80;
-	for (int i = 0; i < 16; i++)
-		((uint64_t*)PaddedMessage)[i] = cuda_swab64(((uint64_t*)PaddedMessage)[i]);
-	CUDA_SAFE_CALL(
-		cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)
-	);
+	uint64_t PaddedMessage[10];
+	for(int i = 0; i < 10; i++)
+		PaddedMessage[i] = cuda_swab64(pdata[i]);
+	CUDA_SAFE_CALL(cudaMemcpy(c_PaddedMessage80[thr_id], PaddedMessage, 10 * sizeof(uint64_t), cudaMemcpyHostToDevice));
 }
 
+__host__ void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata)
+{
+	const uint64_t u512[16] =
+	{
+		0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
+		0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
+		0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
+		0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
+		0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
+		0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
+		0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
+		0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
+	};
+
+	uint64_t h[8] = {
+		0x6a09e667f3bcc908ULL,
+		0xbb67ae8584caa73bULL,
+		0x3c6ef372fe94f82bULL,
+		0xa54ff53a5f1d36f1ULL,
+		0x510e527fade682d1ULL,
+		0x9b05688c2b3e6c1fULL,
+		0x1f83d9abfb41bd6bULL,
+		0x5be0cd19137e2179ULL
+	};
+
+	uint64_t v[16] =
+	{
+		h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7],
+		u512[0], u512[1], u512[2], u512[3], u512[4] ^ 640, u512[5] ^ 640, u512[6], u512[7]
+	};
+
+	uint64_t PaddedMessage[10];
+	uint64_t block[16];
+	uint64_t prehost[224];
+
+	for(int i = 0; i < 10; i++)
+	{
+		PaddedMessage[i] = cuda_swab64(pdata[i]);
+		block[i] = PaddedMessage[i];
+	}
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_PaddedM, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
+	
+	block[10] = 0x8000000000000000;
+	block[11] = 0;
+	block[12] = 0;
+	block[13] = 1;
+	block[14] = 0;
+	block[15] = 0x280;
+
+	GprecalcHost(0, 4, 8, 12, 0x1, 0x0)
+	GprecalcHost(1, 5, 9, 13, 0x3, 0x2)
+	GprecalcHost(2, 6, 10, 14, 0x5, 0x4)
+	GprecalcHost(3, 7, 11, 15, 0x7, 0x6)
+
+	GprecalcHost(1, 6, 11, 12, 0xb, 0xa)
+	GprecalcHost(2, 7, 8, 13, 0xd, 0xc)
+
+	v[0] += (block[8] ^ u512[9]) + v[5];
+	v[15] = ROTR64(v[15] ^ v[0], 32);
+	v[10] += v[15];
+	v[5] = ROTR64(v[5] ^ v[10], 25);
+	v[0] += v[5];
+
+	GprecalcHost(3, 4, 9, 14, 0xf, 0xe);
+
+	v[1] += (block[0x4] ^ u512[0x8]);
+	v[2] += v[6];
+	v[3] += (block[0xd] ^ u512[6]) + v[7];
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Hostprecalc, v, 16 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
+
+	int i = 0;
+	RSPRECHOST(0xa, 0xe);
+	prehost[i++] = block[8] ^ u512[4];
+	prehost[i++] = block[0xf] ^ u512[9];
+	prehost[i++] = block[6] ^ u512[0xd];
+
+
+	RSPRECHOST(0xc, 0x1)
+	RSPRECHOST(0x2, 0x0)
+	RSPRECHOST(0x7, 0xb)
+	RSPRECHOST(0x3, 0x5)
+
+	RSPRECHOST(0x8, 0xb)
+	RSPRECHOST(0x0, 0xc)
+	RSPRECHOST(0x2, 0x5)
+	RSPRECHOST(0xd, 0xf)
+	RSPRECHOST(0xe, 0xa)
+	RSPRECHOST(0x6, 0x3)
+	RSPRECHOST(0x1, 0x7)
+	RSPRECHOSTHI(0x4, 0x9)
+
+	RSPRECHOSTLO(0x9, 0x7)
+	RSPRECHOST(0x1, 0x3)
+	RSPRECHOST(0xc, 0xd)
+	RSPRECHOST(0xe, 0xb)
+	RSPRECHOST(0x6, 0x2)
+	RSPRECHOST(0xa, 0x5)
+	RSPRECHOST(0x0, 0x4)
+	RSPRECHOST(0x8, 0xf)
+
+	RSPRECHOSTHI(0, 0x9)
+	RSPRECHOST(0x7, 0x5)
+	RSPRECHOST(0x4, 0x2)
+	RSPRECHOST(0xf, 0xa)
+	RSPRECHOST(0x1, 0xe)
+	RSPRECHOST(0xc, 0xb)
+	RSPRECHOST(0x8, 0x6)
+	RSPRECHOST(0xd, 0x3)
+
+	RSPRECHOST(0xc, 0x2)
+	RSPRECHOST(0xa, 0x6)
+	RSPRECHOST(0xb, 0x0)
+	RSPRECHOST(0x3, 0x8)
+	RSPRECHOST(0xd, 0x4)
+	RSPRECHOST(0x5, 0x7)
+	RSPRECHOST(0xe, 0xf)
+	RSPRECHOSTLO(0x9, 0x1)
+
+	RSPRECHOST(0x5, 0xc)
+	RSPRECHOST(0xf, 0x1)
+	RSPRECHOST(0xd, 0xe)
+	RSPRECHOST(0xa, 0x4)
+	RSPRECHOST(0x7, 0x0)
+	RSPRECHOST(0x3, 0x6)
+	RSPRECHOSTHI(0x2, 0x9)
+	RSPRECHOST(0xb, 0x8)
+
+	RSPRECHOST(0xb, 0xd)
+	RSPRECHOST(0xe, 0x7)
+	RSPRECHOST(0x1, 0xc)
+	RSPRECHOSTLO(0x9, 0x3)
+	RSPRECHOST(0x0, 0x5)
+	RSPRECHOST(0x4, 0xf)
+	RSPRECHOST(0x6, 0x8)
+	RSPRECHOST(0xa, 0x2)
+
+	RSPRECHOST(0xf, 0x6)
+	RSPRECHOSTLO(0x9, 0xe)
+	RSPRECHOST(0x3, 0xb)
+	RSPRECHOST(0x8, 0x0)
+	RSPRECHOST(0x2, 0xc)
+	RSPRECHOST(0x7, 0xd)
+	RSPRECHOST(0x4, 0x1)
+	RSPRECHOST(0x5, 0xa)
+
+	RSPRECHOST(0x2, 0xa)
+	RSPRECHOST(0x4, 0x8)
+	RSPRECHOST(0x6, 0x7)
+	RSPRECHOST(0x5, 0x1)
+	RSPRECHOST(0xb, 0xf)
+	RSPRECHOSTHI(0xe, 0x9)
+	RSPRECHOST(0xc, 0x3)
+	RSPRECHOST(0x0, 0xd)
+	
+	RSPRECHOST(0x1, 0x0)
+	RSPRECHOST(0x3, 0x2)
+	RSPRECHOST(0x5, 0x4)
+	RSPRECHOST(0x7, 0x6)
+	RSPRECHOSTLO(0x9, 0x8)
+	RSPRECHOST(0xb, 0xa)
+	RSPRECHOST(0xd, 0xc)
+	RSPRECHOST(0xf, 0xe)
+	
+	RSPRECHOST(0xa, 0xe)
+	RSPRECHOST(0x8, 0x4)
+	RSPRECHOSTHI(0xf, 0x9)
+	RSPRECHOST(0x6, 0xd)
+	RSPRECHOST(0xc, 0x1)
+	RSPRECHOST(0x2, 0x0)
+	RSPRECHOST(0x7, 0xb)
+	RSPRECHOST(0x3, 0x5)
+
+	RSPRECHOST(0x8, 0xb)
+	RSPRECHOST(0x0, 0xc)
+	RSPRECHOST(0x2, 0x5)
+	RSPRECHOST(0xd, 0xf)
+	RSPRECHOST(0xe, 0xa)
+	RSPRECHOST(0x6, 0x3)
+	RSPRECHOST(0x1, 0x7)
+	RSPRECHOSTHI(0x4, 0x9)
+
+	RSPRECHOSTLO(0x9, 0x7)
+	RSPRECHOST(0x1, 0x3)
+	RSPRECHOST(0xc, 0xd)
+	RSPRECHOST(0xe, 0xb)
+	RSPRECHOST(0x6, 0x2)
+	RSPRECHOST(0xa, 0x5)
+	RSPRECHOST(0x0, 0x4)
+	RSPRECHOST(0x8, 0xf)
+
+	RSPRECHOSTHI(0x0, 0x9)
+	RSPRECHOST(0x7, 0x5)
+	RSPRECHOST(0x4, 0x2)
+	RSPRECHOST(0xf, 0xa)
+	RSPRECHOST(0x1, 0xe)
+	RSPRECHOST(0xc, 0xb)
+	RSPRECHOST(0x8, 0x6)
+	RSPRECHOST(0xd, 0x3)
+
+	RSPRECHOST(0xc, 0x2)
+	RSPRECHOST(0xa, 0x6)
+	RSPRECHOST(0xb, 0x0)
+	RSPRECHOST(0x3, 0x8)
+	RSPRECHOST(0xd, 0x4)
+	RSPRECHOST(0x5, 0x7)
+	RSPRECHOST(0xe, 0xf)
+	RSPRECHOSTLO(0x9, 0x1)
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(pre, prehost, 224 * 8, 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
+}
 
-__host__ void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order)
+__host__ void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash)
 {
 	const uint32_t threadsperblock = 32;
 	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
-	quark_blake512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
+	quark_blake512_gpu_hash_64 << <grid, block, 0, gpustream[thr_id] >> >(threads, startNounce, d_nonceVector, (uint2 *)d_outputHash);
 }
 
-__host__ void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+__host__ void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
 {
 
 	const uint32_t threadsperblock = 32;
 	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	quark_blake512_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash);
-//	MyStreamSynchronize(NULL, order, thr_id);
+	quark_blake512_gpu_hash_80 << <grid, block, 0, gpustream[thr_id] >> >(threads, startNounce, (uint2 *)d_outputHash);
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
diff --git a/quark/cuda_quark_blake512.cu.orig b/quark/cuda_quark_blake512.cu.orig
new file mode 100644
index 0000000000..1a16b9e65b
--- /dev/null
+++ b/quark/cuda_quark_blake512.cu.orig
@@ -0,0 +1,804 @@
+#include <stdio.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+#define ROTR(x,n) ROTR64(x,n)
+
+#define USE_SHUFFLE 0
+
+__constant__ uint2 c_PaddedM[16];
+__constant__ uint2 Hostprecalc[16];
+
+// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
+
+#define Gprecalc(a,b,c,d,idx1,idx2) { \
+	v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \
+	v[d] = SWAPDWORDS2( v[d] ^ v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2(v[b] ^ v[c], 25); \
+	v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \
+	v[d] = ROR16(v[d] ^ v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2(v[b] ^ v[c], 11); \
+	}
+
+
+#define GprecalcHost(a,b,c,d,idx1,idx2) { \
+	v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \
+	v[d] = ROTR64( v[d] ^ v[a],32); \
+	v[c] += v[d]; \
+	v[b] = ROTR64(v[b] ^ v[c], 25); \
+	v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \
+	v[d] = ROTR64(v[d] ^ v[a],16); \
+	v[c] += v[d]; \
+	v[b] = ROTR64(v[b] ^ v[c], 11); \
+		}
+
+
+__global__ 
+#if __CUDA_ARCH__ > 500
+	__launch_bounds__(256, 1)
+#else
+	__launch_bounds__(256, 2)
+#endif
+void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint64_t *const __restrict__ g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+#if USE_SHUFFLE
+	const int warpID = threadIdx.x & 0x0F; // 16 warps
+	const int warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Bl�cke
+	const int maxhashPosition = thread<<3;
+#endif
+
+#if USE_SHUFFLE
+	if (warpBlockID < ( (threads+15)>>4 ))
+#else
+	if (thread < threads)
+#endif
+	{
+		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		const uint32_t hashPosition = nounce - startNounce;
+
+		uint64_t *inpHash = &g_hash[hashPosition*8];
+		uint2 block[16];
+		block[0] = vectorizeswap(inpHash[0]);
+		block[1] = vectorizeswap(inpHash[1]);
+		block[2] = vectorizeswap(inpHash[2]);
+		block[3] = vectorizeswap(inpHash[3]);
+		block[4] = vectorizeswap(inpHash[4]);
+		block[5] = vectorizeswap(inpHash[5]);
+		block[6] = vectorizeswap(inpHash[6]);
+		block[7] = vectorizeswap(inpHash[7]);
+		block[8] = vectorizehigh(0x80000000);
+		block[9] = vectorizelow(0x0);
+		block[10] = vectorizelow(0x0);
+		block[11] = vectorizelow(0x0);
+		block[12] = vectorizelow(0x0);
+		block[13] = vectorizelow(0x1);
+		block[14] = vectorizelow(0x0);
+		block[15] = vectorizelow(0x200);
+
+		const uint2 h[8] =
+		{
+				{ 0xf3bcc908UL, 0x6a09e667UL },
+				{ 0x84caa73bUL, 0xbb67ae85UL },
+				{ 0xfe94f82bUL, 0x3c6ef372UL },
+				{ 0x5f1d36f1UL, 0xa54ff53aUL },
+				{ 0xade682d1UL, 0x510e527fUL },
+				{ 0x2b3e6c1fUL, 0x9b05688cUL },
+				{ 0xfb41bd6bUL, 0x1f83d9abUL },
+				{ 0x137e2179UL, 0x5be0cd19UL }
+		};
+		const uint2 u512[16] =
+		{
+			{ 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e },
+			{ 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 },
+			{ 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf },
+			{ 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 },
+			{ 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 },
+			{ 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed },
+			{ 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 },
+			{ 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 }
+		};
+
+		uint2 v[16] =
+		{
+			h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7],
+			u512[0], u512[1], u512[2], u512[3], u512[4] ^ 512, u512[5] ^ 512, u512[6], u512[7]
+		};
+
+		Gprecalc(0, 4, 8, 12, 0x1, 0x0)
+		Gprecalc(1, 5, 9, 13, 0x3, 0x2)
+		Gprecalc(2, 6, 10, 14, 0x5, 0x4)
+		Gprecalc(3, 7, 11, 15, 0x7, 0x6)
+		Gprecalc(0, 5, 10, 15, 0x9, 0x8)
+		Gprecalc(1, 6, 11, 12, 0xb, 0xa)
+		Gprecalc(2, 7, 8, 13, 0xd, 0xc)
+		Gprecalc(3, 4, 9, 14, 0xf, 0xe)
+
+		Gprecalc(0, 4, 8, 12, 0xa, 0xe)
+		Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+		Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+		Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+		Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+		Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+		Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+		Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+
+		Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+		Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+		Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+		Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+		Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+		Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+		Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+		Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+		
+		Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+		Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+		Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+		Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+		Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+		Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+		Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+		Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+		
+		Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+		Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+		Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+		Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+		Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+		Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+		Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+		Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+		
+		Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+		Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+		Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+		Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+		Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+		Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+		Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+		Gprecalc(3, 4, 9, 14, 0x9, 0x1)
+		
+		Gprecalc(0, 4, 8, 12, 0x5, 0xc)
+		Gprecalc(1, 5, 9, 13, 0xf, 0x1)
+		Gprecalc(2, 6, 10, 14, 0xd, 0xe)
+		Gprecalc(3, 7, 11, 15, 0xa, 0x4)
+		Gprecalc(0, 5, 10, 15, 0x7, 0x0)
+		Gprecalc(1, 6, 11, 12, 0x3, 0x6)
+		Gprecalc(2, 7, 8, 13, 0x2, 0x9)
+		Gprecalc(3, 4, 9, 14, 0xb, 0x8)
+		
+		Gprecalc(0, 4, 8, 12, 0xb, 0xd)
+		Gprecalc(1, 5, 9, 13, 0xe, 0x7)
+		Gprecalc(2, 6, 10, 14, 0x1, 0xc)
+		Gprecalc(3, 7, 11, 15, 0x9, 0x3)
+		Gprecalc(0, 5, 10, 15, 0x0, 0x5)
+		Gprecalc(1, 6, 11, 12, 0x4, 0xf)
+		Gprecalc(2, 7, 8, 13, 0x6, 0x8)
+		Gprecalc(3, 4, 9, 14, 0xa, 0x2)
+		
+		Gprecalc(0, 4, 8, 12, 0xf, 0x6)
+		Gprecalc(1, 5, 9, 13, 0x9, 0xe)
+		Gprecalc(2, 6, 10, 14, 0x3, 0xb)
+		Gprecalc(3, 7, 11, 15, 0x8, 0x0)
+		Gprecalc(0, 5, 10, 15, 0x2, 0xc)
+		Gprecalc(1, 6, 11, 12, 0x7, 0xd)
+		Gprecalc(2, 7, 8, 13, 0x4, 0x1)
+		Gprecalc(3, 4, 9, 14, 0x5, 0xa)
+		
+		Gprecalc(0, 4, 8, 12, 0x2, 0xa)
+		Gprecalc(1, 5, 9, 13, 0x4, 0x8)
+		Gprecalc(2, 6, 10, 14, 0x6, 0x7)
+		Gprecalc(3, 7, 11, 15, 0x5, 0x1)
+		Gprecalc(0, 5, 10, 15, 0xb, 0xf)
+		Gprecalc(1, 6, 11, 12, 0xe, 0x9)
+		Gprecalc(2, 7, 8, 13, 0xc, 0x3)
+		Gprecalc(3, 4, 9, 14, 0x0, 0xd)
+
+		Gprecalc(0, 4, 8, 12, 0x1, 0x0)
+		Gprecalc(1, 5, 9, 13, 0x3, 0x2)
+		Gprecalc(2, 6, 10, 14, 0x5, 0x4)
+		Gprecalc(3, 7, 11, 15, 0x7, 0x6)
+		Gprecalc(0, 5, 10, 15, 0x9, 0x8)
+		Gprecalc(1, 6, 11, 12, 0xb, 0xa)
+		Gprecalc(2, 7, 8, 13, 0xd, 0xc)
+		Gprecalc(3, 4, 9, 14, 0xf, 0xe)
+
+		Gprecalc(0, 4, 8, 12, 0xa, 0xe)
+		Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+		Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+		Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+		Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+		Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+		Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+		Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+
+		Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+		Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+		Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+		Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+		Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+		Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+		Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+		Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+
+		Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+		Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+		Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+		Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+		Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+		Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+		Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+		Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+
+		Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+		Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+		Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+		Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+		Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+		Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+		Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+		Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+
+		Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+		Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+		Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+		Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+		Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+		Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+		Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+		Gprecalc(3, 4, 9, 14, 0x9, 0x1)
+
+		uint64_t *outHash = &g_hash[8 * hashPosition];
+
+		outHash[0] = devectorizeswap(h[0] ^ v[0] ^ v[8]);
+		outHash[1] = devectorizeswap(h[1] ^ v[1] ^ v[9]);
+		outHash[2] = devectorizeswap(h[2] ^ v[2] ^ v[10]);
+		outHash[3] = devectorizeswap(h[3] ^ v[3] ^ v[11]);
+		outHash[4] = devectorizeswap(h[4] ^ v[4] ^ v[12]);
+		outHash[5] = devectorizeswap(h[5] ^ v[5] ^ v[13]);
+		outHash[6] = devectorizeswap(h[6] ^ v[6] ^ v[14]);
+		outHash[7] = devectorizeswap(h[7] ^ v[7] ^ v[15]);
+	}
+}
+
+__global__
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(256, 4)
+#else
+__launch_bounds__(32, 32)
+#endif
+void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *outputHash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nounce = startNounce + thread;
+		uint2 block[16];
+
+		block[0] = c_PaddedM[0];
+		block[1] = c_PaddedM[1];
+		block[2] = c_PaddedM[2];
+		block[3] = c_PaddedM[3];
+		block[4] = c_PaddedM[4];
+		block[5] = c_PaddedM[5];
+		block[6] = c_PaddedM[6];
+		block[7] = c_PaddedM[7];
+		block[8] = c_PaddedM[8];
+		block[9] = c_PaddedM[9];
+		block[10] = vectorizehigh(0x80000000);
+		block[11] = vectorize(0);
+		block[12] = vectorize(0);
+		block[13] = vectorize(0x0000000000000001);
+		block[14] = vectorize(0);
+		block[15] = vectorize(0x0000000000000280);
+		block[9].x = nounce;
+		const uint2 u512[16] =
+		{
+			{ 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e },
+			{ 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 },
+			{ 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf },
+			{ 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 },
+			{ 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 },
+			{ 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed },
+			{ 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 },
+			{ 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 }
+		};
+
+		const uint2 h[8] = {
+				{ 0xf3bcc908UL, 0x6a09e667UL },
+				{ 0x84caa73bUL, 0xbb67ae85UL },
+				{ 0xfe94f82bUL, 0x3c6ef372UL },
+				{ 0x5f1d36f1UL, 0xa54ff53aUL },
+				{ 0xade682d1UL, 0x510e527fUL },
+				{ 0x2b3e6c1fUL, 0x9b05688cUL },
+				{ 0xfb41bd6bUL, 0x1f83d9abUL },
+				{ 0x137e2179UL, 0x5be0cd19UL }
+		};
+
+		uint2 v[16] =
+		{
+			h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7],
+			u512[0], u512[1], u512[2], u512[3], u512[4] ^ 640, u512[5] ^ 640, u512[6], u512[7]
+		};
+
+		Gprecalc(0, 4, 8, 12, 0x1, 0x0)
+			Gprecalc(1, 5, 9, 13, 0x3, 0x2)
+			Gprecalc(2, 6, 10, 14, 0x5, 0x4)
+			Gprecalc(3, 7, 11, 15, 0x7, 0x6)
+			Gprecalc(0, 5, 10, 15, 0x9, 0x8)
+			Gprecalc(1, 6, 11, 12, 0xb, 0xa)
+			Gprecalc(2, 7, 8, 13, 0xd, 0xc)
+			Gprecalc(3, 4, 9, 14, 0xf, 0xe)
+
+			Gprecalc(0, 4, 8, 12, 0xa, 0xe)
+			Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+			Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+			Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+			Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+			Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+			Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+			Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+
+			Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+			Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+			Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+			Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+			Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+			Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+			Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+			Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+
+			Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+			Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+			Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+			Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+			Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+			Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+			Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+			Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+
+			Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+			Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+			Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+			Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+			Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+			Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+			Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+			Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+
+			Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+			Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+			Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+			Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+			Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+			Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+			Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+			Gprecalc(3, 4, 9, 14, 0x9, 0x1)
+
+			Gprecalc(0, 4, 8, 12, 0x5, 0xc)
+			Gprecalc(1, 5, 9, 13, 0xf, 0x1)
+			Gprecalc(2, 6, 10, 14, 0xd, 0xe)
+			Gprecalc(3, 7, 11, 15, 0xa, 0x4)
+			Gprecalc(0, 5, 10, 15, 0x7, 0x0)
+			Gprecalc(1, 6, 11, 12, 0x3, 0x6)
+			Gprecalc(2, 7, 8, 13, 0x2, 0x9)
+			Gprecalc(3, 4, 9, 14, 0xb, 0x8)
+
+			Gprecalc(0, 4, 8, 12, 0xb, 0xd)
+			Gprecalc(1, 5, 9, 13, 0xe, 0x7)
+			Gprecalc(2, 6, 10, 14, 0x1, 0xc)
+			Gprecalc(3, 7, 11, 15, 0x9, 0x3)
+			Gprecalc(0, 5, 10, 15, 0x0, 0x5)
+			Gprecalc(1, 6, 11, 12, 0x4, 0xf)
+			Gprecalc(2, 7, 8, 13, 0x6, 0x8)
+			Gprecalc(3, 4, 9, 14, 0xa, 0x2)
+
+			Gprecalc(0, 4, 8, 12, 0xf, 0x6)
+			Gprecalc(1, 5, 9, 13, 0x9, 0xe)
+			Gprecalc(2, 6, 10, 14, 0x3, 0xb)
+			Gprecalc(3, 7, 11, 15, 0x8, 0x0)
+			Gprecalc(0, 5, 10, 15, 0x2, 0xc)
+			Gprecalc(1, 6, 11, 12, 0x7, 0xd)
+			Gprecalc(2, 7, 8, 13, 0x4, 0x1)
+			Gprecalc(3, 4, 9, 14, 0x5, 0xa)
+
+			Gprecalc(0, 4, 8, 12, 0x2, 0xa)
+			Gprecalc(1, 5, 9, 13, 0x4, 0x8)
+			Gprecalc(2, 6, 10, 14, 0x6, 0x7)
+			Gprecalc(3, 7, 11, 15, 0x5, 0x1)
+			Gprecalc(0, 5, 10, 15, 0xb, 0xf)
+			Gprecalc(1, 6, 11, 12, 0xe, 0x9)
+			Gprecalc(2, 7, 8, 13, 0xc, 0x3)
+			Gprecalc(3, 4, 9, 14, 0x0, 0xd)
+
+			Gprecalc(0, 4, 8, 12, 0x1, 0x0)
+			Gprecalc(1, 5, 9, 13, 0x3, 0x2)
+			Gprecalc(2, 6, 10, 14, 0x5, 0x4)
+			Gprecalc(3, 7, 11, 15, 0x7, 0x6)
+			Gprecalc(0, 5, 10, 15, 0x9, 0x8)
+			Gprecalc(1, 6, 11, 12, 0xb, 0xa)
+			Gprecalc(2, 7, 8, 13, 0xd, 0xc)
+			Gprecalc(3, 4, 9, 14, 0xf, 0xe)
+
+			Gprecalc(0, 4, 8, 12, 0xa, 0xe)
+			Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+			Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+			Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+			Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+			Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+			Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+			Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+
+			Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+			Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+			Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+			Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+			Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+			Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+			Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+			Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+
+			Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+			Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+			Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+			Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+			Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+			Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+			Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+			Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+
+			Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+			Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+			Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+			Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+			Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+			Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+			Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+			Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+
+			Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+			Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+			Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+			Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+			Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+			Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+			Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+			Gprecalc(3, 4, 9, 14, 0x9, 0x1)
+
+			uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
+		outHash[0] = devectorizeswap(h[0] ^ v[0] ^ v[8]);
+		outHash[1] = devectorizeswap(h[1] ^ v[1] ^ v[9]);
+		outHash[2] = devectorizeswap(h[2] ^ v[2] ^ v[10]);
+		outHash[3] = devectorizeswap(h[3] ^ v[3] ^ v[11]);
+		outHash[4] = devectorizeswap(h[4] ^ v[4] ^ v[12]);
+		outHash[5] = devectorizeswap(h[5] ^ v[5] ^ v[13]);
+		outHash[6] = devectorizeswap(h[6] ^ v[6] ^ v[14]);
+		outHash[7] = devectorizeswap(h[7] ^ v[7] ^ v[15]);
+	}
+}
+
+<<<<<<< HEAD
+=======
+
+
+__global__ 
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(32)
+#else
+__launch_bounds__(32, 16)
+#endif
+void quark_blake512_gpu_hash_80_multi(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ outputHash, const uint2*const __restrict__ c_PaddedMessage)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2 block[16];
+		const uint32_t nounce = startNounce + thread;
+
+		block[0] = c_PaddedMessage[0];
+		block[1] = c_PaddedMessage[1];
+		block[2] = c_PaddedMessage[2];
+		block[3] = c_PaddedMessage[3];
+		block[4] = c_PaddedMessage[4];
+		block[5] = c_PaddedMessage[5];
+		block[6] = c_PaddedMessage[6];
+		block[7] = c_PaddedMessage[7];
+		block[8] = c_PaddedMessage[8];
+		block[9] = c_PaddedMessage[9];
+		block[10] = vectorizehigh(0x80000000);
+		block[11] = vectorizelow(0);
+		block[12] = vectorizelow(0);
+		block[13] = vectorizelow(0x1);
+		block[14] = vectorizelow(0);
+		block[15] = vectorizelow(0x280);
+		block[9].x = nounce;
+
+		const uint2 u512[16] =
+		{
+			{ 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e },
+			{ 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 },
+			{ 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf },
+			{ 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 },
+			{ 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 },
+			{ 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed },
+			{ 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 },
+			{ 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 }
+		};
+
+		const uint2 h[8] = {
+				{ 0xf3bcc908UL,0x6a09e667UL },
+				{ 0x84caa73bUL ,0xbb67ae85UL },
+				{ 0xfe94f82bUL,0x3c6ef372UL },
+				{ 0x5f1d36f1UL,0xa54ff53aUL },
+				{ 0xade682d1UL,0x510e527fUL },
+				{ 0x2b3e6c1fUL,0x9b05688cUL },
+				{ 0xfb41bd6bUL,0x1f83d9abUL },
+				{ 0x137e2179UL,0x5be0cd19UL }
+		};
+
+		uint2 v[16] =
+		{
+			Hostprecalc[0], Hostprecalc[1], Hostprecalc[2], Hostprecalc[3], Hostprecalc[4], Hostprecalc[5], 
+			Hostprecalc[6], Hostprecalc[7], Hostprecalc[8], Hostprecalc[9], Hostprecalc[10], Hostprecalc[11],
+			Hostprecalc[12], Hostprecalc[13], Hostprecalc[14], Hostprecalc[15],
+		};
+
+//		Gprecalc(0, 4, 8, 12, 0x1, 0x0)
+//		Gprecalc(1, 5, 9, 13, 0x3, 0x2)
+//		Gprecalc(2, 6, 10, 14, 0x5, 0x4)
+//		Gprecalc(3, 7, 11, 15, 0x7, 0x6)
+
+		Gprecalc(0, 5, 10, 15, 0x9, 0x8)
+		Gprecalc(1, 6, 11, 12, 0xb, 0xa)
+		Gprecalc(2, 7, 8, 13, 0xd, 0xc)
+		Gprecalc(3, 4, 9, 14, 0xf, 0xe)
+
+		Gprecalc(0, 4, 8, 12, 0xa, 0xe)
+		Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+		Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+		Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+		Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+		Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+		Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+		Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+
+		Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+		Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+		Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+		Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+		Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+		Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+		Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+		Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+
+		Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+		Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+		Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+		Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+		Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+		Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+		Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+		Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+
+		Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+		Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+		Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+		Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+		Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+		Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+		Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+		Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+		
+		Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+		Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+		Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+		Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+		Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+		Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+		Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+		Gprecalc(3, 4, 9, 14, 0x9, 0x1)
+
+		Gprecalc(0, 4, 8, 12, 0x5, 0xc)
+		Gprecalc(1, 5, 9, 13, 0xf, 0x1)
+		Gprecalc(2, 6, 10, 14, 0xd, 0xe)
+		Gprecalc(3, 7, 11, 15, 0xa, 0x4)
+		Gprecalc(0, 5, 10, 15, 0x7, 0x0)
+		Gprecalc(1, 6, 11, 12, 0x3, 0x6)
+		Gprecalc(2, 7, 8, 13, 0x2, 0x9)
+		Gprecalc(3, 4, 9, 14, 0xb, 0x8)
+
+		Gprecalc(0, 4, 8, 12, 0xb, 0xd)
+		Gprecalc(1, 5, 9, 13, 0xe, 0x7)
+		Gprecalc(2, 6, 10, 14, 0x1, 0xc)
+		Gprecalc(3, 7, 11, 15, 0x9, 0x3)
+		Gprecalc(0, 5, 10, 15, 0x0, 0x5)
+		Gprecalc(1, 6, 11, 12, 0x4, 0xf)
+		Gprecalc(2, 7, 8, 13, 0x6, 0x8)
+		Gprecalc(3, 4, 9, 14, 0xa, 0x2)
+
+		Gprecalc(0, 4, 8, 12, 0xf, 0x6)
+		Gprecalc(1, 5, 9, 13, 0x9, 0xe)
+		Gprecalc(2, 6, 10, 14, 0x3, 0xb)
+		Gprecalc(3, 7, 11, 15, 0x8, 0x0)
+		Gprecalc(0, 5, 10, 15, 0x2, 0xc)
+		Gprecalc(1, 6, 11, 12, 0x7, 0xd)
+		Gprecalc(2, 7, 8, 13, 0x4, 0x1)
+		Gprecalc(3, 4, 9, 14, 0x5, 0xa)
+
+		Gprecalc(0, 4, 8, 12, 0x2, 0xa)
+		Gprecalc(1, 5, 9, 13, 0x4, 0x8)
+		Gprecalc(2, 6, 10, 14, 0x6, 0x7)
+		Gprecalc(3, 7, 11, 15, 0x5, 0x1)
+		Gprecalc(0, 5, 10, 15, 0xb, 0xf)
+		Gprecalc(1, 6, 11, 12, 0xe, 0x9)
+		Gprecalc(2, 7, 8, 13, 0xc, 0x3)
+		Gprecalc(3, 4, 9, 14, 0x0, 0xd)
+		
+		Gprecalc(0, 4, 8, 12, 0x1, 0x0)
+		Gprecalc(1, 5, 9, 13, 0x3, 0x2)
+		Gprecalc(2, 6, 10, 14, 0x5, 0x4)
+		Gprecalc(3, 7, 11, 15, 0x7, 0x6)
+		Gprecalc(0, 5, 10, 15, 0x9, 0x8)
+		Gprecalc(1, 6, 11, 12, 0xb, 0xa)
+		Gprecalc(2, 7, 8, 13, 0xd, 0xc)
+		Gprecalc(3, 4, 9, 14, 0xf, 0xe)
+
+		Gprecalc(0, 4, 8, 12, 0xa, 0xe)
+		Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+		Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+		Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+		Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+		Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+		Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+		Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+
+		Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+		Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+		Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+		Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+		Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+		Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+		Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+		Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+
+		Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+		Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+		Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+		Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+		Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+		Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+		Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+		Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+
+		Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+		Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+		Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+		Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+		Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+		Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+		Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+		Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+		
+		Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+		Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+		Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+		Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+		Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+		Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+		Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+		Gprecalc(3, 4, 9, 14, 0x9, 0x1)
+
+		uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
+		outHash[0] = devectorizeswap(h[0] ^ v[0] ^ v[8]);
+		outHash[1] = devectorizeswap(h[1] ^ v[1] ^ v[9]);
+		outHash[2] = devectorizeswap(h[2] ^ v[2] ^ v[10]);
+		outHash[3] = devectorizeswap(h[3] ^ v[3] ^ v[11]);
+		outHash[4] = devectorizeswap(h[4] ^ v[4] ^ v[12]);
+		outHash[5] = devectorizeswap(h[5] ^ v[5] ^ v[13]);
+		outHash[6] = devectorizeswap(h[6] ^ v[6] ^ v[14]);
+		outHash[7] = devectorizeswap(h[7] ^ v[7] ^ v[15]);
+	}
+}
+
+
+>>>>>>> 4221eab... Faster quark/x11 Precalculated 1/32 of blake with the cpu.
+// ---------------------------- END CUDA quark_blake512 functions ------------------------------------
+
+__host__ void quark_blake512_cpu_init(int thr_id)
+{
+}
+
+__host__ void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata)
+{
+	uint64_t PaddedMessage[10];
+	for (int i = 0; i < 10; i++)
+		PaddedMessage[i] = cuda_swab64(pdata[i]);
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_PaddedM, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
+
+	uint64_t block[16];
+
+	uint64_t *peker = (uint64_t *)&PaddedMessage[0];
+
+	block[0] = peker[0];
+	block[1] = peker[1];
+	block[2] = peker[2];
+	block[3] = peker[3];
+	block[4] = peker[4];
+	block[5] = peker[5];
+	block[6] = peker[6];
+	block[7] = peker[7];
+	block[8] = peker[8];
+	block[9] = peker[9];
+	block[10] = 0x8000000000000000;
+	block[11] = 0;
+	block[12] = 0;
+	block[13] = 1;
+	block[14] = 0;
+	block[15] = 280;
+
+	const uint64_t u512[16] =
+	{
+		0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
+		0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
+		0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
+		0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
+		0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
+		0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
+		0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
+		0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
+	};
+
+	uint64_t h[8] = {
+		0x6a09e667f3bcc908ULL,
+		0xbb67ae8584caa73bULL,
+		0x3c6ef372fe94f82bULL,
+		0xa54ff53a5f1d36f1ULL,
+		0x510e527fade682d1ULL,
+		0x9b05688c2b3e6c1fULL,
+		0x1f83d9abfb41bd6bULL,
+		0x5be0cd19137e2179ULL
+	};
+
+	uint64_t v[16] =
+	{
+		h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7],
+		u512[0], u512[1], u512[2], u512[3], u512[4] ^ 640, u512[5] ^ 640, u512[6], u512[7]
+	};
+	
+	GprecalcHost(0, 4, 8, 12, 0x1, 0x0)
+	GprecalcHost(1, 5, 9, 13, 0x3, 0x2)
+	GprecalcHost(2, 6, 10, 14, 0x5, 0x4)
+	GprecalcHost(3, 7, 11, 15, 0x7, 0x6)
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(Hostprecalc, &v[0], 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+
+
+}
+
+
+__host__ void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash)
+{
+	const uint32_t threadsperblock = 32;
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+	quark_blake512_gpu_hash_64<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
+}
+
+__host__ void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
+{
+
+	const uint32_t threadsperblock = 32;
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	quark_blake512_gpu_hash_80 << <grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_outputHash);
+	//	MyStreamSynchronize(NULL, order, thr_id);
+}
diff --git a/quark/cuda_quark_compactionTest.cu b/quark/cuda_quark_compactionTest.cu
index acf7534c99..e89b058cf0 100644
--- a/quark/cuda_quark_compactionTest.cu
+++ b/quark/cuda_quark_compactionTest.cu
@@ -12,14 +12,14 @@ static uint32_t *d_partSum[2][MAX_GPUS]; // fuer bis zu vier partielle Summen
 
 
 // True/False tester
-typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
+typedef uint32_t(*cuda_compactTestFunction_t)(const uint32_t *inpHash);
 
-__device__ uint32_t QuarkTrueTest(uint32_t *inpHash)
+__device__ __forceinline__ uint32_t QuarkTrueTest(const uint32_t *inpHash)
 {
 	return ((inpHash[0] & 0x08) == 0x08);
 }
 
-__device__ uint32_t QuarkFalseTest(uint32_t *inpHash)
+__device__ __forceinline__ uint32_t QuarkFalseTest(const uint32_t *inpHash)
 {
 	return ((inpHash[0] & 0x08) == 0);
 }
@@ -31,31 +31,23 @@ cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[M
 // Setup-Funktionen
 __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
 {
-	cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
-	cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
+	CUDA_SAFE_CALL(cudaMemcpyFromSymbolAsync(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t), 0, cudaMemcpyDeviceToHost, gpustream[thr_id]));
+	CUDA_SAFE_CALL(cudaMemcpyFromSymbolAsync(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t), 0, cudaMemcpyDeviceToHost, gpustream[thr_id]));
 
 	// wir brauchen auch Speicherplatz auf dem Device
-	cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);	
-	cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t));
-	cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2));
+	CUDA_SAFE_CALL(cudaMalloc(&d_numValid[thr_id], 2 * sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMallocHost(&h_numValid[thr_id], 2 * sizeof(uint32_t)));
 
 	uint32_t s1;
 	s1 = (threads / 256) * 2;
 
-	cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
-	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
+	CUDA_SAFE_CALL(cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1)); // BLOCKSIZE (Threads/Block)
+	CUDA_SAFE_CALL(cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1)); // BLOCKSIZE (Threads/Block)
 }
 
-#if __CUDA_ARCH__ < 300
-/**
- * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
- */
-#undef __shfl_up
-#define __shfl_up(var, delta, width) (0)
-#endif
-
 // Die Summenfunktion (vom NVIDIA SDK)
-__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
+__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, const uint32_t *inpHashes=NULL, const uint32_t *d_validNonceTable=NULL)
 {
 	__shared__ uint32_t sums[32];
 	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
@@ -75,18 +67,16 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
 	{
 		if (id < threads)
 		{
-			uint32_t *inpHash;
 			if(d_validNonceTable == NULL)
 			{
 				// keine Nonce-Liste
-				inpHash = &inpHashes[id<<4];
+				value = (*testFunc)(&inpHashes[id << 4]);
 			}else
 			{
 				// Nonce-Liste verf�gbar
 				int nonce = d_validNonceTable[id] - startNounce;
-				inpHash = &inpHashes[nonce<<4];
+				value = (*testFunc)(&inpHashes[nonce << 4]);
 			}			
-			value = (*testFunc)(inpHash);
 		}else
 		{
 			value = 0;
@@ -167,7 +157,7 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
 }
 
 // Uniform add: add partial sums array
-__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
+__global__ void quark_compactTest_gpu_ADD(uint32_t *data, const uint32_t *partial_sums, int len)
 {
 	__shared__ uint32_t buf;
 	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
@@ -184,28 +174,26 @@ __global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums
 }
 
 // Der Scatter
-__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
+__global__ void quark_compactTest_gpu_SCATTER(const uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, const uint32_t *inpHashes=NULL, const uint32_t *d_validNonceTable=NULL)
 {
 	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
 	uint32_t actNounce = id;
 	uint32_t value;
 	if (id < threads)
 	{
-//		uint32_t nounce = startNounce + id;
-		uint32_t *inpHash;
+//		const uint32_t nounce = startNounce + id;
 		if(d_validNonceTable == NULL)
 		{
 			// keine Nonce-Liste
-			inpHash = &inpHashes[id<<4];
+			value = (*testFunc)(&inpHashes[id << 4]);
 		}else
 		{
 			// Nonce-Liste verf�gbar
 			int nonce = d_validNonceTable[id] - startNounce;
 			actNounce = nonce;
-			inpHash = &inpHashes[nonce<<4];
+			value = (*testFunc)(&inpHashes[nonce << 4]);
 		}
 
-		value = (*testFunc)(inpHash);
 	}else
 	{
 		value = 0;
@@ -235,7 +223,7 @@ __host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
 
 __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
 														uint32_t *d_nonces1, cuda_compactTestFunction_t function,
-														uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
+														uint32_t startNounce, const uint32_t *inpHashes, const uint32_t *d_validNonceTable)
 {
 	int orgThreads = threads;
 	threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
@@ -251,50 +239,62 @@ __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t thread
 	bool callThrid = (thr2 > 0) ? true : false;
 
 	// Erster Initialscan
-	quark_compactTest_gpu_SCAN<<<thr1,blockSize>>>(
-		d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);	
+	quark_compactTest_gpu_SCAN<<<thr1,blockSize, 0, gpustream[thr_id]>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);	
+	CUDA_SAFE_CALL(cudaGetLastError());
 
 	// weitere Scans
 	if(callThrid)
 	{		
-		quark_compactTest_gpu_SCAN<<<thr2,blockSize>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
-		quark_compactTest_gpu_SCAN<<<1, thr2>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
-	}else
+		quark_compactTest_gpu_SCAN<<<thr2,blockSize, 0, gpustream[thr_id]>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
+		CUDA_SAFE_CALL(cudaGetLastError());
+		quark_compactTest_gpu_SCAN << <1, thr2, 0, gpustream[thr_id] >> >(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
+		CUDA_SAFE_CALL(cudaGetLastError());
+	}
+	else
 	{
-		quark_compactTest_gpu_SCAN<<<thr3,blockSize2>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
+		quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 0, gpustream[thr_id]>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
+		CUDA_SAFE_CALL(cudaGetLastError());
 	}
 
 	if(callThrid)
-		cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	{
+		cudaMemcpyAsync(nrm, &(d_partSum[1][thr_id])[thr2 - 1], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
+		CUDA_SAFE_CALL(cudaGetLastError());
+	}
 	else
-		cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-
+	{
+		cudaMemcpyAsync(nrm, &(d_partSum[0][thr_id])[nSummen - 1], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
+		CUDA_SAFE_CALL(cudaGetLastError());
+	}
 	
 	// Addieren
 	if(callThrid)
 	{
-		quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
+		quark_compactTest_gpu_ADD<<<thr2-1, blockSize, 0, gpustream[thr_id]>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
+		CUDA_SAFE_CALL(cudaGetLastError());
 	}
-	quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
-	
+	quark_compactTest_gpu_ADD<<<thr1-1, blockSize, 0, gpustream[thr_id]>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
+	CUDA_SAFE_CALL(cudaGetLastError());
+
 	// Scatter
-	quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, 
+	quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0, gpustream[thr_id]>>>(d_tempBranch1Nonces[thr_id], d_nonces1, 
 		function, orgThreads, startNounce, inpHashes, d_validNonceTable);
+	CUDA_SAFE_CALL(cudaGetLastError());
+	cudaStreamSynchronize(gpustream[thr_id]);
 }
 
 ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
 __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
 													 uint32_t *d_nonces1, uint32_t *d_nonces2,
-													 uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
+													 uint32_t startNounce, const uint32_t *inpHashes, const uint32_t *d_validNonceTable)
 {
 	quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
 	quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
 }
 
-__host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+__host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes, const uint32_t *d_validNonceTable,
 											uint32_t *d_nonces1, uint32_t *nrm1,
-											uint32_t *d_nonces2, uint32_t *nrm2,
-											int order)
+											uint32_t *d_nonces2, uint32_t *nrm2)
 {
 	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
 	// "threads" ist in diesem Fall auf die L�nge dieses Array's zu setzen!
@@ -308,8 +308,7 @@ __host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32
 }
 
 __host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, uint32_t *nrm1,
-											int order)
+											uint32_t *d_nonces1, uint32_t *nrm1)
 {
 	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
 	// "threads" ist in diesem Fall auf die L�nge dieses Array's zu setzen!
diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu
index b0d50f731d..a433236d4f 100644
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@@ -4,8 +4,9 @@
 #include <memory.h>
 
 #include "cuda_helper.h"
+#include "cuda_vector.h"
 
-#define TPB 256
+#define TPB 512
 #define THF 4
 
 // aus cpu-miner.c
@@ -19,20 +20,20 @@
 #include "groestl_functions_quad.cu"
 #include "bitslice_transformations_quad.cu"
 
-__global__ __launch_bounds__(TPB, THF)
+__global__ __launch_bounds__(TPB, 2)
 void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
 {
-	uint32_t msgBitsliced[8];
-	uint32_t state[8];
-	uint32_t hash[16];
+	uint32_t __align__(16) msgBitsliced[8];
+	uint32_t __align__(16) state[8];
+	uint32_t __align__(16) hash[16];
 	// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
+    const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
     if (thread < threads)
     {
         // GROESTL
-        uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
-		uint32_t hashPosition = nounce - startNounce;
-        uint32_t *inpHash = &g_hash[hashPosition * 16];
+        const uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t hashPosition = nounce - startNounce;
+        uint32_t *const inpHash = &g_hash[hashPosition * 16];
 
         const uint32_t thr = threadIdx.x & (THF-1);
 
@@ -48,103 +49,33 @@ void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, u
         groestl512_progressMessage_quad(state, msgBitsliced);
 
 		from_bitslice_quad(state, hash);
+
 		if (thr == 0)
 		{
-			#pragma unroll
-			for (int k = 0; k < 16; k++) inpHash[k] = hash[k];
+			uint28 *phash = (uint28*)hash;
+			uint28 *outpt = (uint28*)inpHash; /* var kept for hash align */
+			outpt[0] = phash[0];
+			outpt[1] = phash[1];
+//			outpt[2] = phash[2];
+//			outpt[3] = phash[3];
 		}
     }
 }
 
-__global__ void __launch_bounds__(TPB, THF)
-quark_doublegroestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, uint32_t * __restrict__ g_hash, uint32_t * __restrict__ g_nonceVector)
-{
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
-    if (thread < threads)
-    {
-        // GROESTL
-        uint32_t message[8];
-        uint32_t state[8];
-
-        uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint32_t * inpHash = &g_hash[hashPosition<<4];
-        const uint16_t thr = threadIdx.x & (THF-1);
-
-        #pragma unroll
-        for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];
-
-        #pragma unroll
-        for(int k=4;k<8;k++) message[k] = 0;
-
-        if (thr == 0) message[4] = 0x80;
-        if (thr == 3) message[7] = 0x01000000;
-
-        uint32_t msgBitsliced[8];
-        to_bitslice_quad(message, msgBitsliced);
-
-        for (int round=0; round<2; round++)
-        {
-            groestl512_progressMessage_quad(state, msgBitsliced);
-
-            if (round < 1)
-            {
-                // Verkettung zweier Runden inclusive Padding.
-                msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + (((threadIdx.x&3)==3)<<13));
-                msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
-                msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
-                msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
-                msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
-                msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
-                msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
-                msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + (((threadIdx.x&3)==0)<<4));
-            }
-        }
-
-        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-        uint32_t *outpHash = inpHash;
-        uint32_t hash[16];
-        from_bitslice_quad(state, hash);
-
-		if (thr != 0) return;
-
-		#pragma unroll
-        for(int k=0;k<16;k++) outpHash[k] = hash[k];
-    }
-}
 
-// Setup-Funktionen
 __host__ void quark_groestl512_cpu_init(int thr_id, uint32_t threads)
 {
 //    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
 }
 
-__host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash)
 {
-    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
-    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-    const int factor = THF;
 
     // berechne wie viele Thread Blocks wir brauchen
-	dim3 grid(factor*((threads + TPB - 1) / TPB));
+	dim3 grid(THF*((threads + TPB - 1) / TPB));
 	dim3 block(TPB);
 
-    quark_groestl512_gpu_hash_64_quad<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
-
-    // Strategisches Sleep Kommando zur Senkung der CPU Last
-	//MyStreamSynchronize(NULL, order, thr_id);
+    quark_groestl512_gpu_hash_64_quad<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_hash, d_nonceVector);
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
 
-__host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
-{
-    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
-    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-    const int factor = THF;
-
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid(factor*((threads + TPB-1)/TPB));
-    dim3 block(TPB);
-
-    quark_doublegroestl512_gpu_hash_64_quad<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
-}
diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu
index 6ae3a9acab..875f48e75b 100644
--- a/quark/cuda_quark_keccak512.cu
+++ b/quark/cuda_quark_keccak512.cu
@@ -2,6 +2,7 @@
 #include <memory.h>
 
 #include "cuda_helper.h"
+#include "cuda_vector.h"
 
 #ifdef _MSC_VER
 #define UINT2(x,y) { x, y }
@@ -9,6 +10,8 @@
 #define UINT2(x,y) (uint2) { x, y }
 #endif
 
+static uint32_t *d_found[MAX_GPUS];
+
 __constant__ uint2 c_keccak_round_constants35[24] = {
 		{ 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 },
 		{ 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 },
@@ -23,344 +26,1923 @@ __constant__ uint2 c_keccak_round_constants35[24] = {
 		{ 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 },
 		{ 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 }
 };
+#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
 
-static __device__ __forceinline__ void
-keccak_block_35(uint2 *s) {
-	int i = 0;
-	uint2 t[5], u[5], v, w;
-
-	t[0] = s[0] ^ s[5];
-	t[1] = s[1] ^ s[6];
-	t[2] = s[2] ^ s[7];
-	t[3] = s[3] ^ s[8];
-
-	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-	u[0] = s[4] ^ ROL2(t[1], 1);
-	u[1] = t[0] ^ ROL2(t[2], 1);
-	u[2] = t[1] ^ ROL2(t[3], 1);
-	u[3] = t[2] ^ ROL2(s[4], 1);
-	u[4] = t[3] ^ ROL2(t[0], 1);
-
-	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-	s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-	s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-	s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-	s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-	s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-	/* rho pi: b[..] = rotl(a[..], ..) */
-	v = s[1];
-	s[1] = ROL2(s[6], 44);
-	s[6] = ROL2(s[9], 20);
-	s[9] = ROL2(s[22], 61);
-	s[22] = ROL2(s[14], 39);
-	s[14] = ROL2(s[20], 18);
-	s[20] = ROL2(s[2], 62);
-	s[2] = ROL2(s[12], 43);
-	s[12] = ROL2(s[13], 25);
-	s[13] = ROL2(s[19], 8);
-	s[19] = ROL2(s[23], 56);
-	s[23] = ROL2(s[15], 41);
-	s[15] = ROL2(s[4], 27);
-	s[4] = ROL2(s[24], 14);
-	s[24] = ROL2(s[21], 2);
-	s[21] = ROL2(s[8], 55);
-	s[8] = ROL2(s[16], 45);
-	s[16] = ROL2(s[5], 36);
-	s[5] = ROL2(s[3], 28);
-	s[3] = ROL2(s[18], 21);
-	s[18] = ROL2(s[17], 15);
-	s[17] = ROL2(s[11], 10);
-	s[11] = ROL2(s[7], 6);
-	s[7] = ROL2(s[10], 3);
-	s[10] = ROL2(v, 1);
-
-	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-	v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
-	v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
-	v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-	v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-	v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-	/* iota: a[0,0] ^= round constant */
-	s[0] = s[0] ^ 1; //c_keccak_round_constants[0]);
-
-	for (i = 1; i < 24; i++) {
-		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+__global__  __launch_bounds__(128, 4)
+void quark_keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint2 *const __restrict__ g_hash, uint32_t *const __restrict__ g_nonceVector)
+{
+    const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//    if (thread < threads)
+    {
+        const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-		u[0] = t[4] ^ ROL2(t[1], 1);
-		u[1] = t[0] ^ ROL2(t[2], 1);
-		u[2] = t[1] ^ ROL2(t[3], 1);
-		u[3] = t[2] ^ ROL2(t[4], 1);
-		u[4] = t[3] ^ ROL2(t[0], 1);
-
-		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-		/* rho pi: b[..] = rotl(a[..], ..) */
-		v = s[1];
-		s[1] = ROL2(s[6], 44);
-		s[6] = ROL2(s[9], 20);
-		s[9] = ROL2(s[22], 61);
-		s[22] = ROL2(s[14], 39);
-		s[14] = ROL2(s[20], 18);
-		s[20] = ROL2(s[2], 62);
-		s[2] = ROL2(s[12], 43);
-		s[12] = ROL2(s[13], 25);
-		s[13] = ROL2(s[19], 8);
-		s[19] = ROL2(s[23], 56);
-		s[23] = ROL2(s[15], 41);
-		s[15] = ROL2(s[4], 27);
-		s[4] = ROL2(s[24], 14);
-		s[24] = ROL2(s[21], 2);
-		s[21] = ROL2(s[8], 55);
-		s[8] = ROL2(s[16], 45);
-		s[16] = ROL2(s[5], 36);
-		s[5] = ROL2(s[3], 28);
-		s[3] = ROL2(s[18], 21);
-		s[18] = ROL2(s[17], 15);
-		s[17] = ROL2(s[11], 10);
-		s[11] = ROL2(s[7], 6);
-		s[7] = ROL2(s[10], 3);
-		s[10] = ROL2(v, 1);
-
-		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-		v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
-		v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
-		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-		/* iota: a[0,0] ^= round constant */
-		s[0] ^= c_keccak_round_constants35[i];
-	}
-}
+        const uint32_t hashPosition = nounce - startNounce;
+        uint2 *const inpHash = &g_hash[8 * hashPosition];
 
-static __device__ __forceinline__ void
-keccak_block_35_final(uint2 *s)
-{
-	int i = 0;
-	uint2 t[5], u[5], v, w;
-
-	t[0] = s[0] ^ s[5];
-	t[1] = s[1] ^ s[6];
-	t[2] = s[2] ^ s[7];
-	t[3] = s[3] ^ s[8];
-
-	/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-	u[0] = s[4] ^ ROL2(t[1], 1);
-	u[1] = t[0] ^ ROL2(t[2], 1);
-	u[2] = t[1] ^ ROL2(t[3], 1);
-	u[3] = t[2] ^ ROL2(s[4], 1);
-	u[4] = t[3] ^ ROL2(t[0], 1);
-
-	/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-	s[0] ^= u[0]; s[5] ^= u[0]; s[10] = u[0]; s[15] = u[0]; s[20] = u[0];
-	s[1] ^= u[1]; s[6] ^= u[1]; s[11] = u[1]; s[16] = u[1]; s[21] = u[1];
-	s[2] ^= u[2]; s[7] ^= u[2]; s[12] = u[2]; s[17] = u[2]; s[22] = u[2];
-	s[3] ^= u[3]; s[8] ^= u[3]; s[13] = u[3]; s[18] = u[3]; s[23] = u[3];
-	s[4] ^= u[4]; s[9] = u[4]; s[14] = u[4]; s[19] = u[4]; s[24] = u[4];
-
-	/* rho pi: b[..] = rotl(a[..], ..) */
-	v = s[1];
-	s[1] = ROL2(s[6], 44);
-	s[6] = ROL2(s[9], 20);
-	s[9] = ROL2(s[22], 61);
-	s[22] = ROL2(s[14], 39);
-	s[14] = ROL2(s[20], 18);
-	s[20] = ROL2(s[2], 62);
-	s[2] = ROL2(s[12], 43);
-	s[12] = ROL2(s[13], 25);
-	s[13] = ROL2(s[19], 8);
-	s[19] = ROL2(s[23], 56);
-	s[23] = ROL2(s[15], 41);
-	s[15] = ROL2(s[4], 27);
-	s[4] = ROL2(s[24], 14);
-	s[24] = ROL2(s[21], 2);
-	s[21] = ROL2(s[8], 55);
-	s[8] = ROL2(s[16], 45);
-	s[16] = ROL2(s[5], 36);
-	s[5] = ROL2(s[3], 28);
-	s[3] = ROL2(s[18], 21);
-	s[18] = ROL2(s[17], 15);
-	s[17] = ROL2(s[11], 10);
-	s[11] = ROL2(s[7], 6);
-	s[7] = ROL2(s[10], 3);
-	s[10] = ROL2(v, 1);
-
-	/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-	v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
-	v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
-	v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-	v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-	v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-	/* iota: a[0,0] ^= round constant */
-	s[0] = s[0] ^ 1; //c_keccak_round_constants[0]);
-
-	for (i = 1; i < 23; i++)
-	{
-		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
 
-		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-		u[0] = t[4] ^ ROL2(t[1], 1);
-		u[1] = t[0] ^ ROL2(t[2], 1);
-		u[2] = t[1] ^ ROL2(t[3], 1);
-		u[3] = t[2] ^ ROL2(t[4], 1);
-		u[4] = t[3] ^ ROL2(t[0], 1);
-
-		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-		/* rho pi: b[..] = rotl(a[..], ..) */
-		v = s[1];
-		s[1] = ROL2(s[6], 44);
-		s[6] = ROL2(s[9], 20);
-		s[9] = ROL2(s[22], 61);
-		s[22] = ROL2(s[14], 39);
-		s[14] = ROL2(s[20], 18);
-		s[20] = ROL2(s[2], 62);
-		s[2] = ROL2(s[12], 43);
-		s[12] = ROL2(s[13], 25);
-		s[13] = ROL2(s[19], 8);
-		s[19] = ROL2(s[23], 56);
-		s[23] = ROL2(s[15], 41);
-		s[15] = ROL2(s[4], 27);
-		s[4] = ROL2(s[24], 14);
-		s[24] = ROL2(s[21], 2);
-		s[21] = ROL2(s[8], 55);
-		s[8] = ROL2(s[16], 45);
-		s[16] = ROL2(s[5], 36);
-		s[5] = ROL2(s[3], 28);
-		s[3] = ROL2(s[18], 21);
-		s[18] = ROL2(s[17], 15);
-		s[17] = ROL2(s[11], 10);
-		s[11] = ROL2(s[7], 6);
-		s[7] = ROL2(s[10], 3);
-		s[10] = ROL2(v, 1);
-
-		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-		v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
-		v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
-		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-		/* iota: a[0,0] ^= round constant */
-		s[0] ^= c_keccak_round_constants35[i];
-	}
-	t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-	t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-	t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-	t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-	t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+		uint2 msg[8];
 
-	s[0] ^= t[4] ^ ROL2(t[1], 1);
-	s[18] ^= t[2] ^ ROL2(t[4], 1);
-	s[24] ^= t[3] ^ ROL2(t[0], 1);
+		((uint28*)msg)[0] = ((uint28*)inpHash)[0];
+		((uint28*)msg)[1] = ((uint28*)inpHash)[1];
 
-	s[3] = ROL2(s[18], 21) ^ ((~ROL2(s[24], 14)) & s[0]);
-}
+        uint2 s[25];
+		uint2 bc[5], tmpxor[5], tmp1, tmp2;
 
-__global__  __launch_bounds__(256, 2)
-void quark_keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
-{
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		tmpxor[0] = msg[0] ^ msg[5];
+		tmpxor[1] = msg[1] ^ msg[6];
+		tmpxor[2] = msg[2] ^ msg[7];
+		tmpxor[3] = msg[3] ^ make_uint2(0x1, 0x80000000);
+		tmpxor[4] = msg[4];
 
-        int hashPosition = nounce - startNounce;
-        uint64_t *inpHash = &g_hash[8 * hashPosition];
+		bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
 
-        uint2 keccak_gpu_state[25];
-#pragma unroll
-		for (int i = 0; i<8; i++)
+		s[0] = msg[0] ^ bc[4];
+		s[1] = ROL2(msg[6] ^ bc[0], 44);
+		s[6] = ROL2(bc[3], 20);
+		s[9] = ROL2(bc[1], 61);
+		s[22] = ROL2(bc[3], 39);
+		s[14] = ROL2(bc[4], 18);
+		s[20] = ROL2(msg[2] ^ bc[1], 62);
+		s[2] = ROL2(bc[1], 43);
+		s[12] = ROL2(bc[2], 25);
+		s[13] = ROL8(bc[3]);
+		s[19] = ROR8(bc[2]);
+		s[23] = ROL2(bc[4], 41);
+		s[15] = ROL2(msg[4] ^ bc[3], 27);
+		s[4] = ROL2(bc[3], 14);
+		s[24] = ROL2(bc[0], 2);
+		s[21] = ROL2(make_uint2(0x1, 0x80000000) ^ bc[2], 55);
+		s[8] = ROL2(bc[0], 45);
+		s[16] = ROL2(msg[5] ^ bc[4], 36);
+		s[5] = ROL2(msg[3] ^ bc[2], 28);
+		s[3] = ROL2(bc[2], 21);
+		s[18] = ROL2(bc[1], 15);
+		s[17] = ROL2(bc[0], 10);
+		s[11] = ROL2(msg[7] ^ bc[1], 6);
+		s[7] = ROL2(bc[4], 3);
+		s[10] = ROL2(msg[1] ^ bc[0], 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0].x ^= 1;
+
+#pragma unroll 2
+		for (int i = 1; i < 24; ++i)
 		{
-			keccak_gpu_state[i] = vectorize(inpHash[i]);
-		}
-		keccak_gpu_state[8] = make_uint2(0x00000001UL, 0x80000000);	//vectorize(0x8000000000000001ULL);
 
 #pragma unroll
-        for (int i=9; i<25; i++)
-		{
-			keccak_gpu_state[i] = make_uint2(0, 0);
+			for (int x = 0; x < 5; x++)
+				tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
+
+			bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+			bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+			bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+			bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+			bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+			tmp1 = s[1] ^ bc[0];
+
+			s[0] ^= bc[4];
+			s[1] = ROL2(s[6] ^ bc[0], 44);
+			s[6] = ROL2(s[9] ^ bc[3], 20);
+			s[9] = ROL2(s[22] ^ bc[1], 61);
+			s[22] = ROL2(s[14] ^ bc[3], 39);
+			s[14] = ROL2(s[20] ^ bc[4], 18);
+			s[20] = ROL2(s[2] ^ bc[1], 62);
+			s[2] = ROL2(s[12] ^ bc[1], 43);
+			s[12] = ROL2(s[13] ^ bc[2], 25);
+			s[13] = ROL8(s[19] ^ bc[3]);
+			s[19] = ROR8(s[23] ^ bc[2]);
+			s[23] = ROL2(s[15] ^ bc[4], 41);
+			s[15] = ROL2(s[4] ^ bc[3], 27);
+			s[4] = ROL2(s[24] ^ bc[3], 14);
+			s[24] = ROL2(s[21] ^ bc[0], 2);
+			s[21] = ROL2(s[8] ^ bc[2], 55);
+			s[8] = ROL2(s[16] ^ bc[0], 45);
+			s[16] = ROL2(s[5] ^ bc[4], 36);
+			s[5] = ROL2(s[3] ^ bc[2], 28);
+			s[3] = ROL2(s[18] ^ bc[2], 21);
+			s[18] = ROL2(s[17] ^ bc[1], 15);
+			s[17] = ROL2(s[11] ^ bc[0], 10);
+			s[11] = ROL2(s[7] ^ bc[1], 6);
+			s[7] = ROL2(s[10] ^ bc[4], 3);
+			s[10] = ROL2(tmp1, 1);
+
+			tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+			tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+			tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+			tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+			tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+			s[0] ^= c_keccak_round_constants35[i];
 		}
-		keccak_block_35(keccak_gpu_state);
 
 #pragma unroll
         for(int i=0;i<8;i++)
-			inpHash[i] = devectorize(keccak_gpu_state[i]);
+			inpHash[i] = s[i];
     }
 }
 
-__global__  __launch_bounds__(256, 2)
-void quark_keccak512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+
+__global__  __launch_bounds__(128, 6)
+void quark_keccakskein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint2 *const __restrict__ g_hash, uint32_t *const __restrict__ g_nonceVector)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
+	{
+		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		const uint32_t hashPosition = nounce - startNounce;
+		uint2 *const inpHash = &g_hash[8 * hashPosition];
+
+		uint2 s[25];
+		uint2 bc[5], tmpxor[5], tmp1, tmp2;
+
+		uint2 msg[8];
+
+		((uint28*)msg)[0] = ((uint28*)inpHash)[0];
+		((uint28*)msg)[1] = ((uint28*)inpHash)[1];
+
+		tmpxor[0] = msg[0] ^ msg[5];
+		tmpxor[1] = msg[1] ^ msg[6];
+		tmpxor[2] = msg[2] ^ msg[7];
+		tmpxor[3] = msg[3] ^ make_uint2(0x1, 0x80000000);
+		tmpxor[4] = msg[4];
+
+		bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+		s[0] = inpHash[0] ^ bc[4];
+		s[1] = ROL2(inpHash[6] ^ bc[0], 44);
+		s[6] = ROL2(bc[3], 20);
+		s[9] = ROL2(bc[1], 61);
+		s[22] = ROL2(bc[3], 39);
+		s[14] = ROL2(bc[4], 18);
+		s[20] = ROL2(inpHash[2] ^ bc[1], 62);
+		s[2] = ROL2(bc[1], 43);
+		s[12] = ROL2(bc[2], 25);
+		s[13] = ROL8(bc[3]);
+		s[19] = ROR8(bc[2]);
+		s[23] = ROL2(bc[4], 41);
+		s[15] = ROL2(inpHash[4] ^ bc[3], 27);
+		s[4] = ROL2(bc[3], 14);
+		s[24] = ROL2(bc[0], 2);
+		s[21] = ROL2(make_uint2(0x1, 0x80000000) ^ bc[2], 55);
+		s[8] = ROL2(bc[0], 45);
+		s[16] = ROL2(inpHash[5] ^ bc[4], 36);
+		s[5] = ROL2(inpHash[3] ^ bc[2], 28);
+		s[3] = ROL2(bc[2], 21);
+		s[18] = ROL2(bc[1], 15);
+		s[17] = ROL2(bc[0], 10);
+		s[11] = ROL2(inpHash[7] ^ bc[1], 6);
+		s[7] = ROL2(bc[4], 3);
+		s[10] = ROL2(inpHash[1] ^ bc[0], 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0].x ^= 1;
+
+#pragma unroll 2
+		for (int i = 1; i < 24; ++i)
+		{
+
+#pragma unroll
+			for (int x = 0; x < 5; x++)
+				tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
+
+			bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+			bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+			bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+			bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+			bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+			tmp1 = s[1] ^ bc[0];
+
+			s[0] ^= bc[4];
+			s[1] = ROL2(s[6] ^ bc[0], 44);
+			s[6] = ROL2(s[9] ^ bc[3], 20);
+			s[9] = ROL2(s[22] ^ bc[1], 61);
+			s[22] = ROL2(s[14] ^ bc[3], 39);
+			s[14] = ROL2(s[20] ^ bc[4], 18);
+			s[20] = ROL2(s[2] ^ bc[1], 62);
+			s[2] = ROL2(s[12] ^ bc[1], 43);
+			s[12] = ROL2(s[13] ^ bc[2], 25);
+			s[13] = ROL8(s[19] ^ bc[3]);
+			s[19] = ROR8(s[23] ^ bc[2]);
+			s[23] = ROL2(s[15] ^ bc[4], 41);
+			s[15] = ROL2(s[4] ^ bc[3], 27);
+			s[4] = ROL2(s[24] ^ bc[3], 14);
+			s[24] = ROL2(s[21] ^ bc[0], 2);
+			s[21] = ROL2(s[8] ^ bc[2], 55);
+			s[8] = ROL2(s[16] ^ bc[0], 45);
+			s[16] = ROL2(s[5] ^ bc[4], 36);
+			s[5] = ROL2(s[3] ^ bc[2], 28);
+			s[3] = ROL2(s[18] ^ bc[2], 21);
+			s[18] = ROL2(s[17] ^ bc[1], 15);
+			s[17] = ROL2(s[11] ^ bc[0], 10);
+			s[11] = ROL2(s[7] ^ bc[1], 6);
+			s[7] = ROL2(s[10] ^ bc[4], 3);
+			s[10] = ROL2(tmp1, 1);
+
+			tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+			tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+			tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+			tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+			tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+			s[0] ^= c_keccak_round_constants35[i];
+		}
+
+//#pragma unroll
+//		for (int i = 0; i<8; i++)
+//			inpHash[i] = s[i];
+	uint2 skein_p[8], h[9];
+
+	h[0] = skein_p[0] = (s[0]);
+	h[1] = skein_p[1] = (s[1]);
+	h[2] = skein_p[2] = (s[2]);
+	h[3] = skein_p[3] = (s[3]);
+	h[4] = skein_p[4] = (s[4]);
+	h[5] = skein_p[5] = (s[5]);
+	h[6] = skein_p[6] = (s[6]);
+	h[7] = skein_p[7] = (s[7]);
+
+	skein_p[0] += vectorize(0x4903ADFF749C51CEULL);
+	skein_p[1] += vectorize(0x0D95DE399746DF03ULL);
+	skein_p[2] += vectorize(0x8FD1934127C79BCEULL);
+	skein_p[3] += vectorize(0x9A255629FF352CB1ULL);
+	skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL);
+	skein_p[5] += vectorize(0xEABE394CA9D5C434ULL);
+	skein_p[6] += vectorize(0x891112C71A75B523ULL);
+	skein_p[7] += vectorize(0xAE18A40B660FCC33ULL);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+	skein_p[0] += vectorize(0x0D95DE399746DF03ULL);
+	skein_p[1] += vectorize(0x8FD1934127C79BCEULL);
+	skein_p[2] += vectorize(0x9A255629FF352CB1ULL);
+	skein_p[3] += vectorize(0x5DB62599DF6CA7B0ULL);
+	skein_p[4] += vectorize(0xEABE394CA9D5C3F4ULL);
+	skein_p[5] += vectorize(0x891112C71A75B523ULL);
+	skein_p[6] += vectorize(0x9E18A40B660FCC73ULL);
+	skein_p[7] += vectorize(0xcab2076d98173ec4ULL + 1);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+	skein_p[0] += vectorize(0x8FD1934127C79BCEULL);
+	skein_p[1] += vectorize(0x9A255629FF352CB1ULL);
+	skein_p[2] += vectorize(0x5DB62599DF6CA7B0ULL);
+	skein_p[3] += vectorize(0xEABE394CA9D5C3F4ULL);
+	skein_p[4] += vectorize(0x991112C71A75B523ULL);
+	skein_p[5] += vectorize(0x9E18A40B660FCC73ULL);
+	skein_p[6] += vectorize(0xCAB2076D98173F04ULL);
+	skein_p[7] += vectorize(0x4903ADFF749C51D0ULL);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+	skein_p[0] += vectorize(0x9A255629FF352CB1ULL);
+	skein_p[1] += vectorize(0x5DB62599DF6CA7B0ULL);
+	skein_p[2] += vectorize(0xEABE394CA9D5C3F4ULL);
+	skein_p[3] += vectorize(0x991112C71A75B523ULL);
+	skein_p[4] += vectorize(0xAE18A40B660FCC33ULL);
+	skein_p[5] += vectorize(0xcab2076d98173f04ULL);
+	skein_p[6] += vectorize(0x3903ADFF749C51CEULL);
+	skein_p[7] += vectorize(0x0D95DE399746DF03ULL + 3);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+	skein_p[0] += vectorize(0x5DB62599DF6CA7B0ULL);
+	skein_p[1] += vectorize(0xEABE394CA9D5C3F4ULL);
+	skein_p[2] += vectorize(0x991112C71A75B523ULL);
+	skein_p[3] += vectorize(0xAE18A40B660FCC33ULL);
+	skein_p[4] += vectorize(0xcab2076d98173ec4ULL);
+	skein_p[5] += vectorize(0x3903ADFF749C51CEULL);
+	skein_p[6] += vectorize(0xFD95DE399746DF43ULL);
+	skein_p[7] += vectorize(0x8FD1934127C79BD2ULL);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+	skein_p[0] += vectorize(0xEABE394CA9D5C3F4ULL);
+	skein_p[1] += vectorize(0x991112C71A75B523ULL);
+	skein_p[2] += vectorize(0xAE18A40B660FCC33ULL);
+	skein_p[3] += vectorize(0xcab2076d98173ec4ULL);
+	skein_p[4] += vectorize(0x4903ADFF749C51CEULL);
+	skein_p[5] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL);
+	skein_p[6] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL);
+	skein_p[7] += vectorize(0x9A255629FF352CB1ULL + 5);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+	skein_p[0] += vectorize(0x991112C71A75B523ULL);
+	skein_p[1] += vectorize(0xAE18A40B660FCC33ULL);
+	skein_p[2] += vectorize(0xcab2076d98173ec4ULL);
+	skein_p[3] += vectorize(0x4903ADFF749C51CEULL);
+	skein_p[4] += vectorize(0x0D95DE399746DF03ULL);
+	skein_p[5] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL);
+	skein_p[6] += vectorize(0x8A255629FF352CB1ULL);
+	skein_p[7] += vectorize(0x5DB62599DF6CA7B0ULL + 6);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+	skein_p[0] += vectorize(0xAE18A40B660FCC33ULL);
+	skein_p[1] += vectorize(0xcab2076d98173ec4ULL);
+	skein_p[2] += vectorize(0x4903ADFF749C51CEULL);
+	skein_p[3] += vectorize(0x0D95DE399746DF03ULL);
+	skein_p[4] += vectorize(0x8FD1934127C79BCEULL);
+	skein_p[5] += vectorize(0x8A255629FF352CB1ULL);
+	skein_p[6] += vectorize(0x4DB62599DF6CA7F0ULL);
+	skein_p[7] += vectorize(0xEABE394CA9D5C3F4ULL + 7);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+	skein_p[0] += vectorize(0xcab2076d98173ec4ULL);
+	skein_p[1] += vectorize(0x4903ADFF749C51CEULL);
+	skein_p[2] += vectorize(0x0D95DE399746DF03ULL);
+	skein_p[3] += vectorize(0x8FD1934127C79BCEULL);
+	skein_p[4] += vectorize(0x9A255629FF352CB1ULL);
+	skein_p[5] += vectorize(0x4DB62599DF6CA7F0ULL);
+	skein_p[6] += vectorize(0xEABE394CA9D5C434ULL);
+	skein_p[7] += vectorize(0x991112C71A75B52BULL);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+	skein_p[0] += vectorize(0x4903ADFF749C51CEULL);
+	skein_p[1] += vectorize(0x0D95DE399746DF03ULL);
+	skein_p[2] += vectorize(0x8FD1934127C79BCEULL);
+	skein_p[3] += vectorize(0x9A255629FF352CB1ULL);
+	skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL);
+	skein_p[5] += vectorize(0xEABE394CA9D5C434ULL);
+	skein_p[6] += vectorize(0x891112C71A75B523ULL);
+	skein_p[7] += vectorize(0xAE18A40B660FCC33ULL + 9);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+	skein_p[0] += vectorize(0x0D95DE399746DF03ULL);
+	skein_p[1] += vectorize(0x8FD1934127C79BCEULL);
+	skein_p[2] += vectorize(0x9A255629FF352CB1ULL);
+	skein_p[3] += vectorize(0x5DB62599DF6CA7B0ULL);
+	skein_p[4] += vectorize(0xEABE394CA9D5C3F4ULL);
+	skein_p[5] += vectorize(0x891112C71A75B523ULL);
+	skein_p[6] += vectorize(0x9E18A40B660FCC73ULL);
+	skein_p[7] += vectorize(0xcab2076d98173eceULL);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+	skein_p[0] += vectorize(0x8FD1934127C79BCEULL);
+	skein_p[1] += vectorize(0x9A255629FF352CB1ULL);
+	skein_p[2] += vectorize(0x5DB62599DF6CA7B0ULL);
+	skein_p[3] += vectorize(0xEABE394CA9D5C3F4ULL);
+	skein_p[4] += vectorize(0x991112C71A75B523ULL);
+	skein_p[5] += vectorize(0x9E18A40B660FCC73ULL);
+	skein_p[6] += vectorize(0xcab2076d98173ec4ULL + 0x0000000000000040ULL);
+	skein_p[7] += vectorize(0x4903ADFF749C51CEULL + 11);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+	skein_p[0] += vectorize(0x9A255629FF352CB1ULL);
+	skein_p[1] += vectorize(0x5DB62599DF6CA7B0ULL);
+	skein_p[2] += vectorize(0xEABE394CA9D5C3F4ULL);
+	skein_p[3] += vectorize(0x991112C71A75B523ULL);
+	skein_p[4] += vectorize(0xAE18A40B660FCC33ULL);
+	skein_p[5] += vectorize(0xcab2076d98173ec4ULL + 0x0000000000000040ULL);
+	skein_p[6] += vectorize(0x3903ADFF749C51CEULL);
+	skein_p[7] += vectorize(0x0D95DE399746DF03ULL + 12);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+	skein_p[0] += vectorize(0x5DB62599DF6CA7B0ULL);
+	skein_p[1] += vectorize(0xEABE394CA9D5C3F4ULL);
+	skein_p[2] += vectorize(0x991112C71A75B523ULL);
+	skein_p[3] += vectorize(0xAE18A40B660FCC33ULL);
+	skein_p[4] += vectorize(0xcab2076d98173ec4ULL);
+	skein_p[5] += vectorize(0x3903ADFF749C51CEULL);
+	skein_p[6] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL);
+	skein_p[7] += vectorize(0x8FD1934127C79BCEULL + 13);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+	skein_p[0] += vectorize(0xEABE394CA9D5C3F4ULL);
+	skein_p[1] += vectorize(0x991112C71A75B523ULL);
+	skein_p[2] += vectorize(0xAE18A40B660FCC33ULL);
+	skein_p[3] += vectorize(0xcab2076d98173ec4ULL);
+	skein_p[4] += vectorize(0x4903ADFF749C51CEULL);
+	skein_p[5] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL);
+	skein_p[6] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL);
+	skein_p[7] += vectorize(0x9A255629FF352CB1ULL + 14);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+	skein_p[0] += vectorize(0x991112C71A75B523ULL);
+	skein_p[1] += vectorize(0xAE18A40B660FCC33ULL);
+	skein_p[2] += vectorize(0xcab2076d98173ec4ULL);
+	skein_p[3] += vectorize(0x4903ADFF749C51CEULL);
+	skein_p[4] += vectorize(0x0D95DE399746DF03ULL);
+	skein_p[5] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL);
+	skein_p[6] += vectorize(0x8A255629FF352CB1ULL);
+	skein_p[7] += vectorize(0x5DB62599DF6CA7B0ULL + 15);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+	skein_p[0] += vectorize(0xAE18A40B660FCC33ULL);
+	skein_p[1] += vectorize(0xcab2076d98173ec4ULL);
+	skein_p[2] += vectorize(0x4903ADFF749C51CEULL);
+	skein_p[3] += vectorize(0x0D95DE399746DF03ULL);
+	skein_p[4] += vectorize(0x8FD1934127C79BCEULL);
+	skein_p[5] += vectorize(0x8A255629FF352CB1ULL);
+	skein_p[6] += vectorize(0x4DB62599DF6CA7F0ULL);
+	skein_p[7] += vectorize(0xEABE394CA9D5C3F4ULL + 16ULL);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+	skein_p[0] += vectorize(0xcab2076d98173ec4ULL);
+	skein_p[1] += vectorize(0x4903ADFF749C51CEULL);
+	skein_p[2] += vectorize(0x0D95DE399746DF03ULL);
+	skein_p[3] += vectorize(0x8FD1934127C79BCEULL);
+	skein_p[4] += vectorize(0x9A255629FF352CB1ULL);
+	skein_p[5] += vectorize(0x4DB62599DF6CA7F0ULL);
+	skein_p[6] += vectorize(0xEABE394CA9D5C3F4ULL + 0x0000000000000040ULL);
+	skein_p[7] += vectorize(0x991112C71A75B523ULL + 17);
+	skein_p[0] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+	skein_p[4] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+	skein_p[6] += skein_p[7];
+	skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+	skein_p[2] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+	skein_p[4] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+	skein_p[6] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+	skein_p[0] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+	skein_p[4] += skein_p[1];
+	skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+	skein_p[6] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+	skein_p[0] += skein_p[5];
+	skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+	skein_p[2] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+	skein_p[6] += skein_p[1];
+	skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+	skein_p[0] += skein_p[7];
+	skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+	skein_p[2] += skein_p[5];
+	skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+	skein_p[4] += skein_p[3];
+	skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+	skein_p[0] += vectorize(0x4903ADFF749C51CEULL);
+	skein_p[1] += vectorize(0x0D95DE399746DF03ULL);
+	skein_p[2] += vectorize(0x8FD1934127C79BCEULL);
+	skein_p[3] += vectorize(0x9A255629FF352CB1ULL);
+	skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL);
+	skein_p[5] += vectorize(0xEABE394CA9D5C3F4ULL + 0x0000000000000040ULL);
+	skein_p[6] += vectorize(0x891112C71A75B523ULL);
+	skein_p[7] += vectorize(0xAE18A40B660FCC33ULL + 18);
+
+#define h0 skein_p[0]
+#define h1 skein_p[1]
+#define h2 skein_p[2]
+#define h3 skein_p[3]
+#define h4 skein_p[4]
+#define h5 skein_p[5]
+#define h6 skein_p[6]
+#define h7 skein_p[7]
+	h0 ^= h[0];
+	h1 ^= h[1];
+	h2 ^= h[2];
+	h3 ^= h[3];
+	h4 ^= h[4];
+	h5 ^= h[5];
+	h6 ^= h[6];
+	h7 ^= h[7];
+
+	const uint2 skein_h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ vectorize(0x1BD11BDAA9FC1A22ULL);
+
+	uint2 hash64[8];
+
+	hash64[0] = h0 + h1;
+	hash64[2] = h2 + h3;
+	hash64[5] = (h5 + vectorizelow(8ULL));
+
+	hash64[1] = ROL2(h1, 46) ^ hash64[0];
+	hash64[3] = ROL2(h3, 36) ^ hash64[2];
+	hash64[4] = h4 + hash64[5];
+	hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+	hash64[6] = (h6 + vectorizehigh(0xff000000UL)) + h7;
+	hash64[7] = ROL2(h7, 37) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+	hash64[0] = (hash64[0] + h1);
+	hash64[1] = (hash64[1] + h2);
+	hash64[2] = (hash64[2] + h3);
+	hash64[3] = (hash64[3] + h4);
+	hash64[4] = (hash64[4] + h5);
+	hash64[5] = (hash64[5] + h6 + vectorizehigh(0xff000000UL));
+	hash64[6] = (hash64[6] + h7 + vectorize(0xff00000000000008ULL));
+	hash64[7] = (hash64[7] + skein_h8 + vectorizelow(1));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+	hash64[0] = (hash64[0] + h2);
+	hash64[1] = (hash64[1] + h3);
+	hash64[2] = (hash64[2] + h4);
+	hash64[3] = (hash64[3] + h5);
+	hash64[4] = (hash64[4] + h6);
+	hash64[5] = (hash64[5] + h7 + vectorize(0xff00000000000008ULL));
+	hash64[6] = (hash64[6] + skein_h8 + vectorizelow(8ULL));
+	hash64[7] = (hash64[7] + h0 + vectorize(2));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+	hash64[0] = (hash64[0] + h3);
+	hash64[1] = (hash64[1] + h4);
+	hash64[2] = (hash64[2] + h5);
+	hash64[3] = (hash64[3] + h6);
+	hash64[4] = (hash64[4] + h7);
+	hash64[5] = (hash64[5] + skein_h8 + vectorizelow(8));
+	hash64[6] = (hash64[6] + h0 + vectorizehigh(0xff000000UL));
+	hash64[7] = (hash64[7] + h1 + vectorizelow(3));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+	hash64[0] = (hash64[0] + h4);
+	hash64[1] = (hash64[1] + h5);
+	hash64[2] = (hash64[2] + h6);
+	hash64[3] = (hash64[3] + h7);
+	hash64[4] = (hash64[4] + skein_h8);
+	hash64[5] = (hash64[5] + h0 + vectorizehigh(0xff000000UL));
+	hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008ULL));
+	hash64[7] = (hash64[7] + h2 + vectorizelow(4));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+	hash64[0] = (hash64[0] + h5);
+	hash64[1] = (hash64[1] + h6);
+	hash64[2] = (hash64[2] + h7);
+	hash64[3] = (hash64[3] + skein_h8);
+	hash64[4] = (hash64[4] + h0);
+	hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008ULL));
+	hash64[6] = (hash64[6] + h2 + vectorizelow(8ULL));
+	hash64[7] = (hash64[7] + h3 + vectorizelow(5));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+	hash64[0] = (hash64[0] + h6);
+	hash64[1] = (hash64[1] + h7);
+	hash64[2] = (hash64[2] + skein_h8);
+	hash64[3] = (hash64[3] + h0);
+	hash64[4] = (hash64[4] + h1);
+	hash64[5] = (hash64[5] + h2 + vectorizelow(8ULL));
+	hash64[6] = (hash64[6] + h3 + vectorizehigh(0xff000000UL));
+	hash64[7] = (hash64[7] + h4 + vectorizelow(6));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+	hash64[0] = (hash64[0] + h7);
+	hash64[1] = (hash64[1] + skein_h8);
+	hash64[2] = (hash64[2] + h0);
+	hash64[3] = (hash64[3] + h1);
+	hash64[4] = (hash64[4] + h2);
+	hash64[5] = (hash64[5] + h3 + vectorizehigh(0xff000000UL));
+	hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008ULL));
+	hash64[7] = (hash64[7] + h5 + vectorizelow(7));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+	hash64[0] = (hash64[0] + skein_h8);
+	hash64[1] = (hash64[1] + h0);
+	hash64[2] = (hash64[2] + h1);
+	hash64[3] = (hash64[3] + h2);
+	hash64[4] = (hash64[4] + h3);
+	hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008ULL));
+	hash64[6] = (hash64[6] + h5 + vectorizelow(8));
+	hash64[7] = (hash64[7] + h6 + vectorizelow(8));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+	hash64[0] = (hash64[0] + h0);
+	hash64[1] = (hash64[1] + h1);
+	hash64[2] = (hash64[2] + h2);
+	hash64[3] = (hash64[3] + h3);
+	hash64[4] = (hash64[4] + h4);
+	hash64[5] = (hash64[5] + h5 + vectorizelow(8));
+	hash64[6] = (hash64[6] + h6 + vectorizehigh(0xff000000UL));
+	hash64[7] = (hash64[7] + h7 + vectorizelow(9));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+
+	hash64[0] = (hash64[0] + h1);
+	hash64[1] = (hash64[1] + h2);
+	hash64[2] = (hash64[2] + h3);
+	hash64[3] = (hash64[3] + h4);
+	hash64[4] = (hash64[4] + h5);
+	hash64[5] = (hash64[5] + h6 + vectorizehigh(0xff000000UL));
+	hash64[6] = (hash64[6] + h7 + vectorize(0xff00000000000008ULL));
+	hash64[7] = (hash64[7] + skein_h8 + (vectorizelow(10)));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+	hash64[0] = (hash64[0] + h2);
+	hash64[1] = (hash64[1] + h3);
+	hash64[2] = (hash64[2] + h4);
+	hash64[3] = (hash64[3] + h5);
+	hash64[4] = (hash64[4] + h6);
+	hash64[5] = (hash64[5] + h7 + vectorize(0xff00000000000008ULL));
+	hash64[6] = (hash64[6] + skein_h8 + vectorizelow(8ULL));
+	hash64[7] = (hash64[7] + h0 + vectorizelow(11));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+	hash64[0] = (hash64[0] + h3);
+	hash64[1] = (hash64[1] + h4);
+	hash64[2] = (hash64[2] + h5);
+	hash64[3] = (hash64[3] + h6);
+	hash64[4] = (hash64[4] + h7);
+	hash64[5] = (hash64[5] + skein_h8 + vectorizelow(8));
+	hash64[6] = (hash64[6] + h0 + vectorizehigh(0xff000000UL));
+	hash64[7] = (hash64[7] + h1 + vectorizelow(12));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+	hash64[0] = (hash64[0] + h4);
+	hash64[1] = (hash64[1] + h5);
+	hash64[2] = (hash64[2] + h6);
+	hash64[3] = (hash64[3] + h7);
+	hash64[4] = (hash64[4] + skein_h8);
+	hash64[5] = (hash64[5] + h0 + vectorizehigh(0xff000000UL));
+	hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008ULL));
+	hash64[7] = (hash64[7] + h2 + vectorizelow(13));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+	hash64[0] = (hash64[0] + h5);
+	hash64[1] = (hash64[1] + h6);
+	hash64[2] = (hash64[2] + h7);
+	hash64[3] = (hash64[3] + skein_h8);
+	hash64[4] = (hash64[4] + h0);
+	hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008ULL));
+	hash64[6] = (hash64[6] + h2 + vectorizelow(8ULL));
+	hash64[7] = (hash64[7] + h3 + vectorizelow(14));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+	hash64[0] = (hash64[0] + h6);
+	hash64[1] = (hash64[1] + h7);
+	hash64[2] = (hash64[2] + skein_h8);
+	hash64[3] = (hash64[3] + h0);
+	hash64[4] = (hash64[4] + h1);
+	hash64[5] = (hash64[5] + h2 + vectorizelow(8ULL));
+	hash64[6] = (hash64[6] + h3 + vectorizehigh(0xff000000UL));
+	hash64[7] = (hash64[7] + h4 + vectorizelow(15));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+	hash64[0] = (hash64[0] + h7);
+	hash64[1] = (hash64[1] + skein_h8);
+	hash64[2] = (hash64[2] + h0);
+	hash64[3] = (hash64[3] + h1);
+	hash64[4] = (hash64[4] + h2);
+	hash64[5] = (hash64[5] + h3 + vectorizehigh(0xff000000UL));
+	hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008ULL));
+	hash64[7] = (hash64[7] + h5 + vectorizelow(16));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+	hash64[0] = (hash64[0] + skein_h8);
+	hash64[1] = (hash64[1] + h0);
+	hash64[2] = (hash64[2] + h1);
+	hash64[3] = (hash64[3] + h2);
+	hash64[4] = (hash64[4] + h3);
+	hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008ULL));
+	hash64[6] = (hash64[6] + h5 + vectorizelow(8ULL));
+	hash64[7] = (hash64[7] + h6 + vectorizelow(17));
+	hash64[0] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+	hash64[2] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+	hash64[4] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+	hash64[6] += hash64[7];
+	hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+	hash64[2] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+	hash64[4] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+	hash64[6] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+	hash64[0] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+	hash64[4] += hash64[1];
+	hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+	hash64[6] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+	hash64[0] += hash64[5];
+	hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+	hash64[2] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+	hash64[6] += hash64[1];
+	hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+	hash64[0] += hash64[7];
+	hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+	hash64[2] += hash64[5];
+	hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+	hash64[4] += hash64[3];
+	hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+
+	//#pragma unroll
+	//		for (int i = 0; i<8; i++)
+	//			inpHash[i] = s[i];
+	uint64_t *const outHash = (uint64_t *)&g_hash[8 * hashPosition];
+	
+	outHash[0] = devectorize(hash64[0] + h0);
+	outHash[1] = devectorize(hash64[1] + h1);
+	outHash[2] = devectorize(hash64[2] + h2);
+	outHash[3] = devectorize(hash64[3] + h3);
+	outHash[4] = devectorize(hash64[4] + h4);
+	outHash[5] = devectorize(hash64[5] + h5) + 8;
+	outHash[6] = devectorize(hash64[6] + h6) + 0xff00000000000000ULL;
+	outHash[7] = devectorize(hash64[7] + h7) + 18;
+	}
+
+#undef h0
+#undef h1
+#undef h2
+#undef h3
+#undef h4
+#undef h5
+#undef h6
+#undef h7
+}
+
+__global__  __launch_bounds__(192, 4)
+void quark_keccak512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint2 *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector, uint32_t *const __restrict__ d_found, uint32_t target)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if(thread < threads)
 	{
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
 		int hashPosition = nounce - startNounce;
-		uint64_t *inpHash = &g_hash[8 * hashPosition];
+		const uint2 *inpHash = &g_hash[8 * hashPosition];
 
-		uint2 keccak_gpu_state[25];
-#pragma unroll
-		for (int i = 0; i<8; i++)
+		uint2 msg[8];
+
+		uint28 *phash = (uint28*)inpHash;
+		uint28 *outpt = (uint28*)msg;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+
+		uint2 s[25];
+		uint2 bc[5], tmpxor[5], tmp1, tmp2;
+
+		tmpxor[0] = msg[0] ^ msg[5];
+		tmpxor[1] = msg[1] ^ msg[6];
+		tmpxor[2] = msg[2] ^ msg[7];
+		tmpxor[3] = msg[3] ^ make_uint2(0x1, 0x80000000);
+		tmpxor[4] = msg[4];
+
+		bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+		s[0] = inpHash[0] ^ bc[4];
+		s[1] = ROL2(inpHash[6] ^ bc[0], 44);
+		s[6] = ROL2(bc[3], 20);
+		s[9] = ROL2(bc[1], 61);
+		s[22] = ROL2(bc[3], 39);
+		s[14] = ROL2(bc[4], 18);
+		s[20] = ROL2(inpHash[2] ^ bc[1], 62);
+		s[2] = ROL2(bc[1], 43);
+		s[12] = ROL2(bc[2], 25);
+		s[13] = ROL8(bc[3]);
+		s[19] = ROR8(bc[2]);
+		s[23] = ROL2(bc[4], 41);
+		s[15] = ROL2(inpHash[4] ^ bc[3], 27);
+		s[4] = ROL2(bc[3], 14);
+		s[24] = ROL2(bc[0], 2);
+		s[21] = ROL2(make_uint2(0x1, 0x80000000) ^ bc[2], 55);
+		s[8] = ROL2(bc[0], 45);
+		s[16] = ROL2(inpHash[5] ^ bc[4], 36);
+		s[5] = ROL2(inpHash[3] ^ bc[2], 28);
+		s[3] = ROL2(bc[2], 21);
+		s[18] = ROL2(bc[1], 15);
+		s[17] = ROL2(bc[0], 10);
+		s[11] = ROL2(inpHash[7] ^ bc[1], 6);
+		s[7] = ROL2(bc[4], 3);
+		s[10] = ROL2(inpHash[1] ^ bc[0], 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0].x ^= 1;
+
+#pragma nounroll
+		for(int i = 1; i < 23; i++)
 		{
-			keccak_gpu_state[i] = vectorize(inpHash[i]);
-		}
-		keccak_gpu_state[8] = make_uint2(0x00000001UL, 0x80000000);	//vectorize(0x8000000000000001ULL);
 
 #pragma unroll
-		for (int i = 9; i<25; i++)
-		{
-			keccak_gpu_state[i] = make_uint2(0, 0);
+			for(int x = 0; x < 5; x++)
+				tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
+
+			bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+			bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+			bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+			bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+			bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+			tmp1 = s[1] ^ bc[0];
+
+			s[0] ^= bc[4];
+			s[1] = ROL2(s[6] ^ bc[0], 44);
+			s[6] = ROL2(s[9] ^ bc[3], 20);
+			s[9] = ROL2(s[22] ^ bc[1], 61);
+			s[22] = ROL2(s[14] ^ bc[3], 39);
+			s[14] = ROL2(s[20] ^ bc[4], 18);
+			s[20] = ROL2(s[2] ^ bc[1], 62);
+			s[2] = ROL2(s[12] ^ bc[1], 43);
+			s[12] = ROL2(s[13] ^ bc[2], 25);
+			s[13] = ROL8(s[19] ^ bc[3]);
+			s[19] = ROR8(s[23] ^ bc[2]);
+			s[23] = ROL2(s[15] ^ bc[4], 41);
+			s[15] = ROL2(s[4] ^ bc[3], 27);
+			s[4] = ROL2(s[24] ^ bc[3], 14);
+			s[24] = ROL2(s[21] ^ bc[0], 2);
+			s[21] = ROL2(s[8] ^ bc[2], 55);
+			s[8] = ROL2(s[16] ^ bc[0], 45);
+			s[16] = ROL2(s[5] ^ bc[4], 36);
+			s[5] = ROL2(s[3] ^ bc[2], 28);
+			s[3] = ROL2(s[18] ^ bc[2], 21);
+			s[18] = ROL2(s[17] ^ bc[1], 15);
+			s[17] = ROL2(s[11] ^ bc[0], 10);
+			s[11] = ROL2(s[7] ^ bc[1], 6);
+			s[7] = ROL2(s[10] ^ bc[4], 3);
+			s[10] = ROL2(tmp1, 1);
+
+			tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]);
+			s[0] ^= c_keccak_round_constants35[i];
+			s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+			tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+			tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+			tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+			tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
 		}
-		keccak_block_35_final(keccak_gpu_state);
+		uint2 t[5];
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
 
-		inpHash[3] = devectorize(keccak_gpu_state[3]);
+		s[0] ^= t[4] ^ ROL2(t[1], 1);
+		s[18] ^= t[2] ^ ROL2(t[4], 1);
+		s[24] ^= t[3] ^ ROL2(t[0], 1);
+
+		s[3] = ROL2(s[18], 21) ^ ((~ROL2(s[24], 14)) & s[0]);
+
+		if(s[3].y <= target)
+		{
+			uint32_t tmp = atomicCAS(d_found, 0xffffffff, nounce);
+			if(tmp != 0xffffffff)
+				d_found[1] = nounce;
+		}
 	}
 }
 
-__host__ void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash)
 {
-    const uint32_t threadsperblock = 32;
+    const uint32_t threadsperblock = 128;
 
     // berechne wie viele Thread Blocks wir brauchen
     dim3 grid((threads + threadsperblock-1)/threadsperblock);
     dim3 block(threadsperblock);
 
-    quark_keccak512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+    quark_keccak512_gpu_hash_64<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, (uint2 *)d_hash, d_nonceVector);
+}
+
+__host__ void quark_keccak512_cpu_init(int thr_id)
+{
+	CUDA_SAFE_CALL(cudaMalloc(&(d_found[thr_id]), 2 * sizeof(uint32_t)));
 }
 
-__host__ void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found)
 {
 	const uint32_t threadsperblock = 32;
 
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	cudaMemsetAsync(d_found[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]);
+	quark_keccak512_gpu_hash_64_final << <grid, block, 0, gpustream[thr_id] >> >(threads, startNounce, (uint2 *)d_hash, d_nonceVector, d_found[thr_id], target);
+	CUDA_SAFE_CALL(cudaMemcpyAsync(h_found, d_found[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]));
+}
+
+__host__ void quark_keccakskein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 64;
+
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	quark_keccak512_gpu_hash_64_final << <grid, block >> >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	quark_keccakskein512_gpu_hash_64 << <grid, block, 0, gpustream[thr_id] >> >(threads, startNounce, (uint2 *)d_hash, d_nonceVector);
 }
+
diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu
index ac0be31667..1d9e5ff69d 100644
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@@ -1,18 +1,25 @@
-#include <stdio.h>
-#include <stdint.h>
+#include <cstdint>
+#include <cstdio>
+using namespace std;
 #include <memory.h>
+#include "miner.h"
+#include "cuda_helper.h" 
+#include "cuda_vector.h" 
+
 
-#include "cuda_helper.h"
-#define TPB 128 
 #define TPBf 128
+#define TPB52 1024
+#define TPB50 256
+
+static __constant__ uint64_t c_PaddedMessage80[2]; // padded message (80 bytes + padding)
+__constant__ uint2 precalcvalues[9];
+static uint32_t *d_nonce[MAX_GPUS];
 
 // Take a look at: https://www.schneier.com/skein1.3.pdf
 
 #define SHL(x, n)			((x) << (n))
 #define SHR(x, n)			((x) >> (n))
 
-static uint32_t *d_nonce[MAX_GPUS];
-
 /*
  * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
  */
@@ -238,24 +245,23 @@ static uint32_t *d_nonce[MAX_GPUS];
 #define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
 
 #define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \
-		k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \
-			^ make_uint2( 0xA9FC1A22UL,0x1BD11BDA); \
+		k8 = k0 ^ k1 ^ k2 ^ k3 ^ k4 ^ k5 ^ k6 ^ k7 ^ make_uint2(0xA9FC1A22UL, 0x1BD11BDA); \
 		t2 = t0 ^ t1; \
 	}
 //vectorize(0x1BD11BDAA9FC1A22ULL);
 #define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
-		w0 = (w0 + SKBI(k, s, 0)); \
-		w1 = (w1 + SKBI(k, s, 1)); \
-		w2 = (w2 + SKBI(k, s, 2)); \
-		w3 = (w3 + SKBI(k, s, 3)); \
-		w4 = (w4 + SKBI(k, s, 4)); \
-		w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
-		w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
-		w7 = (w7 + SKBI(k, s, 7) + vectorizelow(s)); \
+		w0 += SKBI(k, s, 0); \
+		w1 += SKBI(k, s, 1); \
+		w2 += SKBI(k, s, 2); \
+		w3 += SKBI(k, s, 3); \
+		w4 += SKBI(k, s, 4); \
+		w5 += SKBI(k, s, 5) + SKBT(t, s, 0); \
+		w6 += SKBI(k, s, 6) + SKBT(t, s, 1); \
+		w7 += SKBI(k, s, 7) + vectorizelow(s); \
 	}
 
 #define TFBIG_MIX(x0, x1, rc) { \
-		x0 = x0 + x1; \
+		x0 += x1; \
 		x1 = ROL2(x1, rc) ^ x0; \
 	}
 
@@ -282,119 +288,1628 @@ static uint32_t *d_nonce[MAX_GPUS];
 		TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
 	}
 
-__global__
-#if __CUDA_ARCH__ > 500
-__launch_bounds__(TPB, 2)
-#else
-__launch_bounds__(TPB, 1)
-#endif
-void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		// Skein
-		uint2 p[8];
-		uint2 h0, h1, h2, h3, h4, h5, h6, h7, h8;
-		uint2 t0, t1, t2;
-
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-		int hashPosition = nounce - startNounce;
-		uint64_t *inpHash = &g_hash[8 * hashPosition];
+/* uint2 variant for SM3.2+ */
 
-		h0 = make_uint2(0x749C51CEull, 0x4903ADFF);
-		h1 = make_uint2(0x9746DF03ull, 0x0D95DE39);
-		h2 = make_uint2(0x27C79BCEull, 0x8FD19341);
-		h3 = make_uint2(0xFF352CB1ull, 0x9A255629);
-		h4 = make_uint2(0xDF6CA7B0ull, 0x5DB62599);
-		h5 = make_uint2(0xA9D5C3F4ull, 0xEABE394C);
-		h6 = make_uint2(0x1A75B523ull, 0x991112C7);
-		h7 = make_uint2(0x660FCC33ull, 0xAE18A40B);
+#define TFBIG_KINIT_UI2(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \
+		k8 = k0 ^ k1 ^ k2 ^ k3 ^ k4 ^ k5 ^ k6 ^ k7 ^ vectorize(SPH_C64(0x1BD11BDAA9FC1A22)); \
+		t2 = t0 ^ t1; \
+		}
 
-		// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg		
-#pragma unroll 8
-		for(int i=0;i<8;i++)
-			p[i] = vectorize(inpHash[i]);
+#define TFBIG_ADDKEY_UI2(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+		w0 += SKBI(k, s, 0); \
+		w1 += SKBI(k, s, 1); \
+		w2 += SKBI(k, s, 2); \
+		w3 += SKBI(k, s, 3); \
+		w4 += SKBI(k, s, 4); \
+		w5 += SKBI(k, s, 5) + SKBT(t, s, 0); \
+		w6 += SKBI(k, s, 6) + SKBT(t, s, 1); \
+		w7 += SKBI(k, s, 7) + vectorize(s); \
+		}
 
-		t0 = vectorizelow(64); // ptr
-		t1 = vectorize(480ull << 55); // etype
-		TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
-		TFBIG_4e(0);
-		TFBIG_4o(1);
-		TFBIG_4e(2);
-		TFBIG_4o(3);
-		TFBIG_4e(4);
-		TFBIG_4o(5);
-		TFBIG_4e(6);
-		TFBIG_4o(7);
-		TFBIG_4e(8);
-		TFBIG_4o(9);
-		TFBIG_4e(10);
-		TFBIG_4o(11);
-		TFBIG_4e(12);
-		TFBIG_4o(13);
-		TFBIG_4e(14);
-		TFBIG_4o(15);
-		TFBIG_4e(16);
-		TFBIG_4o(17);
-		TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+#define TFBIG_ADDKEY_PRE(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+		w0 += SKBI(k, s, 0); \
+		w1 += SKBI(k, s, 1); \
+		w2 += SKBI(k, s, 2); \
+		w3 += SKBI(k, s, 3); \
+		w4 += SKBI(k, s, 4); \
+		w5 += SKBI(k, s, 5) + SKBT(t, s, 0); \
+		w6 += SKBI(k, s, 6) + SKBT(t, s, 1); \
+		w7 += SKBI(k, s, 7) + (s); \
+				}
+
+#define TFBIG_MIX_UI2(x0, x1, rc) { \
+		x0 += x1; \
+		x1 = ROL2(x1, rc) ^ x0; \
+		}
 
-		h0 = vectorize(inpHash[0]) ^ p[0];
-		h1 = vectorize(inpHash[1]) ^ p[1];
-		h2 = vectorize(inpHash[2]) ^ p[2];
-		h3 = vectorize(inpHash[3]) ^ p[3];
-		h4 = vectorize(inpHash[4]) ^ p[4];
-		h5 = vectorize(inpHash[5]) ^ p[5];
-		h6 = vectorize(inpHash[6]) ^ p[6];
-		h7 = vectorize(inpHash[7]) ^ p[7];
+#define TFBIG_MIX_PRE(x0, x1, rc) { \
+		x0 = x0 + x1; \
+		x1 = ROTL64(x1, rc) ^ x0; \
+				}
+
+#define TFBIG_MIX8_UI2(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+		TFBIG_MIX_UI2(w0, w1, rc0); \
+		TFBIG_MIX_UI2(w2, w3, rc1); \
+		TFBIG_MIX_UI2(w4, w5, rc2); \
+		TFBIG_MIX_UI2(w6, w7, rc3); \
+		}
 
-		// 2. Runde -> etype = 510, ptr = 8, bcount = 0, data = 0
-#pragma unroll 8
-		for(int i=0;i<8;i++)
-			p[i] = make_uint2(0,0);
+#define TFBIG_MIX8_PRE(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+		TFBIG_MIX_PRE(w0, w1, rc0); \
+		TFBIG_MIX_PRE(w2, w3, rc1); \
+		TFBIG_MIX_PRE(w4, w5, rc2); \
+		TFBIG_MIX_PRE(w6, w7, rc3); \
+				}
+
+#define TFBIG_4e_UI2(s)  { \
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+		TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+		TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+		TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
+		}
 
-		t0 = vectorizelow(8); // ptr
-		t1 = vectorize(510ull << 55); // etype
-		TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
-		TFBIG_4e(0);
-		TFBIG_4o(1);
-		TFBIG_4e(2);
-		TFBIG_4o(3);
-		TFBIG_4e(4);
-		TFBIG_4o(5);
-		TFBIG_4e(6);
-		TFBIG_4o(7);
-		TFBIG_4e(8);
-		TFBIG_4o(9);
-		TFBIG_4e(10);
-		TFBIG_4o(11);
-		TFBIG_4e(12);
-		TFBIG_4o(13);
-		TFBIG_4e(14);
-		TFBIG_4o(15);
-		TFBIG_4e(16);
-		TFBIG_4o(17);
-		TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+#define TFBIG_4e_PRE(s)  { \
+		TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+		TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+		TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+		TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
+				}
+
+#define TFBIG_4o_UI2(s)  { \
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+		TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+		TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+		TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
+		}
 
-		// fertig
-		uint64_t *outpHash = &g_hash[8 * hashPosition];
+#define TFBIG_4o_PRE(s)  { \
+		TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+		TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+		TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+		TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
+		}
 
-#pragma unroll 8
-		for(int i=0;i<8;i++)
-			outpHash[i] = devectorize(p[i]);
+__global__
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(448, 2)
+#endif
+void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// Skein
+		uint2 skein_p[8], h[8];
+
+		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		const uint32_t hashPosition = nounce - startNounce;
+		uint64_t *Hash = &g_hash[8 * hashPosition];
+
+
+		uint2 msg[8];
+
+		uint28 *phash = (uint28*)Hash;
+		uint28 *outpt = (uint28*)msg;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+
+		h[0] = skein_p[0] = (msg[0]);
+		h[1] = skein_p[1] = (msg[1]);
+		h[2] = skein_p[2] = (msg[2]);
+		h[3] = skein_p[3] = (msg[3]);
+		h[4] = skein_p[4] = (msg[4]);
+		h[5] = skein_p[5] = (msg[5]);
+		h[6] = skein_p[6] = (msg[6]);
+		h[7] = skein_p[7] = (msg[7]);
+
+		skein_p[0] += vectorize(0x4903ADFF749C51CEULL);
+		skein_p[1] += vectorize(0x0D95DE399746DF03ULL);
+		skein_p[2] += vectorize(0x8FD1934127C79BCEULL);
+		skein_p[3] += vectorize(0x9A255629FF352CB1ULL);
+		skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL);
+		skein_p[5] += vectorize(0xEABE394CA9D5C434ULL);
+		skein_p[6] += vectorize(0x891112C71A75B523ULL);
+		skein_p[7] += vectorize(0xAE18A40B660FCC33ULL);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+		skein_p[0] += vectorize(0x0D95DE399746DF03ULL);
+		skein_p[1] += vectorize(0x8FD1934127C79BCEULL);
+		skein_p[2] += vectorize(0x9A255629FF352CB1ULL);
+		skein_p[3] += vectorize(0x5DB62599DF6CA7B0ULL);
+		skein_p[4] += vectorize(0xEABE394CA9D5C3F4ULL);
+		skein_p[5] += vectorize(0x891112C71A75B523ULL);
+		skein_p[6] += vectorize(0x9E18A40B660FCC73ULL);
+		skein_p[7] += vectorize(0xcab2076d98173ec4ULL+1);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+		skein_p[0] += vectorize(0x8FD1934127C79BCEULL);
+		skein_p[1] += vectorize(0x9A255629FF352CB1ULL);
+		skein_p[2] += vectorize(0x5DB62599DF6CA7B0ULL);
+		skein_p[3] += vectorize(0xEABE394CA9D5C3F4ULL);
+		skein_p[4] += vectorize(0x991112C71A75B523ULL);
+		skein_p[5] += vectorize(0x9E18A40B660FCC73ULL);
+		skein_p[6] += vectorize(0xCAB2076D98173F04ULL);
+		skein_p[7] += vectorize(0x4903ADFF749C51D0ULL);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+		skein_p[0] += vectorize(0x9A255629FF352CB1ULL);
+		skein_p[1] += vectorize(0x5DB62599DF6CA7B0ULL);
+		skein_p[2] += vectorize(0xEABE394CA9D5C3F4ULL);
+		skein_p[3] += vectorize(0x991112C71A75B523ULL);
+		skein_p[4] += vectorize(0xAE18A40B660FCC33ULL);
+		skein_p[5] += vectorize(0xcab2076d98173f04ULL);
+		skein_p[6] += vectorize(0x3903ADFF749C51CEULL);
+		skein_p[7] += vectorize(0x0D95DE399746DF03ULL+3);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+		skein_p[0] += vectorize(0x5DB62599DF6CA7B0ULL);
+		skein_p[1] += vectorize(0xEABE394CA9D5C3F4ULL);
+		skein_p[2] += vectorize(0x991112C71A75B523ULL);
+		skein_p[3] += vectorize(0xAE18A40B660FCC33ULL);
+		skein_p[4] += vectorize(0xcab2076d98173ec4ULL);
+		skein_p[5] += vectorize(0x3903ADFF749C51CEULL);
+		skein_p[6] += vectorize(0xFD95DE399746DF43ULL);
+		skein_p[7] += vectorize(0x8FD1934127C79BD2ULL);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+		skein_p[0] += vectorize(0xEABE394CA9D5C3F4ULL);
+		skein_p[1] += vectorize(0x991112C71A75B523ULL);
+		skein_p[2] += vectorize(0xAE18A40B660FCC33ULL);
+		skein_p[3] += vectorize(0xcab2076d98173ec4ULL);
+		skein_p[4] += vectorize(0x4903ADFF749C51CEULL);
+		skein_p[5] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL);
+		skein_p[6] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL);
+		skein_p[7] += vectorize(0x9A255629FF352CB1ULL + 5);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+		skein_p[0] += vectorize(0x991112C71A75B523ULL);
+		skein_p[1] += vectorize(0xAE18A40B660FCC33ULL);
+		skein_p[2] += vectorize(0xcab2076d98173ec4ULL);
+		skein_p[3] += vectorize(0x4903ADFF749C51CEULL);
+		skein_p[4] += vectorize(0x0D95DE399746DF03ULL);
+		skein_p[5] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL);
+		skein_p[6] += vectorize(0x8A255629FF352CB1ULL);
+		skein_p[7] += vectorize(0x5DB62599DF6CA7B0ULL + 6);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+		skein_p[0] += vectorize(0xAE18A40B660FCC33ULL);
+		skein_p[1] += vectorize(0xcab2076d98173ec4ULL);
+		skein_p[2] += vectorize(0x4903ADFF749C51CEULL);
+		skein_p[3] += vectorize(0x0D95DE399746DF03ULL);
+		skein_p[4] += vectorize(0x8FD1934127C79BCEULL);
+		skein_p[5] += vectorize(0x8A255629FF352CB1ULL);
+		skein_p[6] += vectorize(0x4DB62599DF6CA7F0ULL);
+		skein_p[7] += vectorize(0xEABE394CA9D5C3F4ULL + 7);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+		skein_p[0] += vectorize(0xcab2076d98173ec4ULL);
+		skein_p[1] += vectorize(0x4903ADFF749C51CEULL);
+		skein_p[2] += vectorize(0x0D95DE399746DF03ULL);
+		skein_p[3] += vectorize(0x8FD1934127C79BCEULL);
+		skein_p[4] += vectorize(0x9A255629FF352CB1ULL);
+		skein_p[5] += vectorize(0x4DB62599DF6CA7F0ULL);
+		skein_p[6] += vectorize(0xEABE394CA9D5C434ULL);
+		skein_p[7] += vectorize(0x991112C71A75B52BULL);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+		skein_p[0] += vectorize(0x4903ADFF749C51CEULL);
+		skein_p[1] += vectorize(0x0D95DE399746DF03ULL);
+		skein_p[2] += vectorize(0x8FD1934127C79BCEULL);
+		skein_p[3] += vectorize(0x9A255629FF352CB1ULL);
+		skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL);
+		skein_p[5] += vectorize(0xEABE394CA9D5C434ULL);
+		skein_p[6] += vectorize(0x891112C71A75B523ULL);
+		skein_p[7] += vectorize(0xAE18A40B660FCC33ULL + 9);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+		skein_p[0] += vectorize(0x0D95DE399746DF03ULL);
+		skein_p[1] += vectorize(0x8FD1934127C79BCEULL);
+		skein_p[2] += vectorize(0x9A255629FF352CB1ULL);
+		skein_p[3] += vectorize(0x5DB62599DF6CA7B0ULL);
+		skein_p[4] += vectorize(0xEABE394CA9D5C3F4ULL);
+		skein_p[5] += vectorize(0x891112C71A75B523ULL);
+		skein_p[6] += vectorize(0x9E18A40B660FCC73ULL);
+		skein_p[7] += vectorize(0xcab2076d98173eceULL);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+		skein_p[0] += vectorize(0x8FD1934127C79BCEULL);
+		skein_p[1] += vectorize(0x9A255629FF352CB1ULL);
+		skein_p[2] += vectorize(0x5DB62599DF6CA7B0ULL);
+		skein_p[3] += vectorize(0xEABE394CA9D5C3F4ULL);
+		skein_p[4] += vectorize(0x991112C71A75B523ULL);
+		skein_p[5] += vectorize(0x9E18A40B660FCC73ULL);
+		skein_p[6] += vectorize(0xcab2076d98173ec4ULL + 0x0000000000000040ULL);
+		skein_p[7] += vectorize(0x4903ADFF749C51CEULL + 11);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+		skein_p[0] += vectorize(0x9A255629FF352CB1ULL);
+		skein_p[1] += vectorize(0x5DB62599DF6CA7B0ULL);
+		skein_p[2] += vectorize(0xEABE394CA9D5C3F4ULL);
+		skein_p[3] += vectorize(0x991112C71A75B523ULL);
+		skein_p[4] += vectorize(0xAE18A40B660FCC33ULL);
+		skein_p[5] += vectorize(0xcab2076d98173ec4ULL + 0x0000000000000040ULL);
+		skein_p[6] += vectorize(0x3903ADFF749C51CEULL);
+		skein_p[7] += vectorize(0x0D95DE399746DF03ULL + 12);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+		skein_p[0] += vectorize(0x5DB62599DF6CA7B0ULL);
+		skein_p[1] += vectorize(0xEABE394CA9D5C3F4ULL);
+		skein_p[2] += vectorize(0x991112C71A75B523ULL);
+		skein_p[3] += vectorize(0xAE18A40B660FCC33ULL);
+		skein_p[4] += vectorize(0xcab2076d98173ec4ULL);
+		skein_p[5] += vectorize(0x3903ADFF749C51CEULL);
+		skein_p[6] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL);
+		skein_p[7] += vectorize(0x8FD1934127C79BCEULL + 13);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+		skein_p[0] += vectorize(0xEABE394CA9D5C3F4ULL);
+		skein_p[1] += vectorize(0x991112C71A75B523ULL);
+		skein_p[2] += vectorize(0xAE18A40B660FCC33ULL);
+		skein_p[3] += vectorize(0xcab2076d98173ec4ULL);
+		skein_p[4] += vectorize(0x4903ADFF749C51CEULL);
+		skein_p[5] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL);
+		skein_p[6] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL);
+		skein_p[7] += vectorize(0x9A255629FF352CB1ULL + 14);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+		skein_p[0] += vectorize(0x991112C71A75B523ULL);
+		skein_p[1] += vectorize(0xAE18A40B660FCC33ULL);
+		skein_p[2] += vectorize(0xcab2076d98173ec4ULL);
+		skein_p[3] += vectorize(0x4903ADFF749C51CEULL);
+		skein_p[4] += vectorize(0x0D95DE399746DF03ULL);
+		skein_p[5] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL);
+		skein_p[6] += vectorize(0x8A255629FF352CB1ULL);
+		skein_p[7] += vectorize(0x5DB62599DF6CA7B0ULL + 15);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+		skein_p[0] += vectorize(0xAE18A40B660FCC33ULL);
+		skein_p[1] += vectorize(0xcab2076d98173ec4ULL);
+		skein_p[2] += vectorize(0x4903ADFF749C51CEULL);
+		skein_p[3] += vectorize(0x0D95DE399746DF03ULL);
+		skein_p[4] += vectorize(0x8FD1934127C79BCEULL);
+		skein_p[5] += vectorize(0x8A255629FF352CB1ULL);
+		skein_p[6] += vectorize(0x4DB62599DF6CA7F0ULL);
+		skein_p[7] += vectorize(0xEABE394CA9D5C3F4ULL +16ULL);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4];
+		skein_p[0] += vectorize(0xcab2076d98173ec4ULL);
+		skein_p[1] += vectorize(0x4903ADFF749C51CEULL);
+		skein_p[2] += vectorize(0x0D95DE399746DF03ULL);
+		skein_p[3] += vectorize(0x8FD1934127C79BCEULL);
+		skein_p[4] += vectorize(0x9A255629FF352CB1ULL);
+		skein_p[5] += vectorize(0x4DB62599DF6CA7F0ULL);
+		skein_p[6] += vectorize(0xEABE394CA9D5C3F4ULL + 0x0000000000000040ULL);
+		skein_p[7] += vectorize(0x991112C71A75B523ULL + 17);
+		skein_p[0] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2];
+		skein_p[4] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4];
+		skein_p[6] += skein_p[7];
+		skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6];
+		skein_p[2] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2];
+		skein_p[4] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4];
+		skein_p[6] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6];
+		skein_p[0] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0];
+		skein_p[4] += skein_p[1];
+		skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4];
+		skein_p[6] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6];
+		skein_p[0] += skein_p[5];
+		skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0];
+		skein_p[2] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2];
+		skein_p[6] += skein_p[1];
+		skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6];
+		skein_p[0] += skein_p[7];
+		skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0];
+		skein_p[2] += skein_p[5];
+		skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2];
+		skein_p[4] += skein_p[3];
+		skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4];
+		skein_p[0] += vectorize(0x4903ADFF749C51CEULL);
+		skein_p[1] += vectorize(0x0D95DE399746DF03ULL);
+		skein_p[2] += vectorize(0x8FD1934127C79BCEULL);
+		skein_p[3] += vectorize(0x9A255629FF352CB1ULL);
+		skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL);
+		skein_p[5] += vectorize(0xEABE394CA9D5C3F4ULL + 0x0000000000000040ULL);
+		skein_p[6] += vectorize(0x891112C71A75B523ULL);
+		skein_p[7] += vectorize(0xAE18A40B660FCC33ULL + 18);
+
+#define h0 skein_p[0]
+#define h1 skein_p[1]
+#define h2 skein_p[2]
+#define h3 skein_p[3]
+#define h4 skein_p[4]
+#define h5 skein_p[5]
+#define h6 skein_p[6]
+#define h7 skein_p[7]
+		h0 ^= h[0];
+		h1 ^= h[1];
+		h2 ^= h[2];
+		h3 ^= h[3];
+		h4 ^= h[4];
+		h5 ^= h[5];
+		h6 ^= h[6];
+		h7 ^= h[7];
+
+		uint2 skein_h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ vectorize(0x1BD11BDAA9FC1A22ULL);
+
+		uint2 hash64[8];
+
+		hash64[0] = (h0);
+//		hash64[1] = (h1);
+		hash64[2] = (h2);
+//		hash64[3] = (h3);
+		hash64[4] = (h4);
+		hash64[5] = (h5 + vectorizelow(8ULL));
+		hash64[6] = (h6 + vectorizehigh(0xff000000UL));
+//		hash64[7] = (h7);
+
+		hash64[0] += h1;
+		hash64[1] = ROL2(h1, 46) ^ hash64[0];
+		hash64[2] += h3;
+		hash64[3] = ROL2(h3, 36) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+		hash64[6] += h7;
+		hash64[7] = ROL2(h7, 37) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+		hash64[0] = (hash64[0] + h1);
+		hash64[1] = (hash64[1] + h2);
+		hash64[2] = (hash64[2] + h3);
+		hash64[3] = (hash64[3] + h4);
+		hash64[4] = (hash64[4] + h5);
+		hash64[5] = (hash64[5] + h6 + vectorizehigh(0xff000000UL));
+		hash64[6] = (hash64[6] + h7 + vectorize(0xff00000000000008ULL));
+		hash64[7] = (hash64[7] + skein_h8 + vectorizelow(1));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+		hash64[0] = (hash64[0] + h2);
+		hash64[1] = (hash64[1] + h3);
+		hash64[2] = (hash64[2] + h4);
+		hash64[3] = (hash64[3] + h5);
+		hash64[4] = (hash64[4] + h6);
+		hash64[5] = (hash64[5] + h7 + vectorize(0xff00000000000008ULL));
+		hash64[6] = (hash64[6] + skein_h8 + vectorizelow(8ULL));
+		hash64[7] = (hash64[7] + h0 + vectorize(2));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+		hash64[0] = (hash64[0] + h3);
+		hash64[1] = (hash64[1] + h4);
+		hash64[2] = (hash64[2] + h5);
+		hash64[3] = (hash64[3] + h6);
+		hash64[4] = (hash64[4] + h7);
+		hash64[5] = (hash64[5] + skein_h8 + vectorizelow(8));
+		hash64[6] = (hash64[6] + h0 + vectorizehigh(0xff000000UL));
+		hash64[7] = (hash64[7] + h1 + vectorizelow(3));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+		hash64[0] = (hash64[0] + h4);
+		hash64[1] = (hash64[1] + h5);
+		hash64[2] = (hash64[2] + h6);
+		hash64[3] = (hash64[3] + h7);
+		hash64[4] = (hash64[4] + skein_h8);
+		hash64[5] = (hash64[5] + h0 + vectorizehigh(0xff000000UL));
+		hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008ULL));
+		hash64[7] = (hash64[7] + h2 + vectorizelow(4));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+		hash64[0] = (hash64[0] + h5);
+		hash64[1] = (hash64[1] + h6);
+		hash64[2] = (hash64[2] + h7);
+		hash64[3] = (hash64[3] + skein_h8);
+		hash64[4] = (hash64[4] + h0);
+		hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008ULL));
+		hash64[6] = (hash64[6] + h2 + vectorizelow(8ULL));
+		hash64[7] = (hash64[7] + h3 + vectorizelow(5));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+		hash64[0] = (hash64[0] + h6);
+		hash64[1] = (hash64[1] + h7);
+		hash64[2] = (hash64[2] + skein_h8);
+		hash64[3] = (hash64[3] + h0);
+		hash64[4] = (hash64[4] + h1);
+		hash64[5] = (hash64[5] + h2 + vectorizelow(8ULL));
+		hash64[6] = (hash64[6] + h3 + vectorizehigh(0xff000000UL));
+		hash64[7] = (hash64[7] + h4 + vectorizelow(6));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+		hash64[0] = (hash64[0] + h7);
+		hash64[1] = (hash64[1] + skein_h8);
+		hash64[2] = (hash64[2] + h0);
+		hash64[3] = (hash64[3] + h1);
+		hash64[4] = (hash64[4] + h2);
+		hash64[5] = (hash64[5] + h3 + vectorizehigh(0xff000000UL));
+		hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008ULL));
+		hash64[7] = (hash64[7] + h5 + vectorizelow(7));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+		hash64[0] = (hash64[0] + skein_h8);
+		hash64[1] = (hash64[1] + h0);
+		hash64[2] = (hash64[2] + h1);
+		hash64[3] = (hash64[3] + h2);
+		hash64[4] = (hash64[4] + h3);
+		hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008ULL));
+		hash64[6] = (hash64[6] + h5 + vectorizelow(8));
+		hash64[7] = (hash64[7] + h6 + vectorizelow(8));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+		hash64[0] = (hash64[0] + h0);
+		hash64[1] = (hash64[1] + h1);
+		hash64[2] = (hash64[2] + h2);
+		hash64[3] = (hash64[3] + h3);
+		hash64[4] = (hash64[4] + h4);
+		hash64[5] = (hash64[5] + h5 + vectorizelow(8));
+		hash64[6] = (hash64[6] + h6 + vectorizehigh(0xff000000UL));
+		hash64[7] = (hash64[7] + h7 + vectorizelow(9));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+
+		hash64[0] = (hash64[0] + h1);
+		hash64[1] = (hash64[1] + h2);
+		hash64[2] = (hash64[2] + h3);
+		hash64[3] = (hash64[3] + h4);
+		hash64[4] = (hash64[4] + h5);
+		hash64[5] = (hash64[5] + h6 + vectorizehigh(0xff000000UL));
+		hash64[6] = (hash64[6] + h7 + vectorize(0xff00000000000008ULL));
+		hash64[7] = (hash64[7] + skein_h8 + (vectorizelow(10)));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+		hash64[0] = (hash64[0] + h2);
+		hash64[1] = (hash64[1] + h3);
+		hash64[2] = (hash64[2] + h4);
+		hash64[3] = (hash64[3] + h5);
+		hash64[4] = (hash64[4] + h6);
+		hash64[5] = (hash64[5] + h7 + vectorize(0xff00000000000008ULL));
+		hash64[6] = (hash64[6] + skein_h8 + vectorizelow(8ULL));
+		hash64[7] = (hash64[7] + h0 + vectorizelow(11));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+		hash64[0] = (hash64[0] + h3);
+		hash64[1] = (hash64[1] + h4);
+		hash64[2] = (hash64[2] + h5);
+		hash64[3] = (hash64[3] + h6);
+		hash64[4] = (hash64[4] + h7);
+		hash64[5] = (hash64[5] + skein_h8 + vectorizelow(8));
+		hash64[6] = (hash64[6] + h0 + vectorizehigh(0xff000000UL));
+		hash64[7] = (hash64[7] + h1 + vectorizelow(12));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+		hash64[0] = (hash64[0] + h4);
+		hash64[1] = (hash64[1] + h5);
+		hash64[2] = (hash64[2] + h6);
+		hash64[3] = (hash64[3] + h7);
+		hash64[4] = (hash64[4] + skein_h8);
+		hash64[5] = (hash64[5] + h0 + vectorizehigh(0xff000000UL));
+		hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008ULL));
+		hash64[7] = (hash64[7] + h2 + vectorizelow(13));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+		hash64[0] = (hash64[0] + h5);
+		hash64[1] = (hash64[1] + h6);
+		hash64[2] = (hash64[2] + h7);
+		hash64[3] = (hash64[3] + skein_h8);
+		hash64[4] = (hash64[4] + h0);
+		hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008ULL));
+		hash64[6] = (hash64[6] + h2 + vectorizelow(8ULL));
+		hash64[7] = (hash64[7] + h3 + vectorizelow(14));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+		hash64[0] = (hash64[0] + h6);
+		hash64[1] = (hash64[1] + h7);
+		hash64[2] = (hash64[2] + skein_h8);
+		hash64[3] = (hash64[3] + h0);
+		hash64[4] = (hash64[4] + h1);
+		hash64[5] = (hash64[5] + h2 + vectorizelow(8ULL));
+		hash64[6] = (hash64[6] + h3 + vectorizehigh(0xff000000UL));
+		hash64[7] = (hash64[7] + h4 + vectorizelow(15));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+		hash64[0] = (hash64[0] + h7);
+		hash64[1] = (hash64[1] + skein_h8);
+		hash64[2] = (hash64[2] + h0);
+		hash64[3] = (hash64[3] + h1);
+		hash64[4] = (hash64[4] + h2);
+		hash64[5] = (hash64[5] + h3 + vectorizehigh(0xff000000UL));
+		hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008ULL));
+		hash64[7] = (hash64[7] + h5 + vectorizelow(16));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 46) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 36) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 37) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+		hash64[0] = (hash64[0] + skein_h8);
+		hash64[1] = (hash64[1] + h0);
+		hash64[2] = (hash64[2] + h1);
+		hash64[3] = (hash64[3] + h2);
+		hash64[4] = (hash64[4] + h3);
+		hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008ULL));
+		hash64[6] = (hash64[6] + h5 + vectorizelow(8ULL));
+		hash64[7] = (hash64[7] + h6 + vectorizelow(17));
+		hash64[0] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 39) ^ hash64[0];
+		hash64[2] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 30) ^ hash64[2];
+		hash64[4] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 34) ^ hash64[4];
+		hash64[6] += hash64[7];
+		hash64[7] = ROL24(hash64[7]) ^ hash64[6];
+		hash64[2] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 13) ^ hash64[2];
+		hash64[4] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 50) ^ hash64[4];
+		hash64[6] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 10) ^ hash64[6];
+		hash64[0] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 17) ^ hash64[0];
+		hash64[4] += hash64[1];
+		hash64[1] = ROL2(hash64[1], 25) ^ hash64[4];
+		hash64[6] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 29) ^ hash64[6];
+		hash64[0] += hash64[5];
+		hash64[5] = ROL2(hash64[5], 39) ^ hash64[0];
+		hash64[2] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 43) ^ hash64[2];
+		hash64[6] += hash64[1];
+		hash64[1] = ROL8(hash64[1]) ^ hash64[6];
+		hash64[0] += hash64[7];
+		hash64[7] = ROL2(hash64[7], 35) ^ hash64[0];
+		hash64[2] += hash64[5];
+		hash64[5] = ROR8(hash64[5]) ^ hash64[2];
+		hash64[4] += hash64[3];
+		hash64[3] = ROL2(hash64[3], 22) ^ hash64[4];
+
+		Hash[0] = devectorize(hash64[0] + h0);
+		Hash[1] = devectorize(hash64[1] + h1);
+		Hash[2] = devectorize(hash64[2] + h2);
+		Hash[3] = devectorize(hash64[3] + h3);
+		Hash[4] = devectorize(hash64[4] + h4);
+		Hash[5] = devectorize(hash64[5] + h5)+ 8;
+		Hash[6] = devectorize(hash64[6] + h6)+ 0xff00000000000000ULL;
+		Hash[7] = devectorize(hash64[7] + h7)+ 18;
+
+#undef h0
+#undef h1
+#undef h2
+#undef h3
+#undef h4
+#undef h5
+#undef h6
+#undef h7
 	}
 }
+//#else
+//__launch_bounds__(128, 10)
+//#endif
 
-__global__ 
+__global__
 #if __CUDA_ARCH__ > 500
-__launch_bounds__(TPBf, 2)
-#else
-__launch_bounds__(TPBf, 1)
+__launch_bounds__(448, 2)
 #endif
 void quark_skein512_gpu_hash_64_final(const uint32_t threads, const uint32_t startNounce, uint64_t * const __restrict__ g_hash, const uint32_t *g_nonceVector, uint32_t *d_nonce, uint32_t target)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		// Skein
@@ -402,10 +1917,10 @@ void quark_skein512_gpu_hash_64_final(const uint32_t threads, const uint32_t sta
 		uint2 h0, h1, h2, h3, h4, h5, h6, h7, h8;
 		uint2 t0, t1, t2;
 
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
-		uint64_t *inpHash = &g_hash[8 * hashPosition];
+		const uint32_t hashPosition = nounce - startNounce;
+		const uint64_t *const inpHash = &g_hash[8 * hashPosition];
 
 		h0 = make_uint2(0x749C51CEull, 0x4903ADFF);
 		h1 = make_uint2(0x9746DF03ull, 0x0D95DE39);
@@ -416,7 +1931,7 @@ void quark_skein512_gpu_hash_64_final(const uint32_t threads, const uint32_t sta
 		h6 = make_uint2(0x1A75B523ull, 0x991112C7);
 		h7 = make_uint2(0x660FCC33ull, 0xAE18A40B);
 
-		// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg		
+		// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg
 #pragma unroll 8
 		for (int i = 0; i<8; i++)
 			p[i] = vectorize(inpHash[i]);
@@ -507,7 +2022,7 @@ __host__ void quark_skein512_cpu_init(int thr_id)
 	cudaMalloc(&d_nonce[thr_id], 2*sizeof(uint32_t));
 }
 
-__host__ void quark_skein512_setTarget(const void *ptarget)
+__host__ void quark_skein512_setTarget(int thr_id, const void *ptarget)
 {
 }
 __host__ void quark_skein512_cpu_free(int32_t thr_id)
@@ -515,27 +2030,784 @@ __host__ void quark_skein512_cpu_free(int32_t thr_id)
 	cudaFreeHost(&d_nonce[thr_id]);
 }
 
-__host__
-void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+
+/* Elementary functions used by SHA256 */
+#define SWAB32(x)     cuda_swab32(x)
+
+#define R(x, n)       ((x) >> (n))
+#define Ch(x, y, z)   ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)  ((x & (y | z)) | (y & z))
+#define S0(x)         (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
+#define S1(x)         (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
+#define s0(x)         (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
+#define s1(x)         (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
+
+__constant__ uint32_t sha256_endingTable[] = {
+	0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200,
+	0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020,
+	0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549,
+	0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91,
+	0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7,
+	0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d,
+	0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484
+		};
+
+__constant__ uint32_t sha256_constantTable[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+__global__
+__launch_bounds__(TPB52)
+void skein512_gpu_hash_80_52(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ d_nonce, uint64_t target, int thr_id)
 {
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + TPB-1)/TPB);
-	dim3 block(TPB);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
+	{
+		uint2 h8;
+		uint2 p[8];
+
+		uint2 h0 = precalcvalues[0];
+		uint2 h1 = precalcvalues[1];
+		uint2 h2 = precalcvalues[2];
+		uint2 h3 = precalcvalues[3];
+		uint2 h4 = precalcvalues[4];
+		uint2 h5 = precalcvalues[5];
+		uint2 h6 = precalcvalues[6];
+		uint2 h7 = precalcvalues[7];
+		uint2 t2 = precalcvalues[8];
+
+		const uint2 nounce2 = make_uint2(_LOWORD(c_PaddedMessage80[1]), cuda_swab32(startNounce + thread));
+
+		uint2 t0 = vectorizelow(0x50ull); // SPH_T64(bcount << 6) + (sph_u64)(extra);
+		uint2 t1 = vectorizehigh(0xB0000000ul); // (bcount >> 58) + ((sph_u64)(etype) << 55);
+		h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ make_uint2(0xA9FC1A22UL, 0x1BD11BDA);
+		t2 = t0 ^ t1;
+
+		p[0] = h0 + vectorize(c_PaddedMessage80[0]);
+		p[1] = h1 + nounce2;
+		p[2] = h2;
+		p[3] = h3;
+		p[4] = h4;
+		p[5] = h5 + t0;
+		p[6] = h6 + t1;
+		p[7] = h7;
+
+		TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37);
+		TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42);
+		TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39);
+		TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56);
+
+		TFBIG_4o_UI2(1);
+		TFBIG_4e_UI2(2);
+		TFBIG_4o_UI2(3);
+		TFBIG_4e_UI2(4);
+		TFBIG_4o_UI2(5);
+		TFBIG_4e_UI2(6);
+		TFBIG_4o_UI2(7);
+		TFBIG_4e_UI2(8);
+		TFBIG_4o_UI2(9);
+		TFBIG_4e_UI2(10);
+		TFBIG_4o_UI2(11);
+		TFBIG_4e_UI2(12);
+		TFBIG_4o_UI2(13);
+		TFBIG_4e_UI2(14);
+		TFBIG_4o_UI2(15);
+		TFBIG_4e_UI2(16);
+		TFBIG_4o_UI2(17);
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+
+
+		t0 = vectorizelow(8); // extra
+		t1 = vectorizehigh(0xFF000000ul); // etype
+
+		h0 = vectorize(c_PaddedMessage80[0]) ^ p[0];
+		h1 = nounce2 ^ p[1];
+		h2 = p[2];
+		h3 = p[3];
+		h4 = p[4];
+		h5 = p[5];
+		h6 = p[6];
+		h7 = p[7];
+
+		h8 = h0 ^ h1 ^ p[2] ^ p[3] ^ p[4] ^ p[5] ^ p[6] ^ p[7] ^ vectorize(0x1BD11BDAA9FC1A22);
+		t2 = vectorize(0xFF00000000000008ull);
+
+		// p[8] = { 0 };
+		#pragma unroll 8
+		for (int i = 0; i<8; i++)
+			p[i] = make_uint2(0, 0);
 
-	quark_skein512_gpu_hash_64 << <grid, block>> >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-//	MyStreamSynchronize(NULL, order, thr_id);
+		TFBIG_4e_UI2(0);
+		TFBIG_4o_UI2(1);
+		TFBIG_4e_UI2(2);
+		TFBIG_4o_UI2(3);
+		TFBIG_4e_UI2(4);
+		TFBIG_4o_UI2(5);
+		TFBIG_4e_UI2(6);
+		TFBIG_4o_UI2(7);
+		TFBIG_4e_UI2(8);
+		TFBIG_4o_UI2(9);
+		TFBIG_4e_UI2(10);
+		TFBIG_4o_UI2(11);
+		TFBIG_4e_UI2(12);
+		TFBIG_4o_UI2(13);
+		TFBIG_4e_UI2(14);
+		TFBIG_4o_UI2(15);
+		TFBIG_4e_UI2(16);
+		TFBIG_4o_UI2(17);
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+
+		uint32_t *message = (uint32_t *)p;	
+
+		uint32_t W1[16];
+		uint32_t W2[16];
+
+		uint32_t regs[8] = {
+			0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+		};
+		uint32_t hash[8] = {
+			0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+		};
+
+#pragma unroll 16
+		for (int k = 0; k<16; k++)
+			W1[k] = SWAB32(message[k]);
+
+		// Progress W1
+#pragma unroll 16
+		for (int j = 0; j<16; j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + W1[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			regs[7] = regs[6];
+			regs[6] = regs[5];
+			regs[5] = regs[4];
+			regs[4] = regs[3] + T1;
+			regs[3] = regs[2];
+			regs[2] = regs[1];
+			regs[1] = regs[0];
+			regs[0] = T1 + T2;
+		}
+
+		// Progress W2...W3
+
+		////// PART 1
+#pragma unroll 2
+		for (int j = 0; j<2; j++)
+			W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+#pragma unroll 5
+		for (int j = 2; j<7; j++)
+			W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+
+#pragma unroll 8
+		for (int j = 7; j<15; j++)
+			W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j];
+
+		W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+		// Round function
+#pragma unroll 16
+		for (int j = 0; j<16; j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 16] + W2[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			regs[7] = regs[6];
+			regs[6] = regs[5];
+			regs[5] = regs[4];
+			regs[4] = regs[3] + T1;
+			regs[3] = regs[2];
+			regs[2] = regs[1];
+			regs[1] = regs[0];
+			regs[0] = T1 + T2;
+		}
+
+		////// PART 2
+#pragma unroll 2
+		for (int j = 0; j<2; j++)
+			W1[j] = s1(W2[14 + j]) + W2[9 + j] + s0(W2[1 + j]) + W2[j];
+
+#pragma unroll 5
+		for (int j = 2; j<7; j++)
+			W1[j] = s1(W1[j - 2]) + W2[9 + j] + s0(W2[1 + j]) + W2[j];
+
+#pragma unroll 8
+		for (int j = 7; j<15; j++)
+			W1[j] = s1(W1[j - 2]) + W1[j - 7] + s0(W2[1 + j]) + W2[j];
+
+		W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
+
+		// Round function
+#pragma unroll 16
+		for (int j = 0; j<16; j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 32] + W1[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			regs[7] = regs[6];
+			regs[6] = regs[5];
+			regs[5] = regs[4];
+			regs[4] = regs[3] + T1;
+			regs[3] = regs[2];
+			regs[2] = regs[1];
+			regs[1] = regs[0];
+			regs[0] = T1 + T2;
+		}
+
+		////// PART 3
+#pragma unroll 2
+		for (int j = 0; j<2; j++)
+			W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+
+#pragma unroll 5
+		for (int j = 2; j<7; j++)
+			W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+
+#pragma unroll 8
+		for (int j = 7; j<15; j++)
+			W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j];
+
+		W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+		// Round function
+#pragma unroll 16
+		for (int j = 0; j<16; j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 48] + W2[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			regs[7] = regs[6];
+			regs[6] = regs[5];
+			regs[5] = regs[4];
+			regs[4] = regs[3] + T1;
+			regs[3] = regs[2];
+			regs[2] = regs[1];
+			regs[1] = regs[0];
+			regs[0] = T1 + T2;
+		}
+
+#pragma unroll 8
+		for (int k = 0; k<8; k++)
+			hash[k] += regs[k];
+
+		/////
+		///// Second Pass (ending)
+		/////
+#pragma unroll 8
+		for (int k = 0; k<8; k++)
+			regs[k] = hash[k];
+
+		// Progress W1
+		uint32_t T1, T2;
+		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[0] + sha256_endingTable[0];
+		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+		regs[7] = regs[6];
+		regs[6] = regs[5];
+		regs[5] = regs[4];
+		regs[4] = regs[3] + T1;
+		regs[3] = regs[2];
+		regs[2] = regs[1];
+		regs[1] = regs[0];
+		regs[0] = T1 + T2;
+#pragma unroll 
+		for(int j = 1; j<15; j++)
+		{
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			regs[7] = regs[6];
+			regs[6] = regs[5];
+			regs[5] = regs[4];
+			regs[4] = regs[3] + T1;
+			regs[3] = regs[2];
+			regs[2] = regs[1];
+			regs[1] = regs[0];
+			regs[0] = T1 + T2;
+		}
+#pragma unroll 
+		for (int j = 15; j<56; j++)
+		{
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + sha256_endingTable[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			regs[7] = regs[6];
+			regs[6] = regs[5];
+			regs[5] = regs[4];
+			regs[4] = regs[3] + T1;
+			regs[3] = regs[2];
+			regs[2] = regs[1];
+			regs[1] = regs[0];
+			regs[0] = T1 + T2;
+		}
+
+		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[56] + sha256_endingTable[56];
+		regs[7] = T1 + S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+		regs[3] += T1;
+
+		T1 = regs[6] + S1(regs[3]) + Ch(regs[3], regs[4], regs[5]) + sha256_constantTable[57] + sha256_endingTable[57];
+		regs[6] = T1 + S0(regs[7]) + Maj(regs[7], regs[0], regs[1]);
+		regs[2] += T1;
+		//************
+		regs[1] += regs[5] + S1(regs[2]) + Ch(regs[2], regs[3], regs[4]) + sha256_constantTable[58] + sha256_endingTable[58];
+		regs[0] += regs[4] + S1(regs[1]) + Ch(regs[1], regs[2], regs[3]) + sha256_constantTable[59] + sha256_endingTable[59];
+		regs[7] += regs[3] + S1(regs[0]) + Ch(regs[0], regs[1], regs[2]) + sha256_constantTable[60] + sha256_endingTable[60];
+		regs[6] += regs[2] + S1(regs[7]) + Ch(regs[7], regs[0], regs[1]) + sha256_constantTable[61] + sha256_endingTable[61];
+
+		uint64_t test = SWAB32(hash[7] + regs[7]);
+		test <<= 32;
+		test |= SWAB32(hash[6] + regs[6]);
+		if (test <= target)
+		{
+			uint32_t tmp = atomicExch(&(d_nonce[0]), startNounce + thread);
+			if (tmp != 0xffffffff)
+				d_nonce[1] = tmp;
+		}
+	}
+}
+
+__global__
+__launch_bounds__(TPB50)
+void skein512_gpu_hash_80_50(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ d_nonce, uint64_t target, int thr_id)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	//	if (thread < threads)
+	{
+		uint2 h8;
+		uint2 p[8];
+
+		uint2 h0 = precalcvalues[0];
+		uint2 h1 = precalcvalues[1];
+		uint2 h2 = precalcvalues[2];
+		uint2 h3 = precalcvalues[3];
+		uint2 h4 = precalcvalues[4];
+		uint2 h5 = precalcvalues[5];
+		uint2 h6 = precalcvalues[6];
+		uint2 h7 = precalcvalues[7];
+		uint2 t2 = precalcvalues[8];
+
+		const uint2 nounce2 = make_uint2(_LOWORD(c_PaddedMessage80[1]), cuda_swab32(startNounce + thread));
+
+		uint2 t0 = vectorizelow(0x50ull); // SPH_T64(bcount << 6) + (sph_u64)(extra);
+		uint2 t1 = vectorizehigh(0xB0000000ul); // (bcount >> 58) + ((sph_u64)(etype) << 55);
+		h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ make_uint2(0xA9FC1A22UL, 0x1BD11BDA);
+		t2 = t0 ^ t1;
+
+		p[0] = h0 + vectorize(c_PaddedMessage80[0]);
+		p[1] = h1 + nounce2;
+		p[2] = h2;
+		p[3] = h3;
+		p[4] = h4;
+		p[5] = h5 + t0;
+		p[6] = h6 + t1;
+		p[7] = h7;
+
+		TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37);
+		TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42);
+		TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39);
+		TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56);
+
+		TFBIG_4o_UI2(1);
+		TFBIG_4e_UI2(2);
+		TFBIG_4o_UI2(3);
+		TFBIG_4e_UI2(4);
+		TFBIG_4o_UI2(5);
+		TFBIG_4e_UI2(6);
+		TFBIG_4o_UI2(7);
+		TFBIG_4e_UI2(8);
+		TFBIG_4o_UI2(9);
+		TFBIG_4e_UI2(10);
+		TFBIG_4o_UI2(11);
+		TFBIG_4e_UI2(12);
+		TFBIG_4o_UI2(13);
+		TFBIG_4e_UI2(14);
+		TFBIG_4o_UI2(15);
+		TFBIG_4e_UI2(16);
+		TFBIG_4o_UI2(17);
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+
+
+		t0 = vectorizelow(8); // extra
+		t1 = vectorizehigh(0xFF000000ul); // etype
+
+		h0 = vectorize(c_PaddedMessage80[0]) ^ p[0];
+		h1 = nounce2 ^ p[1];
+		h2 = p[2];
+		h3 = p[3];
+		h4 = p[4];
+		h5 = p[5];
+		h6 = p[6];
+		h7 = p[7];
+
+		h8 = h0 ^ h1 ^ p[2] ^ p[3] ^ p[4] ^ p[5] ^ p[6] ^ p[7] ^ vectorize(0x1BD11BDAA9FC1A22);
+		t2 = vectorize(0xFF00000000000008ull);
+
+		// p[8] = { 0 };
+#pragma unroll 8
+		for (int i = 0; i<8; i++)
+			p[i] = make_uint2(0, 0);
+
+		TFBIG_4e_UI2(0);
+		TFBIG_4o_UI2(1);
+		TFBIG_4e_UI2(2);
+		TFBIG_4o_UI2(3);
+		TFBIG_4e_UI2(4);
+		TFBIG_4o_UI2(5);
+		TFBIG_4e_UI2(6);
+		TFBIG_4o_UI2(7);
+		TFBIG_4e_UI2(8);
+		TFBIG_4o_UI2(9);
+		TFBIG_4e_UI2(10);
+		TFBIG_4o_UI2(11);
+		TFBIG_4e_UI2(12);
+		TFBIG_4o_UI2(13);
+		TFBIG_4e_UI2(14);
+		TFBIG_4o_UI2(15);
+		TFBIG_4e_UI2(16);
+		TFBIG_4o_UI2(17);
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+
+		uint32_t *message = (uint32_t *)p;
+
+		uint32_t W1[16];
+		uint32_t W2[16];
+
+		uint32_t regs[8] = {
+			0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+		};;
+		uint32_t hash[8] = {
+			0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+		};
+
+#pragma unroll 16
+		for (int k = 0; k<16; k++)
+			W1[k] = SWAB32(message[k]);
+
+		// Progress W1
+#pragma unroll 16
+		for (int j = 0; j<16; j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + W1[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			regs[7] = regs[6];
+			regs[6] = regs[5];
+			regs[5] = regs[4];
+			regs[4] = regs[3] + T1;
+			regs[3] = regs[2];
+			regs[2] = regs[1];
+			regs[1] = regs[0];
+			regs[0] = T1 + T2;
+		}
+
+		// Progress W2...W3
+
+		////// PART 1
+#pragma unroll 2
+		for (int j = 0; j<2; j++)
+			W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+#pragma unroll 5
+		for (int j = 2; j<7; j++)
+			W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+
+#pragma unroll 8
+		for (int j = 7; j<15; j++)
+			W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j];
+
+		W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+		// Round function
+#pragma unroll 16
+		for (int j = 0; j<16; j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 16] + W2[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			regs[7] = regs[6];
+			regs[6] = regs[5];
+			regs[5] = regs[4];
+			regs[4] = regs[3] + T1;
+			regs[3] = regs[2];
+			regs[2] = regs[1];
+			regs[1] = regs[0];
+			regs[0] = T1 + T2;
+		}
+
+		////// PART 2
+#pragma unroll 2
+		for (int j = 0; j<2; j++)
+			W1[j] = s1(W2[14 + j]) + W2[9 + j] + s0(W2[1 + j]) + W2[j];
+
+#pragma unroll 5
+		for (int j = 2; j<7; j++)
+			W1[j] = s1(W1[j - 2]) + W2[9 + j] + s0(W2[1 + j]) + W2[j];
+
+#pragma unroll 8
+		for (int j = 7; j<15; j++)
+			W1[j] = s1(W1[j - 2]) + W1[j - 7] + s0(W2[1 + j]) + W2[j];
+
+		W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
+
+		// Round function
+#pragma unroll 16
+		for (int j = 0; j<16; j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 32] + W1[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			regs[7] = regs[6];
+			regs[6] = regs[5];
+			regs[5] = regs[4];
+			regs[4] = regs[3] + T1;
+			regs[3] = regs[2];
+			regs[2] = regs[1];
+			regs[1] = regs[0];
+			regs[0] = T1 + T2;
+		}
+
+		////// PART 3
+#pragma unroll 2
+		for (int j = 0; j<2; j++)
+			W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+
+#pragma unroll 5
+		for (int j = 2; j<7; j++)
+			W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+
+#pragma unroll 8
+		for (int j = 7; j<15; j++)
+			W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j];
+
+		W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+		// Round function
+#pragma unroll 16
+		for (int j = 0; j<16; j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 48] + W2[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			regs[7] = regs[6];
+			regs[6] = regs[5];
+			regs[5] = regs[4];
+			regs[4] = regs[3] + T1;
+			regs[3] = regs[2];
+			regs[2] = regs[1];
+			regs[1] = regs[0];
+			regs[0] = T1 + T2;
+		}
+
+#pragma unroll 8
+		for (int k = 0; k<8; k++)
+			hash[k] += regs[k];
+
+		/////
+		///// Second Pass (ending)
+		/////
+#pragma unroll 8
+		for (int k = 0; k<8; k++)
+			regs[k] = hash[k];
+
+		// Progress W1
+		uint32_t T1, T2;
+		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[0] + sha256_endingTable[0];
+		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+		regs[7] = regs[6];
+		regs[6] = regs[5];
+		regs[5] = regs[4];
+		regs[4] = regs[3] + T1;
+		regs[3] = regs[2];
+		regs[2] = regs[1];
+		regs[1] = regs[0];
+		regs[0] = T1 + T2;
+#pragma unroll 
+		for(int j = 1; j<15; j++)
+		{
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			regs[7] = regs[6];
+			regs[6] = regs[5];
+			regs[5] = regs[4];
+			regs[4] = regs[3] + T1;
+			regs[3] = regs[2];
+			regs[2] = regs[1];
+			regs[1] = regs[0];
+			regs[0] = T1 + T2;
+		}
+#pragma unroll 
+		for(int j = 15; j<56; j++)
+		{
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + sha256_endingTable[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			regs[7] = regs[6];
+			regs[6] = regs[5];
+			regs[5] = regs[4];
+			regs[4] = regs[3] + T1;
+			regs[3] = regs[2];
+			regs[2] = regs[1];
+			regs[1] = regs[0];
+			regs[0] = T1 + T2;
+		}
+
+		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[56] + sha256_endingTable[56];
+		regs[7] = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]) + T1;
+		regs[3] += T1;
+
+		T1 = regs[6] + S1(regs[3]) + Ch(regs[3], regs[4], regs[5]) + sha256_constantTable[57] + sha256_endingTable[57];
+		T2 = S0(regs[7]) + Maj(regs[7], regs[0], regs[1]);
+		regs[6] = S0(regs[7]) + Maj(regs[7], regs[0], regs[1]) + T1;
+		regs[2] += T1;
+		//************
+		regs[1] += regs[5] + S1(regs[2]) + Ch(regs[2], regs[3], regs[4]) + sha256_constantTable[58] + sha256_endingTable[58];
+		regs[0] += regs[4] + S1(regs[1]) + Ch(regs[1], regs[2], regs[3]) + sha256_constantTable[59] + sha256_endingTable[59];
+		regs[7] += regs[3] + S1(regs[0]) + Ch(regs[0], regs[1], regs[2]) + sha256_constantTable[60] + sha256_endingTable[60];
+		regs[6] += regs[2] + S1(regs[7]) + Ch(regs[7], regs[0], regs[1]) + sha256_constantTable[61] + sha256_endingTable[61];
+
+		uint64_t test = SWAB32(hash[7] + regs[7]);
+		test <<= 32;
+		test|= SWAB32(hash[6] + regs[6]);
+		if (test <= target)
+		{
+			uint32_t tmp = atomicExch(&(d_nonce[0]), startNounce + thread);
+			if (tmp != 0xffffffff)
+				d_nonce[1] = tmp;
+		}
+	}
 }
 
+__host__
+void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash)
+{
+	int t = 128;
+	if (device_sm[device_map[thr_id]] > 500)
+	{
+		if (cuda_arch[thr_id]>500)
+			t = 448;
+	}
+	dim3 grid((threads + t - 1) / t);
+	dim3 block(t);
+	quark_skein512_gpu_hash_64 << <grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	CUDA_SAFE_CALL(cudaGetLastError());
+}
 
 __host__
-void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t *h_nonce, uint32_t target, int order)
+void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t *h_nonce, uint32_t target)
 {
-	dim3 grid((threads + TPBf - 1) / TPBf);
-	dim3 block(TPBf);
+	const int tp = 128;
+	dim3 grid((threads + tp - 1) / tp);
+	dim3 block(tp);
+
+	CUDA_SAFE_CALL(cudaMemsetAsync(d_nonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]));
 
-	cudaMemset(d_nonce[thr_id], 0xff, 2*sizeof(uint32_t));
+	quark_skein512_gpu_hash_64_final <<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_nonce[thr_id], target);
 
-	quark_skein512_gpu_hash_64_final<< <grid, block>> >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_nonce[thr_id], target);
-	cudaMemcpy(h_nonce, d_nonce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+	CUDA_SAFE_CALL(cudaMemcpy(h_nonce, d_nonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 }
+static void precalc(int thr_id, uint64_t *PaddedMessage)
+{
+	uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8;
+	uint64_t t0, t1, t2;
+
+	h0 = 0x4903ADFF749C51CEull;
+	h1 = 0x0D95DE399746DF03ull;
+	h2 = 0x8FD1934127C79BCEull;
+	h3 = 0x9A255629FF352CB1ull;
+	h4 = 0x5DB62599DF6CA7B0ull;
+	h5 = 0xEABE394CA9D5C3F4ull;
+	h6 = 0x991112C71A75B523ull;
+	h7 = 0xAE18A40B660FCC33ull;
+	h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ SPH_C64(0x1BD11BDAA9FC1A22);
+
+	t0 = 64; // ptr
+	t1 = 0x7000000000000000ull;
+	t2 = 0x7000000000000040ull;
+
+	uint64_t p[8];
+	for (int i = 0; i<8; i++)
+		p[i] = PaddedMessage[i];
+
+	TFBIG_4e_PRE(0);
+	TFBIG_4o_PRE(1);
+	TFBIG_4e_PRE(2);
+	TFBIG_4o_PRE(3);
+	TFBIG_4e_PRE(4);
+	TFBIG_4o_PRE(5);
+	TFBIG_4e_PRE(6);
+	TFBIG_4o_PRE(7);
+	TFBIG_4e_PRE(8);
+	TFBIG_4o_PRE(9);
+	TFBIG_4e_PRE(10);
+	TFBIG_4o_PRE(11);
+	TFBIG_4e_PRE(12);
+	TFBIG_4o_PRE(13);
+	TFBIG_4e_PRE(14);
+	TFBIG_4o_PRE(15);
+	TFBIG_4e_PRE(16);
+	TFBIG_4o_PRE(17);
+	TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+
+	uint64_t buffer[9];
+
+	buffer[0] = PaddedMessage[0] ^ p[0];
+	buffer[1] = PaddedMessage[1] ^ p[1];
+	buffer[2] = PaddedMessage[2] ^ p[2];
+	buffer[3] = PaddedMessage[3] ^ p[3];
+	buffer[4] = PaddedMessage[4] ^ p[4];
+	buffer[5] = PaddedMessage[5] ^ p[5];
+	buffer[6] = PaddedMessage[6] ^ p[6];
+	buffer[7] = PaddedMessage[7] ^ p[7];
+	buffer[8] = t2;
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(precalcvalues, buffer, sizeof(buffer), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
+}
+
+__host__
+void skein512_cpu_setBlock_80(int thr_id, void *pdata)
+{
+	uint64_t *PaddedMessage = (uint64_t*)pdata;
+	CUDA_SAFE_CALL(cudaMalloc(&(d_nonce[thr_id]), 2 * sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_PaddedMessage80, &PaddedMessage[8], 8 * 2, 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
+	CUDA_SAFE_CALL(cudaMemsetAsync(d_nonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]));
 
+	precalc(thr_id, PaddedMessage);
+}
+
+__host__
+void skein512_cpu_hash_80_52(int thr_id, uint32_t threads, uint32_t startNounce, int swapu,uint64_t target, uint32_t *h_found)
+{
+	dim3 grid((threads + TPB52 - 1) / TPB52);
+	dim3 block(TPB52);
+	skein512_gpu_hash_80_52 << < grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, d_nonce[thr_id], target, thr_id);
+	CUDA_SAFE_CALL(cudaMemcpyAsync(h_found, d_nonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]));
+	CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id]));
+}
+
+__host__
+void skein512_cpu_hash_80_50(int thr_id, uint32_t threads, uint32_t startNounce, int swapu, uint64_t target, uint32_t *h_found)
+{
+	dim3 grid((threads + TPB50 - 1) / TPB50);
+	dim3 block(TPB50);
+	skein512_gpu_hash_80_50 << < grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, d_nonce[thr_id], target, thr_id);
+	CUDA_SAFE_CALL(cudaMemcpyAsync(h_found, d_nonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]));
+	CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id]));
+}
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index 17e86d88be..c63d11fb70 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -9,52 +9,45 @@ extern "C"
 }
 
 #include "miner.h"
-
 #include "cuda_helper.h"
 
-static uint32_t *d_hash[MAX_GPUS];
-
-// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
-static uint32_t *d_quarkNonces[MAX_GPUS];
-static uint32_t *d_branch1Nonces[MAX_GPUS];
-static uint32_t *d_branch2Nonces[MAX_GPUS];
-static uint32_t *d_branch3Nonces[MAX_GPUS];
-
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
-extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_init(int thr_id);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+extern void quark_bmw512_cpu_hash_64_quark(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_skein512_cpu_init(int thr_id);
-extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
-extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void  quark_jh512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_keccak512_cpu_init(int thr_id);
+extern void quark_keccakskein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+extern void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found);
+extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+extern void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found);
+extern void  quark_jh512_cpu_init(int thr_id);
 
 
 extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes, const uint32_t *d_validNonceTable,
 											uint32_t *d_nonces1, uint32_t *nrm1,
-											uint32_t *d_nonces2, uint32_t *nrm2,
-											int order);
+											uint32_t *d_nonces2, uint32_t *nrm2);
 extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, uint32_t *nrm1,
-											int order);
+											uint32_t *d_nonces1, uint32_t *nrm1);
 
-extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
-extern void cuda_check_quarkcoin(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order, uint32_t *foundnonces);
+extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash);
+extern void cuda_check_quarkcoin(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, uint32_t *foundnonces);
 
 // Original Quarkhash Funktion aus einem miner Quelltext
-extern "C" void quarkhash(void *state, const void *input)
+void quarkhash(void *state, const void *input)
 {
     sph_blake512_context ctx_blake;
     sph_bmw512_context ctx_bmw;
@@ -131,138 +124,172 @@ extern "C" void quarkhash(void *state, const void *input)
     memcpy(state, hash, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
-
-extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done)
+extern int scanhash_quark(int thr_id, uint32_t *pdata,
+    uint32_t *ptarget, uint32_t max_nonce,
+    uint32_t *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
 
-	uint32_t throughput = device_intensity(thr_id, __func__, 1 << 20); // 256*4096
-	throughput = min(throughput, max_nonce - first_nonce);
+	uint32_t intensity = 1 << 22;
+	intensity = intensity + ((1 << 22)*9/10);
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity); // 256*4096
+	uint32_t throughput = min(throughputmax, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0xf;
+		ptarget[7] = 0x0000003f;
+
+	static THREAD uint32_t *foundnonces = nullptr;
+	static THREAD uint32_t *d_hash = nullptr;
+	static THREAD uint32_t *d_branch1Nonces = nullptr;
+	static THREAD uint32_t *d_branch2Nonces = nullptr;
+	static THREAD uint32_t *d_branch3Nonces = nullptr;
 
-	if (!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		get_cuda_arch(&cuda_arch[thr_id]);
+//		}
 
-		// Konstanten kopieren, Speicher belegen
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
-
-		quark_groestl512_cpu_init(thr_id, throughput);
-		quark_skein512_cpu_init(thr_id);
-		quark_bmw512_cpu_init(thr_id, throughput);
-		cuda_check_cpu_init(thr_id, throughput);
-		quark_compactTest_cpu_init(thr_id, throughput);
-
-		cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput);
-		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
-		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
-		cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
-
-		quark_jh512_cpu_init(thr_id, throughput);
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (16 * sizeof(uint32_t)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
 
-		init[thr_id] = true;
+		// Konstanten kopieren, Speicher belegen
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax));
+		CUDA_SAFE_CALL(cudaMallocHost(&foundnonces, 4 * 4));
+//		CUDA_SAFE_CALL(cudaMalloc(&d_branch1Nonces, sizeof(uint32_t)*throughput));
+//		CUDA_SAFE_CALL(cudaMalloc(&d_branch2Nonces, sizeof(uint32_t)*throughput));
+		uint32_t noncebuffersize = throughputmax * 7 / 10;
+		uint32_t noncebuffersize2 = (throughputmax * 7 / 10)*7/10;
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_branch1Nonces, sizeof(uint32_t)*noncebuffersize2));
+		CUDA_SAFE_CALL(cudaMalloc(&d_branch2Nonces, sizeof(uint32_t)*noncebuffersize2));
+		CUDA_SAFE_CALL(cudaMalloc(&d_branch3Nonces, sizeof(uint32_t)*noncebuffersize));
+		quark_blake512_cpu_init(thr_id);
+		quark_compactTest_cpu_init(thr_id, throughputmax);
+		quark_keccak512_cpu_init(thr_id);
+		quark_jh512_cpu_init(thr_id);
+		CUDA_SAFE_CALL(cudaGetLastError());
+		mining_has_stopped[thr_id] = false;
+		init = true;
 	}
 
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
-	cuda_check_cpu_setTarget(ptarget);
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
+		be32enc(&endiandata[k], pdata[k]);
+	quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata);
 
 	do {
 
-		int order = 0;
 		uint32_t nrm1 = 0, nrm2 = 0, nrm3 = 0;
 
-		// erstes Blake512 Hash mit CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für BMW512
-		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+		quark_bmw512_cpu_hash_64_quark(thr_id, throughput, pdata[19], NULL, d_hash);
 
-		quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
-			d_branch3Nonces[thr_id], &nrm3,
-			order++);
+		quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, NULL,
+			d_branch3Nonces, &nrm3);
 
 		// nur den Skein Branch weiterverfolgen
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces, d_hash);
 
 		// das ist der unbedingte Branch für Groestl512
-		quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces, d_hash);
 
 		// das ist der unbedingte Branch für JH512
-		quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces, d_hash);
 
 		// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
-		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
-			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
+		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces,
+			d_branch1Nonces, &nrm1,
+			d_branch2Nonces, &nrm2);
 
 		// das ist der bedingte Branch für Blake512
-		quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash);
 
 		// das ist der bedingte Branch für Bmw512
-		quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+		quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces, d_hash);
 
-		// das ist der unbedingte Branch für Keccak512
-		quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für Skein512
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		quark_keccakskein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces, d_hash);
 
 
 		// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
-		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
-			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
+		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces,
+			d_branch1Nonces, &nrm1,
+			d_branch3Nonces, &nrm2);
+
+		quark_keccak512_cpu_hash_64_final(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash, ptarget[7], foundnonces);
+		quark_jh512_cpu_hash_64_final(thr_id, nrm2, pdata[19], d_branch3Nonces, d_hash, ptarget[7], foundnonces+2);
+		CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id]));
+		if(foundnonces[0] == 0xffffffff)
+		{
+			foundnonces[0] = foundnonces[2];
+			foundnonces[1] = foundnonces[3];
+		}
+		else
+		{
+			if(foundnonces[1] == 0xffffffff)
+				foundnonces[1] = foundnonces[2];
+		}
 
-		// das ist der bedingte Branch für Keccak512
-		quark_keccak512_cpu_hash_64_final(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
-		quark_jh512_cpu_hash_64_final(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+		if(stop_mining)
+		{
+			mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);
+		}
 
-		uint32_t foundnonces[2];
-		cuda_check_quarkcoin(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++, foundnonces);
 		if (foundnonces[0] != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundnonces[0]);
+			uint32_t vhash64[8]={0};
+			if(opt_verify){ be32enc(&endiandata[19], foundnonces[0]);
 			quarkhash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
 			{
 				int res = 1;
 				*hashes_done = pdata[19] - first_nonce + throughput;
+				if(opt_benchmark)  applog(LOG_INFO, "GPU #%d: Found nonce $%08X", device_map[thr_id], foundnonces[0]);
 				// check if there was some other ones...
 				if (foundnonces[1] != 0xffffffff)
 				{
-					pdata[21] = foundnonces[1];
-					res++;
-					if (opt_benchmark)  applog(LOG_INFO, "GPU #%d: Found second nonce $%08X", thr_id, foundnonces[1]);
+					if(opt_verify){ be32enc(&endiandata[19], foundnonces[1]);
+					quarkhash(vhash64, endiandata);
+
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+						pdata[21] = foundnonces[1];
+						res++;
+						if(opt_benchmark)  applog(LOG_INFO, "GPU #%d: Found second nonce $%08X", device_map[thr_id], foundnonces[1]);
+					}
+					else
+					{
+						if (vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest
+							applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundnonces[1]);
+					}
 				}
 				pdata[19] = foundnonces[0];
-				if (opt_benchmark) applog(LOG_INFO, "GPU #%d: Found nonce $%08X", thr_id, foundnonces[0]);
+
 				return res;
 			}
 			else
 			{
 				if (vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest
-					applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundnonces[0]);
+					applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundnonces[0]);
 			}
 		}
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/qubit/deep.cu b/qubit/deep.cu
index bb0a2ad1fd..11d6ad5d03 100644
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@@ -14,20 +14,19 @@ extern "C" {
 
 #include "cuda_helper.h"
 
-static uint32_t *d_hash[MAX_GPUS];
-
 extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
-extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
-extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
-extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget);
-extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void qubit_luffa512_cpu_setBlock_80(int thr_id, void *pdata);
+extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void qubit_luffa512_cpufinal_setBlock_80(int thr_id, void *pdata, const void *ptarget);
+extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *d_hash, uint32_t target, uint32_t *h_found);
 
-extern "C" void deephash(void *state, const void *input)
+void deephash(void *state, const void *input)
 {
 	// luffa1-cubehash2-shavite3-simd4-echo5
 	sph_luffa512_context ctx_luffa;
@@ -51,77 +50,112 @@ extern "C" void deephash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
-
-extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+extern int scanhash_deep(int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done)
 {
+	static THREAD uint32_t *d_hash = nullptr;
+	static THREAD uint32_t *h_found = nullptr;
+
 	const uint32_t first_nonce = pdata[19];
 	uint32_t endiandata[20];
-	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 256*256*8
-	throughput = min(throughput, (max_nonce - first_nonce));
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << 19); // 256*256*8
+	uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000f;
+		ptarget[7] = 0x00ff;
 
-	if (!init[thr_id])
+	static THREAD volatile bool init = false;
+	if (!init)
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (16 * sizeof(uint32_t)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
 
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax));
 
-		qubit_luffa512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
+		qubit_luffa512_cpu_init(thr_id, throughputmax);
+		x11_echo512_cpu_init(thr_id, throughputmax);
+		CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 4 * sizeof(uint32_t)));
 
-		cuda_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughputmax);
+		mining_has_stopped[thr_id] = false;
 
-		init[thr_id] = true;
+		init = true;
 	}
 
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
-	qubit_luffa512_cpufinal_setBlock_80((void*)endiandata,ptarget);
-	cuda_check_cpu_setTarget(ptarget);
+	qubit_luffa512_cpufinal_setBlock_80(thr_id, (void*)endiandata,ptarget);
+	cuda_check_cpu_setTarget(ptarget, thr_id);
 
 	do {
-		int order = 0;
 
-		qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], d_hash, ptarget[7], h_found);
+		cudaStreamSynchronize(gpustream[thr_id]);
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(h_found[0] != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
+			uint32_t vhash64[8]={0};
+			if(opt_verify){ be32enc(&endiandata[19], h_found[0]);
 			deephash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			{
 				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce);
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (secNonce != 0) {
-					pdata[21] = secNonce;
-					res++;
+				if (h_found[1] != 0xffffffff)
+				{
+					if(opt_verify){ be32enc(&endiandata[19], h_found[1]);
+					deephash(vhash64, endiandata);
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+
+						pdata[21] = h_found[1];
+						res++;
+						if (opt_benchmark)
+							applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]);
+					}
+					else
+					{
+						if (vhash64[7] != Htarg)
+						{
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]);
+						}
+					}
+
 				}
-				pdata[19] = foundNonce;
+				pdata[19] = h_found[0];
+				if (opt_benchmark)
+					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]);
 				return res;
 			}
-			else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
+			else
+			{
+				if (vhash64[7] != Htarg)
+				{
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]);
+				}
 			}
 		}
-
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/qubit/doom.cu b/qubit/doom.cu
index 93f46ef0d8..479e7b36d4 100644
--- a/qubit/doom.cu
+++ b/qubit/doom.cu
@@ -10,13 +10,11 @@ extern "C" {
 
 #include "cuda_helper.h"
 
-static uint32_t *d_hash[MAX_GPUS];
-
 extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
-extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
-extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
-extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget);
-extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void qubit_luffa512_cpu_setBlock_80(int thr_id, void *pdata);
+extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void qubit_luffa512_cpufinal_setBlock_80(int thr_id, void *pdata, const void *ptarget);
+extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void doomhash(void *state, const void *input)
 {
@@ -32,63 +30,75 @@ extern void doomhash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
-
-extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+extern int scanhash_doom(int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done)
 {
+	static THREAD uint32_t *d_hash = nullptr;
+
 	const uint32_t first_nonce = pdata[19];
 	uint32_t endiandata[20];
-	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 22); // 256*256*8*8
-	throughput = min(throughput, (max_nonce - first_nonce));
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << 22); // 256*256*8*8
+	uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000f;
+		ptarget[7] = 0x0000f;
 
-	if (!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (16 * sizeof(uint32_t)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax));
 
-		qubit_luffa512_cpu_init(thr_id, (int) throughput);
+		qubit_luffa512_cpu_init(thr_id, (int) throughputmax);
+		mining_has_stopped[thr_id] = false;
 
-		init[thr_id] = true;
+		init = true;
 	}
 
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
-	qubit_luffa512_cpufinal_setBlock_80((void*)endiandata,ptarget);
+	qubit_luffa512_cpufinal_setBlock_80(thr_id, (void*)endiandata,ptarget);
 
 	do {
-		int order = 0;
 
-		uint32_t foundNonce = qubit_luffa512_cpu_finalhash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++);
-		if (foundNonce != UINT32_MAX)
+		uint32_t foundNonce = qubit_luffa512_cpu_finalhash_80(thr_id, (int) throughput, pdata[19], d_hash);
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(foundNonce != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
+			uint32_t vhash64[8]={0};
+			if(opt_verify){ be32enc(&endiandata[19], foundNonce);
 			doomhash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				*hashes_done = min(max_nonce - first_nonce, (uint64_t) pdata[19] - first_nonce + throughput);
 				pdata[19] = foundNonce;
 				return 1;
 			}
 			else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
+				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundNonce);
 			}
 		}
 
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/qubit/qubit.cu b/qubit/qubit.cu
index 538fabe3c3..0961eac3e3 100644
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@@ -11,33 +11,28 @@ extern "C" {
 }
 
 #include "miner.h"
-
 #include "cuda_helper.h"
 
-static uint32_t *d_hash[MAX_GPUS];
-static uint32_t *h_found[MAX_GPUS];
-
 extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
-extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
-extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void qubit_luffa512_cpu_setBlock_80(int thr_id, void *pdata);
+extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern int x11_simd512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash,const uint32_t simdthreads);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found, int order);
+//extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *d_hash, uint32_t target, uint32_t *h_found);
 
 extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-											uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse,
-											int order);
+extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes,
+											const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse);
 
-extern "C" void qubithash(void *state, const void *input)
+void qubithash(void *state, const void *input)
 {
 	// luffa1-cubehash2-shavite3-simd4-echo5
 
@@ -72,90 +67,148 @@ extern "C" void qubithash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
-
-extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+extern int scanhash_qubit(int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done)
 {
+	static THREAD uint32_t *d_hash = nullptr;
+	static THREAD uint32_t *h_found = nullptr;
+
 	uint32_t endiandata[20];
 	const uint32_t first_nonce = pdata[19];
-	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 256*256*8
-	throughput = min(throughput, (max_nonce - first_nonce));
+
+	uint32_t intensity = 256 * 256 * 10;
+	uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32;
+
+	cudaDeviceProp props;
+	cudaGetDeviceProperties(&props, device_map[thr_id]);
+	if(strstr(props.name, "1080"))
+	{
+		intensity = 256 * 256 * 24;
+	}
+	else if(strstr(props.name, "1070"))
+	{
+		intensity = 256 * 256 * 24;
+	}
+	else if(strstr(props.name, "970"))
+	{
+		intensity = 256 * 256 * 16;
+	}
+	else if (strstr(props.name, "980"))
+	{
+		intensity = 256 * 256 * 24;
+	}
+	else if (strstr(props.name, "750 Ti"))
+	{
+		intensity = 256 * 256 * 12;
+	}
+	else if (strstr(props.name, "750"))
+	{
+		intensity = 256 * 256 * 10;
+	}
+	else if (strstr(props.name, "960"))
+	{
+		intensity = 256 * 256 * 16;
+	}
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity);
+	uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		ptarget[7] = 0x0000ff;
 
-	if (!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
 
-		qubit_luffa512_cpu_init(thr_id, throughput);
-		x11_simd512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
-		CUDA_CALL_OR_RET_X(cudaMallocHost(&(h_found[thr_id]), 4 * sizeof(uint32_t)), 0);
+		qubit_luffa512_cpu_init(thr_id, throughputmax);
+		x11_simd512_cpu_init(thr_id, throughputmax);
+		x11_echo512_cpu_init(thr_id, throughputmax);
 
-		cuda_check_cpu_init(thr_id, throughput);
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax));
+		CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 4 * sizeof(uint32_t)));
+		mining_has_stopped[thr_id] = false;
 
-		init[thr_id] = true;
+		init = true;
 	}
 
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
-	qubit_luffa512_cpu_setBlock_80((void*)endiandata);
-	cuda_check_cpu_setTarget(ptarget);
+	qubit_luffa512_cpu_setBlock_80(thr_id, (void*)endiandata);
 
 	do {
-		int order = 0;
 
 		// Hash with CUDA
-		qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], ptarget[7], h_found[thr_id], order++);
-		if (h_found[thr_id][0] != 0xffffffff)
+		qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_simd512_cpu_hash_64(thr_id,throughput, pdata[19], d_hash,simdthreads);
+		x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], d_hash, ptarget[7], h_found);
+		cudaStreamSynchronize(gpustream[thr_id]);
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(h_found[0] != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], h_found[thr_id][0]);
+			uint32_t vhash64[8]={0};
+			if(opt_verify){ be32enc(&endiandata[19], h_found[0]);
 			qubithash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
 			{
 				int res = 1;
-				// check if there was some other ones...
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (h_found[thr_id][1] != 0xffffffff)
+				if (h_found[1] != 0xffffffff)
 				{
-					pdata[21] = h_found[thr_id][1];
-					res++;
-					if (opt_benchmark)
-						applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, h_found[thr_id][1], vhash64[7], Htarg);
+					if(opt_verify){ be32enc(&endiandata[19], h_found[1]);
+					qubithash(vhash64, endiandata);
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+
+						pdata[21] = h_found[1];
+						res++;
+						if (opt_benchmark)
+							applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]);
+					}
+					else
+					{
+						if (vhash64[7] != Htarg)
+						{
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]);
+						}
+					}
+
 				}
-				pdata[19] = h_found[thr_id][0];
+				pdata[19] = h_found[0];
 				if (opt_benchmark)
-					applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, h_found[thr_id][0], vhash64[7], Htarg);
+					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]);
 				return res;
 			}
 			else
 			{
 				if (vhash64[7] != Htarg)
 				{
-					applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_found[thr_id][0]);
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]);
 				}
 			}
 		}
-
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/qubit/qubit_luffa512.cu b/qubit/qubit_luffa512.cu
index 4a577a62c7..67b85c5fcd 100644
--- a/qubit/qubit_luffa512.cu
+++ b/qubit/qubit_luffa512.cu
@@ -17,20 +17,31 @@
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
-#include <stdio.h>
+#ifdef __cplusplus
+#include <cstdint>
+#include <cstdio>
+using namespace std;
+#else
 #include <stdint.h>
+#include <stdio.h>
+#endif
 #include <memory.h>
-
+#include "miner.h"
 #include "cuda_helper.h"
 
+
+
 #ifndef UINT32_MAX
 #define UINT32_MAX UINT_MAX
 #endif
 
-typedef unsigned char BitSequence;
-
+static THREAD unsigned char PaddedMessage[128];
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 __constant__ uint32_t c_Target[8];
+__constant__ uint32_t statebufferpre[8];
+__constant__ uint32_t statechainvpre[40];
+
+
 
 static uint32_t *h_resNounce[MAX_GPUS];
 static uint32_t *d_resNounce[MAX_GPUS];
@@ -40,11 +51,6 @@ static uint32_t *d_resNounce[MAX_GPUS];
 static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX };
 #endif
 
-typedef struct {
-	uint32_t buffer[8]; /* Buffer to be hashed */
-	uint32_t chainv[40];   /* Chaining values */
-} hashState;
-
 #define BYTES_SWAP32(x) cuda_swab32(x)
 
 #define MULT2(a,j)\
@@ -59,10 +65,10 @@ typedef struct {
 	a[0+(8*j)] = tmp;
 
 #define TWEAK(a0,a1,a2,a3,j)\
-	a0 = (a0<<(j))|(a0>>(32-j));\
-	a1 = (a1<<(j))|(a1>>(32-j));\
-	a2 = (a2<<(j))|(a2>>(32-j));\
-	a3 = (a3<<(j))|(a3>>(32-j));
+	a0 = ROTL32(a0,j);\
+	a1 = ROTL32(a1,j);\
+	a2 = ROTL32(a2,j);\
+	a3 = ROTL32(a3,j);
 
 #define STEP(c0,c1)\
 	SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\
@@ -107,18 +113,6 @@ typedef struct {
 	b0 ^= c1;
 
 /* initial values of chaining variables */
-__constant__ uint32_t c_IV[40] = {
-	0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465,
-	0x6e292011,0x90152df4,0xee058139,0xdef610bb,
-	0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3,
-	0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581,
-	0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05,
-	0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7,
-	0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67,
-	0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce,
-	0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363,
-	0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea};
-
 __constant__ uint32_t c_CNS[80] = {
 	0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
 	0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
@@ -141,251 +135,722 @@ __constant__ uint32_t c_CNS[80] = {
 	0x78602649,0x29131ab6,0x8edae952,0x0fc053c3,
 	0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31};
 
+static uint32_t h_CNS[80] = {
+	0x303994a6, 0xe0337818, 0xc0e65299, 0x441ba90d,
+	0x6cc33a12, 0x7f34d442, 0xdc56983e, 0x9389217f,
+	0x1e00108f, 0xe5a8bce6, 0x7800423d, 0x5274baf4,
+	0x8f5b7882, 0x26889ba7, 0x96e1db12, 0x9a226e9d,
+	0xb6de10ed, 0x01685f3d, 0x70f47aae, 0x05a17cf4,
+	0x0707a3d4, 0xbd09caca, 0x1c1e8f51, 0xf4272b28,
+	0x707a3d45, 0x144ae5cc, 0xaeb28562, 0xfaa7ae2b,
+	0xbaca1589, 0x2e48f1c1, 0x40a46f3e, 0xb923c704,
+	0xfc20d9d2, 0xe25e72c1, 0x34552e25, 0xe623bb72,
+	0x7ad8818f, 0x5c58a4a4, 0x8438764a, 0x1e38e2e7,
+	0xbb6de032, 0x78e38b9d, 0xedb780c8, 0x27586719,
+	0xd9847356, 0x36eda57f, 0xa2c78434, 0x703aace7,
+	0xb213afa5, 0xe028c9bf, 0xc84ebe95, 0x44756f91,
+	0x4e608a22, 0x7e8fce32, 0x56d858fe, 0x956548be,
+	0x343b138f, 0xfe191be2, 0xd0ec4e3d, 0x3cb226e5,
+	0x2ceb4882, 0x5944a28e, 0xb3ad2208, 0xa1c4c355,
+	0xf0d2e9e3, 0x5090d577, 0xac11d7fa, 0x2d1925ab,
+	0x1bcb66f2, 0xb46496ac, 0x6f2d9bc9, 0xd1925ab0,
+	0x78602649, 0x29131ab6, 0x8edae952, 0x0fc053c3,
+	0x3b6ba548, 0x3f014f0c, 0xedae9520, 0xfc053c31 };
+
 
-/***************************************************/
 __device__ __forceinline__
-void rnd512(hashState *state)
+void rnd512(uint32_t *statebuffer, uint32_t *statechainv)
 {
-	int i,j;
+	int i, j;
 	uint32_t t[40];
 	uint32_t chainv[8];
 	uint32_t tmp;
 
 #pragma unroll 8
-	for(i=0;i<8;i++) {
-		t[i]=0;
-#pragma unroll 5
-		for(j=0;j<5;j++) {
-			t[i] ^= state->chainv[i+8*j];
+	for (i = 0; i<8; i++)
+	{
+		t[i] = statechainv[i];
+#pragma unroll
+		for (j = 1; j<5; j++)
+		{
+			t[i] ^= statechainv[i + 8 * j];
 		}
 	}
 
 	MULT2(t, 0);
 
 #pragma unroll 5
-	for(j=0;j<5;j++) {
+	for (j = 0; j<5; j++) {
 #pragma unroll 8
-		for(i=0;i<8;i++) {
-			state->chainv[i+8*j] ^= t[i];
+		for (i = 0; i<8; i++) {
+			statechainv[i + 8 * j] ^= t[i];
 		}
 	}
 
 #pragma unroll 5
-	for(j=0;j<5;j++) {
+	for (j = 0; j<5; j++) {
 #pragma unroll 8
-		for(i=0;i<8;i++) {
-			t[i+8*j] = state->chainv[i+8*j];
+		for (i = 0; i<8; i++) {
+			t[i + 8 * j] = statechainv[i + 8 * j];
 		}
 	}
 
 #pragma unroll 5
-	for(j=0;j<5;j++) {
-		MULT2(state->chainv, j);
+	for (j = 0; j<5; j++) {
+		MULT2(statechainv, j);
 	}
 
 #pragma unroll 5
-	for(j=0;j<5;j++) {
+	for (j = 0; j<5; j++) {
 #pragma unroll 8
-		for(i=0;i<8;i++) {
-			state->chainv[8*j+i] ^= t[8*((j+1)%5)+i];
+		for (i = 0; i<8; i++) {
+			statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i];
 		}
 	}
 
 #pragma unroll 5
-	for(j=0;j<5;j++) {
+	for (j = 0; j<5; j++) {
 #pragma unroll 8
-		for(i=0;i<8;i++) {
-			t[i+8*j] = state->chainv[i+8*j];
+		for (i = 0; i<8; i++) {
+			t[i + 8 * j] = statechainv[i + 8 * j];
 		}
 	}
 
 #pragma unroll 5
-	for(j=0;j<5;j++) {
-		MULT2(state->chainv, j);
+	for (j = 0; j<5; j++) {
+		MULT2(statechainv, j);
 	}
 
 #pragma unroll 5
-	for(j=0;j<5;j++) {
+	for (j = 0; j<5; j++) {
 #pragma unroll 8
-		for(i=0;i<8;i++) {
-			state->chainv[8*j+i] ^= t[8*((j+4)%5)+i];
+		for (i = 0; i<8; i++) {
+			statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i];
 		}
 	}
 
 #pragma unroll 5
-	for(j=0;j<5;j++) {
+	for (j = 0; j<5; j++) {
 #pragma unroll 8
-		for(i=0;i<8;i++) {
-			state->chainv[i+8*j] ^= state->buffer[i];
+		for (i = 0; i<8; i++) {
+			statechainv[i + 8 * j] ^= statebuffer[i];
 		}
-		MULT2(state->buffer, 0);
+		MULT2(statebuffer, 0);
 	}
 
 #pragma unroll 8
-	for(i=0;i<8;i++) {
-		chainv[i] = state->chainv[i];
+	for (i = 0; i<8; i++) {
+		chainv[i] = statechainv[i];
+	}
+
+#pragma unroll 1
+	for (i = 0; i<8; i++) 
+	{
+		STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]);
 	}
 
 #pragma unroll 8
-	for(i=0;i<8;i++) {
-		STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]);
+	for (i = 0; i<8; i++) 
+	{
+		statechainv[i] = chainv[i];
+		chainv[i] = statechainv[i + 8];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++) 
+	{
+		STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]);
 	}
 
 #pragma unroll 8
-	for(i=0;i<8;i++) {
-		state->chainv[i] = chainv[i];
-		chainv[i] = state->chainv[i+8];
+	for (i = 0; i<8; i++) {
+		statechainv[i + 8] = chainv[i];
+		chainv[i] = statechainv[i + 16];
 	}
 
-	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1);
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++) 
+	{
+		STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]);
+	}
 
 #pragma unroll 8
-	for(i=0;i<8;i++) {
-		STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]);
+	for (i = 0; i<8; i++) {
+		statechainv[i + 16] = chainv[i];
+		chainv[i] = statechainv[i + 24];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++) 
+	{
+		STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]);
 	}
 
 #pragma unroll 8
-	for(i=0;i<8;i++) {
-		state->chainv[i+8] = chainv[i];
-		chainv[i] = state->chainv[i+16];
+	for (i = 0; i<8; i++) 
+	{
+		statechainv[i + 24] = chainv[i];
+		chainv[i] = statechainv[i + 32];
 	}
 
-	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2);
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++)
+	{
+		STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]);
+	}
 
 #pragma unroll 8
-	for(i=0;i<8;i++) {
-		STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]);
+	for (i = 0; i<8; i++) {
+		statechainv[i + 32] = chainv[i];
 	}
+}
+
+
+__device__ __forceinline__
+void rnd512_first(uint32_t *statebuffer, uint32_t *statechainv)
+{
+	uint32_t chainv[8];
+	uint32_t tmp;
+	int i;
+
+	statechainv[0 + 8 * 0] ^= statebuffer[0];
+	statechainv[1 + 8 * 0] ^= statebuffer[1];
+	statechainv[2 + 8 * 0] ^= statebuffer[2];
+	statechainv[3 + 8 * 0] ^= statebuffer[3];
+	statechainv[4 + 8 * 0] ^= statebuffer[4];
+
+
+	statechainv[1 + 8 * 1] ^= statebuffer[0];
+	statechainv[2 + 8 * 1] ^= statebuffer[1];
+	statechainv[3 + 8 * 1] ^= statebuffer[2];
+	statechainv[4 + 8 * 1] ^= statebuffer[3];
+	statechainv[5 + 8 * 1] ^= statebuffer[4];
+
+
+	statechainv[2 + 8 * 2] ^= statebuffer[0];
+	statechainv[3 + 8 * 2] ^= statebuffer[1];
+	statechainv[4 + 8 * 2] ^= statebuffer[2];
+	statechainv[5 + 8 * 2] ^= statebuffer[3];
+	statechainv[6 + 8 * 2] ^= statebuffer[4];
+
+
+	statechainv[3 + 8 * 3] ^= statebuffer[0];
+	statechainv[4 + 8 * 3] ^= statebuffer[1];
+	statechainv[5 + 8 * 3] ^= statebuffer[2];
+	statechainv[6 + 8 * 3] ^= statebuffer[3];
+	statechainv[7 + 8 * 3] ^= statebuffer[4];
+
+	statechainv[4 + 8 * 4] ^= statebuffer[0] ^ statebuffer[4];
+	statechainv[5 + 8 * 4] ^= statebuffer[1];
+	statechainv[6 + 8 * 4] ^= statebuffer[2];
+	statechainv[7 + 8 * 4] ^= statebuffer[3];
+	statechainv[0 + 8 * 4] ^= statebuffer[4];
+
+	statechainv[1 + 8 * 4] = (statechainv[1 + 8 * 4] ^ statebuffer[4]);
+	statechainv[3 + 8 * 4] = (statechainv[3 + 8 * 4] ^ statebuffer[4]);
 
 #pragma unroll 8
-	for(i=0;i<8;i++) {
-		state->chainv[i+16] = chainv[i];
-		chainv[i] = state->chainv[i+24];
+	for (i = 0; i<8; i++) {
+		chainv[i] = statechainv[i];
 	}
 
-	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3);
+#pragma unroll 1
+	for (i = 0; i<8; i++)
+	{
+		STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]);
+	}
 
 #pragma unroll 8
-	for(i=0;i<8;i++) {
-		STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]);
+	for (i = 0; i<8; i++)
+	{
+		statechainv[i] = chainv[i];
+		chainv[i] = statechainv[i + 8];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++)
+	{
+		STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]);
 	}
 
 #pragma unroll 8
-	for(i=0;i<8;i++) {
-		state->chainv[i+24] = chainv[i];
-		chainv[i] = state->chainv[i+32];
+	for (i = 0; i<8; i++) {
+		statechainv[i + 8] = chainv[i];
+		chainv[i] = statechainv[i + 16];
 	}
 
-	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4);
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++)
+	{
+		STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]);
+	}
 
 #pragma unroll 8
-	for(i=0;i<8;i++) {
-		STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]);
+	for (i = 0; i<8; i++) {
+		statechainv[i + 16] = chainv[i];
+		chainv[i] = statechainv[i + 24];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++)
+	{
+		STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]);
 	}
 
 #pragma unroll 8
-	for(i=0;i<8;i++) {
-		state->chainv[i+32] = chainv[i];
+	for (i = 0; i<8; i++)
+	{
+		statechainv[i + 24] = chainv[i];
+		chainv[i] = statechainv[i + 32];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++)
+	{
+		STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]);
+	}
+
+#pragma unroll 8
+	for (i = 0; i<8; i++) {
+		statechainv[i + 32] = chainv[i];
 	}
 }
 
 
+void rnd512cpu(uint32_t *statebuffer, uint32_t *statechainv)
+{
+	int i, j;
+	uint32_t t[40];
+	uint32_t chainv[8];
+	uint32_t tmp;
+
+	for (i = 0; i<8; i++)
+	{
+		t[i] = statechainv[i];
+		for (j = 1; j<5; j++)
+		{
+			t[i] ^= statechainv[i + 8 * j];
+		}
+	}
+
+	MULT2(t, 0);
+
+	for (j = 0; j<5; j++)
+	{
+		for (i = 0; i<8; i++)
+		{
+			statechainv[i + 8 * j] ^= t[i];
+		}
+	}
+
+	for (j = 0; j<5; j++)
+	{
+		for (i = 0; i<8; i++)
+		{
+			t[i + 8 * j] = statechainv[i + 8 * j];
+		}
+	}
+
+	for (j = 0; j<5; j++)
+	{
+		MULT2(statechainv, j);
+	}
+
+	for (j = 0; j<5; j++)
+	{
+		for (i = 0; i<8; i++)
+		{
+			statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i];
+		}
+	}
+
+	for (j = 0; j<5; j++)
+	{
+		for (i = 0; i<8; i++)
+		{
+			t[i + 8 * j] = statechainv[i + 8 * j];
+		}
+	}
+
+	for (j = 0; j<5; j++)
+	{
+		MULT2(statechainv, j);
+	}
+
+	for (j = 0; j<5; j++)
+	{
+		for (i = 0; i<8; i++)
+		{
+			statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i];
+		}
+	}
+
+	for (j = 0; j<5; j++)
+	{
+		for (i = 0; i<8; i++)
+		{
+			statechainv[i + 8 * j] ^= statebuffer[i];
+		}
+		MULT2(statebuffer, 0);
+	}
+
+	for (i = 0; i<8; i++)
+	{
+		chainv[i] = statechainv[i];
+	}
+
+	for (i = 0; i<8; i++)
+	{
+		STEP(h_CNS[(2 * i)], h_CNS[(2 * i) + 1]);
+	}
+
+	for (i = 0; i<8; i++)
+	{
+		statechainv[i] = chainv[i];
+		chainv[i] = statechainv[i + 8];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
+
+	for (i = 0; i<8; i++)
+	{
+		STEP(h_CNS[(2 * i) + 16], h_CNS[(2 * i) + 16 + 1]);
+	}
+
+	for (i = 0; i<8; i++)
+	{
+		statechainv[i + 8] = chainv[i];
+		chainv[i] = statechainv[i + 16];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
+
+	for (i = 0; i<8; i++)
+	{
+		STEP(h_CNS[(2 * i) + 32], h_CNS[(2 * i) + 32 + 1]);
+	}
+
+	for (i = 0; i<8; i++)
+	{
+		statechainv[i + 16] = chainv[i];
+		chainv[i] = statechainv[i + 24];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
+
+	for (i = 0; i<8; i++)
+	{
+		STEP(h_CNS[(2 * i) + 48], h_CNS[(2 * i) + 48 + 1]);
+	}
+
+	for (i = 0; i<8; i++)
+	{
+		statechainv[i + 24] = chainv[i];
+		chainv[i] = statechainv[i + 32];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
+
+	for (i = 0; i<8; i++)
+	{
+		STEP(h_CNS[(2 * i) + 64], h_CNS[(2 * i) + 64 + 1]);
+	}
+
+	for (i = 0; i<8; i++)
+	{
+		statechainv[i + 32] = chainv[i];
+	}
+}
+
 __device__ __forceinline__
-void Update512(hashState *const __restrict__ state, const BitSequence *const __restrict__ data)
+void Update512(uint32_t *const __restrict__ statebuffer, uint32_t *const __restrict__ statechainv, const uint32_t *const __restrict__ data)
 {
 #pragma unroll 8
-	for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]);
-	rnd512(state);
+	for (int i = 0; i<8; i++)
+		statebuffer[i] = BYTES_SWAP32((data)[i]);
+	rnd512(statebuffer, statechainv);
 
 #pragma unroll 8
-	for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]);
-	rnd512(state);
+	for(int i=0;i<8;i++)
+		statebuffer[i] = BYTES_SWAP32(((data))[i+8]);
+	rnd512(statebuffer, statechainv);
 #pragma unroll 4
-	for(int i=0;i<4;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+64))[i]);
+	for(int i=0;i<4;i++)
+		statebuffer[i] = BYTES_SWAP32(((data))[i+16]);
 }
 
-
 /***************************************************/
 __device__ __forceinline__
-void finalization512(hashState *const __restrict__ state, uint32_t *const __restrict__ b)
+void rnd512_nullhash(uint32_t *state)
+{
+	int i, j;
+	uint32_t t[40];
+	uint32_t chainv[8];
+	uint32_t tmp;
+
+#pragma unroll 8
+	for (i = 0; i<8; i++) {
+		t[i] = state[i + 8 * 0];
+#pragma unroll 4
+		for (j = 1; j<5; j++) {
+			t[i] ^= state[i + 8 * j];
+		}
+	}
+
+	MULT2(t, 0);
+
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
+#pragma unroll 8
+		for (i = 0; i<8; i++) {
+			state[i + 8 * j] ^= t[i];
+		}
+	}
+
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
+#pragma unroll 8
+		for (i = 0; i<8; i++) {
+			t[i + 8 * j] = state[i + 8 * j];
+		}
+	}
+
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
+		MULT2(state, j);
+	}
+
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
+#pragma unroll 8
+		for (i = 0; i<8; i++) {
+			state[8 * j + i] ^= t[8 * ((j + 1) % 5) + i];
+		}
+	}
+
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
+#pragma unroll 8
+		for (i = 0; i<8; i++) {
+			t[i + 8 * j] = state[i + 8 * j];
+		}
+	}
+
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
+		MULT2(state, j);
+	}
+
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
+#pragma unroll 8
+		for (i = 0; i<8; i++) {
+			state[8 * j + i] ^= t[8 * ((j + 4) % 5) + i];
+		}
+	}
+
+#pragma unroll 8
+	for (i = 0; i<8; i++) {
+		chainv[i] = state[i];
+	}
+
+#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]);
+	}
+
+#pragma unroll 8
+	for (i = 0; i<8; i++) {
+		state[i] = chainv[i];
+		chainv[i] = state[i + 8];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]);
+	}
+
+#pragma unroll 8
+	for (i = 0; i<8; i++) {
+		state[i + 8] = chainv[i];
+		chainv[i] = state[i + 16];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]);
+	}
+
+#pragma unroll 8
+	for (i = 0; i<8; i++) {
+		state[i + 16] = chainv[i];
+		chainv[i] = state[i + 24];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]);
+	}
+
+#pragma unroll 8
+	for (i = 0; i<8; i++) {
+		state[i + 24] = chainv[i];
+		chainv[i] = state[i + 32];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]);
+	}
+
+#pragma unroll 8
+	for (i = 0; i<8; i++) {
+		state[i + 32] = chainv[i];
+	}
+}
+
+
+__device__ __forceinline__
+void finalization512(uint32_t *const __restrict__ statebuffer, uint32_t *const __restrict__ statechainv, uint32_t *const __restrict__ b)
 {
 	int i,j;
 
-	state->buffer[4] = 0x80000000;
+	statebuffer[4] = 0x80000000;
 #pragma unroll 3
-	for(int i=5;i<8;i++) state->buffer[i] = 0;
-	rnd512(state);
+	for(int i=5;i<8;i++)
+		statebuffer[i] = 0;
+	rnd512(statebuffer, statechainv);
 
-	/*---- blank round with m=0 ----*/
-#pragma unroll 8
-	for(i=0;i<8;i++) state->buffer[i] =0;
-	rnd512(state);
+	rnd512_nullhash(statechainv);
 
 #pragma unroll 8
 	for(i=0;i<8;i++) {
 		b[i] = 0;
 #pragma unroll 5
 		for(j=0;j<5;j++) {
-			b[i] ^= state->chainv[i+8*j];
+			b[i] ^= statechainv[i+8*j];
 		}
 		b[i] = BYTES_SWAP32((b[i]));
 	}
 
-#pragma unroll 8
-	for(i=0;i<8;i++) state->buffer[i]=0;
-	rnd512(state);
+	rnd512_nullhash(statechainv);
 
 #pragma unroll 8
-	for(i=0;i<8;i++) {
+	for(i=0;i<8;i++)
+	{
 		b[8+i] = 0;
 #pragma unroll 5
-		for(j=0;j<5;j++) {
-			b[8+i] ^= state->chainv[i+8*j];
+		for(j=0;j<5;j++)
+		{
+			b[8+i] ^= statechainv[i+8*j];
 		}
 		b[8+i] = BYTES_SWAP32((b[8+i]));
 	}
 }
 
-
 /***************************************************/
 // Die Hash-Funktion
 __global__
-void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
+#if __CUDA_ARCH__ == 500
+__launch_bounds__(256, 4)
+#endif
+void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *outputHash)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
 	{
-		uint32_t nounce = startNounce + thread;
-		union {
-		uint64_t buf64[16];
-		uint32_t buf32[32];
-		} buff;
+		int i, j;
+		const uint32_t nounce = startNounce + thread;
+		uint64_t buff[16];
 
-#pragma unroll 16
-		for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i];
+#pragma unroll
+		for (int i=8; i < 16; ++i)
+			buff[i] = c_PaddedMessage80[i];
 
 		// die Nounce durch die thread-spezifische ersetzen
-		buff.buf64[9] = REPLACE_HIWORD(buff.buf64[9], cuda_swab32(nounce));
+		buff[9] = REPLACE_HIWORD(buff[9], cuda_swab32(nounce));
 
+		uint32_t statebuffer[8];
+		uint32_t statechainv[40];
+
+#pragma unroll 4
+		for (int i = 0; i<4; i++)
+			statebuffer[i] = BYTES_SWAP32(((uint32_t*)buff)[i + 16]);
+#pragma unroll 4
+		for (int i = 4; i<8; i++)
+			statebuffer[i] = statebufferpre[i];
+#pragma unroll 
+		for (int i = 0; i<40; i++)
+			statechainv[i] = statechainvpre[i];
+
+		uint32_t *outHash = outputHash + 16 * thread;
+
+		statebuffer[4] = 0x80000000;
+
+		rnd512_first(statebuffer, statechainv);
+		rnd512_nullhash(statechainv);
+
+
+		#pragma unroll
+		for (i = 0; i<8; i++) 
+		{
+			buff[i] = statechainv[i];
+			#pragma unroll
+			for (j = 1; j<5; j++) {
+				buff[i] ^= statechainv[i + 8 * j];
+			}
+			outHash[i] = BYTES_SWAP32((buff[i]));
+		}
+
+		rnd512_nullhash(statechainv);
 
-		hashState state;
-#pragma unroll 40
-		for(int i=0;i<40;i++) state.chainv[i] = c_IV[i];
 #pragma unroll 8
-		for(int i=0;i<8;i++) state.buffer[i] = 0;
-		Update512(&state, (BitSequence*)buff.buf32);
-		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
-		finalization512(&state, (uint32_t*)outHash);
+		for (i = 0; i<8; i++)
+		{
+			buff[8 + i] = statechainv[i];
+#pragma unroll 5
+			for (j = 1; j<5; j++)
+			{
+				buff[8 + i] ^= statechainv[i + 8 * j];
+			}
+			outHash[8 + i] = BYTES_SWAP32((buff[8 + i]));
+		}
+
+
 	}
 }
 
-__global__
-void qubit_luffa512_gpu_finalhash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
+__global__  __launch_bounds__(256,4)
+void qubit_luffa512_gpu_finalhash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce, int thr_id)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
 	{
-		uint32_t nounce = startNounce + thread;
+		const uint32_t nounce = startNounce + thread;
 		union {
 			uint64_t buf64[16];
 			uint32_t buf32[32];
@@ -393,20 +858,29 @@ void qubit_luffa512_gpu_finalhash_80(uint32_t threads, uint32_t startNounce, voi
 		uint32_t Hash[16];
 
 		#pragma unroll 16
-		for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i];
+		for (int i=0; i < 16; ++i)
+			buff.buf64[i] = c_PaddedMessage80[i];
 
 		// Tested nonce
 		buff.buf64[9] = REPLACE_HIWORD(buff.buf64[9], cuda_swab32(nounce));
 
-		hashState state;
-		#pragma unroll 40
-		for(int i=0;i<40;i++) state.chainv[i] = c_IV[i];
-
-		#pragma unroll 8
-		for(int i=0;i<8;i++) state.buffer[i] = 0;
-
-		Update512(&state, (BitSequence*)buff.buf32);
-		finalization512(&state, Hash);
+		uint32_t statebuffer[8];
+		uint32_t statechainv[40] =
+		{
+			0x6d251e69, 0x44b051e0, 0x4eaa6fb4, 0xdbf78465,
+			0x6e292011, 0x90152df4, 0xee058139, 0xdef610bb,
+			0xc3b44b95, 0xd9d2f256, 0x70eee9a0, 0xde099fa3,
+			0x5d9b0557, 0x8fc944b3, 0xcf1ccf0e, 0x746cd581,
+			0xf7efc89d, 0x5dba5781, 0x04016ce5, 0xad659c05,
+			0x0306194f, 0x666d1836, 0x24aa230a, 0x8b264ae7,
+			0x858075d5, 0x36d79cce, 0xe571f7d7, 0x204b1f67,
+			0x35870c6a, 0x57e9e923, 0x14bcb808, 0x7cde72ce,
+			0x6c68e9be, 0x5ec41e22, 0xc825b7c7, 0xaffb4363,
+			0xf5df3999, 0x0fc688f1, 0xb07224cc, 0x03e86cea
+		};
+
+		Update512(statebuffer, statechainv, buff.buf32);
+		finalization512(statebuffer, statechainv, Hash);
 
 		/* dont ask me why not a simple if (Hash[i] > c_Target[i]) return;
 		 * we lose 20% in perfs without the position test */
@@ -450,54 +924,136 @@ void qubit_luffa512_cpu_init(int thr_id, uint32_t threads)
 }
 
 __host__
-uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash,int order)
+uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
 {
 	uint32_t result = UINT32_MAX;
-	cudaMemset(d_resNounce[thr_id], 0xff, NBN * sizeof(uint32_t));
+	cudaMemsetAsync(d_resNounce[thr_id], 0xff, NBN * sizeof(uint32_t), gpustream[thr_id]);
 	const uint32_t threadsperblock = 256;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	qubit_luffa512_gpu_finalhash_80 <<<grid, block>>> (threads, startNounce, d_outputHash, d_resNounce[thr_id]);
+	qubit_luffa512_gpu_finalhash_80 <<<grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, d_outputHash, d_resNounce[thr_id], thr_id);
 	//MyStreamSynchronize(NULL, order, thr_id);
-	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
-		result = h_resNounce[thr_id][0];
+	CUDA_SAFE_CALL(cudaMemcpyAsync(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]));
+	cudaStreamSynchronize(gpustream[thr_id]);
+	result = h_resNounce[thr_id][0];
 #if NBN > 1
-		extra_results[0] = h_resNounce[thr_id][1];
+	extra_results[0] = h_resNounce[thr_id][1];
 #endif
-	}
 	return result;
 }
 
+__host__ void qubit_cpu_precalc(int thr_id)
+{
+	uint32_t tmp,i,j;
+	uint32_t statebuffer[8];
+	uint32_t t[40];
+	uint32_t statechainv[40] =
+	{
+		0x6d251e69, 0x44b051e0, 0x4eaa6fb4, 0xdbf78465,
+		0x6e292011, 0x90152df4, 0xee058139, 0xdef610bb,
+		0xc3b44b95, 0xd9d2f256, 0x70eee9a0, 0xde099fa3,
+		0x5d9b0557, 0x8fc944b3, 0xcf1ccf0e, 0x746cd581,
+		0xf7efc89d, 0x5dba5781, 0x04016ce5, 0xad659c05,
+		0x0306194f, 0x666d1836, 0x24aa230a, 0x8b264ae7,
+		0x858075d5, 0x36d79cce, 0xe571f7d7, 0x204b1f67,
+		0x35870c6a, 0x57e9e923, 0x14bcb808, 0x7cde72ce,
+		0x6c68e9be, 0x5ec41e22, 0xc825b7c7, 0xaffb4363,
+		0xf5df3999, 0x0fc688f1, 0xb07224cc, 0x03e86cea
+	};
+
+	for (int i = 0; i<8; i++)
+		statebuffer[i] = BYTES_SWAP32(*(((uint32_t*)PaddedMessage) + i));
+	rnd512cpu(statebuffer, statechainv);
+
+	for (int i = 0; i<8; i++)
+		statebuffer[i] = BYTES_SWAP32(*(((uint32_t*)PaddedMessage) + i + 8));
+
+	rnd512cpu(statebuffer, statechainv);
+
+
+	for (int i = 0; i<8; i++)
+	{
+		t[i] = statechainv[i];
+		for (int j = 1; j<5; j++)
+		{
+			t[i] ^= statechainv[i + 8 * j];
+		}
+	}
+
+	MULT2(t, 0);
+
+	for (int j = 0; j<5; j++) {
+		for (int i = 0; i<8; i++) {
+			statechainv[i + 8 * j] ^= t[i];
+		}
+	}
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++) {
+			t[i + 8 * j] = statechainv[i + 8 * j];
+		}
+	}
+
+	for (j = 0; j<5; j++) {
+		MULT2(statechainv, j);
+	}
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++) {
+			statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i];
+		}
+	}
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++) {
+			t[i + 8 * j] = statechainv[i + 8 * j];
+		}
+	}
+
+	for (j = 0; j<5; j++) {
+		MULT2(statechainv, j);
+	}
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++) {
+			statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i];
+		}
+	}
+
+
+
+	cudaMemcpyToSymbolAsync(statebufferpre, statebuffer, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]);
+	cudaMemcpyToSymbolAsync(statechainvpre, statechainv, 40 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]);
+}
+
 __host__
-void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash,int order)
+void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
 {
 	const uint32_t threadsperblock = 256;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	qubit_luffa512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, d_outputHash);
+	qubit_luffa512_gpu_hash_80 <<<grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, d_outputHash);
 }
 
 __host__
-void qubit_luffa512_cpu_setBlock_80(void *pdata)
+void qubit_luffa512_cpu_setBlock_80(int thr_id, void *pdata)
 {
-	unsigned char PaddedMessage[128];
-
 	memcpy(PaddedMessage, pdata, 80);
-	memset(PaddedMessage+80, 0, 48);
+	memset(PaddedMessage + 80, 0, 48);
 	PaddedMessage[80] = 0x80;
 	PaddedMessage[111] = 1;
 	PaddedMessage[126] = 0x02;
 	PaddedMessage[127] = 0x80;
 
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_PaddedMessage80, PaddedMessage, 16 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
+	qubit_cpu_precalc(thr_id);
 }
 
 __host__
-void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget)
+void qubit_luffa512_cpufinal_setBlock_80(int thr_id, void *pdata, const void *ptarget)
 {
 	unsigned char PaddedMessage[128];
 
@@ -508,6 +1064,6 @@ void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget)
 	PaddedMessage[126] = 0x02;
 	PaddedMessage[127] = 0x80;
 
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_Target, ptarget, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_PaddedMessage80, PaddedMessage, 16 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
 }
diff --git a/scrypt.c b/scrypt.c
deleted file mode 100644
index c20c2e47d3..0000000000
--- a/scrypt.c
+++ /dev/null
@@ -1,756 +0,0 @@
-/*
- * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- */
-
-#include "cpuminer-config.h"
-#include "miner.h"
-
-#include <stdlib.h>
-#include <string.h>
-#include <inttypes.h>
-
-static const uint32_t keypad[12] = {
-	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
-};
-static const uint32_t innerpad[11] = {
-	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
-};
-static const uint32_t outerpad[8] = {
-	0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
-};
-static const uint32_t finalblk[16] = {
-	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
-};
-
-static inline void HMAC_SHA256_80_init(const uint32_t *key,
-	uint32_t *tstate, uint32_t *ostate)
-{
-	uint32_t ihash[8];
-	uint32_t pad[16];
-	int i;
-
-	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 16, 16);
-	memcpy(pad + 4, keypad, 48);
-	sha256_transform(tstate, pad, 0);
-	memcpy(ihash, tstate, 32);
-
-	sha256_init(ostate);
-	for (i = 0; i < 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform(ostate, pad, 0);
-
-	sha256_init(tstate);
-	for (i = 0; i < 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform(tstate, pad, 0);
-}
-
-static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
-	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t istate[8], ostate2[8];
-	uint32_t ibuf[16], obuf[16];
-	int i, j;
-
-	memcpy(istate, tstate, 32);
-	sha256_transform(istate, salt, 0);
-	
-	memcpy(ibuf, salt + 16, 16);
-	memcpy(ibuf + 5, innerpad, 44);
-	memcpy(obuf + 8, outerpad, 32);
-
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 32);
-		ibuf[4] = i + 1;
-		sha256_transform(obuf, ibuf, 0);
-
-		memcpy(ostate2, ostate, 32);
-		sha256_transform(ostate2, obuf, 0);
-		for (j = 0; j < 8; j++)
-			output[8 * i + j] = swab32(ostate2[j]);
-	}
-}
-
-static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
-	const uint32_t *salt, uint32_t *output)
-{
-	uint32_t buf[16];
-	int i;
-	
-	sha256_transform(tstate, salt, 1);
-	sha256_transform(tstate, salt + 16, 1);
-	sha256_transform(tstate, finalblk, 0);
-	memcpy(buf, tstate, 32);
-	memcpy(buf + 8, outerpad, 32);
-
-	sha256_transform(ostate, buf, 0);
-	for (i = 0; i < 8; i++)
-		output[i] = swab32(ostate[i]);
-}
-
-
-#if HAVE_SHA256_4WAY
-
-static const uint32_t keypad_4way[4 * 12] = {
-	0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000280, 0x00000280, 0x00000280, 0x00000280
-};
-static const uint32_t innerpad_4way[4 * 11] = {
-	0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0
-};
-static const uint32_t outerpad_4way[4 * 8] = {
-	0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000300, 0x00000300, 0x00000300, 0x00000300
-};
-static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = {
-	0x00000001, 0x00000001, 0x00000001, 0x00000001,
-	0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000620, 0x00000620, 0x00000620, 0x00000620
-};
-
-static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
-	uint32_t *tstate, uint32_t *ostate)
-{
-	uint32_t ihash[4 * 8] __attribute__((aligned(16)));
-	uint32_t pad[4 * 16] __attribute__((aligned(16)));
-	int i;
-
-	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 4 * 16, 4 * 16);
-	memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
-	sha256_transform_4way(tstate, pad, 0);
-	memcpy(ihash, tstate, 4 * 32);
-
-	sha256_init_4way(ostate);
-	for (i = 0; i < 4 * 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 4 * 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform_4way(ostate, pad, 0);
-
-	sha256_init_4way(tstate);
-	for (i = 0; i < 4 * 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 4 * 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform_4way(tstate, pad, 0);
-}
-
-static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
-	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t istate[4 * 8] __attribute__((aligned(16)));
-	uint32_t ostate2[4 * 8] __attribute__((aligned(16)));
-	uint32_t ibuf[4 * 16] __attribute__((aligned(16)));
-	uint32_t obuf[4 * 16] __attribute__((aligned(16)));
-	int i, j;
-
-	memcpy(istate, tstate, 4 * 32);
-	sha256_transform_4way(istate, salt, 0);
-	
-	memcpy(ibuf, salt + 4 * 16, 4 * 16);
-	memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
-	memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);
-
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 4 * 32);
-		ibuf[4 * 4 + 0] = i + 1;
-		ibuf[4 * 4 + 1] = i + 1;
-		ibuf[4 * 4 + 2] = i + 1;
-		ibuf[4 * 4 + 3] = i + 1;
-		sha256_transform_4way(obuf, ibuf, 0);
-
-		memcpy(ostate2, ostate, 4 * 32);
-		sha256_transform_4way(ostate2, obuf, 0);
-		for (j = 0; j < 4 * 8; j++)
-			output[4 * 8 * i + j] = swab32(ostate2[j]);
-	}
-}
-
-static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
-	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t buf[4 * 16] __attribute__((aligned(16)));
-	int i;
-	
-	sha256_transform_4way(tstate, salt, 1);
-	sha256_transform_4way(tstate, salt + 4 * 16, 1);
-	sha256_transform_4way(tstate, finalblk_4way, 0);
-	memcpy(buf, tstate, 4 * 32);
-	memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
-
-	sha256_transform_4way(ostate, buf, 0);
-	for (i = 0; i < 4 * 8; i++)
-		output[i] = swab32(ostate[i]);
-}
-
-#endif /* HAVE_SHA256_4WAY */
-
-
-#if HAVE_SHA256_8WAY
-
-static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = {
-	0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
-	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620
-};
-
-static inline void HMAC_SHA256_80_init_8way(const uint32_t *key,
-	uint32_t *tstate, uint32_t *ostate)
-{
-	uint32_t ihash[8 * 8] __attribute__((aligned(32)));
-	uint32_t pad[8 * 16] __attribute__((aligned(32)));
-	int i;
-	
-	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 8 * 16, 8 * 16);
-	for (i = 0; i < 8; i++)
-		pad[8 * 4 + i] = 0x80000000;
-	memset(pad + 8 * 5, 0x00, 8 * 40);
-	for (i = 0; i < 8; i++)
-		pad[8 * 15 + i] = 0x00000280;
-	sha256_transform_8way(tstate, pad, 0);
-	memcpy(ihash, tstate, 8 * 32);
-	
-	sha256_init_8way(ostate);
-	for (i = 0; i < 8 * 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 8 * 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform_8way(ostate, pad, 0);
-	
-	sha256_init_8way(tstate);
-	for (i = 0; i < 8 * 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 8 * 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform_8way(tstate, pad, 0);
-}
-
-static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
-	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t istate[8 * 8] __attribute__((aligned(32)));
-	uint32_t ostate2[8 * 8] __attribute__((aligned(32)));
-	uint32_t ibuf[8 * 16] __attribute__((aligned(32)));
-	uint32_t obuf[8 * 16] __attribute__((aligned(32)));
-	int i, j;
-	
-	memcpy(istate, tstate, 8 * 32);
-	sha256_transform_8way(istate, salt, 0);
-	
-	memcpy(ibuf, salt + 8 * 16, 8 * 16);
-	for (i = 0; i < 8; i++)
-		ibuf[8 * 5 + i] = 0x80000000;
-	memset(ibuf + 8 * 6, 0x00, 8 * 36);
-	for (i = 0; i < 8; i++)
-		ibuf[8 * 15 + i] = 0x000004a0;
-	
-	for (i = 0; i < 8; i++)
-		obuf[8 * 8 + i] = 0x80000000;
-	memset(obuf + 8 * 9, 0x00, 8 * 24);
-	for (i = 0; i < 8; i++)
-		obuf[8 * 15 + i] = 0x00000300;
-	
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 8 * 32);
-		ibuf[8 * 4 + 0] = i + 1;
-		ibuf[8 * 4 + 1] = i + 1;
-		ibuf[8 * 4 + 2] = i + 1;
-		ibuf[8 * 4 + 3] = i + 1;
-		ibuf[8 * 4 + 4] = i + 1;
-		ibuf[8 * 4 + 5] = i + 1;
-		ibuf[8 * 4 + 6] = i + 1;
-		ibuf[8 * 4 + 7] = i + 1;
-		sha256_transform_8way(obuf, ibuf, 0);
-		
-		memcpy(ostate2, ostate, 8 * 32);
-		sha256_transform_8way(ostate2, obuf, 0);
-		for (j = 0; j < 8 * 8; j++)
-			output[8 * 8 * i + j] = swab32(ostate2[j]);
-	}
-}
-
-static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
-	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t buf[8 * 16] __attribute__((aligned(32)));
-	int i;
-	
-	sha256_transform_8way(tstate, salt, 1);
-	sha256_transform_8way(tstate, salt + 8 * 16, 1);
-	sha256_transform_8way(tstate, finalblk_8way, 0);
-	
-	memcpy(buf, tstate, 8 * 32);
-	for (i = 0; i < 8; i++)
-		buf[8 * 8 + i] = 0x80000000;
-	memset(buf + 8 * 9, 0x00, 8 * 24);
-	for (i = 0; i < 8; i++)
-		buf[8 * 15 + i] = 0x00000300;
-	sha256_transform_8way(ostate, buf, 0);
-	
-	for (i = 0; i < 8 * 8; i++)
-		output[i] = swab32(ostate[i]);
-}
-
-#endif /* HAVE_SHA256_8WAY */
-
-
-#if defined(__x86_64__)
-
-#define SCRYPT_MAX_WAYS 1
-#define HAVE_SCRYPT_3WAY 0
-#define scrypt_best_throughput() 1
-static void scrypt_core(uint32_t *X, uint32_t *V);
-void scrypt_core_3way(uint32_t *X, uint32_t *V);
-#if defined(USE_AVX2)
-#undef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 21
-#define HAVE_SCRYPT_6WAY 0
-void scrypt_core_6way(uint32_t *X, uint32_t *V);
-#endif
-
-#elif defined(__i386__)
-
-#define SCRYPT_MAX_WAYS 1
-#define scrypt_best_throughput() 1
-static void scrypt_core(uint32_t *X, uint32_t *V);
-
-#elif defined(__arm__) && defined(__APCS_32__)
-
-static void scrypt_core(uint32_t *X, uint32_t *V);
-#if defined(__ARM_NEON__)
-#undef HAVE_SHA256_4WAY
-#define SCRYPT_MAX_WAYS 1
-#define HAVE_SCRYPT_3WAY 0
-#define scrypt_best_throughput() 1
-void scrypt_core_3way(uint32_t *X, uint32_t *V);
-#endif
-
-#endif
-
-static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
-{
-	uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
-	int i;
-
-	x00 = (B[ 0] ^= Bx[ 0]);
-	x01 = (B[ 1] ^= Bx[ 1]);
-	x02 = (B[ 2] ^= Bx[ 2]);
-	x03 = (B[ 3] ^= Bx[ 3]);
-	x04 = (B[ 4] ^= Bx[ 4]);
-	x05 = (B[ 5] ^= Bx[ 5]);
-	x06 = (B[ 6] ^= Bx[ 6]);
-	x07 = (B[ 7] ^= Bx[ 7]);
-	x08 = (B[ 8] ^= Bx[ 8]);
-	x09 = (B[ 9] ^= Bx[ 9]);
-	x10 = (B[10] ^= Bx[10]);
-	x11 = (B[11] ^= Bx[11]);
-	x12 = (B[12] ^= Bx[12]);
-	x13 = (B[13] ^= Bx[13]);
-	x14 = (B[14] ^= Bx[14]);
-	x15 = (B[15] ^= Bx[15]);
-	for (i = 0; i < 8; i += 2) {
-#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
-		/* Operate on columns. */
-		x04 ^= R(x00+x12, 7);	x09 ^= R(x05+x01, 7);
-		x14 ^= R(x10+x06, 7);	x03 ^= R(x15+x11, 7);
-		
-		x08 ^= R(x04+x00, 9);	x13 ^= R(x09+x05, 9);
-		x02 ^= R(x14+x10, 9);	x07 ^= R(x03+x15, 9);
-		
-		x12 ^= R(x08+x04,13);	x01 ^= R(x13+x09,13);
-		x06 ^= R(x02+x14,13);	x11 ^= R(x07+x03,13);
-		
-		x00 ^= R(x12+x08,18);	x05 ^= R(x01+x13,18);
-		x10 ^= R(x06+x02,18);	x15 ^= R(x11+x07,18);
-		
-		/* Operate on rows. */
-		x01 ^= R(x00+x03, 7);	x06 ^= R(x05+x04, 7);
-		x11 ^= R(x10+x09, 7);	x12 ^= R(x15+x14, 7);
-		
-		x02 ^= R(x01+x00, 9);	x07 ^= R(x06+x05, 9);
-		x08 ^= R(x11+x10, 9);	x13 ^= R(x12+x15, 9);
-		
-		x03 ^= R(x02+x01,13);	x04 ^= R(x07+x06,13);
-		x09 ^= R(x08+x11,13);	x14 ^= R(x13+x12,13);
-		
-		x00 ^= R(x03+x02,18);	x05 ^= R(x04+x07,18);
-		x10 ^= R(x09+x08,18);	x15 ^= R(x14+x13,18);
-#undef R
-	}
-	B[ 0] += x00;
-	B[ 1] += x01;
-	B[ 2] += x02;
-	B[ 3] += x03;
-	B[ 4] += x04;
-	B[ 5] += x05;
-	B[ 6] += x06;
-	B[ 7] += x07;
-	B[ 8] += x08;
-	B[ 9] += x09;
-	B[10] += x10;
-	B[11] += x11;
-	B[12] += x12;
-	B[13] += x13;
-	B[14] += x14;
-	B[15] += x15;
-}
-
-static inline void scrypt_core(uint32_t *X, uint32_t *V)
-{
-	uint32_t i, j, k;
-	
-	for (i = 0; i < 1024; i++) {
-		memcpy(&V[i * 32], X, 128);
-		xor_salsa8(&X[0], &X[16]);
-		xor_salsa8(&X[16], &X[0]);
-	}
-	for (i = 0; i < 1024; i++) {
-		j = 32 * (X[16] & 1023);
-		for (k = 0; k < 32; k++)
-			X[k] ^= V[j + k];
-		xor_salsa8(&X[0], &X[16]);
-		xor_salsa8(&X[16], &X[0]);
-	}
-}
-
-#ifndef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 1
-#define scrypt_best_throughput() 1
-#endif
-
-#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63)
-
-unsigned char *scrypt_buffer_alloc()
-{
-	return (unsigned char *)malloc(SCRYPT_BUFFER_SIZE);
-}
-
-static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
-	uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[8], ostate[8];
-	uint32_t X[32];
-	uint32_t *V;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	memcpy(tstate, midstate, 32);
-	HMAC_SHA256_80_init(input, tstate, ostate);
-	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
-
-	scrypt_core(X, V);
-
-	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
-}
-
-#if HAVE_SHA256_4WAY
-static void scrypt_1024_1_1_256_4way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[4 * 8] __attribute__((aligned(128)));
-	uint32_t ostate[4 * 8] __attribute__((aligned(128)));
-	uint32_t W[4 * 32] __attribute__((aligned(128)));
-	uint32_t X[4 * 32] __attribute__((aligned(128)));
-	uint32_t *V;
-	int i, k;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	for (i = 0; i < 20; i++)
-		for (k = 0; k < 4; k++)
-			W[4 * i + k] = input[k * 20 + i];
-	for (i = 0; i < 8; i++)
-		for (k = 0; k < 4; k++)
-			tstate[4 * i + k] = midstate[i];
-	HMAC_SHA256_80_init_4way(W, tstate, ostate);
-	PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
-	for (i = 0; i < 32; i++)
-		for (k = 0; k < 4; k++)
-			X[k * 32 + i] = W[4 * i + k];
-	scrypt_core(X + 0 * 32, V);
-	scrypt_core(X + 1 * 32, V);
-	scrypt_core(X + 2 * 32, V);
-	scrypt_core(X + 3 * 32, V);
-	for (i = 0; i < 32; i++)
-		for (k = 0; k < 4; k++)
-			W[4 * i + k] = X[k * 32 + i];
-	PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
-	for (i = 0; i < 8; i++)
-		for (k = 0; k < 4; k++)
-			output[k * 8 + i] = W[4 * i + k];
-}
-#endif /* HAVE_SHA256_4WAY */
-
-#if HAVE_SCRYPT_3WAY
-
-static void scrypt_1024_1_1_256_3way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[3 * 8], ostate[3 * 8];
-	uint32_t X[3 * 32] __attribute__((aligned(64)));
-	uint32_t *V;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	memcpy(tstate +  0, midstate, 32);
-	memcpy(tstate +  8, midstate, 32);
-	memcpy(tstate + 16, midstate, 32);
-	HMAC_SHA256_80_init(input +  0, tstate +  0, ostate +  0);
-	HMAC_SHA256_80_init(input + 20, tstate +  8, ostate +  8);
-	HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16);
-	PBKDF2_SHA256_80_128(tstate +  0, ostate +  0, input +  0, X +  0);
-	PBKDF2_SHA256_80_128(tstate +  8, ostate +  8, input + 20, X + 32);
-	PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64);
-
-	scrypt_core_3way(X, V);
-
-	PBKDF2_SHA256_128_32(tstate +  0, ostate +  0, X +  0, output +  0);
-	PBKDF2_SHA256_128_32(tstate +  8, ostate +  8, X + 32, output +  8);
-	PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16);
-}
-
-#if HAVE_SHA256_4WAY
-static void scrypt_1024_1_1_256_12way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[12 * 8] __attribute__((aligned(128)));
-	uint32_t ostate[12 * 8] __attribute__((aligned(128)));
-	uint32_t W[12 * 32] __attribute__((aligned(128)));
-	uint32_t X[12 * 32] __attribute__((aligned(128)));
-	uint32_t *V;
-	int i, j, k;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 20; i++)
-			for (k = 0; k < 4; k++)
-				W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i];
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 4; k++)
-				tstate[32 * j + 4 * i + k] = midstate[i];
-	HMAC_SHA256_80_init_4way(W +   0, tstate +  0, ostate +  0);
-	HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32);
-	HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64);
-	PBKDF2_SHA256_80_128_4way(tstate +  0, ostate +  0, W +   0, W +   0);
-	PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128);
-	PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 4; k++)
-				X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
-	scrypt_core_3way(X + 0 * 96, V);
-	scrypt_core_3way(X + 1 * 96, V);
-	scrypt_core_3way(X + 2 * 96, V);
-	scrypt_core_3way(X + 3 * 96, V);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 4; k++)
-				W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i];
-	PBKDF2_SHA256_128_32_4way(tstate +  0, ostate +  0, W +   0, W +   0);
-	PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128);
-	PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 4; k++)
-				output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k];
-}
-#endif /* HAVE_SHA256_4WAY */
-
-#endif /* HAVE_SCRYPT_3WAY */
-
-#if HAVE_SCRYPT_6WAY
-static void scrypt_1024_1_1_256_24way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[24 * 8] __attribute__((aligned(128)));
-	uint32_t ostate[24 * 8] __attribute__((aligned(128)));
-	uint32_t W[24 * 32] __attribute__((aligned(128)));
-	uint32_t X[24 * 32] __attribute__((aligned(128)));
-	uint32_t *V;
-	int i, j, k;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-	
-	for (j = 0; j < 3; j++) 
-		for (i = 0; i < 20; i++)
-			for (k = 0; k < 8; k++)
-				W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i];
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 8; k++)
-				tstate[8 * 8 * j + 8 * i + k] = midstate[i];
-	HMAC_SHA256_80_init_8way(W +   0, tstate +   0, ostate +   0);
-	HMAC_SHA256_80_init_8way(W + 256, tstate +  64, ostate +  64);
-	HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128);
-	PBKDF2_SHA256_80_128_8way(tstate +   0, ostate +   0, W +   0, W +   0);
-	PBKDF2_SHA256_80_128_8way(tstate +  64, ostate +  64, W + 256, W + 256);
-	PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 8; k++)
-				X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
-	scrypt_core_6way(X + 0 * 32, V);
-	scrypt_core_6way(X + 6 * 32, V);
-	scrypt_core_6way(X + 12 * 32, V);
-	scrypt_core_6way(X + 18 * 32, V);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 8; k++)
-				W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i];
-	PBKDF2_SHA256_128_32_8way(tstate +   0, ostate +   0, W +   0, W +   0);
-	PBKDF2_SHA256_128_32_8way(tstate +  64, ostate +  64, W + 256, W + 256);
-	PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 8; k++)
-				output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k];
-}
-#endif /* HAVE_SCRYPT_6WAY */
-
-int scanhash_scrypt(int thr_id, uint32_t *pdata,
-	unsigned char *scratchbuf, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
-{
-	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
-	uint32_t midstate[8];
-	uint32_t n = pdata[19] - 1;
-	const uint32_t Htarg = ptarget[7];
-	uint32_t throughput = scrypt_best_throughput();
-	int i;
-	
-#if HAVE_SHA256_4WAY
-	if (sha256_use_4way())
-		throughput *= 4;
-#endif
-	
-	for (i = 0; i < throughput; i++)
-		memcpy(data + i * 20, pdata, 80);
-	
-	sha256_init(midstate);
-	sha256_transform(midstate, data, 0);
-	
-	do {
-		for (i = 0; i < throughput; i++)
-			data[i * 20 + 19] = ++n;
-		
-#if HAVE_SHA256_4WAY
-		if (throughput == 4)
-			scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf);
-		else
-#endif
-#if HAVE_SCRYPT_3WAY && HAVE_SHA256_4WAY
-		if (throughput == 12)
-			scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf);
-		else
-#endif
-#if HAVE_SCRYPT_6WAY
-		if (throughput == 24)
-			scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf);
-		else
-#endif
-#if HAVE_SCRYPT_3WAY
-		if (throughput == 3)
-			scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf);
-		else
-#endif
-		scrypt_1024_1_1_256(data, hash, midstate, scratchbuf);
-		
-		for (i = 0; i < throughput; i++) {
-			if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) {
-				*hashes_done = n - pdata[19] + 1;
-				pdata[19] = data[i * 20 + 19];
-				return 1;
-			}
-		}
-	} while (n < max_nonce && !work_restart[thr_id].restart);
-	
-	*hashes_done = n - pdata[19] + 1;
-	pdata[19] = n;
-	return 0;
-}
diff --git a/skein.cu b/skein.cu
new file mode 100644
index 0000000000..8336a91c14
--- /dev/null
+++ b/skein.cu
@@ -0,0 +1,138 @@
+/**
+* SKEIN512 80 + SHA256 64
+* by tpruvot@github - 2015
+* Optimized by sp-hash@github - 2015
+*/
+
+extern "C" {
+#include "sph/sph_skein.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include <openssl/sha.h>
+
+extern void skein512_cpu_setBlock_80(int thr_id,void *pdata);
+extern void skein512_cpu_hash_80_50(int thr_id, uint32_t threads, uint32_t startNounce, int swapu, uint64_t target, uint32_t *h_found);
+extern void skein512_cpu_hash_80_52(int thr_id, uint32_t threads, uint32_t startNounce, int swapu, uint64_t target, uint32_t *h_found);
+
+void skeincoinhash(void *output, const void *input)
+{
+	sph_skein512_context ctx_skein;
+	SHA256_CTX sha256;
+
+	uint32_t hash[16];
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, input, 80);
+	sph_skein512_close(&ctx_skein, hash);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, (unsigned char *)hash, 64);
+	SHA256_Final((unsigned char *)hash, &sha256);
+
+	memcpy(output, hash, 32);
+}
+
+static __inline uint32_t swab32_if(uint32_t val, bool iftrue)
+{
+	return iftrue ? swab32(val) : val;
+}
+
+int scanhash_skeincoin(int thr_id, uint32_t *pdata,
+								  uint32_t *ptarget, uint32_t max_nonce,
+								  uint32_t *hashes_done)
+{
+	static THREAD uint32_t *foundnonces = nullptr;
+
+	const uint32_t first_nonce = pdata[19];
+	const int swap = 1;
+
+	uint32_t intensity = (device_sm[device_map[thr_id]] > 500) ? 1 << 28 : 1 << 27;;
+	uint32_t throughput = device_intensity(device_map[thr_id], __func__, intensity); // 256*4096
+	throughput = min(throughput, max_nonce - first_nonce) & 0xfffffc00;
+
+	if (opt_benchmark)
+	{
+		((uint64_t*)ptarget)[3] = 0x3000f0000;
+	}
+	uint64_t target = ((uint64_t*)ptarget)[3];
+
+	static THREAD volatile bool init = false;
+	if(!init)
+	{
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		CUDA_SAFE_CALL(cudaMallocHost(&foundnonces, 2 * 4));
+		mining_has_stopped[thr_id] = false;
+		init = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k = 0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	skein512_cpu_setBlock_80(thr_id, (void*)endiandata);
+	do
+	{
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (device_sm[device_map[thr_id]] > 500)
+			skein512_cpu_hash_80_52(thr_id, throughput, pdata[19], swap, target, foundnonces);
+		else
+			skein512_cpu_hash_80_50(thr_id, throughput, pdata[19], swap, target, foundnonces);
+
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(foundnonces[0] != 0xffffffff)
+		{
+			uint32_t vhash64[8]={0};
+
+			endiandata[19] = swab32_if(foundnonces[0], swap);
+			
+			skeincoinhash(vhash64, endiandata);
+
+			uint64_t test = ((uint64_t*)vhash64)[3];
+			if (test <= target && fulltest(vhash64, ptarget))
+			{
+				int res = 1;
+				if (opt_debug || opt_benchmark)
+					applog(LOG_INFO, "GPU #%d: found nonce $%08X", device_map[thr_id], foundnonces[0]);
+				if (foundnonces[1] != 0xffffffff)
+				{
+					endiandata[19] = swab32_if(foundnonces[1], swap);
+					skeincoinhash(vhash64, endiandata);
+					uint64_t test2 = ((uint64_t*)vhash64)[3];
+					if (test2 <= target && fulltest(vhash64, ptarget))
+					{
+						if (opt_debug || opt_benchmark)
+							applog(LOG_INFO, "GPU #%d: found nonce $%08X", device_map[thr_id], foundnonces[1]);
+						pdata[19 + res] = swab32_if(foundnonces[1], !swap);
+						res++;
+					}
+					else
+					{
+						if (test2 != target) applog(LOG_WARNING, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundnonces[1]);
+					}
+				}
+				pdata[19] = swab32_if(foundnonces[0], !swap);
+				return res;
+			}
+			else 
+			{
+				if (test != target)
+					applog(LOG_WARNING, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundnonces[0]);
+				else
+					applog(LOG_WARNING, "Lost work: #%d", test);
+
+			}
+		}
+
+		pdata[19] += throughput;
+
+	} while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
+
+	*hashes_done = pdata[19] - first_nonce ;
+	return 0;
+}
diff --git a/sph/neoscrypt.cpp b/sph/neoscrypt.cpp
new file mode 100644
index 0000000000..ebbb2074ed
--- /dev/null
+++ b/sph/neoscrypt.cpp
@@ -0,0 +1,994 @@
+/*
+ * Copyright (c) 2009 Colin Percival, 2011 ArtForz
+ * Copyright (c) 2012 Andrew Moon (floodyberry)
+ * Copyright (c) 2012 Samuel Neves <sneves@dei.uc.pt>
+ * Copyright (c) 2014 John Doering <ghostlander@phoenixcoin.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "neoscrypt.h"
+
+extern void proper_exit(int reason);
+enum
+{
+	LOG_ERR,
+	LOG_WARNING,
+	LOG_NOTICE,
+	LOG_INFO,
+	LOG_DEBUG,
+	/* custom notices */
+	LOG_BLUE = 0x10,
+};
+extern void applog(int prio, const char *fmt, ...);
+
+#ifdef _WIN32
+/* sizeof(unsigned long) = 4 for MinGW64 */
+typedef unsigned long long ulong;
+#else
+typedef unsigned long ulong;
+#endif
+typedef unsigned int  uint;
+typedef unsigned char uchar;
+
+#define MIN(a, b) ((a) < (b) ? a : b)
+#define MAX(a, b) ((a) > (b) ? a : b)
+
+
+/* SHA-256 */
+
+static const uint32_t sha256_constants[64] = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+#define Ch(x,y,z)  (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) (((x | y) & z) | (x & y))
+#define S0(x)      (ROTR32(x,  2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
+#define S1(x)      (ROTR32(x,  6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
+#define G0(x)      (ROTR32(x,  7) ^ ROTR32(x, 18) ^ (x >>  3))
+#define G1(x)      (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10))
+#define W0(in,i)   (U8TO32_BE(&in[i * 4]))
+#define W1(i)      (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16])
+#define STEP(i) \
+    t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \
+    t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \
+    r[7] = r[6]; \
+    r[6] = r[5]; \
+    r[5] = r[4]; \
+    r[4] = r[3] + t0; \
+    r[3] = r[2]; \
+    r[2] = r[1]; \
+    r[1] = r[0]; \
+    r[0] = t0 + t1;
+
+
+typedef struct sha256_hash_state_t {
+    uint32_t H[8];
+    uint64_t T;
+    uint32_t leftover;
+    uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} sha256_hash_state;
+
+
+static void sha256_blocks(sha256_hash_state *S, const uint8_t *in, size_t blocks) {
+    uint32_t r[8], w[64], t0, t1;
+    size_t i;
+
+    for(i = 0; i < 8; i++)
+      r[i] = S->H[i];
+
+    while(blocks--) {
+        for(i =  0; i < 16; i++) {
+            w[i] = W0(in, i);
+        }
+        for(i = 16; i < 64; i++) {
+            w[i] = W1(i);
+        }
+        for(i =  0; i < 64; i++) {
+            STEP(i);
+        }
+        for(i =  0; i <  8; i++) {
+            r[i] += S->H[i];
+            S->H[i] = r[i];
+        }
+        S->T += SCRYPT_HASH_BLOCK_SIZE * 8;
+        in += SCRYPT_HASH_BLOCK_SIZE;
+    }
+}
+
+static void neoscrypt_hash_init_sha256(sha256_hash_state *S) {
+    S->H[0] = 0x6a09e667;
+    S->H[1] = 0xbb67ae85;
+    S->H[2] = 0x3c6ef372;
+    S->H[3] = 0xa54ff53a;
+    S->H[4] = 0x510e527f;
+    S->H[5] = 0x9b05688c;
+    S->H[6] = 0x1f83d9ab;
+    S->H[7] = 0x5be0cd19;
+    S->T = 0;
+    S->leftover = 0;
+}
+
+static void neoscrypt_hash_update_sha256(sha256_hash_state *S, const uint8_t *in, size_t inlen) {
+    size_t blocks, want;
+
+    /* handle the previous data */
+    if(S->leftover) {
+        want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+        want = (want < inlen) ? want : inlen;
+        memcpy(S->buffer + S->leftover, in, want);
+        S->leftover += (uint32_t)want;
+        if(S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+          return;
+        in += want;
+        inlen -= want;
+        sha256_blocks(S, S->buffer, 1);
+    }
+
+    /* handle the current data */
+    blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+    S->leftover = (uint32_t)(inlen - blocks);
+    if(blocks) {
+        sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+        in += blocks;
+    }
+
+    /* handle leftover data */
+    if(S->leftover)
+      memcpy(S->buffer, in, S->leftover);
+}
+
+static void neoscrypt_hash_finish_sha256(sha256_hash_state *S, uint8_t *hash) {
+    uint64_t t = S->T + (S->leftover * 8);
+
+    S->buffer[S->leftover] = 0x80;
+    if(S->leftover <= 55) {
+        memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
+    } else {
+        memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
+        sha256_blocks(S, S->buffer, 1);
+        memset(S->buffer, 0, 56);
+    }
+
+    U64TO8_BE(S->buffer + 56, t);
+    sha256_blocks(S, S->buffer, 1);
+
+    U32TO8_BE(&hash[ 0], S->H[0]);
+    U32TO8_BE(&hash[ 4], S->H[1]);
+    U32TO8_BE(&hash[ 8], S->H[2]);
+    U32TO8_BE(&hash[12], S->H[3]);
+    U32TO8_BE(&hash[16], S->H[4]);
+    U32TO8_BE(&hash[20], S->H[5]);
+    U32TO8_BE(&hash[24], S->H[6]);
+    U32TO8_BE(&hash[28], S->H[7]);
+}
+
+static void neoscrypt_hash_sha256(hash_digest hash, const uint8_t *m, size_t mlen) {
+    sha256_hash_state st;
+    neoscrypt_hash_init_sha256(&st);
+    neoscrypt_hash_update_sha256(&st, m, mlen);
+    neoscrypt_hash_finish_sha256(&st, hash);
+}
+
+
+/* HMAC for SHA-256 */
+
+typedef struct sha256_hmac_state_t {
+    sha256_hash_state inner, outer;
+} sha256_hmac_state;
+
+static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen) {
+    uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
+    size_t i;
+
+    neoscrypt_hash_init_sha256(&st->inner);
+    neoscrypt_hash_init_sha256(&st->outer);
+
+    if(keylen <= SCRYPT_HASH_BLOCK_SIZE) {
+        /* use the key directly if it's <= blocksize bytes */
+        memcpy(pad, key, keylen);
+    } else {
+        /* if it's > blocksize bytes, hash it */
+        neoscrypt_hash_sha256(pad, key, keylen);
+    }
+
+    /* inner = (key ^ 0x36) */
+    /* h(inner || ...) */
+    for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+      pad[i] ^= 0x36;
+    neoscrypt_hash_update_sha256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+    /* outer = (key ^ 0x5c) */
+    /* h(outer || ...) */
+    for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+      pad[i] ^= (0x5c ^ 0x36);
+    neoscrypt_hash_update_sha256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
+}
+
+static void neoscrypt_hmac_update_sha256(sha256_hmac_state *st, const uint8_t *m, size_t mlen) {
+    /* h(inner || m...) */
+    neoscrypt_hash_update_sha256(&st->inner, m, mlen);
+}
+
+static void neoscrypt_hmac_finish_sha256(sha256_hmac_state *st, hash_digest mac) {
+    /* h(inner || m) */
+    hash_digest innerhash;
+    neoscrypt_hash_finish_sha256(&st->inner, innerhash);
+
+    /* h(outer || h(inner || m)) */
+    neoscrypt_hash_update_sha256(&st->outer, innerhash, sizeof(innerhash));
+    neoscrypt_hash_finish_sha256(&st->outer, mac);
+}
+
+
+/* PBKDF2 for SHA-256 */
+
+static void neoscrypt_pbkdf2_sha256(const uint8_t *password, size_t password_len,
+  const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *output, size_t output_len) {
+    sha256_hmac_state hmac_pw, hmac_pw_salt, work;
+    hash_digest ti, u;
+    uint8_t be[4];
+    uint32_t i, j, k, blocks;
+
+    /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
+
+    /* hmac(password, ...) */
+    neoscrypt_hmac_init_sha256(&hmac_pw, password, password_len);
+
+    /* hmac(password, salt...) */
+    hmac_pw_salt = hmac_pw;
+    neoscrypt_hmac_update_sha256(&hmac_pw_salt, salt, salt_len);
+
+    blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
+    for(i = 1; i <= blocks; i++) {
+        /* U1 = hmac(password, salt || be(i)) */
+        U32TO8_BE(be, i);
+        work = hmac_pw_salt;
+        neoscrypt_hmac_update_sha256(&work, be, 4);
+        neoscrypt_hmac_finish_sha256(&work, ti);
+        memcpy(u, ti, sizeof(u));
+
+        /* T[i] = U1 ^ U2 ^ U3... */
+        for(j = 0; j < N - 1; j++) {
+            /* UX = hmac(password, U{X-1}) */
+            work = hmac_pw;
+            neoscrypt_hmac_update_sha256(&work, u, SCRYPT_HASH_DIGEST_SIZE);
+            neoscrypt_hmac_finish_sha256(&work, u);
+
+            /* T[i] ^= UX */
+            for(k = 0; k < sizeof(u); k++)
+              ti[k] ^= u[k];
+        }
+
+        memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len);
+        output += SCRYPT_HASH_DIGEST_SIZE;
+        output_len -= SCRYPT_HASH_DIGEST_SIZE;
+    }
+}
+
+
+/* NeoScrypt */
+
+#if defined(ASM)
+
+extern void neoscrypt_salsa(uint *X, uint rounds);
+extern void neoscrypt_salsa_tangle(uint *X, uint count);
+extern void neoscrypt_chacha(uint *X, uint rounds);
+
+extern void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len);
+extern void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len);
+extern void neoscrypt_blkxor(void *dstp, const void *srcp, uint len);
+
+#else
+
+/* Salsa20, rounds must be a multiple of 2 */
+static void neoscrypt_salsa(uint *X, uint rounds) {
+    uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t;
+
+    x0 = X[0];   x1 = X[1];   x2 = X[2];   x3 = X[3];
+    x4 = X[4];   x5 = X[5];   x6 = X[6];   x7 = X[7];
+    x8 = X[8];   x9 = X[9];  x10 = X[10]; x11 = X[11];
+   x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];
+
+#define quarter(a, b, c, d) \
+    t = a + d; t = ROTL32(t,  7); b ^= t; \
+    t = b + a; t = ROTL32(t,  9); c ^= t; \
+    t = c + b; t = ROTL32(t, 13); d ^= t; \
+    t = d + c; t = ROTL32(t, 18); a ^= t;
+
+    for(; rounds; rounds -= 2) {
+        quarter( x0,  x4,  x8, x12);
+        quarter( x5,  x9, x13,  x1);
+        quarter(x10, x14,  x2,  x6);
+        quarter(x15,  x3,  x7, x11);
+        quarter( x0,  x1,  x2,  x3);
+        quarter( x5,  x6,  x7,  x4);
+        quarter(x10, x11,  x8,  x9);
+        quarter(x15, x12, x13, x14);
+    }
+
+    X[0] += x0;   X[1] += x1;   X[2] += x2;   X[3] += x3;
+    X[4] += x4;   X[5] += x5;   X[6] += x6;   X[7] += x7;
+    X[8] += x8;   X[9] += x9;  X[10] += x10; X[11] += x11;
+   X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;
+
+#undef quarter
+}
+
+/* ChaCha20, rounds must be a multiple of 2 */
+static void neoscrypt_chacha(uint *X, uint rounds) {
+    uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t;
+
+    x0 = X[0];   x1 = X[1];   x2 = X[2];   x3 = X[3];
+    x4 = X[4];   x5 = X[5];   x6 = X[6];   x7 = X[7];
+    x8 = X[8];   x9 = X[9];  x10 = X[10]; x11 = X[11];
+   x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];
+
+#define quarter(a,b,c,d) \
+    a += b; t = d ^ a; d = ROTL32(t, 16); \
+    c += d; t = b ^ c; b = ROTL32(t, 12); \
+    a += b; t = d ^ a; d = ROTL32(t,  8); \
+    c += d; t = b ^ c; b = ROTL32(t,  7);
+
+    for(; rounds; rounds -= 2) {
+        quarter( x0,  x4,  x8, x12);
+        quarter( x1,  x5,  x9, x13);
+        quarter( x2,  x6, x10, x14);
+        quarter( x3,  x7, x11, x15);
+        quarter( x0,  x5, x10, x15);
+        quarter( x1,  x6, x11, x12);
+        quarter( x2,  x7,  x8, x13);
+        quarter( x3,  x4,  x9, x14);
+    }
+
+    X[0] += x0;   X[1] += x1;   X[2] += x2;   X[3] += x3;
+    X[4] += x4;   X[5] += x5;   X[6] += x6;   X[7] += x7;
+    X[8] += x8;   X[9] += x9;  X[10] += x10; X[11] += x11;
+   X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;
+
+#undef quarter
+}
+
+
+/* Fast 32-bit / 64-bit memcpy();
+ * len must be a multiple of 32 bytes */
+static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) {
+    ulong *dst = (ulong *) dstp;
+    ulong *src = (ulong *) srcp;
+    uint i;
+
+    for(i = 0; i < (len / sizeof(ulong)); i += 4) {
+        dst[i]     = src[i];
+        dst[i + 1] = src[i + 1];
+        dst[i + 2] = src[i + 2];
+        dst[i + 3] = src[i + 3];
+    }
+}
+
+/* Fast 32-bit / 64-bit block swapper;
+ * len must be a multiple of 32 bytes */
+static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) {
+    ulong *blkA = (ulong *) blkAp;
+    ulong *blkB = (ulong *) blkBp;
+    register ulong t0, t1, t2, t3;
+    uint i;
+
+    for(i = 0; i < (len / sizeof(ulong)); i += 4) {
+        t0          = blkA[i];
+        t1          = blkA[i + 1];
+        t2          = blkA[i + 2];
+        t3          = blkA[i + 3];
+        blkA[i]     = blkB[i];
+        blkA[i + 1] = blkB[i + 1];
+        blkA[i + 2] = blkB[i + 2];
+        blkA[i + 3] = blkB[i + 3];
+        blkB[i]     = t0;
+        blkB[i + 1] = t1;
+        blkB[i + 2] = t2;
+        blkB[i + 3] = t3;
+    }
+}
+
+/* Fast 32-bit / 64-bit block XOR engine;
+ * len must be a multiple of 32 bytes */
+static void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) {
+    ulong *dst = (ulong *) dstp;
+    ulong *src = (ulong *) srcp;
+    uint i;
+
+    for(i = 0; i < (len / sizeof(ulong)); i += 4) {
+        dst[i]     ^= src[i];
+        dst[i + 1] ^= src[i + 1];
+        dst[i + 2] ^= src[i + 2];
+        dst[i + 3] ^= src[i + 3];
+    }
+}
+
+#endif
+
+/* 32-bit / 64-bit optimised memcpy() */
+static void neoscrypt_copy(void *dstp, const void *srcp, uint len) {
+    ulong *dst = (ulong *) dstp;
+    ulong *src = (ulong *) srcp;
+    uint i, tail;
+
+    for(i = 0; i < (len / sizeof(ulong)); i++)
+      dst[i] = src[i];
+
+    tail = len & (sizeof(ulong) - 1);
+    if(tail) {
+        uchar *dstb = (uchar *) dstp;
+        uchar *srcb = (uchar *) srcp;
+
+        for(i = len - tail; i < len; i++)
+          dstb[i] = srcb[i];
+    }
+}
+
+/* 32-bit / 64-bit optimised memory erase aka memset() to zero */
+static void neoscrypt_erase(void *dstp, uint len) {
+    const ulong null = 0;
+    ulong *dst = (ulong *) dstp;
+    uint i, tail;
+
+    for(i = 0; i < (len / sizeof(ulong)); i++)
+      dst[i] = null;
+
+    tail = len & (sizeof(ulong) - 1);
+    if(tail) {
+        uchar *dstb = (uchar *) dstp;
+
+        for(i = len - tail; i < len; i++)
+          dstb[i] = (uchar)null;
+    }
+}
+
+/* 32-bit / 64-bit optimised XOR engine */
+static void neoscrypt_xor(void *dstp, const void *srcp, uint len) {
+    ulong *dst = (ulong *) dstp;
+    ulong *src = (ulong *) srcp;
+    uint i, tail;
+
+    for(i = 0; i < (len / sizeof(ulong)); i++)
+      dst[i] ^= src[i];
+
+    tail = len & (sizeof(ulong) - 1);
+    if(tail) {
+        uchar *dstb = (uchar *) dstp;
+        uchar *srcb = (uchar *) srcp;
+
+        for(i = len - tail; i < len; i++)
+          dstb[i] ^= srcb[i];
+    }
+}
+
+
+/* BLAKE2s */
+
+#define BLAKE2S_BLOCK_SIZE    64U
+#define BLAKE2S_OUT_SIZE      32U
+#define BLAKE2S_KEY_SIZE      32U
+
+/* Parameter block of 32 bytes */
+typedef struct blake2s_param_t {
+    uchar digest_length;
+    uchar key_length;
+    uchar fanout;
+    uchar depth;
+    uint  leaf_length;
+    uchar node_offset[6];
+    uchar node_depth;
+    uchar inner_length;
+    uchar salt[8];
+    uchar personal[8];
+} blake2s_param;
+
+/* State block of 180 bytes */
+typedef struct blake2s_state_t {
+    uint  h[8];
+    uint  t[2];
+    uint  f[2];
+    uchar buf[2 * BLAKE2S_BLOCK_SIZE];
+    uint  buflen;
+} blake2s_state;
+
+static const uint blake2s_IV[8] = {
+    0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+    0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint8_t blake2s_sigma[10][16] = {
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+    { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+    { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+    {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+    { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+};
+
+static void blake2s_compress(blake2s_state *S, const uint *buf) {
+    uint i;
+    uint m[16];
+    uint v[16];
+	
+    neoscrypt_copy(m, buf, 64);
+    neoscrypt_copy(v, S, 32);
+
+    v[ 8] = blake2s_IV[0];
+    v[ 9] = blake2s_IV[1];
+    v[10] = blake2s_IV[2];
+    v[11] = blake2s_IV[3];
+    v[12] = S->t[0] ^ blake2s_IV[4];
+    v[13] = S->t[1] ^ blake2s_IV[5];
+    v[14] = S->f[0] ^ blake2s_IV[6];
+    v[15] = S->f[1] ^ blake2s_IV[7];
+#define G(r,i,a,b,c,d) \
+  do { \
+    a = a + b + m[blake2s_sigma[r][2*i+0]]; \
+    d = ROTR32(d ^ a, 16); \
+    c = c + d; \
+    b = ROTR32(b ^ c, 12); \
+    a = a + b + m[blake2s_sigma[r][2*i+1]]; \
+    d = ROTR32(d ^ a, 8); \
+    c = c + d; \
+    b = ROTR32(b ^ c, 7); \
+  } while(0)
+#define ROUND(r) \
+  do { \
+    G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \
+    G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \
+    G(r, 2, v[ 2], v[ 6], v[10], v[14]); \
+    G(r, 3, v[ 3], v[ 7], v[11], v[15]); \
+    G(r, 4, v[ 0], v[ 5], v[10], v[15]); \
+    G(r, 5, v[ 1], v[ 6], v[11], v[12]); \
+    G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \
+    G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \
+  } while(0)
+    ROUND(0);
+    ROUND(1);
+    ROUND(2);
+    ROUND(3);
+    ROUND(4);
+    ROUND(5);
+    ROUND(6);
+    ROUND(7);
+    ROUND(8);
+    ROUND(9);
+
+	for (i = 0; i < 8; i++) 
+    S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+	
+#undef G
+#undef ROUND
+}
+
+static void blake2s_update(blake2s_state *S, const uchar *input, uint input_size) {
+    uint left, fill;
+
+    while(input_size > 0) {
+        left = S->buflen;
+        fill = 2 * BLAKE2S_BLOCK_SIZE - left;
+        if(input_size > fill) {
+            /* Buffer fill */
+            neoscrypt_copy(S->buf + left, input, fill);
+            S->buflen += fill;
+            /* Counter increment */
+            S->t[0] += BLAKE2S_BLOCK_SIZE;
+            /* Compress */
+            blake2s_compress(S, (uint *) S->buf);
+            /* Shift buffer left */
+            neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, BLAKE2S_BLOCK_SIZE);
+            S->buflen -= BLAKE2S_BLOCK_SIZE;
+            input += fill;
+            input_size -= fill;
+        } else {
+            neoscrypt_copy(S->buf + left, input, input_size);
+            S->buflen += input_size; 
+            /* Do not compress */
+            input += input_size;
+            input_size = 0;
+        }
+    }
+}
+
+static void neoscrypt_blake2s(const void *input, const uint input_size, const void *key, const uchar key_size,
+  void *output, const uchar output_size) {
+    uchar block[BLAKE2S_BLOCK_SIZE];
+    blake2s_param P[1];
+    blake2s_state S[1];
+
+    /* Initialise */
+    neoscrypt_erase(P, 32);
+    P->digest_length = output_size;
+    P->key_length    = key_size;
+    P->fanout        = 1;
+    P->depth         = 1;
+
+    neoscrypt_erase(S, 180);
+    neoscrypt_copy(S, blake2s_IV, 32);
+    neoscrypt_xor(S, P, 32);
+
+    neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE);
+    neoscrypt_copy(block, key, key_size);
+    blake2s_update(S, (uchar *) block, BLAKE2S_BLOCK_SIZE);
+
+    /* Update */
+    blake2s_update(S, (uchar *) input, input_size);
+
+    /* Finish */
+    if(S->buflen > BLAKE2S_BLOCK_SIZE) {
+        S->t[0] += BLAKE2S_BLOCK_SIZE;
+        blake2s_compress(S, (uint *) S->buf);
+        S->buflen -= BLAKE2S_BLOCK_SIZE;
+        neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, S->buflen);
+    }
+    S->t[0] += S->buflen;
+    S->f[0] = ~0U;
+    neoscrypt_erase(S->buf + S->buflen, 2 * BLAKE2S_BLOCK_SIZE - S->buflen);
+    blake2s_compress(S, (uint *) S->buf);
+
+    /* Write back */
+    neoscrypt_copy(output, S, output_size);
+//	for (int k = 0; k<4; k++) { printf("cpu blake   %d %08x %08x\n", k, ((unsigned int*)output)[2 * k], ((unsigned int*)output)[2 * k + 1]); }
+
+}
+
+
+#define FASTKDF_BUFFER_SIZE 256U
+
+/* FastKDF, a fast buffered key derivation function:
+ * FASTKDF_BUFFER_SIZE must be a power of 2;
+ * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
+ * prf_output_size must be <= prf_key_size; */
+static void neoscrypt_fastkdf(const uchar *password, uint password_len, const uchar *salt, uint salt_len,
+  uint N, uchar *output, uint output_len) {
+//	for (int i = 0; i<10; i++) { printf("cpu password %d %08x %08x\n", i, ((unsigned int*)password)[2 * i], ((unsigned int*)password)[2 * i+1]); }
+    const uint stack_align =  0x40; 
+	const uint kdf_buf_size = 256U; //FASTKDF_BUFFER_SIZE
+    const uint prf_input_size = 64U; //BLAKE2S_BLOCK_SIZE
+    const uint prf_key_size = 32U; //BLAKE2S_KEY_SIZE
+    const uint prf_output_size = 32U; //BLAKE2S_OUT_SIZE
+    uint bufptr, a, b, i, j;
+    uchar *A, *B, *prf_input, *prf_key, *prf_output;
+    uchar *stack;
+	stack = (uchar*)malloc(sizeof(uchar) * 2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align);
+	if(stack == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
+	/* Align and set up the buffers in stack */
+    //uchar stack[2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align];
+	
+    A          = &stack[stack_align & ~(stack_align - 1)];
+    B          = &A[kdf_buf_size + prf_input_size];
+    prf_output = &A[2 * kdf_buf_size + prf_input_size + prf_key_size];
+
+    /* Initialise the password buffer */
+    if(password_len > kdf_buf_size)
+       password_len = kdf_buf_size;
+
+    a = kdf_buf_size / password_len;
+    for(i = 0; i < a; i++)
+      neoscrypt_copy(&A[i * password_len], &password[0], password_len);
+    b = kdf_buf_size - a * password_len;
+    if(b)
+      neoscrypt_copy(&A[a * password_len], &password[0], b);
+    neoscrypt_copy(&A[kdf_buf_size], &password[0], prf_input_size);
+
+    /* Initialise the salt buffer */
+    if(salt_len > kdf_buf_size)
+       salt_len = kdf_buf_size;
+
+    a = kdf_buf_size / salt_len;
+    for(i = 0; i < a; i++)
+      neoscrypt_copy(&B[i * salt_len], &salt[0], salt_len);
+    b = kdf_buf_size - a * salt_len;
+    if(b)
+      neoscrypt_copy(&B[a * salt_len], &salt[0], b);
+    neoscrypt_copy(&B[kdf_buf_size], &salt[0], prf_key_size);
+
+    /* The primary iteration */
+    for(i = 0, bufptr = 0; i < N; i++) {
+
+        /* Map the PRF input buffer */
+        prf_input = &A[bufptr];
+
+        /* Map the PRF key buffer */
+        prf_key = &B[bufptr];
+
+        /* PRF */
+		
+//		for (int k = 0; k<(prf_input_size/4); k++) { printf("cpu bufptr %08x before blake %d  %d %08x \n",bufptr, i, k, ((unsigned int*)prf_input)[k]); }
+        neoscrypt_blake2s(prf_input, prf_input_size, prf_key, prf_key_size, prf_output, prf_output_size);
+		//for (int k = 0; k<(prf_output_size/4); k++) { printf("cpu after blake %d  %d %08x \n", i, k, ((unsigned int*)prf_output)[k]); }
+
+        /* Calculate the next buffer pointer */
+        for(j = 0, bufptr = 0; j < prf_output_size; j++)
+          bufptr += prf_output[j];
+        bufptr &= (kdf_buf_size - 1);
+
+        /* Modify the salt buffer */
+        neoscrypt_xor(&B[bufptr], &prf_output[0], prf_output_size);
+
+        /* Head modified, tail updated */
+        if(bufptr < prf_key_size)
+          neoscrypt_copy(&B[kdf_buf_size + bufptr], &B[bufptr], MIN(prf_output_size, prf_key_size - bufptr));
+
+        /* Tail modified, head updated */
+        if((kdf_buf_size - bufptr) < prf_output_size)
+          neoscrypt_copy(&B[0], &B[kdf_buf_size], prf_output_size - (kdf_buf_size - bufptr));
+
+    }
+
+    /* Modify and copy into the output buffer */
+    if(output_len > kdf_buf_size)
+       output_len = kdf_buf_size;
+
+    a = kdf_buf_size - bufptr;
+    if(a >= output_len) {
+        neoscrypt_xor(&B[bufptr], &A[0], output_len);
+        neoscrypt_copy(&output[0], &B[bufptr], output_len);
+    } else {
+        neoscrypt_xor(&B[bufptr], &A[0], a);
+        neoscrypt_xor(&B[0], &A[a], output_len - a);
+        neoscrypt_copy(&output[0], &B[bufptr], a);
+        neoscrypt_copy(&output[a], &B[0], output_len - a);
+    }
+//	for (int i = 0; i<10; i++) { printf("cpu fastkdf %d %08x %08x\n", i, ((unsigned int*)output)[2 * i], ((unsigned int*)output)[2 * i + 1]); }
+
+}
+
+
+/* Configurable optimised block mixer */
+static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode) {
+    uint i, mixer, rounds;
+
+    mixer  = mixmode >> 8;
+    rounds = mixmode & 0xFF;
+
+    /* NeoScrypt flow:                   Scrypt flow:
+         Xa ^= Xd;  M(Xa'); Ya = Xa";      Xa ^= Xb;  M(Xa'); Ya = Xa";
+         Xb ^= Xa"; M(Xb'); Yb = Xb";      Xb ^= Xa"; M(Xb'); Yb = Xb";
+         Xc ^= Xb"; M(Xc'); Yc = Xc";      Xa" = Ya;
+         Xd ^= Xc"; M(Xd'); Yd = Xd";      Xb" = Yb;
+         Xa" = Ya; Xb" = Yc;
+         Xc" = Yb; Xd" = Yd; */
+
+    if(r == 1) {
+        neoscrypt_blkxor(&X[0], &X[16], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[0], rounds);
+        else
+          neoscrypt_salsa(&X[0], rounds);
+        neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[16], rounds);
+        else
+          neoscrypt_salsa(&X[16], rounds);
+        return;
+    }
+
+    if(r == 2) {
+        neoscrypt_blkxor(&X[0], &X[48], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[0], rounds);
+        else
+          neoscrypt_salsa(&X[0], rounds);
+        neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[16], rounds);
+        else
+          neoscrypt_salsa(&X[16], rounds);
+        neoscrypt_blkxor(&X[32], &X[16], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[32], rounds);
+        else
+          neoscrypt_salsa(&X[32], rounds);
+        neoscrypt_blkxor(&X[48], &X[32], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[48], rounds);
+        else
+          neoscrypt_salsa(&X[48], rounds);
+        neoscrypt_blkswp(&X[16], &X[32], SCRYPT_BLOCK_SIZE);
+        return;
+    }
+
+    /* Reference code for any reasonable r */
+    for(i = 0; i < 2 * r; i++) {
+        if(i) neoscrypt_blkxor(&X[16 * i], &X[16 * (i - 1)], SCRYPT_BLOCK_SIZE);
+        else  neoscrypt_blkxor(&X[0], &X[16 * (2 * r - 1)], SCRYPT_BLOCK_SIZE);
+        if(mixer)
+          neoscrypt_chacha(&X[16 * i], rounds);
+        else
+          neoscrypt_salsa(&X[16 * i], rounds);
+        neoscrypt_blkcpy(&Y[16 * i], &X[16 * i], SCRYPT_BLOCK_SIZE);
+    }
+    for(i = 0; i < r; i++)
+      neoscrypt_blkcpy(&X[16 * i], &Y[16 * 2 * i], SCRYPT_BLOCK_SIZE);
+    for(i = 0; i < r; i++)
+      neoscrypt_blkcpy(&X[16 * (i + r)], &Y[16 * (2 * i + 1)], SCRYPT_BLOCK_SIZE);
+}
+
+/* NeoScrypt core engine:
+ * p = 1, salt = password;
+ * Basic customisation (required):
+ *   profile bit 0:
+ *     0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20;
+ *     1 = Scrypt(1024, 1, 1) with Salsa20/8;
+ *   profile bits 4 to 1:
+ *     0000 = FastKDF-BLAKE2s;
+ *     0001 = PBKDF2-HMAC-SHA256;
+ * Extended customisation (optional):
+ *   profile bit 31:
+ *     0 = extended customisation absent;
+ *     1 = extended customisation present;
+ *   profile bits 7 to 5 (rfactor):
+ *     000 = r of 1;
+ *     001 = r of 2;
+ *     010 = r of 4;
+ *     ...
+ *     111 = r of 128;
+ *   profile bits 12 to 8 (Nfactor):
+ *     00000 = N of 2;
+ *     00001 = N of 4;
+ *     00010 = N of 8;
+ *     .....
+ *     00110 = N of 128;
+ *     .....
+ *     01001 = N of 1024;
+ *     .....
+ *     11110 = N of 2147483648;
+ *   profile bits 30 to 13 are reserved */
+void neoscrypt(const uchar *password, uchar *output, uint profile) {
+    uint N = 128, r = 2, dblmix = 1, mixmode = 0x14, stack_align = 0x40;
+    uint kdf, i, j;
+    uint *X, *Y, *Z, *V;
+
+    if(profile & 0x1) {
+        N = 1024;        /* N = (1 << (Nfactor + 1)); */
+        r = 1;           /* r = (1 << rfactor); */
+        dblmix = 0;      /* Salsa only */
+        mixmode = 0x08;  /* 8 rounds */
+    }
+
+    if(profile >> 31) {
+        N = (1 << (((profile >> 8) & 0x1F) + 1));
+        r = (1 << ((profile >> 5) & 0x7));
+    }
+    uchar *stack;
+    stack = (uchar*)malloc(((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align)*sizeof(uchar));
+	if(stack == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
+	/* X = r * 2 * SCRYPT_BLOCK_SIZE */
+    X = (uint *) &stack[stack_align & ~(stack_align - 1)];
+    /* Z is a copy of X for ChaCha */
+    Z = &X[32 * r];
+    /* Y is an X sized temporal space */
+    Y = &X[64 * r];
+    /* V = N * r * 2 * SCRYPT_BLOCK_SIZE */
+    V = &X[96 * r];
+
+    /* X = KDF(password, salt) */
+    kdf = (profile >> 1) & 0xF;
+
+    switch(kdf) {
+
+        default:
+        case(0x0):
+            neoscrypt_fastkdf(password, 80, password, 80, 32, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE);
+            break;
+
+        case(0x1):
+            neoscrypt_pbkdf2_sha256(password, 80, password, 80, 1, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE);
+            break;
+
+    }
+
+    /* Process ChaCha 1st, Salsa 2nd and XOR them into FastKDF; otherwise Salsa only */
+
+    if(dblmix) {
+        /* blkcpy(Z, X) */
+        neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * SCRYPT_BLOCK_SIZE);
+
+        /* Z = SMix(Z) */
+        for(i = 0; i < N; i++) {
+            /* blkcpy(V, Z) */
+            neoscrypt_blkcpy(&V[i * (32 * r)], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE);
+            /* blkmix(Z, Y) */
+            neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100));
+        }
+        for(i = 0; i < N; i++) {
+            /* integerify(Z) mod N */
+            j = (32 * r) * (Z[16 * (2 * r - 1)] & (N - 1));
+            /* blkxor(Z, V) */
+            neoscrypt_blkxor(&Z[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE);
+            /* blkmix(Z, Y) */
+            neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100));
+        }
+    }
+
+#if (ASM)
+    /* Must be called before and after SSE2 Salsa */
+    neoscrypt_salsa_tangle(&X[0], r * 2);
+#endif
+
+    /* X = SMix(X) */
+    for(i = 0; i < N; i++) {
+        /* blkcpy(V, X) */
+        neoscrypt_blkcpy(&V[i * (32 * r)], &X[0], r * 2 * SCRYPT_BLOCK_SIZE);
+        /* blkmix(X, Y) */
+        neoscrypt_blkmix(&X[0], &Y[0], r, mixmode);
+    }
+    for(i = 0; i < N; i++) {
+        /* integerify(X) mod N */
+        j = (32 * r) * (X[16 * (2 * r - 1)] & (N - 1));
+        /* blkxor(X, V) */
+        neoscrypt_blkxor(&X[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE);
+        /* blkmix(X, Y) */
+        neoscrypt_blkmix(&X[0], &Y[0], r, mixmode);
+    }
+
+#if (ASM)
+    neoscrypt_salsa_tangle(&X[0], r * 2);
+#endif
+
+    if(dblmix)
+      /* blkxor(X, Z) */
+      neoscrypt_blkxor(&X[0], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE);
+
+    /* output = KDF(password, X) */
+    switch(kdf) {
+
+        default:
+        case(0x0):
+            neoscrypt_fastkdf(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 32, output, 32);
+            break;
+
+        case(0x1):
+            neoscrypt_pbkdf2_sha256(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32);
+            break;
+
+    }
+	free(stack);
+}
+
diff --git a/sph/neoscrypt.h b/sph/neoscrypt.h
new file mode 100644
index 0000000000..aec9d541df
--- /dev/null
+++ b/sph/neoscrypt.h
@@ -0,0 +1,27 @@
+void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile);
+
+#define SCRYPT_BLOCK_SIZE 64
+#define SCRYPT_HASH_BLOCK_SIZE 64
+#define SCRYPT_HASH_DIGEST_SIZE 32
+
+typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+#ifndef ROTL32
+#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+#endif
+#ifndef ROTR32
+#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+#endif
+
+#define U8TO32_BE(p) \
+    (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
+    ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])))
+
+#define U32TO8_BE(p, v) \
+    (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+    (p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+
+#define U64TO8_BE(p, v) \
+    U32TO8_BE((p),     (uint32_t)((v) >> 32)); \
+    U32TO8_BE((p) + 4, (uint32_t)((v)      ));
+
diff --git a/sph/sha2.c b/sph/sha2.c
index 46f0e5bee3..5780c76934 100644
--- a/sph/sha2.c
+++ b/sph/sha2.c
@@ -8,7 +8,6 @@
  * any later version.  See COPYING for more details.
  */
 
-#include "cpuminer-config.h"
 #include "miner.h"
 
 #include <string.h>
@@ -462,169 +461,3 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
 }
 
 #endif /* EXTERN_SHA256 */
-
-#if HAVE_SHA256_4WAY
-
-void sha256d_ms_4way(uint32_t *hash,  uint32_t *data,
-	const uint32_t *midstate, const uint32_t *prehash);
-
-static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
-{
-	uint32_t data[4 * 64] __attribute__((aligned(128)));
-	uint32_t hash[4 * 8] __attribute__((aligned(32)));
-	uint32_t midstate[4 * 8] __attribute__((aligned(32)));
-	uint32_t prehash[4 * 8] __attribute__((aligned(32)));
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	const uint32_t Htarg = ptarget[7];
-	int i, j;
-	
-	memcpy(data, pdata + 16, 64);
-	sha256d_preextend(data);
-	for (i = 31; i >= 0; i--)
-		for (j = 0; j < 4; j++)
-			data[i * 4 + j] = data[i];
-	
-	sha256_init(midstate);
-	sha256_transform(midstate, pdata, 0);
-	memcpy(prehash, midstate, 32);
-	sha256d_prehash(prehash, pdata + 16);
-	for (i = 7; i >= 0; i--) {
-		for (j = 0; j < 4; j++) {
-			midstate[i * 4 + j] = midstate[i];
-			prehash[i * 4 + j] = prehash[i];
-		}
-	}
-	
-	do {
-		for (i = 0; i < 4; i++)
-			data[4 * 3 + i] = ++n;
-		
-		sha256d_ms_4way(hash, data, midstate, prehash);
-		
-		for (i = 0; i < 4; i++) {
-			if (swab32(hash[4 * 7 + i]) <= Htarg) {
-				pdata[19] = data[4 * 3 + i];
-				sha256d_80_swap(hash, pdata);
-				if (fulltest(hash, ptarget)) {
-					*hashes_done = n - first_nonce + 1;
-					return 1;
-				}
-			}
-		}
-	} while (n < max_nonce && !work_restart[thr_id].restart);
-	
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
-}
-
-#endif /* HAVE_SHA256_4WAY */
-
-#if HAVE_SHA256_8WAY
-
-void sha256d_ms_8way(uint32_t *hash,  uint32_t *data,
-	const uint32_t *midstate, const uint32_t *prehash);
-
-static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
-{
-	uint32_t data[8 * 64] __attribute__((aligned(128)));
-	uint32_t hash[8 * 8] __attribute__((aligned(32)));
-	uint32_t midstate[8 * 8] __attribute__((aligned(32)));
-	uint32_t prehash[8 * 8] __attribute__((aligned(32)));
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	const uint32_t Htarg = ptarget[7];
-	int i, j;
-	
-	memcpy(data, pdata + 16, 64);
-	sha256d_preextend(data);
-	for (i = 31; i >= 0; i--)
-		for (j = 0; j < 8; j++)
-			data[i * 8 + j] = data[i];
-	
-	sha256_init(midstate);
-	sha256_transform(midstate, pdata, 0);
-	memcpy(prehash, midstate, 32);
-	sha256d_prehash(prehash, pdata + 16);
-	for (i = 7; i >= 0; i--) {
-		for (j = 0; j < 8; j++) {
-			midstate[i * 8 + j] = midstate[i];
-			prehash[i * 8 + j] = prehash[i];
-		}
-	}
-	
-	do {
-		for (i = 0; i < 8; i++)
-			data[8 * 3 + i] = ++n;
-		
-		sha256d_ms_8way(hash, data, midstate, prehash);
-		
-		for (i = 0; i < 8; i++) {
-			if (swab32(hash[8 * 7 + i]) <= Htarg) {
-				pdata[19] = data[8 * 3 + i];
-				sha256d_80_swap(hash, pdata);
-				if (fulltest(hash, ptarget)) {
-					*hashes_done = n - first_nonce + 1;
-					return 1;
-				}
-			}
-		}
-	} while (n < max_nonce && !work_restart[thr_id].restart);
-	
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
-}
-
-#endif /* HAVE_SHA256_8WAY */
-
-int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
-{
-	uint32_t data[64] /* __attribute__((aligned(128))) */;
-	uint32_t hash[8] /* __attribute__((aligned(32))) */;
-	uint32_t midstate[8] /* __attribute__((aligned(32))) */;
-	uint32_t prehash[8] /* __attribute__((aligned(32))) */;
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	const uint32_t Htarg = ptarget[7];
-	
-#if HAVE_SHA256_8WAY
-	if (sha256_use_8way())
-		return scanhash_sha256d_8way(thr_id, pdata, ptarget,
-			max_nonce, hashes_done);
-#endif
-#if HAVE_SHA256_4WAY
-	if (sha256_use_4way())
-		return scanhash_sha256d_4way(thr_id, pdata, ptarget,
-			max_nonce, hashes_done);
-#endif
-	
-	memcpy(data, pdata + 16, 64);
-	sha256d_preextend(data);
-	
-	sha256_init(midstate);
-	sha256_transform(midstate, pdata, 0);
-	memcpy(prehash, midstate, 32);
-	sha256d_prehash(prehash, pdata + 16);
-	
-	do {
-		data[3] = ++n;
-		sha256d_ms(hash, data, midstate, prehash);
-		if (swab32(hash[7]) <= Htarg) {
-			pdata[19] = data[3];
-			sha256d_80_swap(hash, pdata);
-			if (fulltest(hash, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return 1;
-			}
-		}
-	} while (n < max_nonce && !work_restart[thr_id].restart);
-	
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
-}
diff --git a/sph/sha256_Y.c b/sph/sha256_Y.c
new file mode 100644
index 0000000000..d17cbe2c7a
--- /dev/null
+++ b/sph/sha256_Y.c
@@ -0,0 +1,418 @@
+/*-
+ * Copyright 2005,2007,2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+
+#include <stdint.h>
+#include <string.h>
+
+#include "sph/sysendian.h"
+
+#include "sph/sha256_Y.h"
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static void
+be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		be32enc(dst + i * 4, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint32_t).  Assumes len is a multiple of 4.
+ */
+static void
+be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		dst[i] = be32dec(src + i * 4);
+}
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define SHR(x, n)	(x >> n)
+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	t0 = h + S1(e) + Ch(e, f, g) + k;		\
+	t1 = S0(a) + Maj(a, b, c);			\
+	d += t0;					\
+	h  = t0 + t1;
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, k)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i] + k)
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA256_Transform(uint32_t * state, const unsigned char block[64])
+{
+	uint32_t W[64];
+	uint32_t S[8];
+	uint32_t t0, t1;
+	int i;
+	/* 1. Prepare message schedule W. */
+	be32dec_vect(W, block, 64);
+
+	for (i = 16; i < 64; i++)
+		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 32);
+
+	/* 3. Mix. */
+	RNDr(S, W, 0, 0x428a2f98);
+	RNDr(S, W, 1, 0x71374491);
+	RNDr(S, W, 2, 0xb5c0fbcf);
+	RNDr(S, W, 3, 0xe9b5dba5);
+	RNDr(S, W, 4, 0x3956c25b);
+	RNDr(S, W, 5, 0x59f111f1);
+	RNDr(S, W, 6, 0x923f82a4);
+	RNDr(S, W, 7, 0xab1c5ed5);
+	RNDr(S, W, 8, 0xd807aa98);
+	RNDr(S, W, 9, 0x12835b01);
+	RNDr(S, W, 10, 0x243185be);
+	RNDr(S, W, 11, 0x550c7dc3);
+	RNDr(S, W, 12, 0x72be5d74);
+	RNDr(S, W, 13, 0x80deb1fe);
+	RNDr(S, W, 14, 0x9bdc06a7);
+	RNDr(S, W, 15, 0xc19bf174);
+	RNDr(S, W, 16, 0xe49b69c1);
+	RNDr(S, W, 17, 0xefbe4786);
+	RNDr(S, W, 18, 0x0fc19dc6);
+	RNDr(S, W, 19, 0x240ca1cc);
+	RNDr(S, W, 20, 0x2de92c6f);
+	RNDr(S, W, 21, 0x4a7484aa);
+	RNDr(S, W, 22, 0x5cb0a9dc);
+	RNDr(S, W, 23, 0x76f988da);
+	RNDr(S, W, 24, 0x983e5152);
+	RNDr(S, W, 25, 0xa831c66d);
+	RNDr(S, W, 26, 0xb00327c8);
+	RNDr(S, W, 27, 0xbf597fc7);
+	RNDr(S, W, 28, 0xc6e00bf3);
+	RNDr(S, W, 29, 0xd5a79147);
+	RNDr(S, W, 30, 0x06ca6351);
+	RNDr(S, W, 31, 0x14292967);
+	RNDr(S, W, 32, 0x27b70a85);
+	RNDr(S, W, 33, 0x2e1b2138);
+	RNDr(S, W, 34, 0x4d2c6dfc);
+	RNDr(S, W, 35, 0x53380d13);
+	RNDr(S, W, 36, 0x650a7354);
+	RNDr(S, W, 37, 0x766a0abb);
+	RNDr(S, W, 38, 0x81c2c92e);
+	RNDr(S, W, 39, 0x92722c85);
+	RNDr(S, W, 40, 0xa2bfe8a1);
+	RNDr(S, W, 41, 0xa81a664b);
+	RNDr(S, W, 42, 0xc24b8b70);
+	RNDr(S, W, 43, 0xc76c51a3);
+	RNDr(S, W, 44, 0xd192e819);
+	RNDr(S, W, 45, 0xd6990624);
+	RNDr(S, W, 46, 0xf40e3585);
+	RNDr(S, W, 47, 0x106aa070);
+	RNDr(S, W, 48, 0x19a4c116);
+	RNDr(S, W, 49, 0x1e376c08);
+	RNDr(S, W, 50, 0x2748774c);
+	RNDr(S, W, 51, 0x34b0bcb5);
+	RNDr(S, W, 52, 0x391c0cb3);
+	RNDr(S, W, 53, 0x4ed8aa4a);
+	RNDr(S, W, 54, 0x5b9cca4f);
+	RNDr(S, W, 55, 0x682e6ff3);
+	RNDr(S, W, 56, 0x748f82ee);
+	RNDr(S, W, 57, 0x78a5636f);
+	RNDr(S, W, 58, 0x84c87814);
+	RNDr(S, W, 59, 0x8cc70208);
+	RNDr(S, W, 60, 0x90befffa);
+	RNDr(S, W, 61, 0xa4506ceb);
+	RNDr(S, W, 62, 0xbef9a3f7);
+	RNDr(S, W, 63, 0xc67178f2);
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++) {
+		state[i] += S[i];
+	
+}
+
+	/* Clean the stack. */
+	memset(W, 0, 256);
+	memset(S, 0, 32);
+	t0 = t1 = 0;
+}
+
+static unsigned char PAD[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Add padding and terminating bit-count. */
+static void
+SHA256_Pad(SHA256_CTX_Y * ctx)
+{
+	unsigned char len[8];
+	uint32_t r, plen;
+
+	/*
+	 * Convert length to a vector of bytes -- we do this now rather
+	 * than later because the length will change after we pad.
+	 */
+	be32enc_vect(len, ctx->count, 8);
+
+	/* Add 1--64 bytes so that the resulting length is 56 mod 64 */
+	r = (ctx->count[1] >> 3) & 0x3f;
+	plen = (r < 56) ? (56 - r) : (120 - r);
+	SHA256_Update_Y(ctx, PAD, (size_t)plen);
+
+	/* Add the terminating bit-count */
+	SHA256_Update_Y(ctx, len, 8);
+}
+
+/* SHA-256 initialization.  Begins a SHA-256 operation. */
+void
+SHA256_Init_Y(SHA256_CTX_Y * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x6A09E667;
+	ctx->state[1] = 0xBB67AE85;
+	ctx->state[2] = 0x3C6EF372;
+	ctx->state[3] = 0xA54FF53A;
+	ctx->state[4] = 0x510E527F;
+	ctx->state[5] = 0x9B05688C;
+	ctx->state[6] = 0x1F83D9AB;
+	ctx->state[7] = 0x5BE0CD19;
+}
+
+/* Add bytes into the hash */
+void
+SHA256_Update_Y(SHA256_CTX_Y * ctx, const void *in, size_t len)
+{
+	uint32_t bitlen[2];
+	uint32_t r;
+	const unsigned char *src = in;
+
+	/* Number of bytes left in the buffer from previous updates */
+	r = (ctx->count[1] >> 3) & 0x3f;
+	
+	/* Convert the length into a number of bits */
+	bitlen[1] = ((uint32_t)len) << 3;
+	bitlen[0] = (uint32_t)(len >> 29);
+
+	/* Update number of bits */
+	if ((ctx->count[1] += bitlen[1]) < bitlen[1])
+		ctx->count[0]++;
+	ctx->count[0] += bitlen[0];
+
+	/* Handle the case where we don't need to perform any transforms */
+	if (len < 64 - r) {
+
+		memcpy(&ctx->buf[r], src, len);
+		return;
+	}
+
+	/* Finish the current block */
+	memcpy(&ctx->buf[r], src, 64 - r);
+	
+	SHA256_Transform(ctx->state, ctx->buf);
+	src += 64 - r;
+	len -= 64 - r;
+
+	/* Perform complete blocks */
+
+	while (len >= 64) {
+		SHA256_Transform(ctx->state, src);
+		src += 64;
+		len -= 64;
+	}
+
+	/* Copy left over data into buffer */
+	memcpy(ctx->buf, src, len);
+}
+
+/*
+ * SHA-256 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx)
+{
+	/* Add padding */
+	SHA256_Pad(ctx);
+
+	/* Write the hash */
+	be32enc_vect(digest, ctx->state, 32);
+
+	/* Clear the context state */
+	memset((void *)ctx, 0, sizeof(*ctx));
+}
+
+/* Initialize an HMAC-SHA256 operation with the given key. */
+void
+HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen)
+{
+	unsigned char pad[64];
+	unsigned char khash[32];
+	const unsigned char * K = _K;
+	size_t i;
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	if (Klen > 64) {
+		SHA256_Init_Y(&ctx->ictx);
+		SHA256_Update_Y(&ctx->ictx, K, Klen);
+		SHA256_Final_Y(khash, &ctx->ictx);
+		K = khash;
+		Klen = 32;
+	}
+
+	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
+	SHA256_Init_Y(&ctx->ictx);
+	memset(pad, 0x36, 64);
+	for (i = 0; i < Klen; i++) {
+		pad[i] ^= K[i];
+    }
+	SHA256_Update_Y(&ctx->ictx, pad, 64);
+
+	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
+	SHA256_Init_Y(&ctx->octx);
+	memset(pad, 0x5c, 64);
+	for (i = 0; i < Klen; i++)
+   {
+		pad[i] ^= K[i];
+   }
+	SHA256_Update_Y(&ctx->octx, pad, 64);
+
+	/* Clean the stack. */
+	memset(khash, 0, 32);
+}
+
+/* Add bytes to the HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len)
+{
+	/* Feed data to the inner SHA256 operation. */
+	SHA256_Update_Y(&ctx->ictx, in, len);
+}
+
+/* Finish an HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx)
+{
+	unsigned char ihash[32];
+
+	/* Finish the inner SHA256 operation. */
+	SHA256_Final_Y(ihash, &ctx->ictx);
+
+	/* Feed the inner hash to the outer SHA256 operation. */
+	SHA256_Update_Y(&ctx->octx, ihash, 32);
+
+	/* Finish the outer SHA256 operation. */
+	SHA256_Final_Y(digest, &ctx->octx);
+
+	/* Clean the stack. */
+	memset(ihash, 0, 32);
+}
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+
+void
+PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
+size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
+{
+	HMAC_SHA256_CTX_Y PShctx, hctx;
+	size_t i;
+	uint8_t ivec[4];
+	uint8_t U[32];
+	uint8_t T[32];
+	uint64_t j;
+	int k;
+	size_t clen;
+
+	/* Compute HMAC state after processing P and S. */
+	HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen);
+	HMAC_SHA256_Update_Y(&PShctx, salt, saltlen);
+
+	/* Iterate through the blocks. */
+	for (i = 0; i * 32 < dkLen; i++) {
+		/* Generate INT(i + 1). */
+		be32enc(ivec, (uint32_t)(i + 1));
+
+		/* Compute U_1 = PRF(P, S || INT(i)). */
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y));
+		HMAC_SHA256_Update_Y(&hctx, ivec, 4);
+		HMAC_SHA256_Final_Y(U, &hctx);
+
+		/* T_i = U_1 ... */
+		memcpy(T, U, 32);
+
+		for (j = 2; j <= c; j++) {
+			/* Compute U_j. */
+			HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen);
+			HMAC_SHA256_Update_Y(&hctx, U, 32);
+			HMAC_SHA256_Final_Y(U, &hctx);
+
+			/* ... xor U_j ... */
+			for (k = 0; k < 32; k++)
+				T[k] ^= U[k];
+		}
+
+		/* Copy as many bytes as necessary into buf. */
+		clen = dkLen - i * 32;
+		if (clen > 32)
+			clen = 32;
+		memcpy(&buf[i * 32], T, clen);
+	}
+
+	/* Clean PShctx, since we never called _Final on it. */
+	memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
+}
diff --git a/sph/sha256_Y.h b/sph/sha256_Y.h
new file mode 100644
index 0000000000..e97b81ba21
--- /dev/null
+++ b/sph/sha256_Y.h
@@ -0,0 +1,63 @@
+/*-
+ * Copyright 2005,2007,2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/lib/libmd/sha256_Y.h,v 1.2 2006/01/17 15:35:56 phk Exp $
+ */
+
+#ifndef _SHA256_H_
+#define _SHA256_H_
+
+#include <sys/types.h>
+
+#include <stdint.h>
+
+typedef struct SHA256Context {
+	uint32_t state[8];
+	uint32_t count[2];
+	unsigned char buf[64];
+} SHA256_CTX_Y;
+
+typedef struct HMAC_SHA256Context {
+	SHA256_CTX_Y ictx;
+	SHA256_CTX_Y octx;
+} HMAC_SHA256_CTX_Y;
+
+void	SHA256_Init_Y(SHA256_CTX_Y *);
+void	SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t);
+void	SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *);
+void	HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *);
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+void	PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
+    uint64_t, uint8_t *, size_t);
+
+
+#endif /* !_SHA256_H_ */
diff --git a/sph/shabal.c b/sph/shabal.c
index 06d368ce54..46fe962eaf 100644
--- a/sph/shabal.c
+++ b/sph/shabal.c
@@ -386,7 +386,7 @@ extern "C"{
 		if ((Wlow = T32(Wlow + 1)) == 0) \
 			Whigh = T32(Whigh + 1); \
 	} while (0)
-#if 0
+
 static const sph_u32 A_init_192[] = {
 	C32(0xFD749ED4), C32(0xB798E530), C32(0x33904B6F), C32(0x46BDA85E),
 	C32(0x076934B4), C32(0x454B4058), C32(0x77F74527), C32(0xFB4CF465),
@@ -466,7 +466,7 @@ static const sph_u32 C_init_384[] = {
 	C32(0x6E6E36DC), C32(0x63130DAD), C32(0xA9C69BD6), C32(0x1E90EA0C),
 	C32(0x7C35073B), C32(0x28D95E6D), C32(0xAA340E0D), C32(0xCB3DEE70)
 };
-#endif
+
 static const sph_u32 A_init_512[] = {
 	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
 	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
@@ -500,7 +500,6 @@ shabal_init(void *cc, unsigned size)
 	sph_shabal_context *sc;
 
 	switch (size) {
-#if 0
 	case 192:
 		A_init = A_init_192;
 		B_init = B_init_192;
@@ -521,7 +520,6 @@ shabal_init(void *cc, unsigned size)
 		B_init = B_init_384;
 		C_init = C_init_384;
 		break;
-#endif
 	case 512:
 		A_init = A_init_512;
 		B_init = B_init_512;
@@ -662,7 +660,6 @@ shabal_close(void *cc, unsigned ub, unsigned n, void *dst, unsigned size_words)
 	memcpy(dst, u.tmp_out + (sizeof u.tmp_out) - out_len, out_len);
 	shabal_init(sc, size_words << 5);
 }
-
 #if 0
 /* see sph_shabal.h */
 void
@@ -720,6 +717,7 @@ sph_shabal224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	shabal_close(cc, ub, n, dst, 7);
 }
 
+#endif
 /* see sph_shabal.h */
 void
 sph_shabal256_init(void *cc)
@@ -748,6 +746,7 @@ sph_shabal256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	shabal_close(cc, ub, n, dst, 8);
 }
 
+#if 0
 /* see sph_shabal.h */
 void
 sph_shabal384_init(void *cc)
@@ -775,7 +774,6 @@ sph_shabal384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	shabal_close(cc, ub, n, dst, 12);
 }
-
 #endif
 
 /* see sph_shabal.h */
diff --git a/sph/sph_sha2.c b/sph/sph_sha2.c
new file mode 100644
index 0000000000..0a7e0c3275
--- /dev/null
+++ b/sph/sph_sha2.c
@@ -0,0 +1,693 @@
+/* $Id: sha2.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SHA-224 / SHA-256 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_sha2.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHA2
+#define SPH_SMALL_FOOTPRINT_SHA2   1
+#endif
+
+#define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
+#define MAJ(X, Y, Z)   (((Y) & (Z)) | (((Y) | (Z)) & (X)))
+
+#define ROTR    SPH_ROTR32
+
+#define BSG2_0(x)      (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define BSG2_1(x)      (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define SSG2_0(x)      (ROTR(x, 7) ^ ROTR(x, 18) ^ SPH_T32((x) >> 3))
+#define SSG2_1(x)      (ROTR(x, 17) ^ ROTR(x, 19) ^ SPH_T32((x) >> 10))
+
+static const sph_u32 H224[8] = {
+	SPH_C32(0xC1059ED8), SPH_C32(0x367CD507), SPH_C32(0x3070DD17),
+	SPH_C32(0xF70E5939), SPH_C32(0xFFC00B31), SPH_C32(0x68581511),
+	SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4)
+};
+
+static const sph_u32 H256[8] = {
+	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), SPH_C32(0x3C6EF372),
+	SPH_C32(0xA54FF53A), SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
+	SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+};
+
+/*
+ * The SHA2_ROUND_BODY defines the body for a SHA-224 / SHA-256
+ * compression function implementation. The "in" parameter should
+ * evaluate, when applied to a numerical input parameter from 0 to 15,
+ * to an expression which yields the corresponding input block. The "r"
+ * parameter should evaluate to an array or pointer expression
+ * designating the array of 8 words which contains the input and output
+ * of the compression function.
+ */
+
+#if SPH_SMALL_FOOTPRINT_SHA2
+
+static const sph_u32 K[64] = {
+	SPH_C32(0x428A2F98), SPH_C32(0x71374491),
+	SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
+	SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
+	SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
+	SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
+	SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
+	SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
+	SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
+	SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
+	SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
+	SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
+	SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
+	SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
+	SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
+	SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
+	SPH_C32(0x06CA6351), SPH_C32(0x14292967),
+	SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
+	SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
+	SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
+	SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
+	SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
+	SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
+	SPH_C32(0xD192E819), SPH_C32(0xD6990624),
+	SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
+	SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
+	SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
+	SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
+	SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
+	SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
+	SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
+	SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
+	SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
+};
+
+#define SHA2_MEXP1(in, pc)   do { \
+		W[pc] = in(pc); \
+	} while (0)
+
+#define SHA2_MEXP2(in, pc)   do { \
+		W[(pc) & 0x0F] = SPH_T32(SSG2_1(W[((pc) - 2) & 0x0F]) \
+			+ W[((pc) - 7) & 0x0F] \
+			+ SSG2_0(W[((pc) - 15) & 0x0F]) + W[(pc) & 0x0F]); \
+	} while (0)
+
+#define SHA2_STEPn(n, a, b, c, d, e, f, g, h, in, pc)   do { \
+		sph_u32 t1, t2; \
+		SHA2_MEXP ## n(in, pc); \
+		t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
+			+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
+		t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
+		d = SPH_T32(d + t1); \
+		h = SPH_T32(t1 + t2); \
+	} while (0)
+
+#define SHA2_STEP1(a, b, c, d, e, f, g, h, in, pc) \
+	SHA2_STEPn(1, a, b, c, d, e, f, g, h, in, pc)
+#define SHA2_STEP2(a, b, c, d, e, f, g, h, in, pc) \
+	SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
+
+#define SHA2_ROUND_BODY(in, r)   do { \
+		sph_u32 A, B, C, D, E, F, G, H; \
+		sph_u32 W[16]; \
+		unsigned pcount; \
+ \
+		A = (r)[0]; \
+		B = (r)[1]; \
+		C = (r)[2]; \
+		D = (r)[3]; \
+		E = (r)[4]; \
+		F = (r)[5]; \
+		G = (r)[6]; \
+		H = (r)[7]; \
+		pcount = 0; \
+		SHA2_STEP1(A, B, C, D, E, F, G, H, in,  0); \
+		SHA2_STEP1(H, A, B, C, D, E, F, G, in,  1); \
+		SHA2_STEP1(G, H, A, B, C, D, E, F, in,  2); \
+		SHA2_STEP1(F, G, H, A, B, C, D, E, in,  3); \
+		SHA2_STEP1(E, F, G, H, A, B, C, D, in,  4); \
+		SHA2_STEP1(D, E, F, G, H, A, B, C, in,  5); \
+		SHA2_STEP1(C, D, E, F, G, H, A, B, in,  6); \
+		SHA2_STEP1(B, C, D, E, F, G, H, A, in,  7); \
+		SHA2_STEP1(A, B, C, D, E, F, G, H, in,  8); \
+		SHA2_STEP1(H, A, B, C, D, E, F, G, in,  9); \
+		SHA2_STEP1(G, H, A, B, C, D, E, F, in, 10); \
+		SHA2_STEP1(F, G, H, A, B, C, D, E, in, 11); \
+		SHA2_STEP1(E, F, G, H, A, B, C, D, in, 12); \
+		SHA2_STEP1(D, E, F, G, H, A, B, C, in, 13); \
+		SHA2_STEP1(C, D, E, F, G, H, A, B, in, 14); \
+		SHA2_STEP1(B, C, D, E, F, G, H, A, in, 15); \
+		for (pcount = 16; pcount < 64; pcount += 16) { \
+			SHA2_STEP2(A, B, C, D, E, F, G, H, in,  0); \
+			SHA2_STEP2(H, A, B, C, D, E, F, G, in,  1); \
+			SHA2_STEP2(G, H, A, B, C, D, E, F, in,  2); \
+			SHA2_STEP2(F, G, H, A, B, C, D, E, in,  3); \
+			SHA2_STEP2(E, F, G, H, A, B, C, D, in,  4); \
+			SHA2_STEP2(D, E, F, G, H, A, B, C, in,  5); \
+			SHA2_STEP2(C, D, E, F, G, H, A, B, in,  6); \
+			SHA2_STEP2(B, C, D, E, F, G, H, A, in,  7); \
+			SHA2_STEP2(A, B, C, D, E, F, G, H, in,  8); \
+			SHA2_STEP2(H, A, B, C, D, E, F, G, in,  9); \
+			SHA2_STEP2(G, H, A, B, C, D, E, F, in, 10); \
+			SHA2_STEP2(F, G, H, A, B, C, D, E, in, 11); \
+			SHA2_STEP2(E, F, G, H, A, B, C, D, in, 12); \
+			SHA2_STEP2(D, E, F, G, H, A, B, C, in, 13); \
+			SHA2_STEP2(C, D, E, F, G, H, A, B, in, 14); \
+			SHA2_STEP2(B, C, D, E, F, G, H, A, in, 15); \
+		} \
+		(r)[0] = SPH_T32((r)[0] + A); \
+		(r)[1] = SPH_T32((r)[1] + B); \
+		(r)[2] = SPH_T32((r)[2] + C); \
+		(r)[3] = SPH_T32((r)[3] + D); \
+		(r)[4] = SPH_T32((r)[4] + E); \
+		(r)[5] = SPH_T32((r)[5] + F); \
+		(r)[6] = SPH_T32((r)[6] + G); \
+		(r)[7] = SPH_T32((r)[7] + H); \
+	} while (0)
+
+#else
+
+#define SHA2_ROUND_BODY(in, r)   do { \
+		sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
+		sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
+		sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
+		int i; \
+ \
+/* for (i=0;i<8;i++) {printf("in[%d]=%08x in[%d]=%08x \n",2*i,in(2*i),2*i+1,in(2*i+1));} */ \
+ 		A = (r)[0]; \
+		B = (r)[1]; \
+		C = (r)[2]; \
+		D = (r)[3]; \
+		E = (r)[4]; \
+		F = (r)[5]; \
+		G = (r)[6]; \
+		H = (r)[7]; \
+		W00 = in(0); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x428A2F98) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = in(1); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x71374491) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = in(2); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0xB5C0FBCF) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = in(3); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0xE9B5DBA5) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = in(4); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x3956C25B) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = in(5); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x59F111F1) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = in(6); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x923F82A4) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = in(7); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0xAB1C5ED5) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = in(8); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0xD807AA98) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = in(9); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x12835B01) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = in(10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x243185BE) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = in(11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x550C7DC3) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = in(12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x72BE5D74) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = in(13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x80DEB1FE) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = in(14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x9BDC06A7) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = in(15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0xC19BF174) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0xE49B69C1) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0xEFBE4786) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x0FC19DC6) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x240CA1CC) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x2DE92C6F) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x4A7484AA) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x5CB0A9DC) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x76F988DA) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x983E5152) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0xA831C66D) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0xB00327C8) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0xBF597FC7) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0xC6E00BF3) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0xD5A79147) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x06CA6351) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x14292967) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x27B70A85) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x2E1B2138) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x4D2C6DFC) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x53380D13) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x650A7354) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x766A0ABB) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x81C2C92E) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x92722C85) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0xA2BFE8A1) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0xA81A664B) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0xC24B8B70) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0xC76C51A3) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0xD192E819) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0xD6990624) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0xF40E3585) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x106AA070) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x19A4C116) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x1E376C08) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x2748774C) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x34B0BCB5) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x391C0CB3) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x4ED8AA4A) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x5B9CCA4F) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x682E6FF3) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x748F82EE) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x78A5636F) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x84C87814) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x8CC70208) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x90BEFFFA) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0xA4506CEB) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0xBEF9A3F7) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0xC67178F2) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		(r)[0] = SPH_T32((r)[0] + A); \
+		(r)[1] = SPH_T32((r)[1] + B); \
+		(r)[2] = SPH_T32((r)[2] + C); \
+		(r)[3] = SPH_T32((r)[3] + D); \
+		(r)[4] = SPH_T32((r)[4] + E); \
+		(r)[5] = SPH_T32((r)[5] + F); \
+		(r)[6] = SPH_T32((r)[6] + G); \
+		(r)[7] = SPH_T32((r)[7] + H); \
+/* for (i=0;i<4;i++) {printf("r[%d]=%08x r[%d]=%08x\n",2*i,(r)[2*i],2*i+1,(r)[2*i+1]);}  */ \
+	} while (0)
+
+#endif
+
+/*
+ * One round of SHA-224 / SHA-256. The data must be aligned for 32-bit access.
+ */
+static void
+sha2_round(const unsigned char *data, sph_u32 r[8])
+{
+#define SHA2_IN(x)   sph_dec32be_aligned(data + (4 * (x)))
+	SHA2_ROUND_BODY(SHA2_IN, r);
+#undef SHA2_IN
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_init(void *cc)
+{
+	sph_sha224_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H224, sizeof H224);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_init(void *cc)
+{
+	sph_sha256_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H256, sizeof H256);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN   sha2_round
+#define HASH   sha224
+#define BE32   1
+#include "md_helper.c"
+
+/* see sph_sha2.h */
+void
+sph_sha224_close(void *cc, void *dst)
+{
+	sha224_close(cc, dst, 7);
+	sph_sha224_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha224_addbits_and_close(cc, ub, n, dst, 7);
+	sph_sha224_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_close(void *cc, void *dst)
+{
+	sha224_close(cc, dst, 8);
+	sph_sha256_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha224_addbits_and_close(cc, ub, n, dst, 8);
+	sph_sha256_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8])
+{
+#define SHA2_IN(x)   msg[x]
+	SHA2_ROUND_BODY(SHA2_IN, val);
+#undef SHA2_IN
+}
diff --git a/sph/sph_types.h b/sph/sph_types.h
index 5ec7bbf31d..85578a4292 100644
--- a/sph/sph_types.h
+++ b/sph/sph_types.h
@@ -816,7 +816,7 @@ static inline void sph_enc64be_aligned(void *dst, sph_u64 val);
 #undef SPH_64
 #undef SPH_64_TRUE
 
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
+#if 1 // defined __STDC__ && __STDC_VERSION__ >= 199901L
 
 /*
  * On C99 implementations, we can use <stdint.h> to get an exact 64-bit
@@ -824,7 +824,11 @@ static inline void sph_enc64be_aligned(void *dst, sph_u64 val);
  * C99 conformance).
  */
 
+#ifdef __cplusplus
+#include <cstdint>
+#else
 #include <stdint.h>
+#endif
 
 #ifdef UINT32_MAX
 typedef uint32_t sph_u32;
@@ -930,14 +934,25 @@ typedef long long sph_s64;
  */
 
 #define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
+#if defined _MSC_VER
+#define SPH_ROTL32(x, n) _rotl(x, n)
+#define SPH_ROTR32(x, n) _rotr(x, n)
+#else
+#define SPH_ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
+#define SPH_ROTR32(x, n) ((x) >> (n)) | ((x) << (32 - (n)))
+#endif
 
 #if SPH_64
 
+#if defined _MSC_VER
+#define SPH_ROTR64(x, n) _rotr64(x, n)
+#define SPH_ROTL64(x, n) _rotl64(x, n)
+#else
+#define SPH_ROTR64(x, n)  (((x) >> (n)) | ((x) << (64 - (n))))
+#define SPH_ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
+#endif
+
 #define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
-#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
-#define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))
 
 #endif
 
@@ -1001,7 +1016,7 @@ typedef long long sph_s64;
 
 #define SPH_DETECT_UNALIGNED         1
 #define SPH_DETECT_LITTLE_ENDIAN     1
-#define SPH_DETECT_UPTR              sph_u32
+#define SPH_DETECT_UPTR              uintptr_t
 #ifdef __GNUC__
 #define SPH_DETECT_I386_GCC          1
 #endif
@@ -1016,7 +1031,7 @@ typedef long long sph_s64;
 
 #define SPH_DETECT_UNALIGNED         1
 #define SPH_DETECT_LITTLE_ENDIAN     1
-#define SPH_DETECT_UPTR              sph_u64
+#define SPH_DETECT_UPTR              uintptr_t
 #ifdef __GNUC__
 #define SPH_DETECT_AMD64_GCC         1
 #endif
diff --git a/sph/sysendian.h b/sph/sysendian.h
new file mode 100644
index 0000000000..31ac985fb9
--- /dev/null
+++ b/sph/sysendian.h
@@ -0,0 +1,140 @@
+/*-
+ * Copyright 2007-2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+#ifndef _SYSENDIAN_H_
+#define _SYSENDIAN_H_
+
+/* If we don't have be64enc, the <sys/endian.h> we have isn't usable. */
+#if !HAVE_DECL_BE64ENC
+#undef HAVE_SYS_ENDIAN_H
+#endif
+
+#ifdef HAVE_SYS_ENDIAN_H
+
+#include <sys/endian.h>
+
+#else
+
+#include <stdint.h>
+
+#if !HAVE_DECL_LE32DEC
+static  uint32_t le32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
+		((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
+}
+#endif
+
+#if !HAVE_DECL_BE32ENC
+static  void be32enc(void *pp, uint32_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[3] = x & 0xff;
+	p[2] = (x >> 8) & 0xff;
+	p[1] = (x >> 16) & 0xff;
+	p[0] = (x >> 24) & 0xff;
+}
+#endif
+
+#if !HAVE_DECL_BE32DEC
+static uint32_t be32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+	return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
+		((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
+}
+#endif
+
+#if !HAVE_DECL_LE32ENC
+static  void le32enc(void *pp, uint32_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+	p[2] = (x >> 16) & 0xff;
+	p[3] = (x >> 24) & 0xff;
+}
+#endif
+
+static  uint64_t
+be64dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint64_t)(p[7]) + ((uint64_t)(p[6]) << 8) +
+	    ((uint64_t)(p[5]) << 16) + ((uint64_t)(p[4]) << 24) +
+	    ((uint64_t)(p[3]) << 32) + ((uint64_t)(p[2]) << 40) +
+	    ((uint64_t)(p[1]) << 48) + ((uint64_t)(p[0]) << 56));
+}
+
+static void
+be64enc(void *pp, uint64_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[7] = x & 0xff;
+	p[6] = (x >> 8) & 0xff;
+	p[5] = (x >> 16) & 0xff;
+	p[4] = (x >> 24) & 0xff;
+	p[3] = (x >> 32) & 0xff;
+	p[2] = (x >> 40) & 0xff;
+	p[1] = (x >> 48) & 0xff;
+	p[0] = (x >> 56) & 0xff;
+}
+
+
+
+static  uint64_t
+le64dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint64_t)(p[0]) + ((uint64_t)(p[1]) << 8) +
+	    ((uint64_t)(p[2]) << 16) + ((uint64_t)(p[3]) << 24) +
+	    ((uint64_t)(p[4]) << 32) + ((uint64_t)(p[5]) << 40) +
+	    ((uint64_t)(p[6]) << 48) + ((uint64_t)(p[7]) << 56));
+}
+
+static  void
+le64enc(void *pp, uint64_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+	p[2] = (x >> 16) & 0xff;
+	p[3] = (x >> 24) & 0xff;
+	p[4] = (x >> 32) & 0xff;
+	p[5] = (x >> 40) & 0xff;
+	p[6] = (x >> 48) & 0xff;
+	p[7] = (x >> 56) & 0xff;
+}
+#endif /* !HAVE_SYS_ENDIAN_H */
+
+#endif /* !_SYSENDIAN_H_ */
diff --git a/sph/yescrypt-opt.c b/sph/yescrypt-opt.c
new file mode 100644
index 0000000000..3ec0eb8726
--- /dev/null
+++ b/sph/yescrypt-opt.c
@@ -0,0 +1,1392 @@
+/*-
+ * Copyright 2009 Colin Percival
+ * Copyright 2013,2014 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#ifdef __i386__
+#warning "This implementation does not use SIMD, and thus it runs a lot slower than the SIMD-enabled implementation. Enable at least SSE2 in the C compiler and use yescrypt-best.c instead unless you're building this SIMD-less implementation on purpose (portability to older CPUs or testing)."
+#elif defined(__x86_64__)
+#warning "This implementation does not use SIMD, and thus it runs a lot slower than the SIMD-enabled implementation. Use yescrypt-best.c instead unless you're building this SIMD-less implementation on purpose (for testing only)."
+#endif
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "yescrypt.h"
+#include "sha256_Y.h"
+#include "sysendian.h"
+
+// #include "sph/yescrypt-platform.c"
+#define HUGEPAGE_THRESHOLD		(12 * 1024 * 1024)
+
+#ifdef __x86_64__
+#define HUGEPAGE_SIZE			(2 * 1024 * 1024)
+#else
+#undef HUGEPAGE_SIZE
+#endif
+
+
+static void *
+alloc_region(yescrypt_region_t * region, size_t size)
+{
+	size_t base_size = size;
+	uint8_t * base, *aligned;
+#ifdef MAP_ANON
+	int flags =
+#ifdef MAP_NOCORE
+		MAP_NOCORE |
+#endif
+		MAP_ANON | MAP_PRIVATE;
+#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE)
+	size_t new_size = size;
+	const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1;
+	if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) {
+		flags |= MAP_HUGETLB;
+		/*
+		* Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of
+		* huge page size, so let's round up to huge page size here.
+		*/
+		new_size = size + hugepage_mask;
+		new_size &= ~hugepage_mask;
+	}
+	base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0);
+	if (base != MAP_FAILED) {
+		base_size = new_size;
+	}
+	else
+		if (flags & MAP_HUGETLB) {
+		flags &= ~MAP_HUGETLB;
+		base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+		}
+
+#else
+	base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+#endif
+	if (base == MAP_FAILED)
+		base = NULL;
+	aligned = base;
+#elif defined(HAVE_POSIX_MEMALIGN)
+	if ((errno = posix_memalign((void **)&base, 64, size)) != 0)
+		base = NULL;
+	aligned = base;
+#else
+	base = aligned = NULL;
+	if (size + 63 < size) {
+		errno = ENOMEM;
+	}
+	else if ((base = malloc(size + 63)) != NULL) {
+		aligned = base + 63;
+		aligned -= (uintptr_t)aligned & 63;
+	}
+#endif
+	region->base = base;
+	region->aligned = aligned;
+	region->base_size = base ? base_size : 0;
+	region->aligned_size = base ? size : 0;
+	return aligned;
+}
+
+static  void init_region(yescrypt_region_t * region)
+{
+	region->base = region->aligned = NULL;
+	region->base_size = region->aligned_size = 0;
+}
+
+static int
+free_region(yescrypt_region_t * region)
+{
+	if (region->base) {
+#ifdef MAP_ANON
+		if (munmap(region->base, region->base_size))
+			return -1;
+#else
+		free(region->base);
+#endif
+	}
+	init_region(region);
+	return 0;
+}
+
+int
+yescrypt_init_shared(yescrypt_shared_t * shared,
+const uint8_t * param, size_t paramlen,
+uint64_t N, uint32_t r, uint32_t p,
+yescrypt_init_shared_flags_t flags, uint32_t mask,
+uint8_t * buf, size_t buflen)
+{
+	yescrypt_shared1_t * shared1 = &shared->shared1;
+	yescrypt_shared_t dummy, half1, half2;
+	//    yescrypt_shared_t * half2;
+	uint8_t salt[32];
+
+	if (flags & YESCRYPT_SHARED_PREALLOCATED) {
+		if (!shared1->aligned || !shared1->aligned_size)
+			return -1;
+	}
+	else {
+		init_region(shared1);
+	}
+	shared->mask1 = 1;
+	if (!param && !paramlen && !N && !r && !p && !buf && !buflen)
+		return 0;
+
+	init_region(&dummy.shared1);
+	dummy.mask1 = 1;
+	if (yescrypt_kdf(&dummy, shared1,
+		param, paramlen, NULL, 0, N, r, p, 0,
+		YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
+		salt, sizeof(salt)))
+		goto out;
+
+	half1 = half2 = *shared;
+	half1.shared1.aligned_size /= 2;
+	half2.shared1.aligned_size = half1.shared1.aligned_size;
+	half2.shared1.aligned = (char*)half2.shared1.aligned + half1.shared1.aligned_size;
+
+	N /= 2;
+
+	if (p > 1 && yescrypt_kdf(&half1, &half2.shared1,
+		param, paramlen, salt, sizeof(salt), N, r, p, 0,
+		YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_2,
+		salt, sizeof(salt)))
+		goto out;
+
+	if (yescrypt_kdf(&half2, &half1.shared1,
+		param, paramlen, salt, sizeof(salt), N, r, p, 0,
+		YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
+		salt, sizeof(salt)))
+		goto out;
+
+	if (yescrypt_kdf(&half1, &half2.shared1,
+		param, paramlen, salt, sizeof(salt), N, r, p, 0,
+		YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
+		buf, buflen))
+		goto out;
+
+	shared->mask1 = mask;
+
+	return 0;
+
+out:
+	if (!(flags & YESCRYPT_SHARED_PREALLOCATED))
+		free_region(shared1);
+	return -1;
+}
+
+int
+yescrypt_free_shared(yescrypt_shared_t * shared)
+{
+	return free_region(&shared->shared1);
+}
+
+int
+yescrypt_init_local(yescrypt_local_t * local)
+{
+	init_region(local);
+	return 0;
+}
+
+int
+yescrypt_free_local(yescrypt_local_t * local)
+{
+	return free_region(local);
+}
+
+
+static void
+blkcpy(uint64_t * dest, const uint64_t * src, size_t count)
+{
+	do {
+		*dest++ = *src++; *dest++ = *src++;
+		*dest++ = *src++; *dest++ = *src++;
+	} while (count -= 4);
+};
+
+static void
+blkxor(uint64_t * dest, const uint64_t * src, size_t count)
+{
+	do {
+		*dest++ ^= *src++; *dest++ ^= *src++;
+		*dest++ ^= *src++; *dest++ ^= *src++;
+	} while (count -= 4);
+};
+
+typedef union {
+	uint32_t w[16];
+	uint64_t d[8];
+} salsa20_blk_t;
+
+static  void
+salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
+{
+#define COMBINE(out, in1, in2) \
+	Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
+	COMBINE(0, 0, 2)
+	COMBINE(1, 5, 7)
+	COMBINE(2, 2, 4)
+	COMBINE(3, 7, 1)
+	COMBINE(4, 4, 6)
+	COMBINE(5, 1, 3)
+	COMBINE(6, 6, 0)
+	COMBINE(7, 3, 5)
+#undef COMBINE
+}
+
+static void
+salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
+{
+#define COMBINE(out, in1, in2) \
+	Bout->w[out * 2] = Bin->d[in1]; \
+	Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
+	COMBINE(0, 0, 6)
+	COMBINE(1, 5, 3)
+	COMBINE(2, 2, 0)
+	COMBINE(3, 7, 5)
+	COMBINE(4, 4, 2)
+	COMBINE(5, 1, 7)
+	COMBINE(6, 6, 4)
+	COMBINE(7, 3, 1)
+#undef COMBINE
+}
+
+/**
+ * salsa20_8(B):
+ * Apply the salsa20/8 core to the provided block.
+ */
+
+static void
+salsa20_8(uint64_t B[8])
+{
+	size_t i;
+	salsa20_blk_t X;
+
+#define x X.w
+
+	salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X);
+
+	for (i = 0; i < 8; i += 2) {
+#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
+		/* Operate on columns */
+		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
+		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
+
+		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
+		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
+
+		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
+		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
+
+		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
+		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
+
+		/* Operate on rows */
+		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
+		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
+
+		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
+		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
+
+		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
+		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
+
+		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
+		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
+#undef R
+	}
+#undef x
+
+	{
+		salsa20_blk_t Y;
+		salsa20_simd_shuffle(&X, &Y);
+		for (i = 0; i < 16; i += 4) {
+			((salsa20_blk_t *)B)->w[i] += Y.w[i];
+			((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1];
+			((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2];
+			((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3];
+		}
+	}
+}
+
+/**
+ * blockmix_salsa8(Bin, Bout, X, r):
+ * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
+ * bytes in length; the output Bout must also be the same size.  The
+ * temporary space X must be 64 bytes.
+ */
+static void
+blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r)
+{
+	size_t i;
+
+	/* 1: X <-- B_{2r - 1} */
+	blkcpy(X, &Bin[(2 * r - 1) * 8], 8);
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < 2 * r; i += 2) {
+		/* 3: X <-- H(X \xor B_i) */
+		blkxor(X, &Bin[i * 8], 8);
+		salsa20_8(X);
+
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		blkcpy(&Bout[i * 4], X, 8);
+
+		/* 3: X <-- H(X \xor B_i) */
+		blkxor(X, &Bin[i * 8 + 8], 8);
+		salsa20_8(X);
+
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		blkcpy(&Bout[i * 4 + r * 8], X, 8);
+	}
+
+}
+
+/* These are tunable */
+#define S_BITS 8
+#define S_SIMD 2
+#define S_P 4
+#define S_ROUNDS 6
+
+/* Number of S-boxes.  Not tunable, hard-coded in a few places. */
+#define S_N 2
+
+/* Derived values.  Not tunable on their own. */
+#define S_SIZE1 (1 << S_BITS)
+#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8)
+#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK)
+#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD)
+#define S_P_SIZE (S_P * S_SIMD)
+#define S_MIN_R ((S_P * S_SIMD + 15) / 16)
+
+/**
+ * pwxform(B):
+ * Transform the provided block using the provided S-boxes.
+ */
+
+static void
+block_pwxform(uint64_t * B, const uint64_t * S)
+{
+	uint64_t(*X)[S_SIMD] = (uint64_t(*)[S_SIMD])B;
+	const uint8_t *S0 = (const uint8_t *)S;
+	const uint8_t *S1 = (const uint8_t *)(S + S_SIZE1 * S_SIMD);
+	size_t i, j;
+
+	for (j = 0; j < S_P; j++) {
+
+		uint64_t *Xj = X[j];
+		uint64_t x0 = Xj[0];
+		uint64_t x1 = Xj[1];
+
+		for (i = 0; i < S_ROUNDS; i++) {
+			uint64_t x = x0 & S_MASK2;
+			const uint64_t *p0, *p1;
+
+			p0 = (const uint64_t *)(S0 + (uint32_t)x);
+			p1 = (const uint64_t *)(S1 + (x >> 32));
+	
+			x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0;
+			x0 += p0[0];
+			x0 ^= p1[0];
+
+			x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1;
+			x1 += p0[1];
+			x1 ^= p1[1];
+		}
+		Xj[0] = x0;
+		Xj[1] = x1;
+	}
+
+
+
+}
+
+
+/**
+ * blockmix_pwxform(Bin, Bout, S, r):
+ * Compute Bout = BlockMix_pwxform{salsa20/8, S, r}(Bin).  The input Bin must
+ * be 128r bytes in length; the output Bout must also be the same size.
+ *
+ * S lacks const qualifier to match blockmix_salsa8()'s prototype, which we
+ * need to refer to both functions via the same function pointers.
+ */
+static void
+blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout, uint64_t * S, size_t r)
+{
+	size_t r1, r2, i;
+   // S_P_SIZE = 8;
+	/* Convert 128-byte blocks to (S_P_SIZE * 64-bit) blocks */
+
+	r1 = r * 128 / (S_P_SIZE * 8);
+	/* X <-- B_{r1 - 1} */
+	blkcpy(Bout, &Bin[(r1 - 1) * S_P_SIZE], S_P_SIZE);
+
+	/* X <-- X \xor B_i */
+	blkxor(Bout, Bin, S_P_SIZE);
+
+	/* X <-- H'(X) */
+	/* B'_i <-- X */
+	block_pwxform(Bout, S);
+
+	/* for i = 0 to r1 - 1 do */
+	for (i = 1; i < r1; i++) {
+		/* X <-- X \xor B_i */
+		blkcpy(&Bout[i * S_P_SIZE], &Bout[(i - 1) * S_P_SIZE],S_P_SIZE);
+		blkxor(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE], S_P_SIZE);
+
+		/* X <-- H'(X) */
+		/* B'_i <-- X */
+		block_pwxform(&Bout[i * S_P_SIZE], S);
+	}
+	
+	/* Handle partial blocks */
+	if (i * S_P_SIZE < r * 16) {
+		blkcpy(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE],r * 16 - i * S_P_SIZE);
+}
+
+	i = (r1 - 1) * S_P_SIZE / 8;
+	/* Convert 128-byte blocks to 64-byte blocks */
+	r2 = r * 2;
+
+	/* B'_i <-- H(B'_i) */
+	salsa20_8(&Bout[i * 8]);
+
+
+	i++;
+/// not used yescrypt
+
+	for (; i < r2; i++) {
+		/* B'_i <-- H(B'_i \xor B'_{i-1}) */
+		blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8);
+		salsa20_8(&Bout[i * 8]);
+	}
+}
+
+
+
+/**
+ * integerify(B, r):
+ * Return the result of parsing B_{2r-1} as a little-endian integer.
+ */
+static  uint64_t
+integerify(const uint64_t * B, size_t r)
+{
+/*
+ * Our 64-bit words are in host byte order, and word 6 holds the second 32-bit
+ * word of B_{2r-1} due to SIMD shuffling.  The 64-bit value we return is also
+ * in host byte order, as it should be.
+ */
+	const uint64_t * X = &B[(2 * r - 1) * 8];
+	uint32_t lo = X[0];
+	uint32_t hi = X[6] >> 32;
+	return ((uint64_t)hi << 32) + lo;
+}
+
+/**
+ * smix1(B, r, N, flags, V, NROM, shared, XY, S):
+ * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 256r + 64 bytes in length.  The value N must be even and
+ * no smaller than 2.
+ */
+static void
+smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags,
+	uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
+	uint64_t * XY, uint64_t * S)
+{
+	void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) = (S ? blockmix_pwxform : blockmix_salsa8);
+	const uint64_t * VROM = shared->shared1.aligned;
+	uint32_t VROM_mask = shared->mask1;
+	size_t s = 16 * r;
+	uint64_t * X = V;
+	uint64_t * Y = &XY[s];
+	uint64_t * Z = S ? S : &XY[2 * s];
+	uint64_t n, i, j;
+	size_t k;
+
+	/* 1: X <-- B */
+	/* 3: V_i <-- X */
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
+		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
+		salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
+		for (k = 0; k < 16; k++) 
+			tmp->w[k] = le32dec(&src->w[k]);
+                                 
+		salsa20_simd_shuffle(tmp, dst);
+	}
+
+	/* 4: X <-- H(X) */
+	/* 3: V_i <-- X */
+
+	blockmix(X, Y, Z, r);
+
+	blkcpy(&V[s], Y, s);
+
+	X = XY;
+
+	if (NROM && (VROM_mask & 1)) {
+		if ((1 & VROM_mask) == 1) {
+			/* j <-- Integerify(X) mod NROM */
+			j = integerify(Y, r) & (NROM - 1);
+
+			/* X <-- H(X \xor VROM_j) */
+			blkxor(Y, &VROM[j * s], s);
+		}
+	
+		blockmix(Y, X, Z, r);
+
+		/* 2: for i = 0 to N - 1 do */
+		for (n = 1, i = 2; i < N; i += 2) {
+			/* 3: V_i <-- X */
+			blkcpy(&V[i * s], X, s);
+
+			if ((i & (i - 1)) == 0)
+				n <<= 1;
+
+			/* j <-- Wrap(Integerify(X), i) */
+			j = integerify(X, r) & (n - 1);
+			j += i - n;
+
+			/* X <-- X \xor V_j */
+			blkxor(X, &V[j * s], s);
+
+			/* 4: X <-- H(X) */
+			blockmix(X, Y, Z, r);
+
+			/* 3: V_i <-- X */
+			blkcpy(&V[(i + 1) * s], Y, s);
+
+			j = integerify(Y, r);
+			if (((i + 1) & VROM_mask) == 1) {
+				/* j <-- Integerify(X) mod NROM */
+				j &= NROM - 1;
+
+				/* X <-- H(X \xor VROM_j) */
+				blkxor(Y, &VROM[j * s], s);
+			} else {
+				/* j <-- Wrap(Integerify(X), i) */
+				j &= n - 1;
+				j += i + 1 - n;
+
+				/* X <-- H(X \xor V_j) */
+				blkxor(Y, &V[j * s], s);
+			}
+
+			blockmix(Y, X, Z, r);
+		}
+	} else {
+		yescrypt_flags_t rw = flags & YESCRYPT_RW;
+		/* 4: X <-- H(X) */
+		blockmix(Y, X, Z, r);
+
+		/* 2: for i = 0 to N - 1 do */
+		for (n = 1, i = 2; i < N; i += 2) {
+			/* 3: V_i <-- X */
+			blkcpy(&V[i * s], X, s);
+
+			if (rw) {
+				if ((i & (i - 1)) == 0)
+					n <<= 1;
+
+				/* j <-- Wrap(Integerify(X), i) */
+				j = integerify(X, r) & (n - 1);
+				j += i - n;
+				
+				/* X <-- X \xor V_j */
+				blkxor(X, &V[j * s], s);
+			}
+
+			/* 4: X <-- H(X) */
+			blockmix(X, Y, Z, r);
+
+			/* 3: V_i <-- X */
+			blkcpy(&V[(i + 1) * s], Y, s);
+
+			if (rw) {
+				/* j <-- Wrap(Integerify(X), i) */
+				j = integerify(Y, r) & (n - 1);
+				j += (i + 1) - n;
+				
+
+				/* X <-- X \xor V_j */
+				blkxor(Y, &V[j * s], s);
+			}
+
+			/* 4: X <-- H(X) */
+			blockmix(Y, X, Z, r);
+		}
+	}
+
+	/* B' <-- X */
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
+		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
+		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
+		for (k = 0; k < 16; k++)
+			le32enc(&tmp->w[k], src->w[k]);
+		salsa20_simd_unshuffle(tmp, dst);
+	}
+}
+
+
+
+/**
+ * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S):
+ * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 256r + 64 bytes in length.  The value N must be a
+ * power of 2 greater than 1.  The value Nloop must be even.
+ */
+static void
+smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop,
+	yescrypt_flags_t flags,
+	uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
+	uint64_t * XY, uint64_t * S)
+{
+	
+	void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
+		(S ? blockmix_pwxform : blockmix_salsa8);
+	const uint64_t * VROM = shared->shared1.aligned;
+	uint32_t VROM_mask = shared->mask1 | 1;
+	size_t s = 16 * r;
+	yescrypt_flags_t rw = flags & YESCRYPT_RW;
+	uint64_t * X = XY;
+	uint64_t * Y = &XY[s];
+	uint64_t * Z = S ? S : &XY[2 * s];
+	uint64_t i, j;
+	size_t k;
+
+	if (Nloop == 0)
+		return;
+
+	/* X <-- B' */
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
+		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
+		salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
+		for (k = 0; k < 16; k++)
+			tmp->w[k] = le32dec(&src->w[k]);
+		salsa20_simd_shuffle(tmp, dst);
+	}
+	if (NROM) {
+
+		/* 6: for i = 0 to N - 1 do */
+		for (i = 0; i < Nloop; i += 2) {
+			/* 7: j <-- Integerify(X) mod N */
+			j = integerify(X, r) & (N - 1);
+
+			/* 8: X <-- H(X \xor V_j) */
+			blkxor(X, &V[j * s], s);
+			/* V_j <-- Xprev \xor V_j */
+			if (rw)
+				blkcpy(&V[j * s], X, s);
+			blockmix(X, Y, Z, r);
+
+			j = integerify(Y, r);
+			if (((i + 1) & VROM_mask) == 1) {
+				/* j <-- Integerify(X) mod NROM */
+				j &= NROM - 1;
+
+				/* X <-- H(X \xor VROM_j) */
+				blkxor(Y, &VROM[j * s], s);
+			} else {
+				/* 7: j <-- Integerify(X) mod N */
+				j &= N - 1;
+
+				/* 8: X <-- H(X \xor V_j) */
+				blkxor(Y, &V[j * s], s);
+				/* V_j <-- Xprev \xor V_j */
+				if (rw)
+					blkcpy(&V[j * s], Y, s);
+			}
+
+			blockmix(Y, X, Z, r);
+		}
+	} else {
+
+		/* 6: for i = 0 to N - 1 do */
+		i = Nloop / 2;
+		do {
+			/* 7: j <-- Integerify(X) mod N */
+			j = integerify(X, r) & (N - 1);
+
+			/* 8: X <-- H(X \xor V_j) */
+			blkxor(X, &V[j * s], s);
+			/* V_j <-- Xprev \xor V_j */
+			if (rw)
+				blkcpy(&V[j * s], X, s);
+			blockmix(X, Y, Z, r);
+
+			/* 7: j <-- Integerify(X) mod N */
+			j = integerify(Y, r) & (N - 1);
+
+			/* 8: X <-- H(X \xor V_j) */
+			blkxor(Y, &V[j * s], s);
+			/* V_j <-- Xprev \xor V_j */
+			if (rw)
+				blkcpy(&V[j * s], Y, s);
+			blockmix(Y, X, Z, r);
+		} while (--i);
+	}
+
+	/* 10: B' <-- X */
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
+		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
+		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
+		for (k = 0; k < 16; k++)
+			le32enc(&tmp->w[k], src->w[k]);
+		salsa20_simd_unshuffle(tmp, dst);
+	}
+}
+
+
+
+
+/**
+ * p2floor(x):
+ * Largest power of 2 not greater than argument.
+ */
+static uint64_t
+p2floor(uint64_t x)
+{
+	uint64_t y;
+	while ((y = x & (x - 1)))
+		x = y;
+	return x;
+}
+
+/**
+ * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S):
+ * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
+ * temporary storage V must be 128rN bytes in length; the temporary storage
+ * XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is
+ * required with OpenMP-enabled builds).  The value N must be a power of 2
+ * greater than 1.
+ */
+static void
+smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t,
+	yescrypt_flags_t flags,
+	uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
+	uint64_t * XY, uint64_t * S)
+{
+	size_t s = 16 * r;
+	uint64_t Nchunk = N / p, Nloop_all, Nloop_rw;
+	uint32_t i;
+
+	Nloop_all = Nchunk;
+	if (flags & YESCRYPT_RW) {
+		if (t <= 1) {
+			if (t)
+				Nloop_all *= 2; /* 2/3 */
+			Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
+		} else {
+			Nloop_all *= t - 1;
+		}
+	} else if (t) {
+		if (t == 1)
+			Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
+		Nloop_all *= t;
+	}
+
+	Nloop_rw = 0;
+	if (flags & __YESCRYPT_INIT_SHARED)
+		Nloop_rw = Nloop_all;
+	else if (flags & YESCRYPT_RW)
+		Nloop_rw = Nloop_all / p;
+
+	Nchunk &= ~(uint64_t)1; /* round down to even */
+	Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
+	Nloop_rw &= ~(uint64_t)1; /* round down to even */
+
+
+	for (i = 0; i < p; i++) {
+		uint64_t Vchunk = i * Nchunk;
+		uint64_t * Bp = &B[i * s];
+		uint64_t * Vp = &V[Vchunk * s];
+		uint64_t * XYp = XY;
+
+	uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
+		uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
+
+		if (Sp) 
+			smix1(Bp, 1, S_SIZE_ALL / 16, flags & ~YESCRYPT_PWXFORM,Sp, NROM, shared, XYp, NULL);
+
+
+
+		if (!(flags & __YESCRYPT_INIT_SHARED_2)) 
+			smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp);
+
+			smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, NROM, shared, XYp, Sp);
+	}
+	if (Nloop_all > Nloop_rw) {
+
+		for (i = 0; i < p; i++) {
+			uint64_t * Bp = &B[i * s];
+
+			uint64_t * XYp = XY;
+
+			uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
+			smix2(Bp, r, N, Nloop_all - Nloop_rw,flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp);
+		}
+	}
+
+}
+
+static void
+smix_old(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t,
+yescrypt_flags_t flags,
+uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
+uint64_t * XY, uint64_t * S)
+{
+	size_t s = 16 * r;
+	uint64_t Nchunk = N / p, Nloop_all, Nloop_rw;
+	uint32_t i;
+
+	Nloop_all = Nchunk;
+	if (flags & YESCRYPT_RW) {
+		if (t <= 1) {
+			if (t)
+				Nloop_all *= 2; /* 2/3 */
+			Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
+		}
+		else {
+			Nloop_all *= t - 1;
+		}
+	}
+	else if (t) {
+		if (t == 1)
+			Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
+		Nloop_all *= t;
+	}
+
+	Nloop_rw = 0;
+	if (flags & __YESCRYPT_INIT_SHARED)
+		Nloop_rw = Nloop_all;
+	else if (flags & YESCRYPT_RW)
+		Nloop_rw = Nloop_all / p;
+
+	Nchunk &= ~(uint64_t)1; /* round down to even */
+	Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
+	Nloop_rw &= ~(uint64_t)1; /* round down to even */
+
+
+	for (i = 0; i < p; i++) {
+		printf("smix first loop p=%d s=%d Nchunk=%d\n",p,s,(uint32_t)Nchunk);
+		uint64_t Vchunk = i * Nchunk;
+		uint64_t * Bp = &B[i * s];
+		uint64_t * Vp = &V[Vchunk * s];
+		uint64_t * XYp = XY;
+		printf("beofre XYp[0] %08x %08x XYp[1] %08x %08x\n", ((uint32_t*)XYp)[0], ((uint32_t*)XYp)[1], ((uint32_t*)XYp)[2], ((uint32_t*)XYp)[3]);
+
+		uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
+		uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
+		printf("Np %d beofre Sp[0] %08x %08x Sp[1] %08x %08x\n",(uint32_t)Np, ((uint32_t*)Sp)[0], ((uint32_t*)Sp)[1], ((uint32_t*)Sp)[2], ((uint32_t*)Sp)[3]);
+
+		if (Sp) {
+			printf("sp condition  s_size_all %d\n", S_SIZE_ALL);
+			smix1(Bp, 1, S_SIZE_ALL / 16, flags & ~YESCRYPT_PWXFORM, Sp, NROM, shared, XYp, NULL);
+			printf("after XYp[0] %08x %08x XYp[1] %08x %08x\n", ((uint32_t*)XYp)[0], ((uint32_t*)XYp)[1], ((uint32_t*)XYp)[2], ((uint32_t*)XYp)[3]);
+			printf("after Sp[0] %08x %08x Sp[1] %08x %08x\n", ((uint32_t*)Sp)[0], ((uint32_t*)Sp)[1], ((uint32_t*)Sp)[2], ((uint32_t*)Sp)[3]);
+
+
+		}
+
+
+		if (!(flags & __YESCRYPT_INIT_SHARED_2)) {
+			printf("flag condition Np  smix1 and smix2 again %d \n", Np);
+			smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp);
+		}
+
+
+		smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, NROM, shared, XYp, Sp);
+	}
+	
+	printf("Nloop_all %d Nloop_rw %d\n", Nloop_all, Nloop_rw);
+	if (Nloop_all > Nloop_rw) {
+
+		for (i = 0; i < p; i++) {
+			printf("smix second loop p=%d s=%d\n",p,s);
+			uint64_t * Bp = &B[i * s];
+
+			uint64_t * XYp = XY;
+
+			uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
+			smix2(Bp, r, N, Nloop_all - Nloop_rw, flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp);
+		}
+	}
+}
+
+/**
+ * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
+ *     N, r, p, t, flags, buf, buflen):
+ * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
+ * p, buflen), or a revision of scrypt as requested by flags and shared, and
+ * write the result into buf.  The parameters r, p, and buflen must satisfy
+ * r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N must be a power
+ * of 2 greater than 1.
+ *
+ * t controls computation time while not affecting peak memory usage.  shared
+ * and flags may request special modes as described in yescrypt.h.  local is
+ * the thread-local data structure, allowing to preserve and reuse a memory
+ * allocation across calls, thereby reducing its overhead.
+ *
+ * Return 0 on success; or -1 on error.
+ */
+int yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
+	const uint8_t * passwd, size_t passwdlen,
+	const uint8_t * salt, size_t saltlen,
+	uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
+	uint8_t * buf, size_t buflen)
+{
+	yescrypt_region_t tmp;
+	uint64_t NROM;
+	size_t B_size, V_size, XY_size, need;
+	uint64_t * B, * V, * XY, * S;
+	uint64_t sha256[4];
+
+	/*
+	 * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose,
+	 * so don't let it have side-effects.  Without this adjustment, it'd
+	 * enable the SHA-256 password pre-hashing and output post-hashing,
+	 * because any deviation from classic scrypt implies those.
+	 */
+	if (p == 1)
+		flags &= ~YESCRYPT_PARALLEL_SMIX;
+
+	/* Sanity-check parameters */
+	if (flags & ~YESCRYPT_KNOWN_FLAGS) {
+		errno = EINVAL;
+		return -1;
+	}
+#if SIZE_MAX > UINT32_MAX
+	if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
+		errno = EFBIG;
+		return -1;
+	}
+#endif
+	if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) {
+		errno = EFBIG;
+		return -1;
+	}
+	if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) {
+		errno = EINVAL;
+		return -1;
+	}
+	if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) {
+		errno = EINVAL;
+		return -1;
+	}
+#if S_MIN_R > 1
+	if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) {
+		errno = EINVAL;
+		return -1;
+	}
+#endif
+	if ((p > SIZE_MAX / ((size_t)256 * r + 64)) ||
+#if SIZE_MAX / 256 <= UINT32_MAX
+		(r > SIZE_MAX / 256) ||
+#endif
+		(N > SIZE_MAX / 128 / r)) {
+		errno = ENOMEM;
+		return -1;
+	}
+	if (N > UINT64_MAX / ((uint64_t)t + 1)) {
+		errno = EFBIG;
+		return -1;
+	}
+
+	if ((flags & YESCRYPT_PWXFORM) &&
+		p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	NROM = 0;
+	if (shared->shared1.aligned) {
+		NROM = shared->shared1.aligned_size / ((size_t)128 * r);
+		if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) ||
+			!(flags & YESCRYPT_RW)) {
+			errno = EINVAL;
+			return -1;
+		}
+	}
+
+	/* Allocate memory */
+	V = NULL;
+	V_size = (size_t)128 * r * N;
+
+	need = V_size;
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (local->aligned_size < need) {
+			if (local->base || local->aligned ||
+				local->base_size || local->aligned_size) {
+				errno = EINVAL;
+				return -1;
+			}
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		V = (uint64_t *)local->aligned;
+		need = 0;
+	}
+	B_size = (size_t)128 * r * p;
+	need += B_size;
+	if (need < B_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	XY_size = (size_t)256 * r + 64;
+
+	need += XY_size;
+	if (need < XY_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	if (flags & YESCRYPT_PWXFORM) {
+		size_t S_size = S_SIZE_ALL * sizeof(*S);
+
+		if (flags & YESCRYPT_PARALLEL_SMIX)
+			S_size *= p;
+
+		need += S_size;
+		if (need < S_size) {
+			errno = ENOMEM;
+			return -1;
+		}
+	}
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (!alloc_region(&tmp, need))
+			return -1;
+		B = (uint64_t *)tmp.aligned;
+		XY = (uint64_t *)((uint8_t *)B + B_size);
+	} else {
+		init_region(&tmp);
+		if (local->aligned_size < need) {
+			if (free_region(local))
+				return -1;
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		B = (uint64_t *)local->aligned;
+		V = (uint64_t *)((uint8_t *)B + B_size);
+		XY = (uint64_t *)((uint8_t *)V + V_size);
+	}
+	S = NULL;
+	if (flags & YESCRYPT_PWXFORM)
+		S = (uint64_t *)((uint8_t *)XY + XY_size);
+
+
+	if (t || flags) {
+		SHA256_CTX_Y ctx;
+		SHA256_Init_Y(&ctx);
+		SHA256_Update_Y(&ctx, passwd, passwdlen);
+		SHA256_Final_Y((uint8_t *)sha256, &ctx);
+		passwd = (uint8_t *)sha256;
+		passwdlen = sizeof(sha256);
+	}
+
+	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+	PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1,(uint8_t *)B, B_size);
+
+	if (t || flags) 
+	{
+		blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0]));
+	}
+	if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) {
+		smix(B, r, N, p, t, flags, V, NROM, shared, XY, S);
+	} else {
+		uint32_t i;
+		/* 2: for i = 0 to p - 1 do */
+		for (i = 0; i < p; i++) {
+			/* 3: B_i <-- MF(B_i, N) */
+			smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V, NROM, shared, XY, S);
+		}
+	}
+
+	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+	PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen);
+	/*
+	 * Except when computing classic scrypt, allow all computation so far
+	 * to be performed on the client.  The final steps below match those of
+	 * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
+	 * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
+	 * SCRAM's use of SHA-1) would be usable with yescrypt hashes.
+	 */
+	if ((t || flags) && buflen == sizeof(sha256)) {
+		/* Compute ClientKey */
+
+		{
+			HMAC_SHA256_CTX_Y ctx;
+			HMAC_SHA256_Init_Y(&ctx, buf, buflen);
+			HMAC_SHA256_Update_Y(&ctx, salt, saltlen);
+			HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx);
+		}
+		/* Compute StoredKey */
+		{
+			SHA256_CTX_Y ctx;
+			SHA256_Init_Y(&ctx);
+			SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256));
+			SHA256_Final_Y(buf, &ctx);
+		}
+	}
+
+	if (free_region(&tmp))
+		return -1;
+
+	/* Success! */
+	return 0;
+}
+
+int
+yescrypt_kdf_old(const yescrypt_shared_t * shared, yescrypt_local_t * local,
+const uint8_t * passwd, size_t passwdlen,
+const uint8_t * salt, size_t saltlen,
+uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
+uint8_t * buf, size_t buflen)
+{
+	yescrypt_region_t tmp;
+	uint64_t NROM;
+	size_t B_size, V_size, XY_size, need;
+	uint64_t * B, *V, *XY, *S;
+	uint64_t sha256[4];
+
+	/*
+	* YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose,
+	* so don't let it have side-effects.  Without this adjustment, it'd
+	* enable the SHA-256 password pre-hashing and output post-hashing,
+	* because any deviation from classic scrypt implies those.
+	*/
+	if (p == 1)
+		flags &= ~YESCRYPT_PARALLEL_SMIX;
+
+	/* Sanity-check parameters */
+	if (flags & ~YESCRYPT_KNOWN_FLAGS) {
+		errno = EINVAL;
+		return -1;
+	}
+#if SIZE_MAX > UINT32_MAX
+	if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
+		errno = EFBIG;
+		return -1;
+	}
+#endif
+	if ((uint64_t)(r)* (uint64_t)(p) >= (1 << 30)) {
+		errno = EFBIG;
+		return -1;
+	}
+	if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) {
+		errno = EINVAL;
+		return -1;
+	}
+	if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) {
+		errno = EINVAL;
+		return -1;
+	}
+#if S_MIN_R > 1
+	if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) {
+		errno = EINVAL;
+		return -1;
+	}
+#endif
+	if ((p > SIZE_MAX / ((size_t)256 * r + 64)) ||
+#if SIZE_MAX / 256 <= UINT32_MAX
+		(r > SIZE_MAX / 256) ||
+#endif
+		(N > SIZE_MAX / 128 / r)) {
+		errno = ENOMEM;
+		return -1;
+	}
+	if (N > UINT64_MAX / ((uint64_t)t + 1)) {
+		errno = EFBIG;
+		return -1;
+	}
+
+	if ((flags & YESCRYPT_PWXFORM) &&
+		p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	NROM = 0;
+	if (shared->shared1.aligned) {
+		NROM = shared->shared1.aligned_size / ((size_t)128 * r);
+		if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) ||
+			!(flags & YESCRYPT_RW)) {
+			errno = EINVAL;
+			return -1;
+		}
+	}
+
+	/* Allocate memory */
+	V = NULL;
+	V_size = (size_t)128 * r * N;
+
+	need = V_size;
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (local->aligned_size < need) {
+			if (local->base || local->aligned ||
+				local->base_size || local->aligned_size) {
+				errno = EINVAL;
+				return -1;
+			}
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		V = (uint64_t *)local->aligned;
+		need = 0;
+	}
+	B_size = (size_t)128 * r * p;
+	need += B_size;
+	if (need < B_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	XY_size = (size_t)256 * r + 64;
+
+	need += XY_size;
+	if (need < XY_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	if (flags & YESCRYPT_PWXFORM) {
+		size_t S_size = S_SIZE_ALL * sizeof(*S);
+
+		if (flags & YESCRYPT_PARALLEL_SMIX)
+			S_size *= p;
+
+		need += S_size;
+		if (need < S_size) {
+			errno = ENOMEM;
+			return -1;
+		}
+	}
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (!alloc_region(&tmp, need))
+			return -1;
+		B = (uint64_t *)tmp.aligned;
+		XY = (uint64_t *)((uint8_t *)B + B_size);
+		printf("yescrypt_init_shared and flag");
+	}
+	else {
+		printf("NOT yescrypt_init_shared and flag");
+		init_region(&tmp);
+		if (local->aligned_size < need) {
+			if (free_region(local))
+				return -1;
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		B = (uint64_t *)local->aligned;
+		V = (uint64_t *)((uint8_t *)B + B_size);
+		XY = (uint64_t *)((uint8_t *)V + V_size);
+	}
+	S = NULL;
+	if (flags & YESCRYPT_PWXFORM)
+		S = (uint64_t *)((uint8_t *)XY + XY_size);
+
+	printf("XY_size %d  S_size %d B_size %d V_size %d\n", XY_size, S_SIZE_ALL*sizeof(S), B_size, V_size);
+
+	if (t || flags) {
+		printf(" first sha t %d flag %d t or flag %d\n", t, flags, (t || flags));
+		for (int i = 0; i<10; i++) { printf("i=%d passwd %08x %08x\n",i, ((uint32_t*)passwd)[2 * i], ((uint32_t*)passwd)[2 * i+1]); }
+		SHA256_CTX_Y ctx;
+		SHA256_Init_Y(&ctx);
+		SHA256_Update_Y(&ctx, passwd, passwdlen);
+		SHA256_Final_Y((uint8_t *)sha256, &ctx);
+		passwd = (uint8_t *)sha256;
+		passwdlen = sizeof(sha256);
+	}
+
+	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+	printf("passwdlen=%d saltlen=%d before 1st pbkdf2 B_size %d\n",passwdlen,saltlen, B_size);
+	PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, (uint8_t *)B, B_size);
+	//for (int k = 0; k<32; k++)
+	//	printf("k=%d the buf %08x %08x %08x %08x %08x %08x %08x %08x\n", k, ((uint32_t*)B)[8 * k], ((uint32_t*)B)[8 * k + 1], 
+	//	                                      ((uint32_t*)B)[8 * k+2], ((uint32_t*)B)[8 * k + 3], 
+     //                                         ((uint32_t*)B)[8 * k+4], ((uint32_t*)B)[8 * k + 5],
+	  //                                        ((uint32_t*)B)[8 * k+6], ((uint32_t*)B)[8 * k + 7]);
+
+
+	if (t || flags)
+	{
+		printf("before blkcpy count %d\n", sizeof(sha256) / sizeof(sha256[0]));
+		blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0]));
+	}
+	printf("after pbkdf2 B0= %08x %08x %08x %08x %08x %08x %08x %08x\n", B[0], B[1], B[2], B[3]);
+	
+		printf("before smix p %d flag %d\n", p, (flags & YESCRYPT_PARALLEL_SMIX));
+		printf("coef smix r %d N %d p %d t %d flags %d NROM %d\n", r, N, p, t, flags, NROM);
+		smix(B, r, N, p, t, flags, V, NROM, shared, XY, S);
+
+
+	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+	printf("before 2nd pbkdf2 B_size %d buflen %d\n", B_size);
+	printf("paswd = %08x %08x %08x %08x\n", ((uint32_t*)passwd)[0], ((uint32_t*)passwd)[1],((uint32_t*)passwd)[2],((uint32_t*)passwd)[3]);
+	PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen);
+	printf("after 2nd pbkdf2 B_size %d buflen %d\n", B_size, buflen);
+	printf("buf = %08x %08x %08x %08x %08x %08x %08x %08x",
+		((uint64_t*)buf)[0], ((uint64_t*)buf)[1], ((uint64_t*)buf)[2], ((uint64_t*)buf)[3]);
+
+	/*
+	* Except when computing classic scrypt, allow all computation so far
+	* to be performed on the client.  The final steps below match those of
+	* SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
+	* far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
+	* SCRAM's use of SHA-1) would be usable with yescrypt hashes.
+	*/
+	if ((t || flags) && buflen == sizeof(sha256)) {
+		/* Compute ClientKey */
+		printf("compute keys before end the flag %d\n", (t || flags));
+
+		{
+			HMAC_SHA256_CTX_Y ctx;
+			HMAC_SHA256_Init_Y(&ctx, buf, buflen);
+			HMAC_SHA256_Update_Y(&ctx, salt, saltlen);
+			HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx);
+		}
+		/* Compute StoredKey */
+		{
+			SHA256_CTX_Y ctx;
+			SHA256_Init_Y(&ctx);
+			SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256));
+			SHA256_Final_Y(buf, &ctx);
+		}
+	}
+	printf("buf = %08x %08x %08x %08x %08x %08x %08x %08x",
+		((uint64_t*)buf)[0], ((uint64_t*)buf)[1], ((uint64_t*)buf)[2], ((uint64_t*)buf)[3]);
+
+	if (free_region(&tmp))
+		return -1;
+
+	/* Success! */
+	return 0;
+}
+
diff --git a/sph/yescrypt-simd.c b/sph/yescrypt-simd.c
new file mode 100644
index 0000000000..adc054d27d
--- /dev/null
+++ b/sph/yescrypt-simd.c
@@ -0,0 +1,1380 @@
+/*-
+ * Copyright 2009 Colin Percival
+ * Copyright 2012-2014 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+/*
+ * On 64-bit, enabling SSE4.1 helps our pwxform code indirectly, via avoiding
+ * gcc bug 54349 (fixed for gcc 4.9+).  On 32-bit, it's of direct help.  AVX
+ * and XOP are of further help either way.
+ */
+#ifndef __SSE4_1__
+#warning "Consider enabling SSE4.1, AVX, or XOP in the C compiler for significantly better performance"
+#endif
+
+#include <emmintrin.h>
+#ifdef __XOP__
+#include <x86intrin.h>
+#endif
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "sha256_Y.h"
+#include "sysendian.h"
+
+#include "sph/yescrypt.h"
+
+#include "sph/yescrypt-platform.c"
+
+#if __STDC_VERSION__ >= 199901L
+/* have restrict */
+#elif defined(__GNUC__)
+#define restrict __restrict
+#else
+#define restrict
+#endif
+
+#define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint));
+#define PREFETCH_OUT(x, hint) /* disabled */
+
+#ifdef __XOP__
+#define ARX(out, in1, in2, s) \
+	out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s));
+#else
+#define ARX(out, in1, in2, s) \
+	{ \
+		__m128i T = _mm_add_epi32(in1, in2); \
+		out = _mm_xor_si128(out, _mm_slli_epi32(T, s)); \
+		out = _mm_xor_si128(out, _mm_srli_epi32(T, 32-s)); \
+	}
+#endif
+
+#define SALSA20_2ROUNDS \
+	/* Operate on "columns" */ \
+	ARX(X1, X0, X3, 7) \
+	ARX(X2, X1, X0, 9) \
+	ARX(X3, X2, X1, 13) \
+	ARX(X0, X3, X2, 18) \
+\
+	/* Rearrange data */ \
+	X1 = _mm_shuffle_epi32(X1, 0x93); \
+	X2 = _mm_shuffle_epi32(X2, 0x4E); \
+	X3 = _mm_shuffle_epi32(X3, 0x39); \
+\
+	/* Operate on "rows" */ \
+	ARX(X3, X0, X1, 7) \
+	ARX(X2, X3, X0, 9) \
+	ARX(X1, X2, X3, 13) \
+	ARX(X0, X1, X2, 18) \
+\
+	/* Rearrange data */ \
+	X1 = _mm_shuffle_epi32(X1, 0x39); \
+	X2 = _mm_shuffle_epi32(X2, 0x4E); \
+	X3 = _mm_shuffle_epi32(X3, 0x93);
+
+/**
+ * Apply the salsa20/8 core to the block provided in (X0 ... X3).
+ */
+#define SALSA20_8_BASE(maybe_decl, out) \
+	{ \
+		maybe_decl Y0 = X0; \
+		maybe_decl Y1 = X1; \
+		maybe_decl Y2 = X2; \
+		maybe_decl Y3 = X3; \
+		SALSA20_2ROUNDS \
+		SALSA20_2ROUNDS \
+		SALSA20_2ROUNDS \
+		SALSA20_2ROUNDS \
+		(out)[0] = X0 = _mm_add_epi32(X0, Y0); \
+		(out)[1] = X1 = _mm_add_epi32(X1, Y1); \
+		(out)[2] = X2 = _mm_add_epi32(X2, Y2); \
+		(out)[3] = X3 = _mm_add_epi32(X3, Y3); \
+	}
+#define SALSA20_8(out) \
+	SALSA20_8_BASE(__m128i, out)
+
+/**
+ * Apply the salsa20/8 core to the block provided in (X0 ... X3) ^ (Z0 ... Z3).
+ */
+#define SALSA20_8_XOR_ANY(maybe_decl, Z0, Z1, Z2, Z3, out) \
+	X0 = _mm_xor_si128(X0, Z0); \
+	X1 = _mm_xor_si128(X1, Z1); \
+	X2 = _mm_xor_si128(X2, Z2); \
+	X3 = _mm_xor_si128(X3, Z3); \
+	SALSA20_8_BASE(maybe_decl, out)
+
+#define SALSA20_8_XOR_MEM(in, out) \
+	SALSA20_8_XOR_ANY(__m128i, (in)[0], (in)[1], (in)[2], (in)[3], out)
+
+#define SALSA20_8_XOR_REG(out) \
+	SALSA20_8_XOR_ANY(/* empty */, Y0, Y1, Y2, Y3, out)
+
+typedef union {
+	uint32_t w[16];
+	__m128i q[4];
+} salsa20_blk_t;
+
+/**
+ * blockmix_salsa8(Bin, Bout, r):
+ * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
+ * bytes in length; the output Bout must also be the same size.
+ */
+static inline void
+blockmix_salsa8(const salsa20_blk_t *restrict Bin,
+    salsa20_blk_t *restrict Bout, size_t r)
+{
+	__m128i X0, X1, X2, X3;
+	size_t i;
+
+	r--;
+	PREFETCH(&Bin[r * 2 + 1], _MM_HINT_T0)
+	for (i = 0; i < r; i++) {
+		PREFETCH(&Bin[i * 2], _MM_HINT_T0)
+		PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
+		PREFETCH(&Bin[i * 2 + 1], _MM_HINT_T0)
+		PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0)
+	}
+	PREFETCH(&Bin[r * 2], _MM_HINT_T0)
+	PREFETCH_OUT(&Bout[r], _MM_HINT_T0)
+	PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0)
+
+	/* 1: X <-- B_{2r - 1} */
+	X0 = Bin[r * 2 + 1].q[0];
+	X1 = Bin[r * 2 + 1].q[1];
+	X2 = Bin[r * 2 + 1].q[2];
+	X3 = Bin[r * 2 + 1].q[3];
+
+	/* 3: X <-- H(X \xor B_i) */
+	/* 4: Y_i <-- X */
+	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+	SALSA20_8_XOR_MEM(Bin[0].q, Bout[0].q)
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < r;) {
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		SALSA20_8_XOR_MEM(Bin[i * 2 + 1].q, Bout[r + 1 + i].q)
+
+		i++;
+
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		SALSA20_8_XOR_MEM(Bin[i * 2].q, Bout[i].q)
+	}
+
+	/* 3: X <-- H(X \xor B_i) */
+	/* 4: Y_i <-- X */
+	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+	SALSA20_8_XOR_MEM(Bin[r * 2 + 1].q, Bout[r * 2 + 1].q)
+}
+
+/*
+ * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs
+ * starting with Sandy Bridge.  Additionally, PSHUFD uses separate source and
+ * destination registers, whereas the shifts would require an extra move
+ * instruction for our code when building without AVX.  Unfortunately, PSHUFD
+ * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ)
+ * and somewhat slower on some non-Intel CPUs (luckily not including AMD
+ * Bulldozer and Piledriver).  Since for many other CPUs using (V)PSHUFD is a
+ * win in terms of throughput or/and not needing a move instruction, we
+ * currently use it despite of the higher latency on some older CPUs.  As an
+ * alternative, the #if below may be patched to only enable use of (V)PSHUFD
+ * when building with SSE4.1 or newer, which is not available on older CPUs
+ * where this instruction has higher latency.
+ */
+#if 1
+#define HI32(X) \
+	_mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1))
+#elif 0
+#define HI32(X) \
+	_mm_srli_si128((X), 4)
+#else
+#define HI32(X) \
+	_mm_srli_epi64((X), 32)
+#endif
+
+#if defined(__x86_64__) && (defined(__ICC) || defined(__llvm__))
+/* Intel's name, also supported by recent gcc */
+#define EXTRACT64(X) _mm_cvtsi128_si64(X)
+#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__)
+/* gcc got the 'x' name earlier than non-'x', MSVC and Open64 had bugs */
+#define EXTRACT64(X) _mm_cvtsi128_si64x(X)
+#elif defined(__x86_64__) && defined(__SSE4_1__)
+/* No known bugs for this intrinsic */
+#include <smmintrin.h>
+#define EXTRACT64(X) _mm_extract_epi64((X), 0)
+#elif defined(__SSE4_1__)
+/* 32-bit */
+#include <smmintrin.h>
+#if 0
+/* This is currently unused by the code below, which instead uses these two
+ * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */
+#define EXTRACT64(X) \
+	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
+	((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32))
+#endif
+#else
+/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64*() */
+#define EXTRACT64(X) \
+	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
+	((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
+#endif
+
+/* This is tunable */
+#define S_BITS 8
+
+/* Not tunable in this implementation, hard-coded in a few places */
+#define S_SIMD 2
+#define S_P 4
+
+/* Number of S-boxes.  Not tunable by design, hard-coded in a few places. */
+#define S_N 2
+
+/* Derived values.  Not tunable except via S_BITS above. */
+#define S_SIZE1 (1 << S_BITS)
+#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8)
+#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK)
+#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD * 8)
+
+#if !defined(__x86_64__) && defined(__SSE4_1__)
+/* 32-bit with SSE4.1 */
+#define PWXFORM_X_T __m128i
+#define PWXFORM_SIMD(X, x, s0, s1) \
+	x = _mm_and_si128(X, _mm_set1_epi64x(S_MASK2)); \
+	s0 = *(const __m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \
+	s1 = *(const __m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \
+	X = _mm_mul_epu32(HI32(X), X); \
+	X = _mm_add_epi64(X, s0); \
+	X = _mm_xor_si128(X, s1);
+#else
+/* 64-bit, or 32-bit without SSE4.1 */
+#define PWXFORM_X_T uint64_t
+#define PWXFORM_SIMD(X, x, s0, s1) \
+	x = EXTRACT64(X) & S_MASK2; \
+	s0 = *(const __m128i *)(S0 + (uint32_t)x); \
+	s1 = *(const __m128i *)(S1 + (x >> 32)); \
+	X = _mm_mul_epu32(HI32(X), X); \
+	X = _mm_add_epi64(X, s0); \
+	X = _mm_xor_si128(X, s1);
+#endif
+
+#define PWXFORM_ROUND \
+	PWXFORM_SIMD(X0, x0, s00, s01) \
+	PWXFORM_SIMD(X1, x1, s10, s11) \
+	PWXFORM_SIMD(X2, x2, s20, s21) \
+	PWXFORM_SIMD(X3, x3, s30, s31)
+
+#define PWXFORM \
+	{ \
+		PWXFORM_X_T x0, x1, x2, x3; \
+		__m128i s00, s01, s10, s11, s20, s21, s30, s31; \
+		PWXFORM_ROUND PWXFORM_ROUND \
+		PWXFORM_ROUND PWXFORM_ROUND \
+		PWXFORM_ROUND PWXFORM_ROUND \
+	}
+
+#define XOR4(in) \
+	X0 = _mm_xor_si128(X0, (in)[0]); \
+	X1 = _mm_xor_si128(X1, (in)[1]); \
+	X2 = _mm_xor_si128(X2, (in)[2]); \
+	X3 = _mm_xor_si128(X3, (in)[3]);
+
+#define OUT(out) \
+	(out)[0] = X0; \
+	(out)[1] = X1; \
+	(out)[2] = X2; \
+	(out)[3] = X3;
+
+/**
+ * blockmix_pwxform(Bin, Bout, r, S):
+ * Compute Bout = BlockMix_pwxform{salsa20/8, r, S}(Bin).  The input Bin must
+ * be 128r bytes in length; the output Bout must also be the same size.
+ */
+static void
+blockmix(const salsa20_blk_t *restrict Bin, salsa20_blk_t *restrict Bout,
+    size_t r, const __m128i *restrict S)
+{
+	const uint8_t * S0, * S1;
+	__m128i X0, X1, X2, X3;
+	size_t i;
+
+	if (!S) {
+		blockmix_salsa8(Bin, Bout, r);
+		return;
+	}
+
+	S0 = (const uint8_t *)S;
+	S1 = (const uint8_t *)S + S_SIZE_ALL / 2;
+
+	/* Convert 128-byte blocks to 64-byte blocks */
+	r *= 2;
+
+	r--;
+	PREFETCH(&Bin[r], _MM_HINT_T0)
+	for (i = 0; i < r; i++) {
+		PREFETCH(&Bin[i], _MM_HINT_T0)
+		PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
+	}
+	PREFETCH_OUT(&Bout[r], _MM_HINT_T0)
+
+	/* X <-- B_{r1 - 1} */
+	X0 = Bin[r].q[0];
+	X1 = Bin[r].q[1];
+	X2 = Bin[r].q[2];
+	X3 = Bin[r].q[3];
+
+	/* for i = 0 to r1 - 1 do */
+	for (i = 0; i < r; i++) {
+		/* X <-- H'(X \xor B_i) */
+		XOR4(Bin[i].q)
+		PWXFORM
+		/* B'_i <-- X */
+		OUT(Bout[i].q)
+	}
+
+	/* Last iteration of the loop above */
+	XOR4(Bin[i].q)
+	PWXFORM
+
+	/* B'_i <-- H(B'_i) */
+	SALSA20_8(Bout[i].q)
+}
+
+#define XOR4_2(in1, in2) \
+	X0 = _mm_xor_si128((in1)[0], (in2)[0]); \
+	X1 = _mm_xor_si128((in1)[1], (in2)[1]); \
+	X2 = _mm_xor_si128((in1)[2], (in2)[2]); \
+	X3 = _mm_xor_si128((in1)[3], (in2)[3]);
+
+static inline uint32_t
+blockmix_salsa8_xor(const salsa20_blk_t *restrict Bin1,
+    const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
+    size_t r, int Bin2_in_ROM)
+{
+	__m128i X0, X1, X2, X3;
+	size_t i;
+
+	r--;
+	if (Bin2_in_ROM) {
+		PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_NTA)
+		PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0)
+		for (i = 0; i < r; i++) {
+			PREFETCH(&Bin2[i * 2], _MM_HINT_NTA)
+			PREFETCH(&Bin1[i * 2], _MM_HINT_T0)
+			PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_NTA)
+			PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0)
+			PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
+			PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0)
+		}
+		PREFETCH(&Bin2[r * 2], _MM_HINT_T0)
+	} else {
+		PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0)
+		PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0)
+		for (i = 0; i < r; i++) {
+			PREFETCH(&Bin2[i * 2], _MM_HINT_T0)
+			PREFETCH(&Bin1[i * 2], _MM_HINT_T0)
+			PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0)
+			PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0)
+			PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
+			PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0)
+		}
+		PREFETCH(&Bin2[r * 2], _MM_HINT_T0)
+	}
+	PREFETCH(&Bin1[r * 2], _MM_HINT_T0)
+	PREFETCH_OUT(&Bout[r], _MM_HINT_T0)
+	PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0)
+
+	/* 1: X <-- B_{2r - 1} */
+	XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q)
+
+	/* 3: X <-- H(X \xor B_i) */
+	/* 4: Y_i <-- X */
+	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+	XOR4(Bin1[0].q)
+	SALSA20_8_XOR_MEM(Bin2[0].q, Bout[0].q)
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < r;) {
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		XOR4(Bin1[i * 2 + 1].q)
+		SALSA20_8_XOR_MEM(Bin2[i * 2 + 1].q, Bout[r + 1 + i].q)
+
+		i++;
+
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		XOR4(Bin1[i * 2].q)
+		SALSA20_8_XOR_MEM(Bin2[i * 2].q, Bout[i].q)
+	}
+
+	/* 3: X <-- H(X \xor B_i) */
+	/* 4: Y_i <-- X */
+	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+	XOR4(Bin1[r * 2 + 1].q)
+	SALSA20_8_XOR_MEM(Bin2[r * 2 + 1].q, Bout[r * 2 + 1].q)
+
+	return _mm_cvtsi128_si32(X0);
+}
+
+static uint32_t
+blockmix_xor(const salsa20_blk_t *restrict Bin1,
+    const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
+    size_t r, int Bin2_in_ROM, const __m128i *restrict S)
+{
+	const uint8_t * S0, * S1;
+	__m128i X0, X1, X2, X3;
+	size_t i;
+
+	if (!S)
+		return blockmix_salsa8_xor(Bin1, Bin2, Bout, r, Bin2_in_ROM);
+
+	S0 = (const uint8_t *)S;
+	S1 = (const uint8_t *)S + S_SIZE_ALL / 2;
+
+	/* Convert 128-byte blocks to 64-byte blocks */
+	r *= 2;
+
+	r--;
+	if (Bin2_in_ROM) {
+		PREFETCH(&Bin2[r], _MM_HINT_NTA)
+		PREFETCH(&Bin1[r], _MM_HINT_T0)
+		for (i = 0; i < r; i++) {
+			PREFETCH(&Bin2[i], _MM_HINT_NTA)
+			PREFETCH(&Bin1[i], _MM_HINT_T0)
+			PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
+		}
+	} else {
+		PREFETCH(&Bin2[r], _MM_HINT_T0)
+		PREFETCH(&Bin1[r], _MM_HINT_T0)
+		for (i = 0; i < r; i++) {
+			PREFETCH(&Bin2[i], _MM_HINT_T0)
+			PREFETCH(&Bin1[i], _MM_HINT_T0)
+			PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
+		}
+	}
+	PREFETCH_OUT(&Bout[r], _MM_HINT_T0);
+
+	/* X <-- B_{r1 - 1} */
+	XOR4_2(Bin1[r].q, Bin2[r].q)
+
+	/* for i = 0 to r1 - 1 do */
+	for (i = 0; i < r; i++) {
+		/* X <-- H'(X \xor B_i) */
+		XOR4(Bin1[i].q)
+		XOR4(Bin2[i].q)
+		PWXFORM
+		/* B'_i <-- X */
+		OUT(Bout[i].q)
+	}
+
+	/* Last iteration of the loop above */
+	XOR4(Bin1[i].q)
+	XOR4(Bin2[i].q)
+	PWXFORM
+
+	/* B'_i <-- H(B'_i) */
+	SALSA20_8(Bout[i].q)
+
+	return _mm_cvtsi128_si32(X0);
+}
+
+#undef XOR4
+#define XOR4(in, out) \
+	(out)[0] = Y0 = _mm_xor_si128((in)[0], (out)[0]); \
+	(out)[1] = Y1 = _mm_xor_si128((in)[1], (out)[1]); \
+	(out)[2] = Y2 = _mm_xor_si128((in)[2], (out)[2]); \
+	(out)[3] = Y3 = _mm_xor_si128((in)[3], (out)[3]);
+
+static inline uint32_t
+blockmix_salsa8_xor_save(const salsa20_blk_t *restrict Bin1,
+    salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
+    size_t r)
+{
+	__m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+	size_t i;
+
+	r--;
+	PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0)
+	PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0)
+	for (i = 0; i < r; i++) {
+		PREFETCH(&Bin2[i * 2], _MM_HINT_T0)
+		PREFETCH(&Bin1[i * 2], _MM_HINT_T0)
+		PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0)
+		PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0)
+		PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
+		PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0)
+	}
+	PREFETCH(&Bin2[r * 2], _MM_HINT_T0)
+	PREFETCH(&Bin1[r * 2], _MM_HINT_T0)
+	PREFETCH_OUT(&Bout[r], _MM_HINT_T0)
+	PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0)
+
+	/* 1: X <-- B_{2r - 1} */
+	XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q)
+
+	/* 3: X <-- H(X \xor B_i) */
+	/* 4: Y_i <-- X */
+	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+	XOR4(Bin1[0].q, Bin2[0].q)
+	SALSA20_8_XOR_REG(Bout[0].q)
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < r;) {
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		XOR4(Bin1[i * 2 + 1].q, Bin2[i * 2 + 1].q)
+		SALSA20_8_XOR_REG(Bout[r + 1 + i].q)
+
+		i++;
+
+		/* 3: X <-- H(X \xor B_i) */
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		XOR4(Bin1[i * 2].q, Bin2[i * 2].q)
+		SALSA20_8_XOR_REG(Bout[i].q)
+	}
+
+	/* 3: X <-- H(X \xor B_i) */
+	/* 4: Y_i <-- X */
+	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+	XOR4(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q)
+	SALSA20_8_XOR_REG(Bout[r * 2 + 1].q)
+
+	return _mm_cvtsi128_si32(X0);
+}
+
+#define XOR4_Y \
+	X0 = _mm_xor_si128(X0, Y0); \
+	X1 = _mm_xor_si128(X1, Y1); \
+	X2 = _mm_xor_si128(X2, Y2); \
+	X3 = _mm_xor_si128(X3, Y3);
+
+static uint32_t
+blockmix_xor_save(const salsa20_blk_t *restrict Bin1,
+    salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
+    size_t r, const __m128i *restrict S)
+{
+	const uint8_t * S0, * S1;
+	__m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+	size_t i;
+
+	if (!S)
+		return blockmix_salsa8_xor_save(Bin1, Bin2, Bout, r);
+
+	S0 = (const uint8_t *)S;
+	S1 = (const uint8_t *)S + S_SIZE_ALL / 2;
+
+	/* Convert 128-byte blocks to 64-byte blocks */
+	r *= 2;
+
+	r--;
+	PREFETCH(&Bin2[r], _MM_HINT_T0)
+	PREFETCH(&Bin1[r], _MM_HINT_T0)
+	for (i = 0; i < r; i++) {
+		PREFETCH(&Bin2[i], _MM_HINT_T0)
+		PREFETCH(&Bin1[i], _MM_HINT_T0)
+		PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
+	}
+	PREFETCH_OUT(&Bout[r], _MM_HINT_T0);
+
+	/* X <-- B_{r1 - 1} */
+	XOR4_2(Bin1[r].q, Bin2[r].q)
+
+	/* for i = 0 to r1 - 1 do */
+	for (i = 0; i < r; i++) {
+		XOR4(Bin1[i].q, Bin2[i].q)
+		/* X <-- H'(X \xor B_i) */
+		XOR4_Y
+		PWXFORM
+		/* B'_i <-- X */
+		OUT(Bout[i].q)
+	}
+
+	/* Last iteration of the loop above */
+	XOR4(Bin1[i].q, Bin2[i].q)
+	XOR4_Y
+	PWXFORM
+
+	/* B'_i <-- H(B'_i) */
+	SALSA20_8(Bout[i].q)
+
+	return _mm_cvtsi128_si32(X0);
+}
+
+#undef ARX
+#undef SALSA20_2ROUNDS
+#undef SALSA20_8
+#undef SALSA20_8_XOR_ANY
+#undef SALSA20_8_XOR_MEM
+#undef SALSA20_8_XOR_REG
+#undef PWXFORM_SIMD_1
+#undef PWXFORM_SIMD_2
+#undef PWXFORM_ROUND
+#undef PWXFORM
+#undef OUT
+#undef XOR4
+#undef XOR4_2
+#undef XOR4_Y
+
+/**
+ * integerify(B, r):
+ * Return the result of parsing B_{2r-1} as a little-endian integer.
+ */
+static inline uint32_t
+integerify(const salsa20_blk_t * B, size_t r)
+{
+	return B[2 * r - 1].w[0];
+}
+
+/**
+ * smix1(B, r, N, flags, V, NROM, shared, XY, S):
+ * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 128r bytes in length.  The value N must be even and no
+ * smaller than 2.  The array V must be aligned to a multiple of 64 bytes, and
+ * arrays B and XY to a multiple of at least 16 bytes (aligning them to 64
+ * bytes as well saves cache lines, but might result in cache bank conflicts).
+ */
+static void
+smix1(uint8_t * B, size_t r, uint32_t N, yescrypt_flags_t flags,
+    salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared,
+    salsa20_blk_t * XY, void * S)
+{
+	const salsa20_blk_t * VROM = shared->shared1.aligned;
+	uint32_t VROM_mask = shared->mask1;
+	size_t s = 2 * r;
+	salsa20_blk_t * X = V, * Y;
+	uint32_t i, j;
+	size_t k;
+
+	/* 1: X <-- B */
+	/* 3: V_i <-- X */
+	for (k = 0; k < 2 * r; k++) {
+		for (i = 0; i < 16; i++) {
+			X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]);
+		}
+	}
+
+	if (NROM && (VROM_mask & 1)) {
+		uint32_t n;
+		salsa20_blk_t * V_n;
+		const salsa20_blk_t * V_j;
+
+		/* 4: X <-- H(X) */
+		/* 3: V_i <-- X */
+		Y = &V[s];
+		blockmix(X, Y, r, S);
+
+		X = &V[2 * s];
+		if ((1 & VROM_mask) == 1) {
+			/* j <-- Integerify(X) mod NROM */
+			j = integerify(Y, r) & (NROM - 1);
+			V_j = &VROM[j * s];
+
+			/* X <-- H(X \xor VROM_j) */
+			j = blockmix_xor(Y, V_j, X, r, 1, S);
+		} else {
+			/* X <-- H(X) */
+			blockmix(Y, X, r, S);
+			j = integerify(X, r);
+		}
+
+		for (n = 2; n < N; n <<= 1) {
+			uint32_t m = (n < N / 2) ? n : (N - 1 - n);
+
+			V_n = &V[n * s];
+
+			/* 2: for i = 0 to N - 1 do */
+			for (i = 1; i < m; i += 2) {
+				/* j <-- Wrap(Integerify(X), i) */
+				j &= n - 1;
+				j += i - 1;
+				V_j = &V[j * s];
+
+				/* X <-- X \xor V_j */
+				/* 4: X <-- H(X) */
+				/* 3: V_i <-- X */
+				Y = &V_n[i * s];
+				j = blockmix_xor(X, V_j, Y, r, 0, S);
+
+				if (((n + i) & VROM_mask) == 1) {
+					/* j <-- Integerify(X) mod NROM */
+					j &= NROM - 1;
+					V_j = &VROM[j * s];
+				} else {
+					/* j <-- Wrap(Integerify(X), i) */
+					j &= n - 1;
+					j += i;
+					V_j = &V[j * s];
+				}
+
+				/* X <-- H(X \xor VROM_j) */
+				X = &V_n[(i + 1) * s];
+				j = blockmix_xor(Y, V_j, X, r, 1, S);
+			}
+		}
+
+		n >>= 1;
+
+		/* j <-- Wrap(Integerify(X), i) */
+		j &= n - 1;
+		j += N - 2 - n;
+		V_j = &V[j * s];
+
+		/* X <-- X \xor V_j */
+		/* 4: X <-- H(X) */
+		/* 3: V_i <-- X */
+		Y = &V[(N - 1) * s];
+		j = blockmix_xor(X, V_j, Y, r, 0, S);
+
+		if (((N - 1) & VROM_mask) == 1) {
+			/* j <-- Integerify(X) mod NROM */
+			j &= NROM - 1;
+			V_j = &VROM[j * s];
+		} else {
+			/* j <-- Wrap(Integerify(X), i) */
+			j &= n - 1;
+			j += N - 1 - n;
+			V_j = &V[j * s];
+		}
+
+		/* X <-- X \xor V_j */
+		/* 4: X <-- H(X) */
+		X = XY;
+		blockmix_xor(Y, V_j, X, r, 1, S);
+	} else if (flags & YESCRYPT_RW) {
+		uint32_t n;
+		salsa20_blk_t * V_n, * V_j;
+
+		/* 4: X <-- H(X) */
+		/* 3: V_i <-- X */
+		Y = &V[s];
+		blockmix(X, Y, r, S);
+
+		/* 4: X <-- H(X) */
+		/* 3: V_i <-- X */
+		X = &V[2 * s];
+		blockmix(Y, X, r, S);
+		j = integerify(X, r);
+
+		for (n = 2; n < N; n <<= 1) {
+			uint32_t m = (n < N / 2) ? n : (N - 1 - n);
+
+			V_n = &V[n * s];
+
+			/* 2: for i = 0 to N - 1 do */
+			for (i = 1; i < m; i += 2) {
+				Y = &V_n[i * s];
+
+				/* j <-- Wrap(Integerify(X), i) */
+				j &= n - 1;
+				j += i - 1;
+				V_j = &V[j * s];
+
+				/* X <-- X \xor V_j */
+				/* 4: X <-- H(X) */
+				/* 3: V_i <-- X */
+				j = blockmix_xor(X, V_j, Y, r, 0, S);
+
+				/* j <-- Wrap(Integerify(X), i) */
+				j &= n - 1;
+				j += i;
+				V_j = &V[j * s];
+
+				/* X <-- X \xor V_j */
+				/* 4: X <-- H(X) */
+				/* 3: V_i <-- X */
+				X = &V_n[(i + 1) * s];
+				j = blockmix_xor(Y, V_j, X, r, 0, S);
+			}
+		}
+
+		n >>= 1;
+
+		/* j <-- Wrap(Integerify(X), i) */
+		j &= n - 1;
+		j += N - 2 - n;
+		V_j = &V[j * s];
+
+		/* X <-- X \xor V_j */
+		/* 4: X <-- H(X) */
+		/* 3: V_i <-- X */
+		Y = &V[(N - 1) * s];
+		j = blockmix_xor(X, V_j, Y, r, 0, S);
+
+		/* j <-- Wrap(Integerify(X), i) */
+		j &= n - 1;
+		j += N - 1 - n;
+		V_j = &V[j * s];
+
+		/* X <-- X \xor V_j */
+		/* 4: X <-- H(X) */
+		X = XY;
+		blockmix_xor(Y, V_j, X, r, 0, S);
+	} else {
+		/* 2: for i = 0 to N - 1 do */
+		for (i = 1; i < N - 1; i += 2) {
+			/* 4: X <-- H(X) */
+			/* 3: V_i <-- X */
+			Y = &V[i * s];
+			blockmix(X, Y, r, S);
+
+			/* 4: X <-- H(X) */
+			/* 3: V_i <-- X */
+			X = &V[(i + 1) * s];
+			blockmix(Y, X, r, S);
+		}
+
+		/* 4: X <-- H(X) */
+		/* 3: V_i <-- X */
+		Y = &V[i * s];
+		blockmix(X, Y, r, S);
+
+		/* 4: X <-- H(X) */
+		X = XY;
+		blockmix(Y, X, r, S);
+	}
+
+	/* B' <-- X */
+	for (k = 0; k < 2 * r; k++) {
+		for (i = 0; i < 16; i++) {
+			le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]);
+		}
+	}
+}
+
+/**
+ * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S):
+ * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 256r bytes in length.  The value N must be a power of 2
+ * greater than 1.  The value Nloop must be even.  The array V must be aligned
+ * to a multiple of 64 bytes, and arrays B and XY to a multiple of at least 16
+ * bytes (aligning them to 64 bytes as well saves cache lines, but might result
+ * in cache bank conflicts).
+ */
+static void
+smix2(uint8_t * B, size_t r, uint32_t N, uint64_t Nloop,
+    yescrypt_flags_t flags, salsa20_blk_t * V, uint32_t NROM,
+    const yescrypt_shared_t * shared, salsa20_blk_t * XY, void * S)
+{
+	const salsa20_blk_t * VROM = shared->shared1.aligned;
+	uint32_t VROM_mask = shared->mask1;
+	size_t s = 2 * r;
+	salsa20_blk_t * X = XY, * Y = &XY[s];
+	uint64_t i;
+	uint32_t j;
+	size_t k;
+
+	if (Nloop == 0)
+		return;
+
+	/* X <-- B' */
+	/* 3: V_i <-- X */
+	for (k = 0; k < 2 * r; k++) {
+		for (i = 0; i < 16; i++) {
+			X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]);
+		}
+	}
+
+	i = Nloop / 2;
+
+	/* 7: j <-- Integerify(X) mod N */
+	j = integerify(X, r) & (N - 1);
+
+/*
+ * Normally, NROM implies YESCRYPT_RW, but we check for these separately
+ * because YESCRYPT_PARALLEL_SMIX resets YESCRYPT_RW for the smix2() calls
+ * operating on the entire V.
+ */
+	if (NROM && (flags & YESCRYPT_RW)) {
+		/* 6: for i = 0 to N - 1 do */
+		for (i = 0; i < Nloop; i += 2) {
+			salsa20_blk_t * V_j = &V[j * s];
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* V_j <-- Xprev \xor V_j */
+			/* j <-- Integerify(X) mod NROM */
+			j = blockmix_xor_save(X, V_j, Y, r, S);
+
+			if (((i + 1) & VROM_mask) == 1) {
+				const salsa20_blk_t * VROM_j;
+
+				j &= NROM - 1;
+				VROM_j = &VROM[j * s];
+
+				/* X <-- H(X \xor VROM_j) */
+				/* 7: j <-- Integerify(X) mod N */
+				j = blockmix_xor(Y, VROM_j, X, r, 1, S);
+			} else {
+				j &= N - 1;
+				V_j = &V[j * s];
+
+				/* 8: X <-- H(X \xor V_j) */
+				/* V_j <-- Xprev \xor V_j */
+				/* j <-- Integerify(X) mod NROM */
+				j = blockmix_xor_save(Y, V_j, X, r, S);
+			}
+			j &= N - 1;
+			V_j = &V[j * s];
+		}
+	} else if (NROM) {
+		/* 6: for i = 0 to N - 1 do */
+		for (i = 0; i < Nloop; i += 2) {
+			const salsa20_blk_t * V_j = &V[j * s];
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* V_j <-- Xprev \xor V_j */
+			/* j <-- Integerify(X) mod NROM */
+			j = blockmix_xor(X, V_j, Y, r, 0, S);
+
+			if (((i + 1) & VROM_mask) == 1) {
+				j &= NROM - 1;
+				V_j = &VROM[j * s];
+			} else {
+				j &= N - 1;
+				V_j = &V[j * s];
+			}
+
+			/* X <-- H(X \xor VROM_j) */
+			/* 7: j <-- Integerify(X) mod N */
+			j = blockmix_xor(Y, V_j, X, r, 1, S);
+			j &= N - 1;
+			V_j = &V[j * s];
+		}
+	} else if (flags & YESCRYPT_RW) {
+		/* 6: for i = 0 to N - 1 do */
+		do {
+			salsa20_blk_t * V_j = &V[j * s];
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* V_j <-- Xprev \xor V_j */
+			/* 7: j <-- Integerify(X) mod N */
+			j = blockmix_xor_save(X, V_j, Y, r, S);
+			j &= N - 1;
+			V_j = &V[j * s];
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* V_j <-- Xprev \xor V_j */
+			/* 7: j <-- Integerify(X) mod N */
+			j = blockmix_xor_save(Y, V_j, X, r, S);
+			j &= N - 1;
+		} while (--i);
+	} else {
+		/* 6: for i = 0 to N - 1 do */
+		do {
+			const salsa20_blk_t * V_j = &V[j * s];
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* 7: j <-- Integerify(X) mod N */
+			j = blockmix_xor(X, V_j, Y, r, 0, S);
+			j &= N - 1;
+			V_j = &V[j * s];
+
+			/* 8: X <-- H(X \xor V_j) */
+			/* 7: j <-- Integerify(X) mod N */
+			j = blockmix_xor(Y, V_j, X, r, 0, S);
+			j &= N - 1;
+		} while (--i);
+	}
+
+	/* 10: B' <-- X */
+	for (k = 0; k < 2 * r; k++) {
+		for (i = 0; i < 16; i++) {
+			le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]);
+		}
+	}
+}
+
+/**
+ * p2floor(x):
+ * Largest power of 2 not greater than argument.
+ */
+static uint64_t
+p2floor(uint64_t x)
+{
+	uint64_t y;
+	while ((y = x & (x - 1)))
+		x = y;
+	return x;
+}
+
+/**
+ * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S):
+ * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
+ * temporary storage V must be 128rN bytes in length; the temporary storage XY
+ * must be 256r or 256rp bytes in length (the larger size is required with
+ * OpenMP-enabled builds).  The value N must be a power of 2 greater than 1.
+ * The array V must be aligned to a multiple of 64 bytes, and arrays B and
+ * XY to a multiple of at least 16 bytes (aligning them to 64 bytes as well
+ * saves cache lines and helps avoid false sharing in OpenMP-enabled builds
+ * when p > 1, but it might also result in cache bank conflicts).
+ */
+static void
+smix(uint8_t * B, size_t r, uint32_t N, uint32_t p, uint32_t t,
+    yescrypt_flags_t flags,
+    salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared,
+    salsa20_blk_t * XY, void * S)
+{
+	size_t s = 2 * r;
+	uint32_t Nchunk = N / p;
+	uint64_t Nloop_all, Nloop_rw;
+	uint32_t i;
+
+	Nloop_all = Nchunk;
+	if (flags & YESCRYPT_RW) {
+		if (t <= 1) {
+			if (t)
+				Nloop_all *= 2; /* 2/3 */
+			Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
+		} else {
+			Nloop_all *= t - 1;
+		}
+	} else if (t) {
+		if (t == 1)
+			Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
+		Nloop_all *= t;
+	}
+
+	Nloop_rw = 0;
+	if (flags & __YESCRYPT_INIT_SHARED)
+		Nloop_rw = Nloop_all;
+	else if (flags & YESCRYPT_RW)
+		Nloop_rw = Nloop_all / p;
+
+	Nchunk &= ~(uint32_t)1; /* round down to even */
+	Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
+	Nloop_rw &= ~(uint64_t)1; /* round down to even */
+
+#ifdef _OPENMP
+#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, shared, XY, S, s, Nchunk, Nloop_all, Nloop_rw)
+	{
+#pragma omp for
+#endif
+	for (i = 0; i < p; i++) {
+		uint32_t Vchunk = i * Nchunk;
+		uint8_t * Bp = &B[128 * r * i];
+		salsa20_blk_t * Vp = &V[Vchunk * s];
+#ifdef _OPENMP
+		salsa20_blk_t * XYp = &XY[i * (2 * s)];
+#else
+		salsa20_blk_t * XYp = XY;
+#endif
+		uint32_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
+		void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S;
+		if (Sp)
+			smix1(Bp, 1, S_SIZE_ALL / 128,
+			    flags & ~YESCRYPT_PWXFORM,
+			    Sp, NROM, shared, XYp, NULL);
+		if (!(flags & __YESCRYPT_INIT_SHARED_2))
+			smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp);
+		smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp,
+		    NROM, shared, XYp, Sp);
+	}
+
+	if (Nloop_all > Nloop_rw) {
+#ifdef _OPENMP
+#pragma omp for
+#endif
+		for (i = 0; i < p; i++) {
+			uint8_t * Bp = &B[128 * r * i];
+#ifdef _OPENMP
+			salsa20_blk_t * XYp = &XY[i * (2 * s)];
+#else
+			salsa20_blk_t * XYp = XY;
+#endif
+			void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S;
+			smix2(Bp, r, N, Nloop_all - Nloop_rw,
+			    flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp);
+		}
+	}
+#ifdef _OPENMP
+	}
+#endif
+}
+
+/**
+ * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
+ *     N, r, p, t, flags, buf, buflen):
+ * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
+ * p, buflen), or a revision of scrypt as requested by flags and shared, and
+ * write the result into buf.  The parameters r, p, and buflen must satisfy
+ * r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N must be a power
+ * of 2 greater than 1.  (This optimized implementation currently additionally
+ * limits N to the range from 8 to 2^31, but other implementation might not.)
+ *
+ * t controls computation time while not affecting peak memory usage.  shared
+ * and flags may request special modes as described in yescrypt.h.  local is
+ * the thread-local data structure, allowing to preserve and reuse a memory
+ * allocation across calls, thereby reducing its overhead.
+ *
+ * Return 0 on success; or -1 on error.
+ */
+int
+yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
+    const uint8_t * passwd, size_t passwdlen,
+    const uint8_t * salt, size_t saltlen,
+    uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
+    uint8_t * buf, size_t buflen)
+{
+	yescrypt_region_t tmp;
+	uint64_t NROM;
+	size_t B_size, V_size, XY_size, need;
+	uint8_t * B, * S;
+	salsa20_blk_t * V, * XY;
+	uint8_t sha256[32];
+
+	/*
+	 * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose,
+	 * so don't let it have side-effects.  Without this adjustment, it'd
+	 * enable the SHA-256 password pre-hashing and output post-hashing,
+	 * because any deviation from classic scrypt implies those.
+	 */
+	if (p == 1)
+		flags &= ~YESCRYPT_PARALLEL_SMIX;
+
+	/* Sanity-check parameters */
+	if (flags & ~YESCRYPT_KNOWN_FLAGS) {
+		errno = EINVAL;
+		return -1;
+	}
+#if SIZE_MAX > UINT32_MAX
+	if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
+		errno = EFBIG;
+		return -1;
+	}
+#endif
+	if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) {
+		errno = EFBIG;
+		return -1;
+	}
+	if (N > UINT32_MAX) {
+		errno = EFBIG;
+		return -1;
+	}
+	if (((N & (N - 1)) != 0) || (N <= 7) || (r < 1) || (p < 1)) {
+		errno = EINVAL;
+		return -1;
+	}
+	if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 7)) {
+		errno = EINVAL;
+		return -1;
+	}
+	if ((r > SIZE_MAX / 256 / p) ||
+	    (N > SIZE_MAX / 128 / r)) {
+		errno = ENOMEM;
+		return -1;
+	}
+#ifdef _OPENMP
+	if (!(flags & YESCRYPT_PARALLEL_SMIX) &&
+	    (N > SIZE_MAX / 128 / (r * p))) {
+		errno = ENOMEM;
+		return -1;
+	}
+#endif
+	if ((flags & YESCRYPT_PWXFORM) &&
+#ifndef _OPENMP
+	    (flags & YESCRYPT_PARALLEL_SMIX) &&
+#endif
+	    p > SIZE_MAX / S_SIZE_ALL) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	NROM = 0;
+	if (shared->shared1.aligned) {
+		NROM = shared->shared1.aligned_size / ((size_t)128 * r);
+		if (NROM > UINT32_MAX) {
+			errno = EFBIG;
+			return -1;
+		}
+		if (((NROM & (NROM - 1)) != 0) || (NROM <= 7) ||
+		    !(flags & YESCRYPT_RW)) {
+			errno = EINVAL;
+			return -1;
+		}
+	}
+
+	/* Allocate memory */
+	V = NULL;
+	V_size = (size_t)128 * r * N;
+#ifdef _OPENMP
+	if (!(flags & YESCRYPT_PARALLEL_SMIX))
+		V_size *= p;
+#endif
+	need = V_size;
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (local->aligned_size < need) {
+			if (local->base || local->aligned ||
+			    local->base_size || local->aligned_size) {
+				errno = EINVAL;
+				return -1;
+			}
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		V = (salsa20_blk_t *)local->aligned;
+		need = 0;
+	}
+	B_size = (size_t)128 * r * p;
+	need += B_size;
+	if (need < B_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	XY_size = (size_t)256 * r;
+#ifdef _OPENMP
+	XY_size *= p;
+#endif
+	need += XY_size;
+	if (need < XY_size) {
+		errno = ENOMEM;
+		return -1;
+	}
+	if (flags & YESCRYPT_PWXFORM) {
+		size_t S_size = S_SIZE_ALL;
+#ifdef _OPENMP
+		S_size *= p;
+#else
+		if (flags & YESCRYPT_PARALLEL_SMIX)
+			S_size *= p;
+#endif
+		need += S_size;
+		if (need < S_size) {
+			errno = ENOMEM;
+			return -1;
+		}
+	}
+	if (flags & __YESCRYPT_INIT_SHARED) {
+		if (!alloc_region(&tmp, need))
+			return -1;
+		B = (uint8_t *)tmp.aligned;
+		XY = (salsa20_blk_t *)((uint8_t *)B + B_size);
+	} else {
+		init_region(&tmp);
+		if (local->aligned_size < need) {
+			if (free_region(local))
+				return -1;
+			if (!alloc_region(local, need))
+				return -1;
+		}
+		B = (uint8_t *)local->aligned;
+		V = (salsa20_blk_t *)((uint8_t *)B + B_size);
+		XY = (salsa20_blk_t *)((uint8_t *)V + V_size);
+	}
+	S = NULL;
+	if (flags & YESCRYPT_PWXFORM)
+		S = (uint8_t *)XY + XY_size;
+
+	if (t || flags) {
+		SHA256_CTX_Y ctx;
+		SHA256_Init_Y(&ctx);
+		SHA256_Update_Y(&ctx, passwd, passwdlen);
+		SHA256_Final_Y(sha256, &ctx);
+		passwd = sha256;
+		passwdlen = sizeof(sha256);
+	}
+
+	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+	PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, B, B_size);
+
+	if (t || flags)
+		memcpy(sha256, B, sizeof(sha256));
+
+	if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) {
+		smix(B, r, N, p, t, flags, V, NROM, shared, XY, S);
+	} else {
+		uint32_t i;
+
+		/* 2: for i = 0 to p - 1 do */
+#ifdef _OPENMP
+#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, shared, XY, S)
+#endif
+		for (i = 0; i < p; i++) {
+			/* 3: B_i <-- MF(B_i, N) */
+#ifdef _OPENMP
+			smix(&B[(size_t)128 * r * i], r, N, 1, t, flags,
+			    &V[(size_t)2 * r * i * N],
+			    NROM, shared,
+			    &XY[(size_t)4 * r * i],
+			    S ? &S[S_SIZE_ALL * i] : S);
+#else
+			smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, V,
+			    NROM, shared, XY, S);
+#endif
+		}
+	}
+
+	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+	PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, buf, buflen);
+
+	/*
+	 * Except when computing classic scrypt, allow all computation so far
+	 * to be performed on the client.  The final steps below match those of
+	 * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
+	 * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
+	 * SCRAM's use of SHA-1) would be usable with yescrypt hashes.
+	 */
+	if ((t || flags) && buflen == sizeof(sha256)) {
+		/* Compute ClientKey */
+		{
+			HMAC_SHA256_CTX_Y ctx;
+			HMAC_SHA256_Init_Y(&ctx, buf, buflen);
+#if 0
+/* Proper yescrypt */
+ 			HMAC_SHA256_Update_Y(&ctx, "Client Key", 10);
+#else
+/* GlobalBoost-Y buggy yescrypt */
+			HMAC_SHA256_Update_Y(&ctx, salt, saltlen);
+#endif			
+			HMAC_SHA256_Final_Y(sha256, &ctx);
+		}
+		/* Compute StoredKey */
+		{
+			SHA256_CTX_Y ctx;
+			SHA256_Init_Y(&ctx);
+			SHA256_Update_Y(&ctx, sha256, sizeof(sha256));
+			SHA256_Final_Y(buf, &ctx);
+		}
+	}
+
+	if (free_region(&tmp))
+		return -1;
+
+	/* Success! */
+	return 0;
+}
diff --git a/sph/yescrypt.h b/sph/yescrypt.h
new file mode 100644
index 0000000000..651225833f
--- /dev/null
+++ b/sph/yescrypt.h
@@ -0,0 +1,376 @@
+/*-
+ * Copyright 2009 Colin Percival
+ * Copyright 2013,2014 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+#ifndef _YESCRYPT_H_
+#define _YESCRYPT_H_
+
+#include <stdint.h>
+#include <stdlib.h> /* for size_t */
+#include <errno.h>
+
+//#ifdef __cplusplus
+//extern "C" {
+//#endif
+
+
+//extern void yescrypt_hash_sp(const unsigned char *input, unsigned char *output);
+extern void yescrypt_hash(const unsigned char *input, unsigned char *output);
+
+
+
+/**
+ * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen):
+ * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
+ * p, buflen) and write the result into buf.  The parameters r, p, and buflen
+ * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N
+ * must be a power of 2 greater than 1.
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as buf is local to the thread.
+ */
+extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen,
+	const uint8_t * __salt, size_t __saltlen,
+	uint64_t __N, uint32_t __r, uint32_t __p,
+	uint8_t * __buf, size_t __buflen);
+
+/**
+ * Internal type used by the memory allocator.  Please do not use it directly.
+ * Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since
+ * they might differ from each other in a future version.
+ */
+typedef struct {
+	void * base, * aligned;
+	size_t base_size, aligned_size;
+} yescrypt_region_t;
+
+/**
+ * Types for shared (ROM) and thread-local (RAM) data structures.
+ */
+typedef yescrypt_region_t yescrypt_shared1_t;
+typedef struct {
+	yescrypt_shared1_t shared1;
+	uint32_t mask1;
+} yescrypt_shared_t;
+typedef yescrypt_region_t yescrypt_local_t;
+
+/**
+ * Possible values for yescrypt_init_shared()'s flags argument.
+ */
+typedef enum {
+	YESCRYPT_SHARED_DEFAULTS = 0,
+	YESCRYPT_SHARED_PREALLOCATED = 0x100
+} yescrypt_init_shared_flags_t;
+
+/**
+ * Possible values for the flags argument of yescrypt_kdf(),
+ * yescrypt_gensalt_r(), yescrypt_gensalt().  These may be OR'ed together,
+ * except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive.
+ * Please refer to the description of yescrypt_kdf() below for the meaning of
+ * these flags.
+ */
+typedef enum {
+/* public */
+	YESCRYPT_WORM = 0,
+	YESCRYPT_RW = 1,
+	YESCRYPT_PARALLEL_SMIX = 2,
+	YESCRYPT_PWXFORM = 4,
+/* private */
+	__YESCRYPT_INIT_SHARED_1 = 0x10000,
+	__YESCRYPT_INIT_SHARED_2 = 0x20000,
+	__YESCRYPT_INIT_SHARED = 0x30000
+} yescrypt_flags_t;
+
+#define YESCRYPT_KNOWN_FLAGS \
+	(YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | YESCRYPT_PWXFORM | \
+	__YESCRYPT_INIT_SHARED)
+
+/**
+ * yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, mask,
+ *     buf, buflen):
+ * Optionally allocate memory for and initialize the shared (ROM) data
+ * structure.  The parameters N, r, and p must satisfy the same conditions as
+ * with crypto_scrypt().  param and paramlen specify a local parameter with
+ * which the ROM is seeded.  If buf is not NULL, then it is used to return
+ * buflen bytes of message digest for the initialized ROM (the caller may use
+ * this to verify that the ROM has been computed in the same way that it was on
+ * a previous run).
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the
+ * ROM is assumed to have been preallocated by the caller, with
+ * shared->shared1.aligned being the start address of the ROM and
+ * shared->shared1.aligned_size being its size (which must be consistent with
+ * N, r, and p).  This may be used e.g. when the ROM is to be placed in a SysV
+ * shared memory segment allocated by the caller.
+ *
+ * mask controls the frequency of ROM accesses by yescrypt_kdf().  Normally it
+ * should be set to 1, to interleave RAM and ROM accesses, which works well
+ * when both regions reside in the machine's RAM anyway.  Other values may be
+ * used e.g. when the ROM is memory-mapped from a disk file.  Recommended mask
+ * values are powers of 2 minus 1 or minus 2.  Here's the effect of some mask
+ * values:
+ * mask	value	ROM accesses in SMix 1st loop	ROM accesses in SMix 2nd loop
+ *	0		0				1/2
+ *	1		1/2				1/2
+ *	2		0				1/4
+ *	3		1/4				1/4
+ *	6		0				1/8
+ *	7		1/8				1/8
+ *	14		0				1/16
+ *	15		1/16				1/16
+ *	1022		0				1/1024
+ *	1023		1/1024				1/1024
+ *
+ * Actual computation of the ROM contents may be avoided, if you don't intend
+ * to use a ROM but need a dummy shared structure, by calling this function
+ * with NULL, 0, 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0 for the
+ * arguments starting with param and on.
+ *
+ * MT-safe as long as shared is local to the thread.
+ */
+extern int yescrypt_init_shared(yescrypt_shared_t * __shared,
+	const uint8_t * __param, size_t __paramlen,
+	uint64_t __N, uint32_t __r, uint32_t __p,
+	yescrypt_init_shared_flags_t __flags, uint32_t __mask,
+	uint8_t * __buf, size_t __buflen);
+
+/**
+ * yescrypt_free_shared(shared):
+ * Free memory that had been allocated with yescrypt_init_shared().
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as shared is local to the thread.
+ */
+extern int yescrypt_free_shared(yescrypt_shared_t * __shared);
+
+/**
+ * yescrypt_init_local(local):
+ * Initialize the thread-local (RAM) data structure.  Actual memory allocation
+ * is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r().
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as local is local to the thread.
+ */
+extern int yescrypt_init_local(yescrypt_local_t * __local);
+
+/**
+ * yescrypt_free_local(local):
+ * Free memory that may have been allocated for an initialized thread-local
+ * (RAM) data structure.
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as local is local to the thread.
+ */
+extern int yescrypt_free_local(yescrypt_local_t * __local);
+
+/**
+ * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
+ *     N, r, p, t, flags, buf, buflen):
+ * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
+ * p, buflen), or a revision of scrypt as requested by flags and shared, and
+ * write the result into buf.  The parameters N, r, p, and buflen must satisfy
+ * the same conditions as with crypto_scrypt().  t controls computation time
+ * while not affecting peak memory usage.  shared and flags may request
+ * special modes as described below.  local is the thread-local data
+ * structure, allowing to preserve and reuse a memory allocation across calls,
+ * thereby reducing its overhead.
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * t controls computation time.  t = 0 is optimal in terms of achieving the
+ * highest area-time for ASIC attackers.  Thus, higher computation time, if
+ * affordable, is best achieved by increasing N rather than by increasing t.
+ * However, if the higher memory usage (which goes along with higher N) is not
+ * affordable, or if fine-tuning of the time is needed (recall that N must be a
+ * power of 2), then t = 1 or above may be used to increase time while staying
+ * at the same peak memory usage.  t = 1 increases the time by 25% and
+ * decreases the normalized area-time to 96% of optimal.  (Of course, in
+ * absolute terms the area-time increases with higher t.  It's just that it
+ * would increase slightly more with higher N*r rather than with higher t.)
+ * t = 2 increases the time by another 20% and decreases the normalized
+ * area-time to 89% of optimal.  Thus, these two values are reasonable to use
+ * for fine-tuning.  Values of t higher than 2 result in further increase in
+ * time while reducing the efficiency much further (e.g., down to around 50% of
+ * optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact
+ * numbers varying by the flags settings).
+ *
+ * Classic scrypt is available by setting t = 0 and flags to YESCRYPT_WORM and
+ * passing a dummy shared structure (see the description of
+ * yescrypt_init_shared() above for how to produce one).  In this mode, the
+ * thread-local memory region (RAM) is first sequentially written to and then
+ * randomly read from.  This algorithm is friendly towards time-memory
+ * tradeoffs (TMTO), available both to defenders (albeit not in this
+ * implementation) and to attackers.
+ *
+ * Setting YESCRYPT_RW adds extra random reads and writes to the thread-local
+ * memory region (RAM), which makes TMTO a lot less efficient.  This may be
+ * used to slow down the kinds of attackers who would otherwise benefit from
+ * classic scrypt's efficient TMTO.  Since classic scrypt's TMTO allows not
+ * only for the tradeoff, but also for a decrease of attacker's area-time (by
+ * up to a constant factor), setting YESCRYPT_RW substantially increases the
+ * cost of attacks in area-time terms as well.  Yet another benefit of it is
+ * that optimal area-time is reached at an earlier time than with classic
+ * scrypt, and t = 0 actually corresponds to this earlier completion time,
+ * resulting in quicker hash computations (and thus in higher request rate
+ * capacity).  Due to these properties, YESCRYPT_RW should almost always be
+ * set, except when compatibility with classic scrypt or TMTO-friendliness are
+ * desired.
+ *
+ * YESCRYPT_PARALLEL_SMIX moves parallelism that is present with p > 1 to a
+ * lower level as compared to where it is in classic scrypt.  This reduces
+ * flexibility for efficient computation (for both attackers and defenders) by
+ * requiring that, short of resorting to TMTO, the full amount of memory be
+ * allocated as needed for the specified p, regardless of whether that
+ * parallelism is actually being fully made use of or not.  (For comparison, a
+ * single instance of classic scrypt may be computed in less memory without any
+ * CPU time overhead, but in more real time, by not making full use of the
+ * parallelism.)  This may be desirable when the defender has enough memory
+ * with sufficiently low latency and high bandwidth for efficient full parallel
+ * execution, yet the required memory size is high enough that some likely
+ * attackers might end up being forced to choose between using higher latency
+ * memory than they could use otherwise (waiting for data longer) or using TMTO
+ * (waiting for data more times per one hash computation).  The area-time cost
+ * for other kinds of attackers (who would use the same memory type and TMTO
+ * factor or no TMTO either way) remains roughly the same, given the same
+ * running time for the defender.  In the TMTO-friendly YESCRYPT_WORM mode, as
+ * long as the defender has enough memory that is just as fast as the smaller
+ * per-thread regions would be, doesn't expect to ever need greater
+ * flexibility (except possibly via TMTO), and doesn't need backwards
+ * compatibility with classic scrypt, there are no other serious drawbacks to
+ * this setting.  In the YESCRYPT_RW mode, which is meant to discourage TMTO,
+ * this new approach to parallelization makes TMTO less inefficient.  (This is
+ * an unfortunate side-effect of avoiding some random writes, as we have to in
+ * order to allow for parallel threads to access a common memory region without
+ * synchronization overhead.)  Thus, in this mode this setting poses an extra
+ * tradeoff of its own (higher area-time cost for a subset of attackers vs.
+ * better TMTO resistance).  Setting YESCRYPT_PARALLEL_SMIX also changes the
+ * way the running time is to be controlled from N*r*p (for classic scrypt) to
+ * N*r (in this modification).  All of this applies only when p > 1.  For
+ * p = 1, this setting is a no-op.
+ *
+ * Passing a real shared structure, with ROM contents previously computed by
+ * yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for
+ * the thread-local RAM region.  In order to allow for initialization of the
+ * ROM to be split into a separate program, the shared->shared1.aligned and
+ * shared->shared1.aligned_size fields may be set by the caller of
+ * yescrypt_kdf() manually rather than with yescrypt_init_shared().
+ *
+ * local must be initialized with yescrypt_init_local().
+ *
+ * MT-safe as long as local and buf are local to the thread.
+ */
+extern int yescrypt_kdf(const yescrypt_shared_t * __shared,
+	yescrypt_local_t * __local,
+	const uint8_t * __passwd, size_t __passwdlen,
+	const uint8_t * __salt, size_t __saltlen,
+	uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t,
+	yescrypt_flags_t __flags,
+	uint8_t * __buf, size_t __buflen);
+
+/**
+ * yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen):
+ * Compute and encode an scrypt or enhanced scrypt hash of passwd given the
+ * parameters and salt value encoded in setting.  If the shared structure is
+ * not dummy, a ROM is used and YESCRYPT_RW is required.  Otherwise, whether to
+ * use the YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
+ * discouraging modification) is determined by the setting string.  shared and
+ * local must be initialized as described above for yescrypt_kdf().  buf must
+ * be large enough (as indicated by buflen) to hold the encoded hash string.
+ *
+ * Return the encoded hash string on success; or NULL on error.
+ *
+ * MT-safe as long as local and buf are local to the thread.
+ */
+extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared,
+	yescrypt_local_t * __local,
+	const uint8_t * __passwd, size_t __passwdlen,
+	const uint8_t * __setting,
+	uint8_t * __buf, size_t __buflen);
+
+/**
+ * yescrypt(passwd, setting):
+ * Compute and encode an scrypt or enhanced scrypt hash of passwd given the
+ * parameters and salt value encoded in setting.  Whether to use the
+ * YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
+ * discouraging modification) is determined by the setting string.
+ *
+ * Return the encoded hash string on success; or NULL on error.
+ *
+ * This is a crypt(3)-like interface, which is simpler to use than
+ * yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM,
+ * and it is slower than yescrypt_r() for repeated calls because it allocates
+ * and frees memory on each call.
+ *
+ * MT-unsafe.
+ */
+extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting);
+
+/**
+ * yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen):
+ * Generate a setting string for use with yescrypt_r() and yescrypt() by
+ * encoding into it the parameters N_log2 (which is to be set to base 2
+ * logarithm of the desired value for N), r, p, flags, and a salt given by src
+ * (of srclen bytes).  buf must be large enough (as indicated by buflen) to
+ * hold the setting string.
+ *
+ * Return the setting string on success; or NULL on error.
+ *
+ * MT-safe as long as buf is local to the thread.
+ */
+extern uint8_t * yescrypt_gensalt_r(
+	uint32_t __N_log2, uint32_t __r, uint32_t __p,
+	yescrypt_flags_t __flags,
+	const uint8_t * __src, size_t __srclen,
+	uint8_t * __buf, size_t __buflen);
+
+/**
+ * yescrypt_gensalt(N_log2, r, p, flags, src, srclen):
+ * Generate a setting string for use with yescrypt_r() and yescrypt().  This
+ * function is the same as yescrypt_gensalt_r() except that it uses a static
+ * buffer and thus is not MT-safe.
+ *
+ * Return the setting string on success; or NULL on error.
+ *
+ * MT-unsafe.
+ */
+extern uint8_t * yescrypt_gensalt(
+	uint32_t __N_log2, uint32_t __r, uint32_t __p,
+	yescrypt_flags_t __flags,
+	const uint8_t * __src, size_t __srclen);
+
+//#ifdef __cplusplus
+//}
+//#endif
+
+#endif /* !_YESCRYPT_H_ */
diff --git a/sph/yescryptcommon.c b/sph/yescryptcommon.c
new file mode 100644
index 0000000000..e5d76eb436
--- /dev/null
+++ b/sph/yescryptcommon.c
@@ -0,0 +1,365 @@
+/*-
+ * Copyright 2013,2014 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "sph/yescrypt.h"
+#include <pthread.h>
+//#include <thread> 
+
+#define BYTES2CHARS(bytes) \
+	((((bytes) * 8) + 5) / 6)
+
+#define HASH_SIZE 32 /* bytes */
+#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */
+#define YESCRYPT_FLAGS (YESCRYPT_RW | YESCRYPT_PWXFORM)
+static const char * const itoa64 =
+	"./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+static uint8_t * encode64_uint32(uint8_t * dst, size_t dstlen,
+    uint32_t src, uint32_t srcbits)
+{
+	uint32_t bit;
+
+	for (bit = 0; bit < srcbits; bit += 6) {
+		if (dstlen < 1)
+			return NULL;
+		*dst++ = itoa64[src & 0x3f];
+		dstlen--;
+		src >>= 6;
+	}
+
+	return dst;
+}
+
+static uint8_t * encode64(uint8_t * dst, size_t dstlen,
+    const uint8_t * src, size_t srclen)
+{
+	size_t i;
+
+	for (i = 0; i < srclen; ) {
+		uint8_t * dnext;
+		uint32_t value = 0, bits = 0;
+		do {
+			value |= (uint32_t)src[i++] << bits;
+			bits += 8;
+		} while (bits < 24 && i < srclen);
+		dnext = encode64_uint32(dst, dstlen, value, bits);
+		if (!dnext)
+			return NULL;
+		dstlen -= dnext - dst;
+		dst = dnext;
+	}
+
+	return dst;
+}
+
+static int decode64_one(uint32_t * dst, uint8_t src)
+{
+	const char * ptr = strchr(itoa64, src);
+	if (ptr) {
+		*dst = ptr - itoa64;
+		return 0;
+	}
+	*dst = 0;
+	return -1;
+}
+
+static const uint8_t * decode64_uint32(uint32_t * dst, uint32_t dstbits,
+    const uint8_t * src)
+{
+	uint32_t bit;
+	uint32_t value;
+
+	value = 0;
+	for (bit = 0; bit < dstbits; bit += 6) {
+		uint32_t one;
+		if (decode64_one(&one, *src)) {
+			*dst = 0;
+			return NULL;
+		}
+		src++;
+		value |= one << bit;
+	}
+
+	*dst = value;
+	return src;
+}
+
+uint8_t *
+yescrypt_r(const yescrypt_shared_t * shared, yescrypt_local_t * local,
+    const uint8_t * passwd, size_t passwdlen,
+    const uint8_t * setting,
+    uint8_t * buf, size_t buflen)
+{
+	uint8_t hash[HASH_SIZE];
+	const uint8_t * src, * salt;
+	uint8_t * dst;
+	size_t prefixlen, saltlen, need;
+	uint8_t version;
+	uint64_t N;
+	uint32_t r, p;
+	yescrypt_flags_t flags = YESCRYPT_WORM;
+          fflush(stdout);
+	if (setting[0] != '$' || setting[1] != '7')
+		{
+            fflush(stdout);
+		return NULL; 
+        }
+          fflush(stdout);  
+	src = setting + 2;
+          fflush(stdout); 
+	switch ((version = *src)) {
+	case '$':
+                fflush(stdout);
+		break;
+	case 'X':
+		src++;
+		flags = YESCRYPT_RW;
+                fflush(stdout);
+		break;
+	default:
+		{
+            fflush(stdout);
+		return NULL; 
+        }
+	}
+
+          fflush(stdout);
+	if (*src != '$') {
+		uint32_t decoded_flags;
+		if (decode64_one(&decoded_flags, *src))
+
+		{
+		fflush(stdout);
+		return NULL; 
+		}
+		flags = decoded_flags;
+		if (*++src != '$')
+	    {	
+        fflush(stdout);
+		return NULL; 
+        }
+	}
+	src++;
+
+	{
+		uint32_t N_log2;
+		if (decode64_one(&N_log2, *src))
+			{
+		         return NULL; 
+            }
+		src++;
+		N = (uint64_t)1 << N_log2;
+	}
+
+	src = decode64_uint32(&r, 30, src);
+	if (!src)
+          {
+		return NULL; 
+          }
+
+	src = decode64_uint32(&p, 30, src);
+	if (!src)
+         {
+		return NULL; 
+         }
+
+	prefixlen = src - setting;
+
+	salt = src;
+	src = (uint8_t *)strrchr((char *)salt, '$');
+	if (src)
+		saltlen = src - salt;
+	else
+		saltlen = strlen((char *)salt);
+
+	need = prefixlen + saltlen + 1 + HASH_LEN + 1;
+	if (need > buflen || need < saltlen)
+                            
+		 {
+        fflush(stdout);
+		return NULL; 
+         }
+
+fflush(stdout);
+	if (yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
+	    N, r, p, 0, flags, hash, sizeof(hash)))
+	    {
+        fflush(stdout);
+		return NULL; 
+        }
+
+	dst = buf;
+	memcpy(dst, setting, prefixlen + saltlen);
+	dst += prefixlen + saltlen;
+	*dst++ = '$';
+
+	dst = encode64(dst, buflen - (dst - buf), hash, sizeof(hash));
+	/* Could zeroize hash[] here, but yescrypt_kdf() doesn't zeroize its
+	 * memory allocations yet anyway. */
+	if (!dst || dst >= buf + buflen) /* Can't happen */
+		{
+		return NULL; 
+        }
+
+	*dst = 0; /* NUL termination */
+           fflush(stdout);
+	return buf;
+}
+
+uint8_t *
+yescrypt(const uint8_t * passwd, const uint8_t * setting)
+{
+	static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1 + HASH_LEN + 1];
+	yescrypt_shared_t shared;
+	yescrypt_local_t local;
+	uint8_t * retval;
+	if (yescrypt_init_shared(&shared, NULL, 0,
+	    0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
+		return NULL;
+	if (yescrypt_init_local(&local)) {
+		yescrypt_free_shared(&shared);
+		return NULL;
+	}
+	retval = yescrypt_r(&shared, &local,
+	    passwd, 80, setting, buf, sizeof(buf));
+        // printf("hashse='%s'\n", (char *)retval);
+	if (yescrypt_free_local(&local)) {
+		yescrypt_free_shared(&shared);
+		return NULL;
+	}
+	if (yescrypt_free_shared(&shared))
+		return NULL;
+	return retval;
+      
+}
+
+uint8_t *
+yescrypt_gensalt_r(uint32_t N_log2, uint32_t r, uint32_t p,
+    yescrypt_flags_t flags,
+    const uint8_t * src, size_t srclen,
+    uint8_t * buf, size_t buflen)
+{
+	uint8_t * dst;
+	size_t prefixlen = 3 + 1 + 5 + 5;
+	size_t saltlen = BYTES2CHARS(srclen);
+	size_t need;
+
+	if (p == 1)
+		flags &= ~YESCRYPT_PARALLEL_SMIX;
+
+	if (flags) {
+		if (flags & ~0x3f)
+			return NULL;
+
+		prefixlen++;
+		if (flags != YESCRYPT_RW)
+			prefixlen++;
+	}
+
+	need = prefixlen + saltlen + 1;
+	if (need > buflen || need < saltlen || saltlen < srclen)
+		return NULL;
+
+	if (N_log2 > 63 || ((uint64_t)r * (uint64_t)p >= (1U << 30)))
+		return NULL;
+
+	dst = buf;
+	*dst++ = '$';
+	*dst++ = '7';
+	if (flags) {
+		*dst++ = 'X'; /* eXperimental, subject to change */
+		if (flags != YESCRYPT_RW)
+			*dst++ = itoa64[flags];
+	}
+	*dst++ = '$';
+
+	*dst++ = itoa64[N_log2];
+
+	dst = encode64_uint32(dst, buflen - (dst - buf), r, 30);
+	if (!dst) /* Can't happen */
+		return NULL;
+
+	dst = encode64_uint32(dst, buflen - (dst - buf), p, 30);
+	if (!dst) /* Can't happen */
+		return NULL;
+
+	dst = encode64(dst, buflen - (dst - buf), src, srclen);
+	if (!dst || dst >= buf + buflen) /* Can't happen */
+		return NULL;
+
+	*dst = 0; /* NUL termination */
+
+	return buf;
+}
+
+uint8_t *
+yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p,
+    yescrypt_flags_t flags,
+    const uint8_t * src, size_t srclen)
+{
+	static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1];
+	return yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen,
+	    buf, sizeof(buf));
+}
+
+static int
+yescrypt_bsty(const uint8_t * passwd, size_t passwdlen,
+    const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p,
+    uint8_t * buf, size_t buflen)
+{
+
+#ifdef WIN32
+	static __declspec(thread) int initialized = 0;
+	static __declspec(thread) yescrypt_shared_t shared;
+	static  __declspec(thread) yescrypt_local_t local;
+#else
+	static __thread int initialized = 0;
+	static __thread yescrypt_shared_t shared;
+	static __thread yescrypt_local_t local;
+#endif
+
+	int retval;
+	if (!initialized) {
+/* "shared" could in fact be shared, but it's simpler to keep it private
+ * along with "local".  It's dummy and tiny anyway. */
+		if (yescrypt_init_shared(&shared, NULL, 0,
+		    0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
+			return -1;
+		if (yescrypt_init_local(&local)) {
+			yescrypt_free_shared(&shared);
+			return -1;
+		}
+		initialized = 1;
+ 	}
+	retval = yescrypt_kdf(&shared, &local,
+	    passwd, passwdlen, salt, saltlen, N, r, p, 0, YESCRYPT_FLAGS,
+	    buf, buflen);		
+
+	return retval;
+}
+
+void yescrypt_hash(const unsigned char *input, unsigned char *output)
+{
+
+   yescrypt_bsty((const uint8_t *)input, 80, (const uint8_t *) input, 80, 2048, 8, 1, (uint8_t *)output, 32);
+}
diff --git a/stats.cpp b/stats.cpp
index f5a2d8280e..e30b4015ce 100644
--- a/stats.cpp
+++ b/stats.cpp
@@ -25,7 +25,7 @@ extern int opt_statsavg;
  */
 void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate, uint8_t found, uint32_t height)
 {
-	const uint8_t  gpu = (uint8_t) device_map[thr_id];
+	uint8_t  gpu = (uint8_t) device_map[thr_id];
 	const uint64_t key = ((uid++ % UINT32_MAX) << 32) + gpu;
 	stats_data data;
 	// to enough hashes to give right stats
@@ -61,7 +61,8 @@ void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate, uint8
  */
 double stats_get_speed(int thr_id, double def_speed)
 {
-	const uint64_t gpu = device_map[thr_id];
+	uint64_t gpu = device_map[thr_id];
+
 	const uint64_t keymsk = 0xffULL; // last u8 is the gpu
 	double speed = 0.0;
 	int records = 0;
diff --git a/sysinfos.cpp b/sysinfos.cpp
index 4515830be2..6639411f36 100644
--- a/sysinfos.cpp
+++ b/sysinfos.cpp
@@ -6,11 +6,11 @@
  * tpruvot 2014
  */
 
-#include <stdio.h>
+#include <cstdio>
 #include <ctype.h>
-#include <stdlib.h>
-#include <string.h>
-
+#include <cstdlib>
+#include <cstring>
+using namespace std;
 #include "miner.h"
 
 #ifndef WIN32
@@ -46,12 +46,11 @@ static uint32_t linux_cpufreq(int core)
 	FILE *fd = fopen(CPUFREQ_PATH, "r");
 	uint32_t freq = 0;
 
-	if (!fd)
-		return freq;
-
-	if (!fscanf(fd, "%d", &freq))
-		return freq;
-
+	if(!fd)
+	{
+		fscanf(fd, "%d", &freq);
+		fclose(fd);
+	}
 	return freq;
 }
 
diff --git a/uint256.h b/uint256.h
deleted file mode 100644
index 2a252c94f3..0000000000
--- a/uint256.h
+++ /dev/null
@@ -1,784 +0,0 @@
-// Copyright (c) 2009-2010 Satoshi Nakamoto
-// Copyright (c) 2009-2012 The Bitcoin developers
-// Distributed under the MIT/X11 software license, see the accompanying
-// file COPYING or http://www.opensource.org/licenses/mit-license.php.
-#ifndef BITCOIN_UINT256_H
-#define BITCOIN_UINT256_H
-
-#include <limits.h>
-#include <stdio.h>
-#include <string.h>
-#include <inttypes.h>
-#include <string>
-#include <vector>
-
-typedef long long  int64;
-typedef unsigned long long  uint64;
-
-
-inline int Testuint256AdHoc(std::vector<std::string> vArg);
-
-
-
-/** Base class without constructors for uint256 and uint160.
- * This makes the compiler let you use it in a union.
- */
-template<unsigned int BITS>
-class base_uint
-{
-protected:
-    enum { WIDTH=BITS/32 };
-    uint32_t pn[WIDTH];
-public:
-
-    bool operator!() const
-    {
-        for (int i = 0; i < WIDTH; i++)
-            if (pn[i] != 0)
-                return false;
-        return true;
-    }
-
-    const base_uint operator~() const
-    {
-        base_uint ret;
-        for (int i = 0; i < WIDTH; i++)
-            ret.pn[i] = ~pn[i];
-        return ret;
-    }
-
-    const base_uint operator-() const
-    {
-        base_uint ret;
-        for (int i = 0; i < WIDTH; i++)
-            ret.pn[i] = ~pn[i];
-        ret++;
-        return ret;
-    }
-
-    double getdouble() const
-    {
-        double ret = 0.0;
-        double fact = 1.0;
-        for (int i = 0; i < WIDTH; i++) {
-            ret += fact * pn[i];
-            fact *= 4294967296.0;
-        }
-        return ret;
-    }
-
-    base_uint& operator=(uint64 b)
-    {
-        pn[0] = (unsigned int)b;
-        pn[1] = (unsigned int)(b >> 32);
-        for (int i = 2; i < WIDTH; i++)
-            pn[i] = 0;
-        return *this;
-    }
-
-    base_uint& operator^=(const base_uint& b)
-    {
-        for (int i = 0; i < WIDTH; i++)
-            pn[i] ^= b.pn[i];
-        return *this;
-    }
-
-    base_uint& operator&=(const base_uint& b)
-    {
-        for (int i = 0; i < WIDTH; i++)
-            pn[i] &= b.pn[i];
-        return *this;
-    }
-
-    base_uint& operator|=(const base_uint& b)
-    {
-        for (int i = 0; i < WIDTH; i++)
-            pn[i] |= b.pn[i];
-        return *this;
-    }
-
-    base_uint& operator^=(uint64 b)
-    {
-        pn[0] ^= (unsigned int)b;
-        pn[1] ^= (unsigned int)(b >> 32);
-        return *this;
-    }
-
-    base_uint& operator|=(uint64 b)
-    {
-        pn[0] |= (unsigned int)b;
-        pn[1] |= (unsigned int)(b >> 32);
-        return *this;
-    }
-
-    base_uint& operator<<=(unsigned int shift)
-    {
-        base_uint a(*this);
-        for (int i = 0; i < WIDTH; i++)
-            pn[i] = 0;
-        int k = shift / 32;
-        shift = shift % 32;
-        for (int i = 0; i < WIDTH; i++)
-        {
-            if (i+k+1 < WIDTH && shift != 0)
-                pn[i+k+1] |= (a.pn[i] >> (32-shift));
-            if (i+k < WIDTH)
-                pn[i+k] |= (a.pn[i] << shift);
-        }
-        return *this;
-    }
-
-    base_uint& operator>>=(unsigned int shift)
-    {
-        base_uint a(*this);
-        for (int i = 0; i < WIDTH; i++)
-            pn[i] = 0;
-        int k = shift / 32;
-        shift = shift % 32;
-        for (int i = 0; i < WIDTH; i++)
-        {
-            if (i-k-1 >= 0 && shift != 0)
-                pn[i-k-1] |= (a.pn[i] << (32-shift));
-            if (i-k >= 0)
-                pn[i-k] |= (a.pn[i] >> shift);
-        }
-        return *this;
-    }
-
-    base_uint& operator+=(const base_uint& b)
-    {
-        uint64 carry = 0;
-        for (int i = 0; i < WIDTH; i++)
-        {
-            uint64 n = carry + pn[i] + b.pn[i];
-            pn[i] = n & 0xffffffff;
-            carry = n >> 32;
-        }
-        return *this;
-    }
-
-    base_uint& operator-=(const base_uint& b)
-    {
-        *this += -b;
-        return *this;
-    }
-
-    base_uint& operator+=(uint64 b64)
-    {
-        base_uint b;
-        b = b64;
-        *this += b;
-        return *this;
-    }
-
-    base_uint& operator-=(uint64 b64)
-    {
-        base_uint b;
-        b = b64;
-        *this += -b;
-        return *this;
-    }
-
-
-    base_uint& operator++()
-    {
-        // prefix operator
-        int i = 0;
-        while (++pn[i] == 0 && i < WIDTH-1)
-            i++;
-        return *this;
-    }
-
-    const base_uint operator++(int)
-    {
-        // postfix operator
-        const base_uint ret = *this;
-        ++(*this);
-        return ret;
-    }
-
-    base_uint& operator--()
-    {
-        // prefix operator
-        int i = 0;
-        while (--pn[i] == -1 && i < WIDTH-1)
-            i++;
-        return *this;
-    }
-
-    const base_uint operator--(int)
-    {
-        // postfix operator
-        const base_uint ret = *this;
-        --(*this);
-        return ret;
-    }
-
-
-    friend inline bool operator<(const base_uint& a, const base_uint& b)
-    {
-        for (int i = base_uint::WIDTH-1; i >= 0; i--)
-        {
-            if (a.pn[i] < b.pn[i])
-                return true;
-            else if (a.pn[i] > b.pn[i])
-                return false;
-        }
-        return false;
-    }
-
-    friend inline bool operator<=(const base_uint& a, const base_uint& b)
-    {
-        for (int i = base_uint::WIDTH-1; i >= 0; i--)
-        {
-            if (a.pn[i] < b.pn[i])
-                return true;
-            else if (a.pn[i] > b.pn[i])
-                return false;
-        }
-        return true;
-    }
-
-    friend inline bool operator>(const base_uint& a, const base_uint& b)
-    {
-        for (int i = base_uint::WIDTH-1; i >= 0; i--)
-        {
-            if (a.pn[i] > b.pn[i])
-                return true;
-            else if (a.pn[i] < b.pn[i])
-                return false;
-        }
-        return false;
-    }
-
-    friend inline bool operator>=(const base_uint& a, const base_uint& b)
-    {
-        for (int i = base_uint::WIDTH-1; i >= 0; i--)
-        {
-            if (a.pn[i] > b.pn[i])
-                return true;
-            else if (a.pn[i] < b.pn[i])
-                return false;
-        }
-        return true;
-    }
-
-    friend inline bool operator==(const base_uint& a, const base_uint& b)
-    {
-        for (int i = 0; i < base_uint::WIDTH; i++)
-            if (a.pn[i] != b.pn[i])
-                return false;
-        return true;
-    }
-
-    friend inline bool operator==(const base_uint& a, uint64 b)
-    {
-        if (a.pn[0] != (unsigned int)b)
-            return false;
-        if (a.pn[1] != (unsigned int)(b >> 32))
-            return false;
-        for (int i = 2; i < base_uint::WIDTH; i++)
-            if (a.pn[i] != 0)
-                return false;
-        return true;
-    }
-
-    friend inline bool operator!=(const base_uint& a, const base_uint& b)
-    {
-        return (!(a == b));
-    }
-
-    friend inline bool operator!=(const base_uint& a, uint64 b)
-    {
-        return (!(a == b));
-    }
-
-
-
-    std::string GetHex() const
-    {
-        char psz[sizeof(pn)*2 + 1];
-        for (unsigned int i = 0; i < sizeof(pn); i++)
-            sprintf(psz + i*2, "%02x", ((unsigned char*)pn)[sizeof(pn) - i - 1]);
-        return std::string(psz, psz + sizeof(pn)*2);
-    }
-
-    void SetHex(const char* psz)
-    {
-        for (int i = 0; i < WIDTH; i++)
-            pn[i] = 0;
-
-        // skip leading spaces
-        while (isspace(*psz))
-            psz++;
-
-        // skip 0x
-        if (psz[0] == '0' && tolower(psz[1]) == 'x')
-            psz += 2;
-
-        // hex string to uint
-        static const unsigned char phexdigit[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0 };
-        const char* pbegin = psz;
-        while (phexdigit[(unsigned char)*psz] || *psz == '0')
-            psz++;
-        psz--;
-        unsigned char* p1 = (unsigned char*)pn;
-        unsigned char* pend = p1 + WIDTH * 4;
-        while (psz >= pbegin && p1 < pend)
-        {
-            *p1 = phexdigit[(unsigned char)*psz--];
-            if (psz >= pbegin)
-            {
-                *p1 |= (phexdigit[(unsigned char)*psz--] << 4);
-                p1++;
-            }
-        }
-    }
-
-    void SetHex(const std::string& str)
-    {
-        SetHex(str.c_str());
-    }
-
-    std::string ToString() const
-    {
-        return (GetHex());
-    }
-
-    unsigned char* begin()
-    {
-        return (unsigned char*)&pn[0];
-    }
-
-    unsigned char* end()
-    {
-        return (unsigned char*)&pn[WIDTH];
-    }
-
-    const unsigned char* begin() const
-    {
-        return (unsigned char*)&pn[0];
-    }
-
-    const unsigned char* end() const
-    {
-        return (unsigned char*)&pn[WIDTH];
-    }
-
-    unsigned int size() const
-    {
-        return sizeof(pn);
-    }
-
-    uint64 Get64(int n=0) const
-    {
-        return pn[2*n] | (uint64)pn[2*n+1] << 32;
-    }
-
-//    unsigned int GetSerializeSize(int nType=0, int nVersion=PROTOCOL_VERSION) const
-    unsigned int GetSerializeSize(int nType, int nVersion) const
-    {
-        return sizeof(pn);
-    }
-
-    template<typename Stream>
-//    void Serialize(Stream& s, int nType=0, int nVersion=PROTOCOL_VERSION) const
-    void Serialize(Stream& s, int nType, int nVersion) const
-    {
-        s.write((char*)pn, sizeof(pn));
-    }
-
-    template<typename Stream>
-//    void Unserialize(Stream& s, int nType=0, int nVersion=PROTOCOL_VERSION)
-    void Unserialize(Stream& s, int nType, int nVersion)
-    {
-        s.read((char*)pn, sizeof(pn));
-    }
-
-
-    friend class uint160;
-    friend class uint256;
-    friend inline int Testuint256AdHoc(std::vector<std::string> vArg);
-};
-
-typedef base_uint<160> base_uint160;
-typedef base_uint<256> base_uint256;
-
-
-
-//
-// uint160 and uint256 could be implemented as templates, but to keep
-// compile errors and debugging cleaner, they're copy and pasted.
-//
-
-
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// uint160
-//
-
-/** 160-bit unsigned integer */
-class uint160 : public base_uint160
-{
-public:
-    typedef base_uint160 basetype;
-
-    uint160()
-    {
-        for (int i = 0; i < WIDTH; i++)
-            pn[i] = 0;
-    }
-
-    uint160(const basetype& b)
-    {
-        for (int i = 0; i < WIDTH; i++)
-            pn[i] = b.pn[i];
-    }
-
-    uint160& operator=(const basetype& b)
-    {
-        for (int i = 0; i < WIDTH; i++)
-            pn[i] = b.pn[i];
-        return *this;
-    }
-
-    uint160(uint64 b)
-    {
-        pn[0] = (unsigned int)b;
-        pn[1] = (unsigned int)(b >> 32);
-        for (int i = 2; i < WIDTH; i++)
-            pn[i] = 0;
-    }
-
-    uint160& operator=(uint64 b)
-    {
-        pn[0] = (unsigned int)b;
-        pn[1] = (unsigned int)(b >> 32);
-        for (int i = 2; i < WIDTH; i++)
-            pn[i] = 0;
-        return *this;
-    }
-
-    explicit uint160(const std::string& str)
-    {
-        SetHex(str);
-    }
-
-    explicit uint160(const std::vector<unsigned char>& vch)
-    {
-        if (vch.size() == sizeof(pn))
-            memcpy(pn, &vch[0], sizeof(pn));
-        else
-            *this = 0;
-    }
-};
-
-inline bool operator==(const uint160& a, uint64 b)                           { return (base_uint160)a == b; }
-inline bool operator!=(const uint160& a, uint64 b)                           { return (base_uint160)a != b; }
-inline const uint160 operator<<(const base_uint160& a, unsigned int shift)   { return uint160(a) <<= shift; }
-inline const uint160 operator>>(const base_uint160& a, unsigned int shift)   { return uint160(a) >>= shift; }
-inline const uint160 operator<<(const uint160& a, unsigned int shift)        { return uint160(a) <<= shift; }
-inline const uint160 operator>>(const uint160& a, unsigned int shift)        { return uint160(a) >>= shift; }
-
-inline const uint160 operator^(const base_uint160& a, const base_uint160& b) { return uint160(a) ^= b; }
-inline const uint160 operator&(const base_uint160& a, const base_uint160& b) { return uint160(a) &= b; }
-inline const uint160 operator|(const base_uint160& a, const base_uint160& b) { return uint160(a) |= b; }
-inline const uint160 operator+(const base_uint160& a, const base_uint160& b) { return uint160(a) += b; }
-inline const uint160 operator-(const base_uint160& a, const base_uint160& b) { return uint160(a) -= b; }
-
-inline bool operator<(const base_uint160& a, const uint160& b)          { return (base_uint160)a <  (base_uint160)b; }
-inline bool operator<=(const base_uint160& a, const uint160& b)         { return (base_uint160)a <= (base_uint160)b; }
-inline bool operator>(const base_uint160& a, const uint160& b)          { return (base_uint160)a >  (base_uint160)b; }
-inline bool operator>=(const base_uint160& a, const uint160& b)         { return (base_uint160)a >= (base_uint160)b; }
-inline bool operator==(const base_uint160& a, const uint160& b)         { return (base_uint160)a == (base_uint160)b; }
-inline bool operator!=(const base_uint160& a, const uint160& b)         { return (base_uint160)a != (base_uint160)b; }
-inline const uint160 operator^(const base_uint160& a, const uint160& b) { return (base_uint160)a ^  (base_uint160)b; }
-inline const uint160 operator&(const base_uint160& a, const uint160& b) { return (base_uint160)a &  (base_uint160)b; }
-inline const uint160 operator|(const base_uint160& a, const uint160& b) { return (base_uint160)a |  (base_uint160)b; }
-inline const uint160 operator+(const base_uint160& a, const uint160& b) { return (base_uint160)a +  (base_uint160)b; }
-inline const uint160 operator-(const base_uint160& a, const uint160& b) { return (base_uint160)a -  (base_uint160)b; }
-
-inline bool operator<(const uint160& a, const base_uint160& b)          { return (base_uint160)a <  (base_uint160)b; }
-inline bool operator<=(const uint160& a, const base_uint160& b)         { return (base_uint160)a <= (base_uint160)b; }
-inline bool operator>(const uint160& a, const base_uint160& b)          { return (base_uint160)a >  (base_uint160)b; }
-inline bool operator>=(const uint160& a, const base_uint160& b)         { return (base_uint160)a >= (base_uint160)b; }
-inline bool operator==(const uint160& a, const base_uint160& b)         { return (base_uint160)a == (base_uint160)b; }
-inline bool operator!=(const uint160& a, const base_uint160& b)         { return (base_uint160)a != (base_uint160)b; }
-inline const uint160 operator^(const uint160& a, const base_uint160& b) { return (base_uint160)a ^  (base_uint160)b; }
-inline const uint160 operator&(const uint160& a, const base_uint160& b) { return (base_uint160)a &  (base_uint160)b; }
-inline const uint160 operator|(const uint160& a, const base_uint160& b) { return (base_uint160)a |  (base_uint160)b; }
-inline const uint160 operator+(const uint160& a, const base_uint160& b) { return (base_uint160)a +  (base_uint160)b; }
-inline const uint160 operator-(const uint160& a, const base_uint160& b) { return (base_uint160)a -  (base_uint160)b; }
-
-inline bool operator<(const uint160& a, const uint160& b)               { return (base_uint160)a <  (base_uint160)b; }
-inline bool operator<=(const uint160& a, const uint160& b)              { return (base_uint160)a <= (base_uint160)b; }
-inline bool operator>(const uint160& a, const uint160& b)               { return (base_uint160)a >  (base_uint160)b; }
-inline bool operator>=(const uint160& a, const uint160& b)              { return (base_uint160)a >= (base_uint160)b; }
-inline bool operator==(const uint160& a, const uint160& b)              { return (base_uint160)a == (base_uint160)b; }
-inline bool operator!=(const uint160& a, const uint160& b)              { return (base_uint160)a != (base_uint160)b; }
-inline const uint160 operator^(const uint160& a, const uint160& b)      { return (base_uint160)a ^  (base_uint160)b; }
-inline const uint160 operator&(const uint160& a, const uint160& b)      { return (base_uint160)a &  (base_uint160)b; }
-inline const uint160 operator|(const uint160& a, const uint160& b)      { return (base_uint160)a |  (base_uint160)b; }
-inline const uint160 operator+(const uint160& a, const uint160& b)      { return (base_uint160)a +  (base_uint160)b; }
-inline const uint160 operator-(const uint160& a, const uint160& b)      { return (base_uint160)a -  (base_uint160)b; }
-
-
-
-
-
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// uint256
-//
-
-/** 256-bit unsigned integer */
-class uint256 : public base_uint256
-{
-public:
-    typedef base_uint256 basetype;
-
-    uint256()
-    {
-        for (int i = 0; i < WIDTH; i++)
-            pn[i] = 0;
-    }
-
-    uint256(const basetype& b)
-    {
-        for (int i = 0; i < WIDTH; i++)
-            pn[i] = b.pn[i];
-    }
-
-    uint256& operator=(const basetype& b)
-    {
-        for (int i = 0; i < WIDTH; i++)
-            pn[i] = b.pn[i];
-        return *this;
-    }
-
-    uint256(uint64 b)
-    {
-        pn[0] = (unsigned int)b;
-        pn[1] = (unsigned int)(b >> 32);
-        for (int i = 2; i < WIDTH; i++)
-            pn[i] = 0;
-    }
-
-    uint256& operator=(uint64 b)
-    {
-        pn[0] = (unsigned int)b;
-        pn[1] = (unsigned int)(b >> 32);
-        for (int i = 2; i < WIDTH; i++)
-            pn[i] = 0;
-        return *this;
-    }
-
-    explicit uint256(const std::string& str)
-    {
-        SetHex(str);
-    }
-
-    explicit uint256(const std::vector<unsigned char>& vch)
-    {
-        if (vch.size() == sizeof(pn))
-            memcpy(pn, &vch[0], sizeof(pn));
-        else
-            *this = 0;
-    }
-};
-
-inline bool operator==(const uint256& a, uint64 b)                           { return (base_uint256)a == b; }
-inline bool operator!=(const uint256& a, uint64 b)                           { return (base_uint256)a != b; }
-inline const uint256 operator<<(const base_uint256& a, unsigned int shift)   { return uint256(a) <<= shift; }
-inline const uint256 operator>>(const base_uint256& a, unsigned int shift)   { return uint256(a) >>= shift; }
-inline const uint256 operator<<(const uint256& a, unsigned int shift)        { return uint256(a) <<= shift; }
-inline const uint256 operator>>(const uint256& a, unsigned int shift)        { return uint256(a) >>= shift; }
-
-inline const uint256 operator^(const base_uint256& a, const base_uint256& b) { return uint256(a) ^= b; }
-inline const uint256 operator&(const base_uint256& a, const base_uint256& b) { return uint256(a) &= b; }
-inline const uint256 operator|(const base_uint256& a, const base_uint256& b) { return uint256(a) |= b; }
-inline const uint256 operator+(const base_uint256& a, const base_uint256& b) { return uint256(a) += b; }
-inline const uint256 operator-(const base_uint256& a, const base_uint256& b) { return uint256(a) -= b; }
-
-inline bool operator<(const base_uint256& a, const uint256& b)          { return (base_uint256)a <  (base_uint256)b; }
-inline bool operator<=(const base_uint256& a, const uint256& b)         { return (base_uint256)a <= (base_uint256)b; }
-inline bool operator>(const base_uint256& a, const uint256& b)          { return (base_uint256)a >  (base_uint256)b; }
-inline bool operator>=(const base_uint256& a, const uint256& b)         { return (base_uint256)a >= (base_uint256)b; }
-inline bool operator==(const base_uint256& a, const uint256& b)         { return (base_uint256)a == (base_uint256)b; }
-inline bool operator!=(const base_uint256& a, const uint256& b)         { return (base_uint256)a != (base_uint256)b; }
-inline const uint256 operator^(const base_uint256& a, const uint256& b) { return (base_uint256)a ^  (base_uint256)b; }
-inline const uint256 operator&(const base_uint256& a, const uint256& b) { return (base_uint256)a &  (base_uint256)b; }
-inline const uint256 operator|(const base_uint256& a, const uint256& b) { return (base_uint256)a |  (base_uint256)b; }
-inline const uint256 operator+(const base_uint256& a, const uint256& b) { return (base_uint256)a +  (base_uint256)b; }
-inline const uint256 operator-(const base_uint256& a, const uint256& b) { return (base_uint256)a -  (base_uint256)b; }
-
-inline bool operator<(const uint256& a, const base_uint256& b)          { return (base_uint256)a <  (base_uint256)b; }
-inline bool operator<=(const uint256& a, const base_uint256& b)         { return (base_uint256)a <= (base_uint256)b; }
-inline bool operator>(const uint256& a, const base_uint256& b)          { return (base_uint256)a >  (base_uint256)b; }
-inline bool operator>=(const uint256& a, const base_uint256& b)         { return (base_uint256)a >= (base_uint256)b; }
-inline bool operator==(const uint256& a, const base_uint256& b)         { return (base_uint256)a == (base_uint256)b; }
-inline bool operator!=(const uint256& a, const base_uint256& b)         { return (base_uint256)a != (base_uint256)b; }
-inline const uint256 operator^(const uint256& a, const base_uint256& b) { return (base_uint256)a ^  (base_uint256)b; }
-inline const uint256 operator&(const uint256& a, const base_uint256& b) { return (base_uint256)a &  (base_uint256)b; }
-inline const uint256 operator|(const uint256& a, const base_uint256& b) { return (base_uint256)a |  (base_uint256)b; }
-inline const uint256 operator+(const uint256& a, const base_uint256& b) { return (base_uint256)a +  (base_uint256)b; }
-inline const uint256 operator-(const uint256& a, const base_uint256& b) { return (base_uint256)a -  (base_uint256)b; }
-
-inline bool operator<(const uint256& a, const uint256& b)               { return (base_uint256)a <  (base_uint256)b; }
-inline bool operator<=(const uint256& a, const uint256& b)              { return (base_uint256)a <= (base_uint256)b; }
-inline bool operator>(const uint256& a, const uint256& b)               { return (base_uint256)a >  (base_uint256)b; }
-inline bool operator>=(const uint256& a, const uint256& b)              { return (base_uint256)a >= (base_uint256)b; }
-inline bool operator==(const uint256& a, const uint256& b)              { return (base_uint256)a == (base_uint256)b; }
-inline bool operator!=(const uint256& a, const uint256& b)              { return (base_uint256)a != (base_uint256)b; }
-inline const uint256 operator^(const uint256& a, const uint256& b)      { return (base_uint256)a ^  (base_uint256)b; }
-inline const uint256 operator&(const uint256& a, const uint256& b)      { return (base_uint256)a &  (base_uint256)b; }
-inline const uint256 operator|(const uint256& a, const uint256& b)      { return (base_uint256)a |  (base_uint256)b; }
-inline const uint256 operator+(const uint256& a, const uint256& b)      { return (base_uint256)a +  (base_uint256)b; }
-inline const uint256 operator-(const uint256& a, const uint256& b)      { return (base_uint256)a -  (base_uint256)b; }
-
-
-
-
-
-
-
-
-
-
-#ifdef TEST_UINT256
-
-inline int Testuint256AdHoc(std::vector<std::string> vArg)
-{
-    uint256 g(0);
-
-
-    printf("%s\n", g.ToString().c_str());
-    g--;  printf("g--\n");
-    printf("%s\n", g.ToString().c_str());
-    g--;  printf("g--\n");
-    printf("%s\n", g.ToString().c_str());
-    g++;  printf("g++\n");
-    printf("%s\n", g.ToString().c_str());
-    g++;  printf("g++\n");
-    printf("%s\n", g.ToString().c_str());
-    g++;  printf("g++\n");
-    printf("%s\n", g.ToString().c_str());
-    g++;  printf("g++\n");
-    printf("%s\n", g.ToString().c_str());
-
-
-
-    uint256 a(7);
-    printf("a=7\n");
-    printf("%s\n", a.ToString().c_str());
-
-    uint256 b;
-    printf("b undefined\n");
-    printf("%s\n", b.ToString().c_str());
-    int c = 3;
-
-    a = c;
-    a.pn[3] = 15;
-    printf("%s\n", a.ToString().c_str());
-    uint256 k(c);
-
-    a = 5;
-    a.pn[3] = 15;
-    printf("%s\n", a.ToString().c_str());
-    b = 1;
-    b <<= 52;
-
-    a |= b;
-
-    a ^= 0x500;
-
-    printf("a %s\n", a.ToString().c_str());
-
-    a = a | b | (uint256)0x1000;
-
-
-    printf("a %s\n", a.ToString().c_str());
-    printf("b %s\n", b.ToString().c_str());
-
-    a = 0xfffffffe;
-    a.pn[4] = 9;
-
-    printf("%s\n", a.ToString().c_str());
-    a++;
-    printf("%s\n", a.ToString().c_str());
-    a++;
-    printf("%s\n", a.ToString().c_str());
-    a++;
-    printf("%s\n", a.ToString().c_str());
-    a++;
-    printf("%s\n", a.ToString().c_str());
-
-    a--;
-    printf("%s\n", a.ToString().c_str());
-    a--;
-    printf("%s\n", a.ToString().c_str());
-    a--;
-    printf("%s\n", a.ToString().c_str());
-    uint256 d = a--;
-    printf("%s\n", d.ToString().c_str());
-    printf("%s\n", a.ToString().c_str());
-    a--;
-    printf("%s\n", a.ToString().c_str());
-    a--;
-    printf("%s\n", a.ToString().c_str());
-
-    d = a;
-
-    printf("%s\n", d.ToString().c_str());
-    for (int i = uint256::WIDTH-1; i >= 0; i--) printf("%08x", d.pn[i]); printf("\n");
-
-    uint256 neg = d;
-    neg = ~neg;
-    printf("%s\n", neg.ToString().c_str());
-
-
-    uint256 e = uint256("0xABCDEF123abcdef12345678909832180000011111111");
-    printf("\n");
-    printf("%s\n", e.ToString().c_str());
-
-
-    printf("\n");
-    uint256 x1 = uint256("0xABCDEF123abcdef12345678909832180000011111111");
-    uint256 x2;
-    printf("%s\n", x1.ToString().c_str());
-    for (int i = 0; i < 270; i += 4)
-    {
-        x2 = x1 << i;
-        printf("%s\n", x2.ToString().c_str());
-    }
-
-    printf("\n");
-    printf("%s\n", x1.ToString().c_str());
-    for (int i = 0; i < 270; i += 4)
-    {
-        x2 = x1;
-        x2 >>= i;
-        printf("%s\n", x2.ToString().c_str());
-    }
-
-
-    for (int i = 0; i < 100; i++)
-    {
-        uint256 k = (~uint256(0) >> i);
-        printf("%s\n", k.ToString().c_str());
-    }
-
-    for (int i = 0; i < 100; i++)
-    {
-        uint256 k = (~uint256(0) << i);
-        printf("%s\n", k.ToString().c_str());
-    }
-
-    return (0);
-}
-
-#endif
-
-#endif
diff --git a/util.c b/util.c
deleted file mode 100644
index 350665ff2d..0000000000
--- a/util.c
+++ /dev/null
@@ -1,1617 +0,0 @@
-/*
- * Copyright 2010 Jeff Garzik
- * Copyright 2012-2014 pooler
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.  See COPYING for more details.
- */
-
-#define _GNU_SOURCE
-#include "cpuminer-config.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <stdarg.h>
-#include <string.h>
-#include <stdbool.h>
-#include <inttypes.h>
-#include <unistd.h>
-#include <jansson.h>
-#include <curl/curl.h>
-#include <time.h>
-#ifdef WIN32
-#include "compat/winansi.h"
-#include <winsock2.h>
-#include <mstcpip.h>
-#else
-#include <errno.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#endif
-#include "compat.h"
-#include "miner.h"
-#include "elist.h"
-
-struct data_buffer {
-	void		*buf;
-	size_t		len;
-};
-
-struct upload_buffer {
-	const void	*buf;
-	size_t		len;
-	size_t		pos;
-};
-
-struct header_info {
-	char		*lp_path;
-	char		*reason;
-	char		*stratum_url;
-};
-
-struct tq_ent {
-	void			*data;
-	struct list_head	q_node;
-};
-
-struct thread_q {
-	struct list_head	q;
-
-	bool frozen;
-
-	pthread_mutex_t		mutex;
-	pthread_cond_t		cond;
-};
-
-void applog(int prio, const char *fmt, ...)
-{
-	va_list ap;
-
-	va_start(ap, fmt);
-
-#ifdef HAVE_SYSLOG_H
-	if (use_syslog) {
-		va_list ap2;
-		char *buf;
-		int len;
-
-		/* custom colors to syslog prio */
-		if (prio > LOG_DEBUG) {
-			switch (prio) {
-				case LOG_BLUE: prio = LOG_NOTICE; break;
-			}
-		}
-
-		va_copy(ap2, ap);
-		len = vsnprintf(NULL, 0, fmt, ap2) + 1;
-		va_end(ap2);
-		buf = alloca(len);
-		if (vsnprintf(buf, len, fmt, ap) >= 0)
-			syslog(prio, "%s", buf);
-	}
-#else
-	if (0) {}
-#endif
-	else {
-		const char* color = "";
-		char *f;
-		int len;
-		time_t now;
-		struct tm tm, *tm_p;
-
-		time(&now);
-
-		pthread_mutex_lock(&applog_lock);
-		tm_p = localtime(&now);
-		memcpy(&tm, tm_p, sizeof(tm));
-		pthread_mutex_unlock(&applog_lock);
-
-		switch (prio) {
-			case LOG_ERR:     color = CL_RED; break;
-			case LOG_WARNING: color = CL_YLW; break;
-			case LOG_NOTICE:  color = CL_WHT; break;
-			case LOG_INFO:    color = ""; break;
-			case LOG_DEBUG:   color = CL_GRY; break;
-
-			case LOG_BLUE:
-				prio = LOG_NOTICE;
-				color = CL_CYN;
-				break;
-		}
-		if (!use_colors)
-			color = "";
-
-		len = 40 + (int) strlen(fmt) + 2;
-		f = (char*) alloca(len);
-		sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d]%s %s%s\n",
-			tm.tm_year + 1900,
-			tm.tm_mon + 1,
-			tm.tm_mday,
-			tm.tm_hour,
-			tm.tm_min,
-			tm.tm_sec,
-			color,
-			fmt,
-			use_colors ? CL_N : ""
-		);
-		pthread_mutex_lock(&applog_lock);
-		vfprintf(stderr, f, ap);	/* atomic write to stderr */
-		fflush(stderr);
-		pthread_mutex_unlock(&applog_lock);
-	}
-	va_end(ap);
-}
-
-static void databuf_free(struct data_buffer *db)
-{
-	if (!db)
-		return;
-
-	free(db->buf);
-
-	memset(db, 0, sizeof(*db));
-}
-
-static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
-			  void *user_data)
-{
-	struct data_buffer *db = (struct data_buffer *)user_data;
-	size_t len = size * nmemb;
-	size_t oldlen, newlen;
-	void *newmem;
-	static const unsigned char zero = 0;
-
-	oldlen = db->len;
-	newlen = oldlen + len;
-
-	newmem = realloc(db->buf, newlen + 1);
-	if (!newmem)
-		return 0;
-
-	db->buf = newmem;
-	db->len = newlen;
-	memcpy((char*)db->buf + oldlen, ptr, len);
-	memcpy((char*)db->buf + newlen, &zero, 1);	/* null terminate */
-
-	return len;
-}
-
-static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
-			     void *user_data)
-{
-	struct upload_buffer *ub = (struct upload_buffer *)user_data;
-	unsigned int len = (unsigned int)(size * nmemb);
-
-	if (len > ub->len - ub->pos)
-		len = (unsigned int)(ub->len - ub->pos);
-
-	if (len) {
-		memcpy(ptr, (char*)ub->buf + ub->pos, len);
-		ub->pos += len;
-	}
-
-	return len;
-}
-
-#if LIBCURL_VERSION_NUM >= 0x071200
-static int seek_data_cb(void *user_data, curl_off_t offset, int origin)
-{
-	struct upload_buffer *ub = (struct upload_buffer *)user_data;
-	
-	switch (origin) {
-	case SEEK_SET:
-		ub->pos = (size_t)offset;
-		break;
-	case SEEK_CUR:
-		ub->pos += (size_t)offset;
-		break;
-	case SEEK_END:
-		ub->pos = ub->len + (size_t)offset;
-		break;
-	default:
-		return 1; /* CURL_SEEKFUNC_FAIL */
-	}
-
-	return 0; /* CURL_SEEKFUNC_OK */
-}
-#endif
-
-static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
-{
-	struct header_info *hi = (struct header_info *)user_data;
-	size_t remlen, slen, ptrlen = size * nmemb;
-	char *rem, *val = NULL, *key = NULL;
-	void *tmp;
-
-	val = (char*)calloc(1, ptrlen);
-	key = (char*)calloc(1, ptrlen);
-	if (!key || !val)
-		goto out;
-
-	tmp = memchr(ptr, ':', ptrlen);
-	if (!tmp || (tmp == ptr))	/* skip empty keys / blanks */
-		goto out;
-	slen = (size_t)((char*)tmp - (char*)ptr);
-	if ((slen + 1) == ptrlen)	/* skip key w/ no value */
-		goto out;
-	memcpy(key, ptr, slen);		/* store & nul term key */
-	key[slen] = 0;
-
-	rem = (char*)ptr + slen + 1;		/* trim value's leading whitespace */
-	remlen = ptrlen - slen - 1;
-	while ((remlen > 0) && (isspace(*rem))) {
-		remlen--;
-		rem++;
-	}
-
-	memcpy(val, rem, remlen);	/* store value, trim trailing ws */
-	val[remlen] = 0;
-	while ((*val) && (isspace(val[strlen(val) - 1]))) {
-		val[strlen(val) - 1] = 0;
-	}
-	if (!*val)			/* skip blank value */
-		goto out;
-
-	if (!strcasecmp("X-Long-Polling", key)) {
-		hi->lp_path = val;	/* X-Mining-Extensions: longpoll */
-		val = NULL;
-	}
-
-	if (!strcasecmp("X-Reject-Reason", key)) {
-		hi->reason = val;	/* X-Mining-Extensions: reject-reason */
-		//applog(LOG_WARNING, "%s:%s", key, val);
-		val = NULL;
-	}
-
-	if (!strcasecmp("X-Stratum", key)) {
-		hi->stratum_url = val;	/* steal memory reference */
-		val = NULL;
-	}
-
-	if (!strcasecmp("X-Nonce-Range", key)) {
-		/* todo when available: X-Mining-Extensions: noncerange */
-	}
-out:
-	free(key);
-	free(val);
-	return ptrlen;
-}
-
-#if LIBCURL_VERSION_NUM >= 0x070f06
-static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd,
-	curlsocktype purpose)
-{
-	int keepalive = 1;
-	int tcp_keepcnt = 3;
-	int tcp_keepidle = 50;
-	int tcp_keepintvl = 50;
-#ifdef WIN32
-	DWORD outputBytes;
-#endif
-
-#ifndef WIN32	
-	if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive,
-		sizeof(keepalive))))
-		return 1;
-#ifdef __linux
-	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT,
-		&tcp_keepcnt, sizeof(tcp_keepcnt))))
-		return 1;
-	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE,
-		&tcp_keepidle, sizeof(tcp_keepidle))))
-		return 1;
-	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL,
-		&tcp_keepintvl, sizeof(tcp_keepintvl))))
-		return 1;
-#endif /* __linux */
-#ifdef __APPLE_CC__
-	if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE,
-		&tcp_keepintvl, sizeof(tcp_keepintvl))))
-		return 1;
-#endif /* __APPLE_CC__ */
-#else /* WIN32 */
-	struct tcp_keepalive vals;
-	vals.onoff = 1;
-	vals.keepalivetime = tcp_keepidle * 1000;
-	vals.keepaliveinterval = tcp_keepintvl * 1000;	
-	if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals),
-		NULL, 0, &outputBytes, NULL, NULL)))
-		return 1;
-#endif /* WIN32 */
-
-	return 0;
-}
-#endif
-
-json_t *json_rpc_call(CURL *curl, const char *url,
-		      const char *userpass, const char *rpc_req,
-		      bool longpoll_scan, bool longpoll, int *curl_err)
-{
-	json_t *val, *err_val, *res_val;
-	int rc;
-	struct data_buffer all_data = {0};
-	struct upload_buffer upload_data;
-	json_error_t err;
-	struct curl_slist *headers = NULL;
-	char len_hdr[64], hashrate_hdr[64];
-	char curl_err_str[CURL_ERROR_SIZE];
-	long timeout = longpoll ? opt_timeout : 30;
-	struct header_info hi = {0};
-	bool lp_scanning = longpoll_scan && !have_longpoll;
-
-	/* it is assumed that 'curl' is freshly [re]initialized at this pt */
-
-	if (opt_protocol)
-		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
-	curl_easy_setopt(curl, CURLOPT_URL, url);
-	if (opt_cert)
-		curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
-	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
-	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1);
-	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
-	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
-	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
-	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
-	curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb);
-	curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data);
-#if LIBCURL_VERSION_NUM >= 0x071200
-	curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb);
-	curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data);
-#endif
-	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
-	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
-	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
-	curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb);
-	curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi);
-	if (opt_proxy) {
-		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
-		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
-	}
-	if (userpass) {
-		curl_easy_setopt(curl, CURLOPT_USERPWD, userpass);
-		curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
-	}
-#if LIBCURL_VERSION_NUM >= 0x070f06
-	if (longpoll)
-		curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
-#endif
-	curl_easy_setopt(curl, CURLOPT_POST, 1);
-
-	if (opt_protocol)
-		applog(LOG_DEBUG, "JSON protocol request:\n%s", rpc_req);
-
-	upload_data.buf = rpc_req;
-	upload_data.len = strlen(rpc_req);
-	upload_data.pos = 0;
-	sprintf(len_hdr, "Content-Length: %lu", (unsigned long) upload_data.len);
-	sprintf(hashrate_hdr, "X-Mining-Hashrate: %llu", (unsigned long long) global_hashrate);
-
-	headers = curl_slist_append(headers, "Content-Type: application/json");
-	headers = curl_slist_append(headers, len_hdr);
-	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
-	headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll noncerange reject-reason");
-	headers = curl_slist_append(headers, hashrate_hdr);
-	headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/
-	headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/
-
-	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
-
-	rc = curl_easy_perform(curl);
-	if (curl_err != NULL)
-		*curl_err = rc;
-	if (rc) {
-		if (!(longpoll && rc == CURLE_OPERATION_TIMEDOUT))
-			applog(LOG_ERR, "HTTP request failed: %s", curl_err_str);
-		goto err_out;
-	}
-
-	/* If X-Stratum was found, activate Stratum */
-	if (want_stratum && hi.stratum_url &&
-	    !strncasecmp(hi.stratum_url, "stratum+tcp://", 14) &&
-	    !(opt_proxy && opt_proxy_type == CURLPROXY_HTTP)) {
-		have_stratum = true;
-		tq_push(thr_info[stratum_thr_id].q, hi.stratum_url);
-		hi.stratum_url = NULL;
-	}
-
-	/* If X-Long-Polling was found, activate long polling */
-	if (lp_scanning && hi.lp_path && !have_stratum) {
-		have_longpoll = true;
-		tq_push(thr_info[longpoll_thr_id].q, hi.lp_path);
-		hi.lp_path = NULL;
-	}
-
-	if (!all_data.buf) {
-		applog(LOG_ERR, "Empty data received in json_rpc_call.");
-		goto err_out;
-	}
-
-	val = JSON_LOADS((const char*)all_data.buf, &err);
-	if (!val) {
-		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
-		goto err_out;
-	}
-
-	if (opt_protocol) {
-		char *s = json_dumps(val, JSON_INDENT(3));
-		applog(LOG_DEBUG, "JSON protocol response:\n%s\n", s);
-		free(s);
-	}
-
-	/* JSON-RPC valid response returns a non-null 'result',
-	 * and a null 'error'. */
-	res_val = json_object_get(val, "result");
-	err_val = json_object_get(val, "error");
-
-	if (!res_val || json_is_null(res_val) ||
-	    (err_val && !json_is_null(err_val))) {
-		char *s;
-
-		if (err_val)
-			s = json_dumps(err_val, JSON_INDENT(3));
-		else
-			s = strdup("(unknown reason)");
-
-		applog(LOG_ERR, "JSON-RPC call failed: %s", s);
-
-		free(s);
-
-		goto err_out;
-	}
-
-	if (hi.reason)
-		json_object_set_new(val, "reject-reason", json_string(hi.reason));
-
-	databuf_free(&all_data);
-	curl_slist_free_all(headers);
-	curl_easy_reset(curl);
-	return val;
-
-err_out:
-	free(hi.lp_path);
-	free(hi.reason);
-	free(hi.stratum_url);
-	databuf_free(&all_data);
-	curl_slist_free_all(headers);
-	curl_easy_reset(curl);
-	return NULL;
-}
-
-/**
- * Unlike malloc, calloc set the memory to zero
- */
-void *aligned_calloc(int size)
-{
-	const int ALIGN = 64; // cache line
-#ifdef _MSC_VER
-	void* res = _aligned_malloc(size, ALIGN);
-	memset(res, 0, size);
-	return res;
-#else
-	void *mem = calloc(1, size+ALIGN+sizeof(void*));
-	void **ptr = (void**)((size_t)(mem+ALIGN+sizeof(void*)) & ~(ALIGN-1));
-	ptr[-1] = mem;
-	return ptr;
-#endif
-}
-
-void aligned_free(void *ptr)
-{
-#ifdef _MSC_VER
-	return _aligned_free(ptr);
-#else
-	free(((void**)ptr)[-1]);
-#endif
-}
-
-void cbin2hex(char *out, const char *in, size_t len)
-{
-	if (out) {
-		unsigned int i;
-		for (i = 0; i < len; i++)
-			sprintf(out + (i * 2), "%02x", (uint8_t)in[i]);
-	}
-}
-
-char *bin2hex(const unsigned char *in, size_t len)
-{
-	char *s = (char*)malloc((len * 2) + 1);
-	if (!s)
-		return NULL;
-
-	cbin2hex(s, (const char *) in, len);
-
-	return s;
-}
-
-bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
-{
-	char hex_byte[3];
-	char *ep;
-
-	hex_byte[2] = '\0';
-
-	while (*hexstr && len) {
-		if (!hexstr[1]) {
-			applog(LOG_ERR, "hex2bin str truncated");
-			return false;
-		}
-		hex_byte[0] = hexstr[0];
-		hex_byte[1] = hexstr[1];
-		*p = (unsigned char) strtol(hex_byte, &ep, 16);
-		if (*ep) {
-			applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte);
-			return false;
-		}
-		p++;
-		hexstr += 2;
-		len--;
-	}
-
-	return (len == 0 && *hexstr == 0) ? true : false;
-}
-
-/* Subtract the `struct timeval' values X and Y,
-   storing the result in RESULT.
-   Return 1 if the difference is negative, otherwise 0.  */
-int timeval_subtract(struct timeval *result, struct timeval *x,
-	struct timeval *y)
-{
-	/* Perform the carry for the later subtraction by updating Y. */
-	if (x->tv_usec < y->tv_usec) {
-		int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
-		y->tv_usec -= 1000000 * nsec;
-		y->tv_sec += nsec;
-	}
-	if (x->tv_usec - y->tv_usec > 1000000) {
-		int nsec = (x->tv_usec - y->tv_usec) / 1000000;
-		y->tv_usec += 1000000 * nsec;
-		y->tv_sec -= nsec;
-	}
-
-	/* Compute the time remaining to wait.
-	 * `tv_usec' is certainly positive. */
-	result->tv_sec = x->tv_sec - y->tv_sec;
-	result->tv_usec = x->tv_usec - y->tv_usec;
-
-	/* Return 1 if result is negative. */
-	return x->tv_sec < y->tv_sec;
-}
-
-bool fulltest(const uint32_t *hash, const uint32_t *target)
-{
-	int i;
-	bool rc = true;
-	
-	for (i = 7; i >= 0; i--) {
-		if (hash[i] > target[i]) {
-			rc = false;
-			break;
-		}
-		if (hash[i] < target[i]) {
-			rc = true;
-			break;
-		}
-		if (hash[1] == target[1]) {
-			applog(LOG_NOTICE, "We found a close match!");
-		}
-	}
-
-	if (!rc && opt_debug) {
-		uint32_t hash_be[8], target_be[8];
-		char *hash_str, *target_str;
-		
-		for (i = 0; i < 8; i++) {
-			be32enc(hash_be + i, hash[7 - i]);
-			be32enc(target_be + i, target[7 - i]);
-		}
-		hash_str = bin2hex((unsigned char *)hash_be, 32);
-		target_str = bin2hex((unsigned char *)target_be, 32);
-
-		applog(LOG_DEBUG, "DEBUG: %s\nHash:   %s\nTarget: %s",
-			rc ? "hash <= target"
-			   : CL_YLW "hash > target (false positive)" CL_N,
-			hash_str,
-			target_str);
-
-		free(hash_str);
-		free(target_str);
-	}
-
-	return rc;
-}
-
-void diff_to_target(uint32_t *target, double diff)
-{
-	uint64_t m;
-	int k;
-	
-	for (k = 6; k > 0 && diff > 1.0; k--)
-		diff /= 4294967296.0;
-	m = (uint64_t)(4294901760.0 / diff);
-	if (m == 0 && k == 6)
-		memset(target, 0xff, 32);
-	else {
-		memset(target, 0, 32);
-		target[k] = (uint32_t)m;
-		target[k + 1] = (uint32_t)(m >> 32);
-	}
-}
-
-#ifdef WIN32
-#define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK)
-#else
-#define socket_blocks() (errno == EAGAIN || errno == EWOULDBLOCK)
-#endif
-
-static bool send_line(curl_socket_t sock, char *s)
-{
-	ssize_t len, sent = 0;
-	
-	len = (ssize_t)strlen(s);
-	s[len++] = '\n';
-
-	while (len > 0) {
-		struct timeval timeout = {0, 0};
-		ssize_t n;
-		fd_set wd;
-
-		FD_ZERO(&wd);
-		FD_SET(sock, &wd);
-		if (select((int)sock + 1, NULL, &wd, NULL, &timeout) < 1)
-			return false;
-		n = send(sock, s + sent, len, 0);
-		if (n < 0) {
-			if (!socket_blocks())
-				return false;
-			n = 0;
-		}
-		sent += n;
-		len -= n;
-	}
-
-	return true;
-}
-
-bool stratum_send_line(struct stratum_ctx *sctx, char *s)
-{
-	bool ret = false;
-
-	if (opt_protocol)
-		applog(LOG_DEBUG, "> %s", s);
-
-	pthread_mutex_lock(&sctx->sock_lock);
-	ret = send_line(sctx->sock, s);
-	pthread_mutex_unlock(&sctx->sock_lock);
-
-	return ret;
-}
-
-static bool socket_full(curl_socket_t sock, int timeout)
-{
-	struct timeval tv;
-	fd_set rd;
-
-	FD_ZERO(&rd);
-	FD_SET(sock, &rd);
-	tv.tv_sec = timeout;
-	tv.tv_usec = 0;
-	if (select((int)sock + 1, &rd, NULL, NULL, &tv) > 0)
-		return true;
-	return false;
-}
-
-bool stratum_socket_full(struct stratum_ctx *sctx, int timeout)
-{
-	return strlen(sctx->sockbuf) || socket_full(sctx->sock, timeout);
-}
-
-#define RBUFSIZE 2048
-#define RECVSIZE (RBUFSIZE - 4)
-
-static void stratum_buffer_append(struct stratum_ctx *sctx, const char *s)
-{
-	size_t old, snew;
-
-	old = strlen(sctx->sockbuf);
-	snew = old + strlen(s) + 1;
-	if (snew >= sctx->sockbuf_size) {
-		sctx->sockbuf_size = snew + (RBUFSIZE - (snew % RBUFSIZE));
-		sctx->sockbuf = (char*)realloc(sctx->sockbuf, sctx->sockbuf_size);
-	}
-	strcpy(sctx->sockbuf + old, s);
-}
-
-char *stratum_recv_line(struct stratum_ctx *sctx)
-{
-	ssize_t len, buflen;
-	char *tok, *sret = NULL;
-
-	if (!strstr(sctx->sockbuf, "\n")) {
-		bool ret = true;
-		time_t rstart;
-
-		time(&rstart);
-		if (!socket_full(sctx->sock, 60)) {
-			applog(LOG_ERR, "stratum_recv_line timed out");
-			goto out;
-		}
-		do {
-			char s[RBUFSIZE];
-			ssize_t n;
-
-			memset(s, 0, RBUFSIZE);
-			n = recv(sctx->sock, s, RECVSIZE, 0);
-			if (!n) {
-				ret = false;
-				break;
-			}
-			if (n < 0) {
-				if (!socket_blocks() || !socket_full(sctx->sock, 1)) {
-					ret = false;
-					break;
-				}
-			} else
-				stratum_buffer_append(sctx, s);
-		} while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n"));
-
-		if (!ret) {
-			applog(LOG_ERR, "stratum_recv_line failed");
-			goto out;
-		}
-	}
-
-	buflen = (ssize_t)strlen(sctx->sockbuf);
-	tok = strtok(sctx->sockbuf, "\n");
-	if (!tok) {
-		applog(LOG_ERR, "stratum_recv_line failed to parse a newline-terminated string");
-		goto out;
-	}
-	sret = strdup(tok);
-	len = (ssize_t)strlen(sret);
-
-	if (buflen > len + 1)
-		memmove(sctx->sockbuf, sctx->sockbuf + len + 1, buflen - len + 1);
-	else
-		sctx->sockbuf[0] = '\0';
-
-out:
-	if (sret && opt_protocol)
-		applog(LOG_DEBUG, "< %s", sret);
-	return sret;
-}
-
-#if LIBCURL_VERSION_NUM >= 0x071101
-static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose,
-	struct curl_sockaddr *addr)
-{
-	curl_socket_t *sock = (curl_socket_t *)clientp;
-	*sock = socket(addr->family, addr->socktype, addr->protocol);
-	return *sock;
-}
-#endif
-
-bool stratum_connect(struct stratum_ctx *sctx, const char *url)
-{
-	CURL *curl;
-	int rc;
-
-	pthread_mutex_lock(&sctx->sock_lock);
-	if (sctx->curl)
-		curl_easy_cleanup(sctx->curl);
-	sctx->curl = curl_easy_init();
-	if (!sctx->curl) {
-		applog(LOG_ERR, "CURL initialization failed");
-		pthread_mutex_unlock(&sctx->sock_lock);
-		return false;
-	}
-	curl = sctx->curl;
-	if (!sctx->sockbuf) {
-		sctx->sockbuf = (char*)calloc(RBUFSIZE, 1);
-		sctx->sockbuf_size = RBUFSIZE;
-	}
-	sctx->sockbuf[0] = '\0';
-	pthread_mutex_unlock(&sctx->sock_lock);
-
-	if (url != sctx->url) {
-		free(sctx->url);
-		sctx->url = strdup(url);
-	}
-	free(sctx->curl_url);
-	sctx->curl_url = (char*)malloc(strlen(url));
-	sprintf(sctx->curl_url, "http%s", strstr(url, "://"));
-
-	if (opt_protocol)
-		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
-	curl_easy_setopt(curl, CURLOPT_URL, sctx->curl_url);
-	curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1);
-	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30);
-	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str);
-	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
-	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
-	if (opt_proxy && opt_proxy_type != CURLPROXY_HTTP) {
-		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
-		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
-	} else if (getenv("http_proxy")) {
-		if (getenv("all_proxy"))
-			curl_easy_setopt(curl, CURLOPT_PROXY, getenv("all_proxy"));
-		else if (getenv("ALL_PROXY"))
-			curl_easy_setopt(curl, CURLOPT_PROXY, getenv("ALL_PROXY"));
-		else
-			curl_easy_setopt(curl, CURLOPT_PROXY, "");
-	}
-#if LIBCURL_VERSION_NUM >= 0x070f06
-	curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
-#endif
-#if LIBCURL_VERSION_NUM >= 0x071101
-	curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb);
-	curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock);
-#endif
-	curl_easy_setopt(curl, CURLOPT_CONNECT_ONLY, 1);
-
-	rc = curl_easy_perform(curl);
-	if (rc) {
-		applog(LOG_ERR, "Stratum connection failed: %s", sctx->curl_err_str);
-		curl_easy_cleanup(curl);
-		sctx->curl = NULL;
-		return false;
-	}
-
-#if LIBCURL_VERSION_NUM < 0x071101
-	/* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */
-	curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock);
-#endif
-
-	return true;
-}
-
-void stratum_disconnect(struct stratum_ctx *sctx)
-{
-	pthread_mutex_lock(&sctx->sock_lock);
-	if (sctx->curl) {
-		curl_easy_cleanup(sctx->curl);
-		sctx->curl = NULL;
-		sctx->sockbuf[0] = '\0';
-	}
-	pthread_mutex_unlock(&sctx->sock_lock);
-}
-
-static const char *get_stratum_session_id(json_t *val)
-{
-	json_t *arr_val;
-	int i, n;
-
-	arr_val = json_array_get(val, 0);
-	if (!arr_val || !json_is_array(arr_val))
-		return NULL;
-	n = json_array_size(arr_val);
-	for (i = 0; i < n; i++) {
-		const char *notify;
-		json_t *arr = json_array_get(arr_val, i);
-
-		if (!arr || !json_is_array(arr))
-			break;
-		notify = json_string_value(json_array_get(arr, 0));
-		if (!notify)
-			continue;
-		if (!strcasecmp(notify, "mining.notify"))
-			return json_string_value(json_array_get(arr, 1));
-	}
-	return NULL;
-}
-
-bool stratum_subscribe(struct stratum_ctx *sctx)
-{
-	char *s, *sret = NULL;
-	const char *sid, *xnonce1;
-	int xn2_size;
-	json_t *val = NULL, *res_val, *err_val;
-	json_error_t err;
-	bool ret = false, retry = false;
-
-start:
-	s = (char*)malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0));
-	if (retry)
-		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}");
-	else if (sctx->session_id)
-		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\", \"%s\"]}", sctx->session_id);
-	else
-		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\"]}");
-
-	if (!stratum_send_line(sctx, s))
-		goto out;
-
-	if (!socket_full(sctx->sock, 30)) {
-		applog(LOG_ERR, "stratum_subscribe timed out");
-		goto out;
-	}
-
-	sret = stratum_recv_line(sctx);
-	if (!sret)
-		goto out;
-
-	val = JSON_LOADS(sret, &err);
-	free(sret);
-	if (!val) {
-		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
-		goto out;
-	}
-
-	res_val = json_object_get(val, "result");
-	err_val = json_object_get(val, "error");
-
-	if (!res_val || json_is_null(res_val) ||
-	    (err_val && !json_is_null(err_val))) {
-		if (opt_debug || retry) {
-			free(s);
-			if (err_val)
-				s = json_dumps(err_val, JSON_INDENT(3));
-			else
-				s = strdup("(unknown reason)");
-			applog(LOG_ERR, "JSON-RPC call failed: %s", s);
-		}
-		goto out;
-	}
-
-	sid = get_stratum_session_id(res_val);
-	if (opt_debug && !sid)
-		applog(LOG_DEBUG, "Failed to get Stratum session id");
-	xnonce1 = json_string_value(json_array_get(res_val, 1));
-	if (!xnonce1) {
-		applog(LOG_ERR, "Failed to get extranonce1");
-		goto out;
-	}
-	xn2_size = json_integer_value(json_array_get(res_val, 2));
-	if (!xn2_size) {
-		applog(LOG_ERR, "Failed to get extranonce2_size");
-		goto out;
-	}
-
-	pthread_mutex_lock(&sctx->work_lock);
-	free(sctx->session_id);
-	free(sctx->xnonce1);
-	sctx->session_id = sid ? strdup(sid) : NULL;
-	sctx->xnonce1_size = strlen(xnonce1) / 2;
-	sctx->xnonce1 = (unsigned char*)malloc(sctx->xnonce1_size);
-	hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size);
-	sctx->xnonce2_size = xn2_size;
-	sctx->next_diff = 1.0;
-	pthread_mutex_unlock(&sctx->work_lock);
-
-	if (opt_debug && sid)
-		applog(LOG_DEBUG, "Stratum session id: %s", sctx->session_id);
-
-	ret = true;
-
-out:
-	free(s);
-	if (val)
-		json_decref(val);
-
-	if (!ret) {
-		if (sret && !retry) {
-			retry = true;
-			goto start;
-		}
-	}
-
-	return ret;
-}
-
-bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass)
-{
-	json_t *val = NULL, *res_val, *err_val;
-	char *s, *sret;
-	json_error_t err;
-	bool ret = false;
-
-	s = (char*)malloc(80 + strlen(user) + strlen(pass));
-	sprintf(s, "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}",
-	        user, pass);
-
-	if (!stratum_send_line(sctx, s))
-		goto out;
-
-	while (1) {
-		sret = stratum_recv_line(sctx);
-		if (!sret)
-			goto out;
-		if (!stratum_handle_method(sctx, sret))
-			break;
-		free(sret);
-	}
-
-	val = JSON_LOADS(sret, &err);
-	free(sret);
-	if (!val) {
-		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
-		goto out;
-	}
-
-	res_val = json_object_get(val, "result");
-	err_val = json_object_get(val, "error");
-
-	if (!res_val || json_is_false(res_val) ||
-	    (err_val && !json_is_null(err_val)))  {
-		applog(LOG_ERR, "Stratum authentication failed");
-		goto out;
-	}
-
-	ret = true;
-
-out:
-	free(s);
-	if (val)
-		json_decref(val);
-
-	return ret;
-}
-
-/**
- * Extract bloc height     L H... here len=3, height=0x1333e8
- * "...0000000000ffffffff2703e83313062f503253482f043d61105408"
- */
-static uint32_t getblocheight(struct stratum_ctx *sctx)
-{
-	uint32_t height = 0;
-	uint8_t hlen = 0, *p, *m;
-
-	// find 0xffff tag
-	p = (uint8_t*) sctx->job.coinbase + 32;
-	m = p + 128;
-	while (*p != 0xff && p < m) p++;
-	while (*p == 0xff && p < m) p++;
-	if (*(p-1) == 0xff && *(p-2) == 0xff) {
-		p++; hlen = *p;
-		p++; height = le16dec(p);
-		p += 2;
-		switch (hlen) {
-			case 4:
-				height += 0x10000UL * le16dec(p);
-				break;
-			case 3:
-				height += 0x10000UL * (*p);
-				break;
-		}
-	}
-	return height;
-}
-
-static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
-{
-	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime, *nreward;
-	size_t coinb1_size, coinb2_size;
-	bool clean, ret = false;
-	int merkle_count, i;
-	json_t *merkle_arr;
-	unsigned char **merkle;
-	int ntime;
-
-	job_id = json_string_value(json_array_get(params, 0));
-	prevhash = json_string_value(json_array_get(params, 1));
-	coinb1 = json_string_value(json_array_get(params, 2));
-	coinb2 = json_string_value(json_array_get(params, 3));
-	merkle_arr = json_array_get(params, 4);
-	if (!merkle_arr || !json_is_array(merkle_arr))
-		goto out;
-	merkle_count = json_array_size(merkle_arr);
-	version = json_string_value(json_array_get(params, 5));
-	nbits = json_string_value(json_array_get(params, 6));
-	stime = json_string_value(json_array_get(params, 7));
-	clean = json_is_true(json_array_get(params, 8));
-	nreward = json_string_value(json_array_get(params, 9));
-
-	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
-	    strlen(prevhash) != 64 || strlen(version) != 8 ||
-	    strlen(nbits) != 8 || strlen(stime) != 8) {
-		applog(LOG_ERR, "Stratum notify: invalid parameters");
-		goto out;
-	}
-
-	/* store stratum server time diff */
-	hex2bin((unsigned char *)&ntime, stime, 4);
-	ntime = swab32(ntime) - (uint32_t) time(0);
-	if (ntime > sctx->srvtime_diff) {
-		sctx->srvtime_diff = ntime;
-		if (!opt_quiet)
-			applog(LOG_DEBUG, "stratum time is at least %ds in the future", ntime);
-	}
-
-	merkle = (unsigned char**)malloc(merkle_count * sizeof(char *));
-	for (i = 0; i < merkle_count; i++) {
-		const char *s = json_string_value(json_array_get(merkle_arr, i));
-		if (!s || strlen(s) != 64) {
-			while (i--)
-				free(merkle[i]);
-			free(merkle);
-			applog(LOG_ERR, "Stratum notify: invalid Merkle branch");
-			goto out;
-		}
-		merkle[i] = (unsigned char*)malloc(32);
-		hex2bin(merkle[i], s, 32);
-	}
-
-	pthread_mutex_lock(&sctx->work_lock);
-
-	coinb1_size = strlen(coinb1) / 2;
-	coinb2_size = strlen(coinb2) / 2;
-	sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size +
-	                          sctx->xnonce2_size + coinb2_size;
-
-	sctx->job.coinbase = (unsigned char*)realloc(sctx->job.coinbase, sctx->job.coinbase_size);
-	sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size;
-	hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
-	memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size);
-
-	if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
-		memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
-	hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size);
-
-	free(sctx->job.job_id);
-	sctx->job.job_id = strdup(job_id);
-	hex2bin(sctx->job.prevhash, prevhash, 32);
-
-	sctx->bloc_height = getblocheight(sctx);
-
-	for (i = 0; i < sctx->job.merkle_count; i++)
-		free(sctx->job.merkle[i]);
-	free(sctx->job.merkle);
-	sctx->job.merkle = merkle;
-	sctx->job.merkle_count = merkle_count;
-
-	hex2bin(sctx->job.version, version, 4);
-	hex2bin(sctx->job.nbits, nbits, 4);
-	hex2bin(sctx->job.ntime, stime, 4);
-	if(nreward != NULL)
-	{
-		if(strlen(nreward) == 4)
-			hex2bin(sctx->job.nreward, nreward, 2);
-	}
-	sctx->job.clean = clean;
-
-	sctx->job.diff = sctx->next_diff;
-
-	pthread_mutex_unlock(&sctx->work_lock);
-
-	ret = true;
-
-out:
-	return ret;
-}
-
-static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
-{
-	double diff;
-
-	diff = json_number_value(json_array_get(params, 0));
-	if (diff == 0)
-		return false;
-
-	pthread_mutex_lock(&sctx->work_lock);
-	sctx->next_diff = diff;
-	pthread_mutex_unlock(&sctx->work_lock);
-
-	applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
-
-	return true;
-}
-
-static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params)
-{
-	json_t *port_val;
-	const char *host;
-	int port;
-
-	host = json_string_value(json_array_get(params, 0));
-	port_val = json_array_get(params, 1);
-	if (json_is_string(port_val))
-		port = atoi(json_string_value(port_val));
-	else
-		port = json_integer_value(port_val);
-	if (!host || !port)
-		return false;
-	
-	free(sctx->url);
-	sctx->url = (char*)malloc(32 + strlen(host));
-	sprintf(sctx->url, "stratum+tcp://%s:%d", host, port);
-
-	applog(LOG_NOTICE, "Server requested reconnection to %s", sctx->url);
-
-	stratum_disconnect(sctx);
-
-	return true;
-}
-
-static bool stratum_get_version(struct stratum_ctx *sctx, json_t *id)
-{
-	char *s;
-	json_t *val;
-	bool ret;
-	
-	if (!id || json_is_null(id))
-		return false;
-
-	val = json_object();
-	json_object_set(val, "id", id);
-	json_object_set_new(val, "error", json_null());
-	json_object_set_new(val, "result", json_string(USER_AGENT));
-	s = json_dumps(val, 0);
-	ret = stratum_send_line(sctx, s);
-	json_decref(val);
-	free(s);
-
-	return ret;
-}
-
-static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *params)
-{
-	char *s;
-	json_t *val;
-	bool ret;
-
-	val = json_array_get(params, 0);
-	if (val)
-		applog(LOG_NOTICE, "MESSAGE FROM SERVER: %s", json_string_value(val));
-	
-	if (!id || json_is_null(id))
-		return true;
-
-	val = json_object();
-	json_object_set(val, "id", id);
-	json_object_set_new(val, "error", json_null());
-	json_object_set_new(val, "result", json_true());
-	s = json_dumps(val, 0);
-	ret = stratum_send_line(sctx, s);
-	json_decref(val);
-	free(s);
-
-	return ret;
-}
-
-bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
-{
-	json_t *val, *id, *params;
-	json_error_t err;
-	const char *method;
-	bool ret = false;
-
-	val = JSON_LOADS(s, &err);
-	if (!val) {
-		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
-		goto out;
-	}
-
-	method = json_string_value(json_object_get(val, "method"));
-	if (!method)
-		goto out;
-	id = json_object_get(val, "id");
-	params = json_object_get(val, "params");
-
-	if (!strcasecmp(method, "mining.notify")) {
-		ret = stratum_notify(sctx, params);
-		goto out;
-	}
-	if (!strcasecmp(method, "mining.set_difficulty")) {
-		ret = stratum_set_difficulty(sctx, params);
-		goto out;
-	}
-	if (!strcasecmp(method, "client.reconnect")) {
-		ret = stratum_reconnect(sctx, params);
-		goto out;
-	}
-	if (!strcasecmp(method, "client.get_version")) {
-		ret = stratum_get_version(sctx, id);
-		goto out;
-	}
-	if (!strcasecmp(method, "client.show_message")) {
-		ret = stratum_show_message(sctx, id, params);
-		goto out;
-	}
-
-out:
-	if (val)
-		json_decref(val);
-
-	return ret;
-}
-
-struct thread_q *tq_new(void)
-{
-	struct thread_q *tq;
-
-	tq = (struct thread_q *)calloc(1, sizeof(*tq));
-	if (!tq)
-		return NULL;
-
-	INIT_LIST_HEAD(&tq->q);
-	pthread_mutex_init(&tq->mutex, NULL);
-	pthread_cond_init(&tq->cond, NULL);
-
-	return tq;
-}
-
-void tq_free(struct thread_q *tq)
-{
-	struct tq_ent *ent, *iter;
-
-	if (!tq)
-		return;
-
-	list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent, struct tq_ent) {
-		list_del(&ent->q_node);
-		free(ent);
-	}
-
-	pthread_cond_destroy(&tq->cond);
-	pthread_mutex_destroy(&tq->mutex);
-
-	memset(tq, 0, sizeof(*tq));	/* poison */
-	free(tq);
-}
-
-static void tq_freezethaw(struct thread_q *tq, bool frozen)
-{
-	pthread_mutex_lock(&tq->mutex);
-
-	tq->frozen = frozen;
-
-	pthread_cond_signal(&tq->cond);
-	pthread_mutex_unlock(&tq->mutex);
-}
-
-void tq_freeze(struct thread_q *tq)
-{
-	tq_freezethaw(tq, true);
-}
-
-void tq_thaw(struct thread_q *tq)
-{
-	tq_freezethaw(tq, false);
-}
-
-bool tq_push(struct thread_q *tq, void *data)
-{
-	struct tq_ent *ent;
-	bool rc = true;
-
-	ent = (struct tq_ent *)calloc(1, sizeof(*ent));
-	if (!ent)
-		return false;
-
-	ent->data = data;
-	INIT_LIST_HEAD(&ent->q_node);
-
-	pthread_mutex_lock(&tq->mutex);
-
-	if (!tq->frozen) {
-		list_add_tail(&ent->q_node, &tq->q);
-	} else {
-		free(ent);
-		rc = false;
-	}
-
-	pthread_cond_signal(&tq->cond);
-	pthread_mutex_unlock(&tq->mutex);
-
-	return rc;
-}
-
-void *tq_pop(struct thread_q *tq, const struct timespec *abstime)
-{
-	struct tq_ent *ent;
-	void *rval = NULL;
-	int rc;
-
-	pthread_mutex_lock(&tq->mutex);
-
-	if (!list_empty(&tq->q))
-		goto pop;
-
-	if (abstime)
-		rc = pthread_cond_timedwait(&tq->cond, &tq->mutex, abstime);
-	else
-		rc = pthread_cond_wait(&tq->cond, &tq->mutex);
-	if (rc)
-		goto out;
-	if (list_empty(&tq->q))
-		goto out;
-
-pop:
-	ent = list_entry(tq->q.next, struct tq_ent, q_node);
-	rval = ent->data;
-
-	list_del(&ent->q_node);
-	free(ent);
-
-out:
-	pthread_mutex_unlock(&tq->mutex);
-	return rval;
-}
-
-/**
- * @param buf char[9] mini
- * @param time_t timer to convert
- */
-size_t time2str(char* buf, time_t timer)
-{
-	struct tm* tm_info;
-	tm_info = localtime(&timer);
-	return strftime(buf, 19, "%H:%M:%S", tm_info);
-}
-
-/**
- * Alloc and returns time string (to be freed)
- * @param time_t timer to convert
- */
-char* atime2str(time_t timer)
-{
-	char* buf = (char*) malloc(16);
-	memset(buf, 0, 16);
-	time2str(buf, timer);
-	return buf;
-}
-
-/* sprintf can be used in applog */
-static char* format_hash(char* buf, unsigned char *hash)
-{
-	int len = 0;
-	for (int i=0; i < 32; i += 4) {
-		len += sprintf(buf+len, "%02x%02x%02x%02x ",
-			hash[i], hash[i+1], hash[i+2], hash[i+3]);
-	}
-	return buf;
-}
-
-/* to debug diff in data */
-extern void applog_compare_hash(unsigned char *hash, unsigned char *hash2)
-{
-	char s[256] = "";
-	int len = 0;
-	for (int i=0; i < 32; i += 4) {
-		char *color = memcmp(hash+i, hash2+i, 4) ? CL_WHT : CL_GRY;
-		len += sprintf(s+len, "%s%02x%02x%02x%02x " CL_GRY, color,
-			hash[i], hash[i+1], hash[i+2], hash[i+3]);
-		s[len] = '\0';
-	}
-	applog(LOG_DEBUG, "%s", s);
-}
-
-extern void applog_hash(unsigned char *hash)
-{
-	char s[128] = {'\0'};
-	applog(LOG_DEBUG, "%s", format_hash(s, hash));
-}
-
-#define printpfx(n,h) \
-	printf("%s%12s%s: %s\n", CL_BLU, n, CL_N, format_hash(s, h))
-
-extern bool opt_tracegpu;
-void do_gpu_tests(void)
-{
-#ifdef _DEBUG
-	unsigned long done;
-	char s[128] = { '\0' };
-	unsigned char buf[128], hash[128];
-	uint32_t tgt[8] = { 0 };
-	memset(buf, 0, sizeof buf);
-	buf[0] = 1; buf[64] = 2;
-	opt_tracegpu = true;
-	work_restart = (struct work_restart*) malloc(sizeof(struct work_restart));
-	work_restart[0].restart = 1;
-	tgt[6] = 0xffff;
-	scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
-	free(work_restart);
-	work_restart = NULL;
-	opt_tracegpu = false;
-#endif
-}
-
-void print_hash_tests(void)
-{
-	char s[128] = {'\0'};
-	unsigned char buf[128], hash[128];
-	memset(buf, 0, sizeof buf);
-	// buf[0] = 1; buf[64] = 2;
-
-	printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n");
-
-	memset(hash, 0, sizeof hash);
-	animehash(&hash[0], &buf[0]);
-	printpfx("anime", hash);
-
-	memset(hash, 0, sizeof hash);
-	blake256hash(&hash[0], &buf[0], 8);
-	printpfx("blakecoin", hash);
-
-	memset(hash, 0, sizeof hash);
-	blake256hash(&hash[0], &buf[0], 14);
-	printpfx("blake", hash);
-
-	do_gpu_tests();
-
-	memset(hash, 0, sizeof hash);
-	deephash(&hash[0], &buf[0]);
-	printpfx("deep", hash);
-
-	memset(hash, 0, sizeof hash);
-	fresh_hash(&hash[0], &buf[0]);
-	printpfx("fresh", hash);
-
-	memset(hash, 0, sizeof hash);
-	fugue256_hash(&hash[0], &buf[0], 32);
-	printpfx("fugue256", hash);
-
-	memset(hash, 0, sizeof hash);
-	groestlhash(&hash[0], &buf[0]);
-	printpfx("groestl", hash);
-
-	memset(hash, 0, sizeof hash);
-	heavycoin_hash(&hash[0], &buf[0], 32);
-	printpfx("heavy", hash);
-
-	memset(hash, 0, sizeof hash);
-	keccak256_hash(&hash[0], &buf[0]);
-	printpfx("keccak", hash);
-
-	memset(hash, 0, sizeof hash);
-	jackpothash(&hash[0], &buf[0]);
-	printpfx("jackpot", hash);
-
-	memset(hash, 0, sizeof hash);
-	doomhash(&hash[0], &buf[0]);
-	printpfx("luffa", hash);
-
-	memset(hash, 0, sizeof hash);
-	myriadhash(&hash[0], &buf[0]);
-	printpfx("myriad", hash);
-
-	memset(hash, 0, sizeof hash);
-	nist5hash(&hash[0], &buf[0]);
-	printpfx("nist5", hash);
-
-	memset(hash, 0, sizeof hash);
-	pentablakehash(&hash[0], &buf[0]);
-	printpfx("pentablake", hash);
-
-	memset(hash, 0, sizeof hash);
-	quarkhash(&hash[0], &buf[0]);
-	printpfx("quark", hash);
-
-	memset(hash, 0, sizeof hash);
-	qubithash(&hash[0], &buf[0]);
-	printpfx("qubit", hash);
-
-	memset(hash, 0, sizeof hash);
-	s3hash(&hash[0], &buf[0]);
-	printpfx("S3", hash);
-
-	memset(hash, 0, sizeof hash);
-	wcoinhash(&hash[0], &buf[0]);
-	printpfx("whirl", hash);
-
-	memset(hash, 0, sizeof hash);
-	x11hash(&hash[0], &buf[0]);
-	printpfx("X11", hash);
-
-	memset(hash, 0, sizeof hash);
-	x13hash(&hash[0], &buf[0]);
-	printpfx("X13", hash);
-
-	memset(hash, 0, sizeof hash);
-	x14hash(&hash[0], &buf[0]);
-	printpfx("X14", hash);
-
-	memset(hash, 0, sizeof hash);
-	x15hash(&hash[0], &buf[0]);
-	printpfx("X15", hash);
-
-	memset(hash, 0, sizeof hash);
-	x17hash(&hash[0], &buf[0]);
-	printpfx("X17", hash);
-
-	printf("\n");
-}
diff --git a/util.cpp b/util.cpp
index 42cee696e5..496f799064 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1,27 +1,24 @@
 /*
- * Copyright 2010 Jeff Garzik
- * Copyright 2012-2014 pooler
- * Copyright 2014 ccminer team
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.  See COPYING for more details.
- */
-
-//#define _GNU_SOURCE
-#include "cpuminer-config.h"
-
-#include <stdio.h>
-#include <stdlib.h>
+* Copyright 2010 Jeff Garzik
+* Copyright 2012-2014 pooler
+* Copyright 2014 ccminer team
+*
+* This program is free software; you can redistribute it and/or modify it
+* under the terms of the GNU General Public License as published by the Free
+* Software Foundation; either version 2 of the License, or (at your option)
+* any later version.  See COPYING for more details.
+*/
+
+#include <cstdio>
+#include <cstdlib>
 #include <ctype.h>
-#include <stdarg.h>
-#include <string.h>
-#include <inttypes.h>
+#include <cstdarg>
+#include <cstring>
+#include <cinttypes>
 #include <unistd.h>
 #include <jansson.h>
 #include <curl/curl.h>
-#include <time.h>
+#include <ctime>
 #ifdef WIN32
 #include "compat/winansi.h"
 #include <winsock2.h>
@@ -34,32 +31,41 @@
 #endif
 #include "miner.h"
 #include "elist.h"
+using namespace std;
+
+extern enum sha_algos opt_algo;
+extern char curl_err_str[];
 
 bool opt_tracegpu = false;
 
-struct data_buffer {
+struct data_buffer
+{
 	void		*buf;
 	size_t		len;
 };
 
-struct upload_buffer {
+struct upload_buffer
+{
 	const void	*buf;
 	size_t		len;
 	size_t		pos;
 };
 
-struct header_info {
+struct header_info
+{
 	char		*lp_path;
 	char		*reason;
 	char		*stratum_url;
 };
 
-struct tq_ent {
+struct tq_ent
+{
 	void			*data;
 	struct list_head	q_node;
 };
 
-struct thread_q {
+struct thread_q
+{
 	struct list_head	q;
 
 	bool frozen;
@@ -68,6 +74,20 @@ struct thread_q {
 	pthread_cond_t		cond;
 };
 
+// input and output may point to the same location
+void hexstringreverse(void *output, const void *input, size_t length)
+{
+	uint16_t tmp1;
+	uint16_t tmp2;
+	for(size_t i = 0; i < length / 4; i++)
+	{
+		tmp1 = *(((uint16_t*)input) + i);
+		tmp2 = *(((uint16_t*)output) + (length / 2 - i));
+		*(((uint16_t*)input) + i) = tmp2;
+		*(((uint16_t*)output) + (length / 2 - i)) = tmp1;
+	}
+}
+
 void applog(int prio, const char *fmt, ...)
 {
 	va_list ap;
@@ -75,29 +95,35 @@ void applog(int prio, const char *fmt, ...)
 	va_start(ap, fmt);
 
 #ifdef HAVE_SYSLOG_H
-	if (use_syslog) {
+	if(use_syslog)
+	{
 		va_list ap2;
 		char *buf;
 		int len;
 
 		/* custom colors to syslog prio */
-		if (prio > LOG_DEBUG) {
-			switch (prio) {
-				case LOG_BLUE: prio = LOG_NOTICE; break;
+		if(prio > LOG_DEBUG)
+		{
+			switch(prio)
+			{
+			case LOG_BLUE: prio = LOG_NOTICE; break;
 			}
 		}
 
 		va_copy(ap2, ap);
 		len = vsnprintf(NULL, 0, fmt, ap2) + 1;
 		va_end(ap2);
-		buf = (char*) alloca(len);
-		if (vsnprintf(buf, len, fmt, ap) >= 0)
+		buf = (char*)alloca(len);
+		if(vsnprintf(buf, len, fmt, ap) >= 0)
 			syslog(prio, "%s", buf);
 	}
 #else
-	if (0) {}
+	if(0)
+	{
+	}
 #endif
-	else {
+	else
+	{
 		const char* color = "";
 		char *f;
 		int len;
@@ -109,34 +135,35 @@ void applog(int prio, const char *fmt, ...)
 		memcpy(&tm, tm_p, sizeof(tm));
 		pthread_mutex_unlock(&applog_lock);
 
-		switch (prio) {
-			case LOG_ERR:     color = CL_RED; break;
-			case LOG_WARNING: color = CL_YLW; break;
-			case LOG_NOTICE:  color = CL_WHT; break;
-			case LOG_INFO:    color = ""; break;
-			case LOG_DEBUG:   color = CL_GRY; break;
-
-			case LOG_BLUE:
-				prio = LOG_NOTICE;
-				color = CL_CYN;
-				break;
+		switch(prio)
+		{
+		case LOG_ERR:     color = CL_RED; break;
+		case LOG_WARNING: color = CL_YLW; break;
+		case LOG_NOTICE:  color = CL_WHT; break;
+		case LOG_INFO:    color = ""; break;
+		case LOG_DEBUG:   color = CL_GRY; break;
+
+		case LOG_BLUE:
+			prio = LOG_NOTICE;
+			color = CL_CYN;
+			break;
 		}
-		if (!use_colors)
+		if(!use_colors)
 			color = "";
 
-		len = 40 + (int) strlen(fmt) + 2;
-		f = (char*) alloca(len);
+		len = 40 + (int)strlen(fmt) + 2;
+		f = (char*)alloca(len);
 		sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d]%s %s%s\n",
-			tm.tm_year + 1900,
-			tm.tm_mon + 1,
-			tm.tm_mday,
-			tm.tm_hour,
-			tm.tm_min,
-			tm.tm_sec,
-			color,
-			fmt,
-			use_colors ? CL_N : ""
-		);
+				tm.tm_year + 1900,
+				tm.tm_mon + 1,
+				tm.tm_mday,
+				tm.tm_hour,
+				tm.tm_min,
+				tm.tm_sec,
+				color,
+				fmt,
+				use_colors ? CL_N : ""
+				);
 		pthread_mutex_lock(&applog_lock);
 		vfprintf(stderr, f, ap);	/* atomic write to stderr */
 		fflush(stderr);
@@ -145,9 +172,45 @@ void applog(int prio, const char *fmt, ...)
 	va_end(ap);
 }
 
+void format_hashrate(double hashrate, char *output)
+{
+	char prefix = '\0';
+
+	if(hashrate < 10000)
+	{
+		// nop
+	}
+	else if(hashrate < 1e7)
+	{
+		prefix = 'k';
+		hashrate *= 1e-3;
+	}
+	else if(hashrate < 1e10)
+	{
+		prefix = 'M';
+		hashrate *= 1e-6;
+	}
+	else if(hashrate < 1e13)
+	{
+		prefix = 'G';
+		hashrate *= 1e-9;
+	}
+	else
+	{
+		prefix = 'T';
+		hashrate *= 1e-12;
+	}
+
+	sprintf(
+		output,
+		prefix ? "%.2f %cH/s" : "%.2f H/s%c",
+		hashrate, prefix
+		);
+}
+
 static void databuf_free(struct data_buffer *db)
 {
-	if (!db)
+	if(!db)
 		return;
 
 	free(db->buf);
@@ -156,7 +219,7 @@ static void databuf_free(struct data_buffer *db)
 }
 
 static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
-			  void *user_data)
+						  void *user_data)
 {
 	struct data_buffer *db = (struct data_buffer *)user_data;
 	size_t len = size * nmemb;
@@ -168,8 +231,11 @@ static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
 	newlen = oldlen + len;
 
 	newmem = realloc(db->buf, newlen + 1);
-	if (!newmem)
-		return 0;
+	if(newmem == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 
 	db->buf = newmem;
 	db->len = newlen;
@@ -180,15 +246,16 @@ static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
 }
 
 static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
-			     void *user_data)
+							 void *user_data)
 {
 	struct upload_buffer *ub = (struct upload_buffer *)user_data;
 	unsigned int len = (unsigned int)(size * nmemb);
 
-	if (len > ub->len - ub->pos)
+	if(len > ub->len - ub->pos)
 		len = (unsigned int)(ub->len - ub->pos);
 
-	if (len) {
+	if(len)
+	{
 		memcpy(ptr, (char*)ub->buf + ub->pos, len);
 		ub->pos += len;
 	}
@@ -200,8 +267,9 @@ static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
 static int seek_data_cb(void *user_data, curl_off_t offset, int origin)
 {
 	struct upload_buffer *ub = (struct upload_buffer *)user_data;
-	
-	switch (origin) {
+
+	switch(origin)
+	{
 	case SEEK_SET:
 		ub->pos = (size_t)offset;
 		break;
@@ -227,51 +295,65 @@ static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 	void *tmp;
 
 	val = (char*)calloc(1, ptrlen);
+	if(val == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 	key = (char*)calloc(1, ptrlen);
-	if (!key || !val)
-		goto out;
+	if(key == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 
 	tmp = memchr(ptr, ':', ptrlen);
-	if (!tmp || (tmp == ptr))	/* skip empty keys / blanks */
+	if(!tmp || (tmp == ptr))	/* skip empty keys / blanks */
 		goto out;
 	slen = (size_t)((char*)tmp - (char*)ptr);
-	if ((slen + 1) == ptrlen)	/* skip key w/ no value */
+	if((slen + 1) == ptrlen)	/* skip key w/ no value */
 		goto out;
 	memcpy(key, ptr, slen);		/* store & nul term key */
 	key[slen] = 0;
 
 	rem = (char*)ptr + slen + 1;		/* trim value's leading whitespace */
 	remlen = ptrlen - slen - 1;
-	while ((remlen > 0) && (isspace(*rem))) {
+	while((remlen > 0) && (isspace(*rem)))
+	{
 		remlen--;
 		rem++;
 	}
 
 	memcpy(val, rem, remlen);	/* store value, trim trailing ws */
 	val[remlen] = 0;
-	while ((*val) && (isspace(val[strlen(val) - 1]))) {
+	while((*val) && (isspace(val[strlen(val) - 1])))
+	{
 		val[strlen(val) - 1] = 0;
 	}
-	if (!*val)			/* skip blank value */
+	if(!*val)			/* skip blank value */
 		goto out;
 
-	if (!strcasecmp("X-Long-Polling", key)) {
+	if(!strcasecmp("X-Long-Polling", key))
+	{
 		hi->lp_path = val;	/* X-Mining-Extensions: longpoll */
 		val = NULL;
 	}
 
-	if (!strcasecmp("X-Reject-Reason", key)) {
+	if(!strcasecmp("X-Reject-Reason", key))
+	{
 		hi->reason = val;	/* X-Mining-Extensions: reject-reason */
 		//applog(LOG_WARNING, "%s:%s", key, val);
 		val = NULL;
 	}
 
-	if (!strcasecmp("X-Stratum", key)) {
+	if(!strcasecmp("X-Stratum", key))
+	{
 		hi->stratum_url = val;	/* steal memory reference */
 		val = NULL;
 	}
 
-	if (!strcasecmp("X-Nonce-Range", key)) {
+	if(!strcasecmp("X-Nonce-Range", key))
+	{
 		/* todo when available: X-Mining-Extensions: noncerange */
 	}
 out:
@@ -282,7 +364,7 @@ static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 
 #if LIBCURL_VERSION_NUM >= 0x070f06
 static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd,
-	curlsocktype purpose)
+								curlsocktype purpose)
 {
 	int keepalive = 1;
 	int tcp_keepcnt = 3;
@@ -293,22 +375,22 @@ static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd,
 #endif
 
 #ifndef WIN32	
-	if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive,
+	if(unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive,
 		sizeof(keepalive))))
 		return 1;
 #ifdef __linux
-	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT,
+	if(unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT,
 		&tcp_keepcnt, sizeof(tcp_keepcnt))))
 		return 1;
-	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE,
+	if(unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE,
 		&tcp_keepidle, sizeof(tcp_keepidle))))
 		return 1;
-	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL,
+	if(unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL,
 		&tcp_keepintvl, sizeof(tcp_keepintvl))))
 		return 1;
 #endif /* __linux */
 #ifdef __APPLE_CC__
-	if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE,
+	if(unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE,
 		&tcp_keepintvl, sizeof(tcp_keepintvl))))
 		return 1;
 #endif /* __APPLE_CC__ */
@@ -316,8 +398,8 @@ static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd,
 	struct tcp_keepalive vals;
 	vals.onoff = 1;
 	vals.keepalivetime = tcp_keepidle * 1000;
-	vals.keepaliveinterval = tcp_keepintvl * 1000;	
-	if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals),
+	vals.keepaliveinterval = tcp_keepintvl * 1000;
+	if(unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals),
 		NULL, 0, &outputBytes, NULL, NULL)))
 		return 1;
 #endif /* WIN32 */
@@ -327,28 +409,27 @@ static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd,
 #endif
 
 json_t *json_rpc_call(CURL *curl, const char *url,
-		      const char *userpass, const char *rpc_req,
-		      bool longpoll_scan, bool longpoll, int *curl_err)
+					  const char *userpass, const char *rpc_req,
+					  bool longpoll_scan, bool longpoll, int *curl_err)
 {
 	json_t *val, *err_val, *res_val;
-	int rc;
+	CURLcode rc;
 	struct data_buffer all_data = { 0 };
 	struct upload_buffer upload_data;
 	json_error_t err;
 	struct curl_slist *headers = NULL;
 	char* httpdata;
 	char len_hdr[64], hashrate_hdr[64];
-	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
-	long timeout = longpoll ? opt_timeout : 30;
+	long timeout = opt_timeout;
 	struct header_info hi = { 0 };
 	bool lp_scanning = longpoll_scan && !have_longpoll;
 
 	/* it is assumed that 'curl' is freshly [re]initialized at this pt */
 
-	if (opt_protocol)
+	if(opt_protocol)
 		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
 	curl_easy_setopt(curl, CURLOPT_URL, url);
-	if (opt_cert)
+	if(opt_cert)
 		curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
 	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
 	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
@@ -367,27 +448,29 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
 	curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb);
 	curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi);
-	if (opt_proxy) {
+	if(opt_proxy)
+	{
 		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
 		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
 	}
-	if (userpass) {
+	if(userpass)
+	{
 		curl_easy_setopt(curl, CURLOPT_USERPWD, userpass);
 		curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
 	}
 #if LIBCURL_VERSION_NUM >= 0x070f06
-	if (longpoll)
+	if(longpoll)
 		curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
 #endif
 	curl_easy_setopt(curl, CURLOPT_POST, 1);
 
-	if (opt_protocol)
+	if(opt_protocol)
 		applog(LOG_DEBUG, "JSON protocol request:\n%s", rpc_req);
 
 	upload_data.buf = rpc_req;
 	upload_data.len = strlen(rpc_req);
 	upload_data.pos = 0;
-	sprintf(len_hdr, "Content-Length: %lu", (unsigned long) upload_data.len);
+	sprintf(len_hdr, "Content-Length: %lu", (unsigned long)upload_data.len);
 	sprintf(hashrate_hdr, "X-Mining-Hashrate: %llu", (unsigned long long) global_hashrate);
 
 	headers = curl_slist_append(headers, "Content-Type: application/json");
@@ -399,91 +482,118 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/
 
 	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
-
+	curl_err_str[0] = 0;
 	rc = curl_easy_perform(curl);
-	if (curl_err != NULL)
+	if(curl_err != NULL)
 		*curl_err = rc;
-	if (rc) {
-		if (!(longpoll && rc == CURLE_OPERATION_TIMEDOUT)) {
-			applog(LOG_ERR, "HTTP request failed: %s", curl_err_str);
+	if(rc != CURLE_OK)
+	{
+		if(!(longpoll && rc == CURLE_OPERATION_TIMEDOUT))
+		{
+			if(strlen(curl_err_str)>0)
+				applog(LOG_ERR, "HTTP request failed: %s", curl_err_str);
+			else
+				applog(LOG_ERR, "HTTP request failed: %s", curl_easy_strerror(rc));
 			goto err_out;
 		}
 	}
 
 	/* If X-Stratum was found, activate Stratum */
-	if (want_stratum && hi.stratum_url &&
-	    !strncasecmp(hi.stratum_url, "stratum+tcp://", 14) &&
-	    !(opt_proxy && opt_proxy_type == CURLPROXY_HTTP)) {
+	if(want_stratum && hi.stratum_url &&
+	   !strncasecmp(hi.stratum_url, "stratum+tcp://", 14) &&
+	   !(opt_proxy && opt_proxy_type == CURLPROXY_HTTP))
+	{
 		have_stratum = true;
 		tq_push(thr_info[stratum_thr_id].q, hi.stratum_url);
 		hi.stratum_url = NULL;
 	}
 
 	/* If X-Long-Polling was found, activate long polling */
-	if (lp_scanning && hi.lp_path && !have_stratum) {
+	if(lp_scanning && hi.lp_path && !have_stratum)
+	{
 		have_longpoll = true;
 		tq_push(thr_info[longpoll_thr_id].q, hi.lp_path);
 		hi.lp_path = NULL;
 	}
 
-	if (!all_data.buf || !all_data.len) {
+	if(!all_data.buf || !all_data.len)
+	{
 		applog(LOG_ERR, "Empty data received in json_rpc_call.");
 		goto err_out;
 	}
 
-	httpdata = (char*) all_data.buf;
+	httpdata = (char*)all_data.buf;
 
-	if (*httpdata != '{' && *httpdata != '[') {
+	if(*httpdata != '{' && *httpdata != '[')
+	{
 		long errcode = 0;
 		CURLcode c = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &errcode);
-		if (c == CURLE_OK && errcode == 401) {
+		if(c == CURLE_OK && errcode == 401)
+		{
 			applog(LOG_ERR, "You are not authorized, check your login and password.");
 			goto err_out;
 		}
 	}
 
 	val = JSON_LOADS(httpdata, &err);
-	if (!val) {
+	if(!val)
+	{
 		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
-		if (opt_protocol)
+		if(opt_protocol)
 			applog(LOG_DEBUG, "%s", httpdata);
 		goto err_out;
 	}
 
-	if (opt_protocol) {
+	if(opt_protocol)
+	{
 		char *s = json_dumps(val, JSON_INDENT(3));
 		applog(LOG_DEBUG, "JSON protocol response:\n%s\n", s);
 		free(s);
 	}
 
 	/* JSON-RPC valid response returns a non-null 'result',
-	 * and a null 'error'. */
+	* and a null 'error'. */
 	res_val = json_object_get(val, "result");
 	err_val = json_object_get(val, "error");
 
-	if (!res_val || json_is_null(res_val) ||
-	    (err_val && !json_is_null(err_val))) {
-		char *s;
+	if(!res_val || json_is_null(res_val) ||
+	   (err_val && !json_is_null(err_val)))
+	{
+		char *s = NULL;
 
-		if (err_val) {
+		if(err_val)
+		{
+			s = json_dumps(err_val, 0);
 			json_t *msg = json_object_get(err_val, "message");
-			s = json_dumps(err_val, JSON_INDENT(3));
-			if (json_is_string(msg)) {
+			json_t *err_code = json_object_get(err_val, "code");
+			if(curl_err && json_integer_value(err_code))
+				*curl_err = (int)json_integer_value(err_code);
+
+			if(json_is_string(msg))
+			{
 				free(s);
 				s = strdup(json_string_value(msg));
+				if(have_longpoll && s && !strcmp(s, "method not getwork"))
+				{
+					json_decref(err_val);
+					free(s);
+					goto err_out;
+				}
 			}
+			json_decref(err_val);
 		}
 		else
 			s = strdup("(unknown reason)");
 
-		applog(LOG_ERR, "JSON-RPC call failed: %s", s);
+		if(!curl_err || opt_debug)
+			applog(LOG_ERR, "JSON-RPC call failed: %s", s);
 
 		free(s);
 
 		goto err_out;
 	}
 
-	if (hi.reason)
+	if(hi.reason)
 		json_object_set_new(val, "reject-reason", json_string(hi.reason));
 
 	databuf_free(&all_data);
@@ -502,18 +612,28 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 }
 
 /**
- * Unlike malloc, calloc set the memory to zero
- */
+* Unlike malloc, calloc set the memory to zero
+*/
 void *aligned_calloc(int size)
 {
 	const int ALIGN = 64; // cache line
 #ifdef _MSC_VER
 	void* res = _aligned_malloc(size, ALIGN);
+	if(res == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 	memset(res, 0, size);
 	return res;
 #else
-	void *mem = calloc(1, size+ALIGN+sizeof(uintptr_t));
-	void **ptr = (void**)((size_t)(((uintptr_t)(mem))+ALIGN+sizeof(uintptr_t)) & ~(ALIGN-1));
+	void *mem = calloc(1, size + ALIGN + sizeof(uintptr_t));
+	if(mem == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
+	void **ptr = (void**)((size_t)(((uintptr_t)(mem)) + ALIGN + sizeof(uintptr_t)) & ~(ALIGN - 1));
 	ptr[-1] = mem;
 	return ptr;
 #endif
@@ -530,9 +650,10 @@ void aligned_free(void *ptr)
 
 void cbin2hex(char *out, const char *in, size_t len)
 {
-	if (out) {
+	if(out)
+	{
 		unsigned int i;
-		for (i = 0; i < len; i++)
+		for(i = 0; i < len; i++)
 			sprintf(out + (i * 2), "%02x", (uint8_t)in[i]);
 	}
 }
@@ -540,10 +661,13 @@ void cbin2hex(char *out, const char *in, size_t len)
 char *bin2hex(const uchar *in, size_t len)
 {
 	char *s = (char*)malloc((len * 2) + 1);
-	if (!s)
-		return NULL;
+	if(s == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 
-	cbin2hex(s, (const char *) in, len);
+	cbin2hex(s, (const char *)in, len);
 
 	return s;
 }
@@ -555,15 +679,18 @@ bool hex2bin(uchar *p, const char *hexstr, size_t len)
 
 	hex_byte[2] = '\0';
 
-	while (*hexstr && len) {
-		if (!hexstr[1]) {
+	while(*hexstr && len)
+	{
+		if(!hexstr[1])
+		{
 			applog(LOG_ERR, "hex2bin str truncated");
 			return false;
 		}
 		hex_byte[0] = hexstr[0];
 		hex_byte[1] = hexstr[1];
-		*p = (uchar) strtol(hex_byte, &ep, 16);
-		if (*ep) {
+		*p = (uchar)strtol(hex_byte, &ep, 16);
+		if(*ep)
+		{
 			applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte);
 			return false;
 		}
@@ -576,19 +703,19 @@ bool hex2bin(uchar *p, const char *hexstr, size_t len)
 }
 
 /* Subtract the `struct timeval' values X and Y,
-   storing the result in RESULT.
-   Return 1 if the difference is negative, otherwise 0.  */
+storing the result in RESULT.
+Return 1 if the difference is negative, otherwise 0.  */
 int timeval_subtract(struct timeval *result, struct timeval *x,
-	struct timeval *y)
+struct timeval *y)
 {
 	uint64_t start, end;
 
 	end = x->tv_usec + 1000000 * x->tv_sec;
-	start   = y->tv_usec + 1000000 * y->tv_sec;
-	if (start <= end)
+	start = y->tv_usec + 1000000 * y->tv_sec;
+	if(start <= end)
 	{
 		uint64_t diff = end - start;
-		result->tv_sec = diff / 1000000;
+		result->tv_sec = (long)(diff / 1000000);
 		result->tv_usec = diff % 1000000;
 	}
 	else
@@ -605,26 +732,32 @@ bool fulltest(const uint32_t *hash, const uint32_t *target)
 {
 	int i;
 	bool rc = true;
-	
-	for (i = 7; i >= 0; i--) {
-		if (hash[i] > target[i]) {
+
+	for(i = 7; i >= 0; i--)
+	{
+		if(hash[i] > target[i])
+		{
 			rc = false;
 			break;
 		}
-		if (hash[i] < target[i]) {
+		if(hash[i] < target[i])
+		{
 			rc = true;
 			break;
 		}
-		if (hash[1] == target[1]) {
+		if(hash[1] == target[1])
+		{
 			applog(LOG_NOTICE, "We found a close match!");
 		}
 	}
 
-	if (!rc && opt_debug) {
+	if(!rc && opt_debug)
+	{
 		uint32_t hash_be[8], target_be[8];
 		char *hash_str, *target_str;
-		
-		for (i = 0; i < 8; i++) {
+
+		for(i = 0; i < 8; i++)
+		{
 			be32enc(hash_be + i, hash[7 - i]);
 			be32enc(target_be + i, target[7 - i]);
 		}
@@ -632,10 +765,10 @@ bool fulltest(const uint32_t *hash, const uint32_t *target)
 		target_str = bin2hex((uchar *)target_be, 32);
 
 		applog(LOG_DEBUG, "DEBUG: %s\nHash:   %s\nTarget: %s",
-			rc ? "hash <= target"
+			   rc ? "hash <= target"
 			   : CL_YLW "hash > target (false positive)" CL_N,
-			hash_str,
-			target_str);
+			   hash_str,
+			   target_str);
 
 		free(hash_str);
 		free(target_str);
@@ -644,17 +777,39 @@ bool fulltest(const uint32_t *hash, const uint32_t *target)
 	return rc;
 }
 
+bool fulltest_sia(const uint64_t *hash, const uint64_t *target)
+{
+	int i;
+	bool rc = true;
+
+	for(i = 0; i < 4; i--)
+	{
+		if(swab64(hash[i]) > target[3 - i])
+		{
+			rc = false;
+			break;
+		}
+		if(swab64(hash[i]) < target[3 - i])
+		{
+			rc = true;
+			break;
+		}
+	}
+	return rc;
+}
+
 void diff_to_target(uint32_t *target, double diff)
 {
 	uint64_t m;
 	int k;
-	
-	for (k = 6; k > 0 && diff > 1.0; k--)
+
+	for(k = 6; k > 0 && diff > 1.0; k--)
 		diff /= 4294967296.0;
 	m = (uint64_t)(4294901760.0 / diff);
-	if (m == 0 && k == 6)
+	if(m == 0 && k == 6)
 		memset(target, 0xff, 32);
-	else {
+	else
+	{
 		memset(target, 0, 32);
 		target[k] = (uint32_t)m;
 		target[k + 1] = (uint32_t)(m >> 32);
@@ -670,22 +825,24 @@ void diff_to_target(uint32_t *target, double diff)
 static bool send_line(curl_socket_t sock, char *s)
 {
 	ssize_t len, sent = 0;
-	
+
 	len = (ssize_t)strlen(s);
 	s[len++] = '\n';
 
-	while (len > 0) {
-		struct timeval timeout = {0, 0};
+	while(len > 0)
+	{
+		struct timeval timeout = { 0, 0 };
 		ssize_t n;
 		fd_set wd;
 
 		FD_ZERO(&wd);
 		FD_SET(sock, &wd);
-		if (select((int)sock + 1, NULL, &wd, NULL, &timeout) < 1)
+		if(select((int)sock + 1, NULL, &wd, NULL, &timeout) < 1)
 			return false;
-		n = send(sock, s + sent, len, 0);
-		if (n < 0) {
-			if (!socket_blocks())
+		n = send(sock, s + sent, (int)len, 0);
+		if(n < 0)
+		{
+			if(!socket_blocks())
 				return false;
 			n = 0;
 		}
@@ -700,7 +857,7 @@ bool stratum_send_line(struct stratum_ctx *sctx, char *s)
 {
 	bool ret = false;
 
-	if (opt_protocol)
+	if(opt_protocol)
 		applog(LOG_DEBUG, "> %s", s);
 
 	pthread_mutex_lock(&sctx->sock_lock);
@@ -719,7 +876,7 @@ static bool socket_full(curl_socket_t sock, int timeout)
 	FD_SET(sock, &rd);
 	tv.tv_sec = timeout;
 	tv.tv_usec = 0;
-	if (select((int)sock + 1, &rd, NULL, NULL, &tv) > 0)
+	if(select((int)sock + 1, &rd, NULL, NULL, &tv) > 0)
 		return true;
 	return false;
 }
@@ -738,9 +895,15 @@ static void stratum_buffer_append(struct stratum_ctx *sctx, const char *s)
 
 	old = strlen(sctx->sockbuf);
 	snew = old + strlen(s) + 1;
-	if (snew >= sctx->sockbuf_size) {
+	if(snew >= sctx->sockbuf_size)
+	{
 		sctx->sockbuf_size = snew + (RBUFSIZE - (snew % RBUFSIZE));
 		sctx->sockbuf = (char*)realloc(sctx->sockbuf, sctx->sockbuf_size);
+		if(sctx->sockbuf == NULL)
+		{
+			applog(LOG_ERR, "Out of memory!");
+			proper_exit(2);
+		}
 	}
 	strcpy(sctx->sockbuf + old, s);
 }
@@ -749,62 +912,75 @@ char *stratum_recv_line(struct stratum_ctx *sctx)
 {
 	ssize_t len, buflen;
 	char *tok, *sret = NULL;
+	int timeout = opt_timeout;
+
+	if(!sctx->sockbuf)
+		return NULL;
 
-	if (!strstr(sctx->sockbuf, "\n")) {
+	if(!strstr(sctx->sockbuf, "\n"))
+	{
 		bool ret = true;
 		time_t rstart = time(NULL);
-		if (!socket_full(sctx->sock, 60)) {
+		if(!socket_full(sctx->sock, timeout))
+		{
 			applog(LOG_ERR, "stratum_recv_line timed out");
 			goto out;
 		}
-		do {
+		do
+		{
 			char s[RBUFSIZE];
 			ssize_t n;
 
 			memset(s, 0, RBUFSIZE);
 			n = recv(sctx->sock, s, RECVSIZE, 0);
-			if (!n) {
+			if(!n)
+			{
 				ret = false;
 				break;
 			}
-			if (n < 0) {
-				if (!socket_blocks() || !socket_full(sctx->sock, 1)) {
+			if(n < 0)
+			{
+				if(!socket_blocks() || !socket_full(sctx->sock, 10))
+				{
 					ret = false;
 					break;
 				}
-			} else
+			}
+			else
 				stratum_buffer_append(sctx, s);
-		} while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n"));
+		} while(time(NULL) - rstart < timeout && !strstr(sctx->sockbuf, "\n"));
 
-		if (!ret) {
-			applog(LOG_ERR, "stratum_recv_line failed");
+		if(!ret)
+		{
+			if(opt_debug) applog(LOG_ERR, "stratum_recv_line failed");
 			goto out;
 		}
 	}
 
 	buflen = (ssize_t)strlen(sctx->sockbuf);
 	tok = strtok(sctx->sockbuf, "\n");
-	if (!tok) {
+	if(!tok)
+	{
 		applog(LOG_ERR, "stratum_recv_line failed to parse a newline-terminated string");
 		goto out;
 	}
 	sret = strdup(tok);
 	len = (ssize_t)strlen(sret);
 
-	if (buflen > len + 1)
+	if(buflen > len + 1)
 		memmove(sctx->sockbuf, sctx->sockbuf + len + 1, buflen - len + 1);
 	else
 		sctx->sockbuf[0] = '\0';
 
 out:
-	if (sret && opt_protocol)
+	if(sret && opt_protocol)
 		applog(LOG_DEBUG, "< %s", sret);
 	return sret;
 }
 
 #if LIBCURL_VERSION_NUM >= 0x071101
 static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose,
-	struct curl_sockaddr *addr)
+struct curl_sockaddr *addr)
 {
 	curl_socket_t *sock = (curl_socket_t *)clientp;
 	*sock = socket(addr->family, addr->socktype, addr->protocol);
@@ -815,48 +991,64 @@ static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose,
 bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 {
 	CURL *curl;
-	int rc;
+	CURLcode rc;
 
 	pthread_mutex_lock(&sctx->sock_lock);
-	if (sctx->curl)
+	if(sctx->curl)
 		curl_easy_cleanup(sctx->curl);
 	sctx->curl = curl_easy_init();
-	if (!sctx->curl) {
+	if(!sctx->curl)
+	{
 		applog(LOG_ERR, "CURL initialization failed");
 		pthread_mutex_unlock(&sctx->sock_lock);
 		return false;
 	}
 	curl = sctx->curl;
-	if (!sctx->sockbuf) {
+	if(!sctx->sockbuf)
+	{
 		sctx->sockbuf = (char*)calloc(RBUFSIZE, 1);
+		if(sctx->sockbuf == NULL)
+		{
+			applog(LOG_ERR, "Out of memory!");
+			proper_exit(2);
+		}
 		sctx->sockbuf_size = RBUFSIZE;
 	}
 	sctx->sockbuf[0] = '\0';
 	pthread_mutex_unlock(&sctx->sock_lock);
 
-	if (url != sctx->url) {
+	if(url != sctx->url)
+	{
 		free(sctx->url);
 		sctx->url = strdup(url);
 	}
 	free(sctx->curl_url);
 	sctx->curl_url = (char*)malloc(strlen(url));
+	if(sctx->curl_url == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 	sprintf(sctx->curl_url, "http%s", strstr(url, "://"));
 
-	if (opt_protocol)
+	if(opt_protocol)
 		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
 	curl_easy_setopt(curl, CURLOPT_URL, sctx->curl_url);
 	curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1);
-	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30);
-	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str);
+	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, opt_timeout);
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
 	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
 	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
-	if (opt_proxy && opt_proxy_type != CURLPROXY_HTTP) {
+	if(opt_proxy && opt_proxy_type != CURLPROXY_HTTP)
+	{
 		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
 		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
-	} else if (getenv("http_proxy")) {
-		if (getenv("all_proxy"))
+	}
+	else if(getenv("http_proxy"))
+	{
+		if(getenv("all_proxy"))
 			curl_easy_setopt(curl, CURLOPT_PROXY, getenv("all_proxy"));
-		else if (getenv("ALL_PROXY"))
+		else if(getenv("ALL_PROXY"))
 			curl_easy_setopt(curl, CURLOPT_PROXY, getenv("ALL_PROXY"));
 		else
 			curl_easy_setopt(curl, CURLOPT_PROXY, "");
@@ -869,10 +1061,14 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 	curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock);
 #endif
 	curl_easy_setopt(curl, CURLOPT_CONNECT_ONLY, 1);
-
+	curl_err_str[0] = 0;
 	rc = curl_easy_perform(curl);
-	if (rc) {
-		applog(LOG_ERR, "Stratum connection failed: %s", sctx->curl_err_str);
+	if(rc != CURLE_OK)
+	{
+		if(strlen(curl_err_str)>0)
+			applog(LOG_ERR, "HTTP request failed: %s", curl_err_str);
+		else
+			applog(LOG_ERR, "HTTP request failed: %s", curl_easy_strerror(rc));
 		curl_easy_cleanup(curl);
 		sctx->curl = NULL;
 		return false;
@@ -886,78 +1082,109 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 	return true;
 }
 
+static void stratum_free_job(struct stratum_ctx *sctx)
+{
+	pthread_mutex_lock(&sctx->sock_lock);
+	if(sctx->job.job_id)
+	{
+		free(sctx->job.job_id);
+	}
+	if(sctx->job.merkle_count)
+	{
+		for(int i = 0; i < sctx->job.merkle_count; i++)
+		{
+			free(sctx->job.merkle[i]);
+			sctx->job.merkle[i] = NULL;
+		}
+		free(sctx->job.merkle);
+	}
+	free(sctx->job.coinbase);
+	// note: xnonce2 is not allocated
+	memset(&(sctx->job.job_id), 0, sizeof(struct stratum_job));
+	pthread_mutex_unlock(&sctx->sock_lock);
+}
+
 void stratum_disconnect(struct stratum_ctx *sctx)
 {
 	pthread_mutex_lock(&sctx->sock_lock);
-	if (sctx->curl) {
+	if(sctx->curl)
+	{
 		sctx->disconnects++;
 		curl_easy_cleanup(sctx->curl);
 		sctx->curl = NULL;
 		sctx->sockbuf[0] = '\0';
 	}
+	if(sctx->job.job_id)
+	{
+		stratum_free_job(sctx);
+	}
 	pthread_mutex_unlock(&sctx->sock_lock);
 }
 
-static const char *get_stratum_session_id(json_t *val)
+static const char *get_stratum_session_id(const json_t *val)
 {
 	json_t *arr_val;
 	int i, n;
 
 	arr_val = json_array_get(val, 0);
-	if (!arr_val || !json_is_array(arr_val))
+	if(!arr_val || !json_is_array(arr_val))
 		return NULL;
-	n = json_array_size(arr_val);
-	for (i = 0; i < n; i++) {
+	n = (int)json_array_size(arr_val);
+	for(i = 0; i < n; i++)
+	{
 		const char *notify;
 		json_t *arr = json_array_get(arr_val, i);
-		if (!arr || !json_is_array(arr))
+		if(!arr || !json_is_array(arr))
 			break;
 		notify = json_string_value(json_array_get(arr, 0));
-		if (!notify)
+		if(!notify)
 			continue;
-		if (!strcasecmp(notify, "mining.notify"))
+		if(!strcasecmp(notify, "mining.notify"))
 			return json_string_value(json_array_get(arr, 1));
 	}
 	return NULL;
 }
 
-static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, int pndx)
+static bool stratum_parse_extranonce(struct stratum_ctx *sctx, const json_t *params, int pndx)
 {
 	const char* xnonce1;
 	int xn2_size;
 
 	xnonce1 = json_string_value(json_array_get(params, pndx));
-	if (!xnonce1) {
+	if(!xnonce1)
+	{
 		applog(LOG_ERR, "Failed to get extranonce1");
 		goto out;
 	}
-	xn2_size = (int) json_integer_value(json_array_get(params, pndx+1));
-	if (!xn2_size) {
+	xn2_size = (int)json_integer_value(json_array_get(params, pndx + 1));
+	if(!xn2_size)
+	{
 		applog(LOG_ERR, "Failed to get extranonce2_size");
 		goto out;
 	}
-	if (xn2_size < 2 || xn2_size > 16) {
-		applog(LOG_INFO, "Failed to get valid n2size in parse_extranonce");
+	if(xn2_size < 2 || xn2_size > 16)
+	{
+		applog(LOG_ERR, "invalid n2size in parse_extranonce: size=%d", xn2_size);
 		goto out;
 	}
 
 	pthread_mutex_lock(&sctx->work_lock);
-	if (sctx->xnonce1)
+	if(sctx->xnonce1)
 		free(sctx->xnonce1);
 	sctx->xnonce1_size = strlen(xnonce1) / 2;
-	sctx->xnonce1 = (uchar*) calloc(1, sctx->xnonce1_size);
-	if (unlikely(!sctx->xnonce1)) {
-		applog(LOG_ERR, "Failed to alloc xnonce1");
-		pthread_mutex_unlock(&sctx->work_lock);
-		goto out;
+	sctx->xnonce1 = (uchar*)calloc(1, sctx->xnonce1_size);
+	if(sctx->xnonce1 == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
 	}
 	hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size);
 	sctx->xnonce2_size = xn2_size;
 	pthread_mutex_unlock(&sctx->work_lock);
 
-	if (pndx == 0 && opt_debug) /* pool dynamic change */
+	if(pndx == 0 && opt_debug) /* pool dynamic change */
 		applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 size=%d",
-			xnonce1, xn2_size);
+		xnonce1, xn2_size);
 
 	return true;
 out:
@@ -966,52 +1193,63 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i
 
 bool stratum_subscribe(struct stratum_ctx *sctx)
 {
-	char *s, *sret = NULL;
-	const char *sid;
-	json_t *val = NULL, *res_val, *err_val;
 	json_error_t err;
+	json_t *val;
+	json_t *res_val;
+	json_t *err_val;
 	bool ret = false, retry = false;
+	char *sret;
+	char *sid;
 
 start:
-	s = (char*)malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0));
-	if (retry)
+	char *s = (char*)malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0));
+	if(s == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
+	if(retry)
 		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}");
-	else if (sctx->session_id)
+	else if(sctx->session_id)
 		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\", \"%s\"]}", sctx->session_id);
 	else
 		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\"]}");
 
-	if (!stratum_send_line(sctx, s))
+	if(!stratum_send_line(sctx, s))
 		goto out;
 
-	if (!socket_full(sctx->sock, 10)) {
+	if(!socket_full(sctx->sock, 10))
+	{
 		applog(LOG_ERR, "stratum_subscribe timed out");
 		goto out;
 	}
 
 	sret = stratum_recv_line(sctx);
-	if (!sret)
+	if(!sret)
 		goto out;
 
 	val = JSON_LOADS(sret, &err);
 	free(sret);
-	if (!val) {
+	if(!val)
+	{
 		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
 		goto out;
 	}
 
-	if (json_integer_value(json_object_get(val, "id")) != 1) {
+	if(json_integer_value(json_object_get(val, "id")) != 1)
+	{
 		applog(LOG_WARNING, "Stratum subscribe answer id is not correct!");
 	}
 
 	res_val = json_object_get(val, "result");
 	err_val = json_object_get(val, "error");
 
-	if (!res_val || json_is_null(res_val) ||
-	    (err_val && !json_is_null(err_val))) {
-		if (opt_debug || retry) {
+	if(!res_val || json_is_null(res_val) || (err_val && !json_is_null(err_val)))
+	{
+		if(opt_debug || retry)
+		{
 			free(s);
-			if (err_val)
+			if(err_val)
 				s = json_dumps(err_val, JSON_INDENT(3));
 			else
 				s = strdup("(unknown reason)");
@@ -1021,19 +1259,20 @@ bool stratum_subscribe(struct stratum_ctx *sctx)
 	}
 
 	// sid is param 1, extranonce params are 2 and 3
-	if (!stratum_parse_extranonce(sctx, res_val, 1)) {
+	if(!stratum_parse_extranonce(sctx, res_val, 1))
+	{
 		goto out;
 	}
 
 	ret = true;
 
 	// session id (optional)
-	sid = get_stratum_session_id(res_val);
-	if (opt_debug && sid)
+	sid = (char*)get_stratum_session_id(res_val);
+	if(opt_debug && sid)
 		applog(LOG_DEBUG, "Stratum session id: %s", sid);
 
 	pthread_mutex_lock(&sctx->work_lock);
-	if (sctx->session_id)
+	if(sctx->session_id)
 		free(sctx->session_id);
 	sctx->session_id = sid ? strdup(sid) : NULL;
 	sctx->next_diff = 1.0;
@@ -1041,20 +1280,19 @@ bool stratum_subscribe(struct stratum_ctx *sctx)
 
 out:
 	free(s);
-	if (val)
+	if(val)
 		json_decref(val);
 
-	if (!ret) {
-		if (sret && !retry) {
-			retry = true;
-			goto start;
-		}
+	if(!ret && sret && !retry)
+	{
+		retry = true;
+		goto start;
 	}
 
 	return ret;
 }
 
-bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass)
+bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass, bool extranonce)
 {
 	json_t *val = NULL, *res_val, *err_val;
 	char *s, *sret;
@@ -1062,109 +1300,128 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
 	bool ret = false;
 
 	s = (char*)malloc(80 + strlen(user) + strlen(pass));
+	if(s == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 	sprintf(s, "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}",
-	        user, pass);
+			user, pass);
 
-	if (!stratum_send_line(sctx, s))
+	if(!stratum_send_line(sctx, s))
+	{
+		applog(LOG_ERR, "Error: couldn't send stratum authorization request");
 		goto out;
+	}
 
-	while (1) {
+	while(1)
+	{
 		sret = stratum_recv_line(sctx);
-		if (!sret)
+		if(!sret)
 			goto out;
-		if (!stratum_handle_method(sctx, sret))
+		if(!stratum_handle_method(sctx, sret))
 			break;
 		free(sret);
 	}
 
 	val = JSON_LOADS(sret, &err);
 	free(sret);
-	if (!val) {
+	if(!val)
+	{
 		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
 		goto out;
 	}
-
-	if (json_integer_value(json_object_get(val, "id")) != 2) {
+	if(json_integer_value(json_object_get(val, "id")) != 2)
+	{
 		applog(LOG_WARNING, "Stratum authorize answer id is not correct!");
 	}
 	res_val = json_object_get(val, "result");
 	err_val = json_object_get(val, "error");
-
-	if (!res_val || json_is_false(res_val) ||
-	    (err_val && !json_is_null(err_val)))  {
+	if(!res_val || json_is_false(res_val) ||
+		 (err_val && !json_is_null(err_val)))
+	{
 		applog(LOG_ERR, "Stratum authentication failed");
 		goto out;
 	}
-
 	sctx->tm_connected = time(NULL);
 	ret = true;
+	if(extranonce)
+	{
+		// subscribe to extranonce (optional)
+		sprintf(s, "{\"id\": 3, \"method\": \"mining.extranonce.subscribe\", \"params\": []}");
+		if(!stratum_send_line(sctx, s))
+			goto out;
+		// reduced timeout to handle pools ignoring this method without answer (like xpool.ca)
+		if(!socket_full(sctx->sock, 10))
+		{
+			if(opt_debug)
+				applog(LOG_DEBUG, "stratum extranonce subscribe timed out");
+			goto out;
+		}
 
-	// subscribe to extranonce (optional)
-	sprintf(s, "{\"id\": 3, \"method\": \"mining.extranonce.subscribe\", \"params\": []}");
-
-	if (!stratum_send_line(sctx, s))
-		goto out;
-
-	// reduced timeout to handle pools ignoring this method without answer (like xpool.ca)
-	if (!socket_full(sctx->sock, 1)) {
-		if (opt_debug)
-			applog(LOG_DEBUG, "stratum extranonce subscribe timed out");
-		goto out;
-	}
-
-	sret = stratum_recv_line(sctx);
-	if (sret) {
-		json_t *extra = JSON_LOADS(sret, &err);
-		if (!extra) {
-			applog(LOG_WARNING, "JSON decode failed(%d): %s", err.line, err.text);
-		} else {
-			if (json_integer_value(json_object_get(extra, "id")) != 3) {
-				// we receive a standard method if extranonce is ignored
-				if (!stratum_handle_method(sctx, sret))
-					applog(LOG_WARNING, "Stratum extranonce answer id was not correct!");
-			} else {
-				res_val = json_object_get(extra, "result");
-				if (opt_debug && (!res_val || json_is_false(res_val)))
-					applog(LOG_DEBUG, "extranonce subscribe not supported");
+		sret = stratum_recv_line(sctx);
+		if(sret)
+		{
+			json_t *extra = JSON_LOADS(sret, &err);
+			if(!extra)
+			{
+				applog(LOG_WARNING, "JSON decode failed(%d): %s", err.line, err.text);
+			}
+			else
+			{
+				if(json_integer_value(json_object_get(extra, "id")) != 3)
+				{
+					// we receive a standard method if extranonce is ignored
+					if(!stratum_handle_method(sctx, sret))
+						applog(LOG_WARNING, "Stratum extranonce answer id was not correct!");
+				}
+				else
+				{
+					res_val = json_object_get(extra, "result");
+					if(opt_debug && (!res_val || json_is_false(res_val)))
+						applog(LOG_DEBUG, "extranonce subscribe not supported");
+				}
+				json_decref(extra);
 			}
-			json_decref(extra);
+			free(sret);
 		}
-		free(sret);
 	}
 
 out:
 	free(s);
-	if (val)
+	if(val)
 		json_decref(val);
 
 	return ret;
 }
 
 /**
- * Extract bloc height     L H... here len=3, height=0x1333e8
- * "...0000000000ffffffff2703e83313062f503253482f043d61105408"
- */
+* Extract block height     L H... here len=3, height=0x1333e8
+* "...0000000000ffffffff2703e83313062f503253482f043d61105408"
+*/
 static uint32_t getblocheight(struct stratum_ctx *sctx)
 {
 	uint32_t height = 0;
 	uint8_t hlen = 0, *p, *m;
 
 	// find 0xffff tag
-	p = (uint8_t*) sctx->job.coinbase + 32;
+	p = (uint8_t*)sctx->job.coinbase + 32;
 	m = p + 128;
-	while (*p != 0xff && p < m) p++;
-	while (*p == 0xff && p < m) p++;
-	if (*(p-1) == 0xff && *(p-2) == 0xff) {
+	while(*p != 0xff && p < m) p++;
+	while(*p == 0xff && p < m) p++;
+	if(*(p - 1) == 0xff && *(p - 2) == 0xff)
+	{
 		p++; hlen = *p;
 		p++; height = le16dec(p);
 		p += 2;
-		switch (hlen) {
-			case 4:
-				height += 0x10000UL * le16dec(p);
-				break;
-			case 3:
-				height += 0x10000UL * (*p);
-				break;
+		switch(hlen)
+		{
+		case 4:
+			height += 0x10000UL * le16dec(p);
+			break;
+		case 3:
+			height += 0x10000UL * (*p);
+			break;
 		}
 	}
 	return height;
@@ -1172,71 +1429,117 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)
 
 static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 {
-	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime, *nreward;
+	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *nreward;
+	char *stime;
 	size_t coinb1_size, coinb2_size;
 	bool clean, ret = false;
 	int merkle_count, i;
 	json_t *merkle_arr;
-	uchar **merkle;
-	int ntime;
+	uchar **merkle = NULL;
+	int32_t ntime;
 
 	job_id = json_string_value(json_array_get(params, 0));
 	prevhash = json_string_value(json_array_get(params, 1));
 	coinb1 = json_string_value(json_array_get(params, 2));
 	coinb2 = json_string_value(json_array_get(params, 3));
 	merkle_arr = json_array_get(params, 4);
-	if (!merkle_arr || !json_is_array(merkle_arr))
+	if(!merkle_arr || !json_is_array(merkle_arr))
 		goto out;
-	merkle_count = json_array_size(merkle_arr);
-	version = json_string_value(json_array_get(params, 5));
+	merkle_count = (int)json_array_size(merkle_arr);
+	if(opt_algo != ALGO_SIA)
+		version = json_string_value(json_array_get(params, 5));
+	else
+		version = "00000001"; //unused
 	nbits = json_string_value(json_array_get(params, 6));
-	stime = json_string_value(json_array_get(params, 7));
+	stime = (char *)json_string_value(json_array_get(params, 7));
 	clean = json_is_true(json_array_get(params, 8));
 	nreward = json_string_value(json_array_get(params, 9));
 
-	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
-	    strlen(prevhash) != 64 || strlen(version) != 8 ||
-	    strlen(nbits) != 8 || strlen(stime) != 8) {
+	if(!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
+		 strlen(prevhash) != 64 || strlen(version) != 8 || strlen(nbits) != 8)
+	{
 		applog(LOG_ERR, "Stratum notify: invalid parameters");
 		goto out;
 	}
+	if(opt_algo == ALGO_SIA)
+	{
+		if(strlen(stime) != 16)
+		{
+			applog(LOG_ERR, "Stratum notify: invalid time parameter");
+			goto out;
+		}
+	}
+	else
+	{
+		if(strlen(stime) != 8)
+		{
+			applog(LOG_ERR, "Stratum notify: invalid time parameter");
+			goto out;
+		}
+	}
 
 	/* store stratum server time diff */
 	hex2bin((uchar *)&ntime, stime, 4);
-	ntime = swab32(ntime) - (uint32_t) time(0);
-	if (ntime > sctx->srvtime_diff) {
+	if(opt_algo!=ALGO_SIA)
+		ntime = swab32(ntime) - (uint32_t)time(0);
+	else
+		ntime = ntime - (uint32_t)time(0);
+
+	pthread_mutex_lock(&sctx->work_lock);
+
+	if(ntime > sctx->srvtime_diff)
+	{
 		sctx->srvtime_diff = ntime;
-		if (!opt_quiet && ntime > 20)
+		if(!opt_quiet && ntime > 20)
 			applog(LOG_DEBUG, "stratum time is at least %ds in the future", ntime);
 	}
 
-	merkle = (uchar**) malloc(merkle_count * sizeof(char *));
-	for (i = 0; i < merkle_count; i++) {
+	if(merkle_count)
+	{
+		merkle = (uchar**)malloc(merkle_count * sizeof(char *));
+		if(merkle == NULL)
+		{
+			applog(LOG_ERR, "Out of memory!");
+			proper_exit(2);
+		}
+	}
+	for(i = 0; i < merkle_count; i++)
+	{
 		const char *s = json_string_value(json_array_get(merkle_arr, i));
-		if (!s || strlen(s) != 64) {
-			while (i--)
+		if(!s || strlen(s) != 64)
+		{
+			while(i--)
 				free(merkle[i]);
 			free(merkle);
 			applog(LOG_ERR, "Stratum notify: invalid Merkle branch");
+			pthread_mutex_unlock(&sctx->work_lock);
 			goto out;
 		}
-		merkle[i] = (uchar*) malloc(32);
+		merkle[i] = (uchar*)malloc(32);
+		if(merkle[i] == NULL)
+		{
+			applog(LOG_ERR, "Out of memory!");
+			proper_exit(2);
+		}
 		hex2bin(merkle[i], s, 32);
 	}
 
-	pthread_mutex_lock(&sctx->work_lock);
-
 	coinb1_size = strlen(coinb1) / 2;
 	coinb2_size = strlen(coinb2) / 2;
 	sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size +
-	                          sctx->xnonce2_size + coinb2_size;
+		sctx->xnonce2_size + coinb2_size;
 
-	sctx->job.coinbase = (uchar*) realloc(sctx->job.coinbase, sctx->job.coinbase_size);
+	sctx->job.coinbase = (uchar*)realloc(sctx->job.coinbase, sctx->job.coinbase_size);
+	if(sctx->job.coinbase == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 	sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size;
 	hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
 	memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size);
 
-	if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
+	if(!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
 		memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
 	hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size);
 
@@ -1244,9 +1547,12 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	sctx->job.job_id = strdup(job_id);
 	hex2bin(sctx->job.prevhash, prevhash, 32);
 
-	sctx->job.height = getblocheight(sctx);
+	if(opt_algo != ALGO_SIA)
+		sctx->job.height = getblocheight(sctx);
+	else
+		sctx->job.height = 1;
 
-	for (i = 0; i < sctx->job.merkle_count; i++)
+	for(i = 0; i < sctx->job.merkle_count; i++)
 		free(sctx->job.merkle[i]);
 	free(sctx->job.merkle);
 	sctx->job.merkle = merkle;
@@ -1272,12 +1578,13 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	return ret;
 }
 
+extern time_t g_work_time;
 static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
 {
 	double diff;
 
 	diff = json_number_value(json_array_get(params, 0));
-	if (diff <= 0.0)
+	if(diff <= 0.0)
 		return false;
 
 	pthread_mutex_lock(&sctx->work_lock);
@@ -1285,9 +1592,11 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
 	pthread_mutex_unlock(&sctx->work_lock);
 
 	/* store for api stats */
-	if (diff != global_diff) {
+	if(diff != global_diff)
+	{
 		global_diff = diff;
 		applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
+		g_work_time = 0;
 	}
 
 	return true;
@@ -1301,15 +1610,20 @@ static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params)
 
 	host = json_string_value(json_array_get(params, 0));
 	port_val = json_array_get(params, 1);
-	if (json_is_string(port_val))
+	if(json_is_string(port_val))
 		port = atoi(json_string_value(port_val));
 	else
-		port = (int) json_integer_value(port_val);
-	if (!host || !port)
+		port = (int)json_integer_value(port_val);
+	if(!host || !port)
 		return false;
-	
+
 	free(sctx->url);
 	sctx->url = (char*)malloc(32 + strlen(host));
+	if(sctx->url == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 	sprintf(sctx->url, "stratum+tcp://%s:%d", host, port);
 
 	applog(LOG_NOTICE, "Server requested reconnection to %s", sctx->url);
@@ -1318,20 +1632,165 @@ static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params)
 
 	return true;
 }
+static bool stratum_pong(struct stratum_ctx *sctx, json_t *id)
+{
+	char buf[64];
+	bool ret = false;
+
+	if(!id || json_is_null(id))
+		return ret;
+
+	sprintf(buf, "{\"id\":%d,\"result\":\"pong\",\"error\":null}",
+					(int)json_integer_value(id));
+	ret = stratum_send_line(sctx, buf);
+
+	return ret;
+}
+
+static bool stratum_get_algo(struct stratum_ctx *sctx, json_t *id, json_t *params)
+{
+	char algo[64] = {0};
+	char *s;
+	json_t *val;
+	bool ret = true;
+
+	if(!id || json_is_null(id))
+		return false;
+
+	get_currentalgo(algo, sizeof(algo));
+
+	val = json_object();
+	json_object_set(val, "id", id);
+	json_object_set_new(val, "error", json_null());
+	json_object_set_new(val, "result", json_string(algo));
+
+	s = json_dumps(val, 0);
+	ret = stratum_send_line(sctx, s);
+	json_decref(val);
+	free(s);
+
+	return ret;
+}
+
+#include "nvml.h"
+extern char driver_version[32];
+extern int cuda_arch[MAX_GPUS];
+
+static bool json_object_set_error(json_t *result, int code, const char *msg)
+{
+	json_t *val = json_object();
+	json_object_set_new(val, "code", json_integer(code));
+	json_object_set_new(val, "message", json_string(msg));
+	return json_object_set_new(result, "error", val) != -1;
+}
+
+/* allow to report algo/device perf to the pool for algo stats */
+static bool stratum_benchdata(json_t *result, json_t *params, int thr_id)
+{
+	char algo[64] = {0};
+	char vid[32], arch[8], driver[32];
+	char *card;
+	char os[8];
+	uint32_t watts = 0; 
+	int dev_id = device_map[thr_id];
+	int cuda_ver = cuda_version();
+	struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
+	json_t *val;
+
+	if(!cgpu || !opt_stratum_stats) return false;
+
+#if defined(WIN32) && (defined(_M_X64) || defined(__x86_64__))
+	strcpy(os, "win64");
+#else
+	strcpy(os, is_windows() ? "win32" : "linux");
+#endif
+
+#ifdef USE_WRAPNVML
+	cgpu->has_monitoring = true;
+	cgpu->gpu_power = gpu_power(cgpu); // mWatts
+	watts = (cgpu->gpu_power >= 1000) ? cgpu->gpu_power / 1000 : 0; // ignore nvapi %
+	gpu_info(cgpu);
+#endif
+	cuda_gpu_clocks(cgpu);
+	get_currentalgo(algo, sizeof(algo));
+
+	card = device_name[dev_id];
+	cgpu->khashes = stats_get_speed(thr_id, 0.0) / 1000.0;
+
+	sprintf(vid, "%04hx:%04hx", cgpu->gpu_vid, cgpu->gpu_pid);
+	sprintf(arch, "%d", (int)cgpu->gpu_arch);
+	if(cuda_arch[dev_id] > 0 && cuda_arch[dev_id] != cgpu->gpu_arch)
+	{
+		// if binary was not compiled for the highest cuda arch, add it
+		snprintf(arch, 8, "%d@%d", (int)cgpu->gpu_arch, cuda_arch[dev_id]);
+	}
+	snprintf(driver, 32, "CUDA %d.%d %s", cuda_ver / 1000, (cuda_ver % 1000) / 10, driver_version);
+	driver[31] = '\0';
+
+	val = json_object();
+	json_object_set_new(val, "algo", json_string(algo));
+	json_object_set_new(val, "type", json_string("gpu"));
+	json_object_set_new(val, "device", json_string(card));
+	json_object_set_new(val, "vendorid", json_string(vid));
+	json_object_set_new(val, "arch", json_string(arch));
+	json_object_set_new(val, "freq", json_integer(cgpu->gpu_clock / 1000));
+	json_object_set_new(val, "memf", json_integer(cgpu->gpu_memclock / 1000));
+	json_object_set_new(val, "power", json_integer(watts));
+	json_object_set_new(val, "khashes", json_real(cgpu->khashes));
+	json_object_set_new(val, "intensity", json_real(cgpu->intensity));
+	json_object_set_new(val, "throughput", json_integer(cgpu->throughput));
+	json_object_set_new(val, "client", json_string(PACKAGE_NAME "/" PACKAGE_VERSION));
+	json_object_set_new(val, "os", json_string(os));
+	json_object_set_new(val, "driver", json_string(driver));
+
+	json_object_set_new(result, "result", val);
+
+	return true;
+}
+
+static bool stratum_get_stats(struct stratum_ctx *sctx, json_t *id, json_t *params)
+{
+	char *s;
+	json_t *val;
+	bool ret;
+
+	if(!id || json_is_null(id))
+		return false;
+
+	val = json_object();
+	json_object_set(val, "id", id);
+
+	ret = stratum_benchdata(val, params, 0);
+
+	if(!ret)
+	{
+		json_object_set_error(val, 1, "disabled"); //EPERM
+	}
+	else
+	{
+		json_object_set_new(val, "error", json_null());
+	}
 
+	s = json_dumps(val, 0);
+	ret = stratum_send_line(sctx, s);
+	json_decref(val);
+	free(s);
+
+	return ret;
+}
 static bool stratum_get_version(struct stratum_ctx *sctx, json_t *id)
 {
 	char *s;
 	json_t *val;
 	bool ret;
-	
-	if (!id || json_is_null(id))
+
+	if(!id || json_is_null(id))
 		return false;
 
 	val = json_object();
 	json_object_set(val, "id", id);
-	json_object_set_new(val, "error", json_null());
 	json_object_set_new(val, "result", json_string(USER_AGENT));
+	json_object_set_new(val, "error", json_null());
 	s = json_dumps(val, 0);
 	ret = stratum_send_line(sctx, s);
 	json_decref(val);
@@ -1347,10 +1806,10 @@ static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *p
 	bool ret;
 
 	val = json_array_get(params, 0);
-	if (val)
+	if(val)
 		applog(LOG_NOTICE, "MESSAGE FROM SERVER: %s", json_string_value(val));
-	
-	if (!id || json_is_null(id))
+
+	if(!id || json_is_null(id))
 		return true;
 
 	val = json_object();
@@ -1364,6 +1823,27 @@ static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *p
 
 	return ret;
 }
+static bool stratum_unknown_method(struct stratum_ctx *sctx, json_t *id)
+{
+	char *s;
+	json_t *val;
+	bool ret = false;
+
+	if(!id || json_is_null(id))
+		return ret;
+
+	val = json_object();
+	json_object_set(val, "id", id);
+	json_object_set_new(val, "result", json_false());
+	json_object_set_error(val, 38, "unknown method"); // ENOSYS
+
+	s = json_dumps(val, 0);
+	ret = stratum_send_line(sctx, s);
+	json_decref(val);
+	free(s);
+
+	return ret;
+}
 
 bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
 {
@@ -1373,44 +1853,75 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
 	bool ret = false;
 
 	val = JSON_LOADS(s, &err);
-	if (!val) {
+	if(!val)
+	{
 		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
 		goto out;
 	}
 
 	method = json_string_value(json_object_get(val, "method"));
-	if (!method)
+	if(!method)
 		goto out;
 	id = json_object_get(val, "id");
 	params = json_object_get(val, "params");
 
-	if (!strcasecmp(method, "mining.notify")) {
+	if(!strcasecmp(method, "mining.notify"))
+	{
 		ret = stratum_notify(sctx, params);
 		goto out;
 	}
-	if (!strcasecmp(method, "mining.set_difficulty")) {
+	if(!strcasecmp(method, "mining.ping"))
+	{ // cgminer 4.7.1+
+		if(opt_debug) applog(LOG_DEBUG, "Pool ping");
+		ret = stratum_pong(sctx, id);
+		goto out;
+	}
+	if(!strcasecmp(method, "mining.set_difficulty"))
+	{
 		ret = stratum_set_difficulty(sctx, params);
 		goto out;
 	}
-	if (!strcasecmp(method, "mining.set_extranonce")) {
+	if(!strcasecmp(method, "mining.set_extranonce"))
+	{
 		ret = stratum_parse_extranonce(sctx, params, 0);
 		goto out;
 	}
-	if (!strcasecmp(method, "client.reconnect")) {
+	if(!strcasecmp(method, "client.reconnect"))
+	{
 		ret = stratum_reconnect(sctx, params);
 		goto out;
 	}
-	if (!strcasecmp(method, "client.get_version")) {
+	if(!strcasecmp(method, "client.get_algo"))
+	{ // ccminer only yet!
+		// will prevent wrong algo parameters on a pool, will be used as test on rejects
+		if(!opt_quiet) applog(LOG_NOTICE, "Pool asked your algo parameter");
+		ret = stratum_get_algo(sctx, id, params);
+		goto out;
+	}
+	if(!strcasecmp(method, "client.get_stats"))
+	{ // ccminer/yiimp only yet!
+		// optional to fill device benchmarks
+		ret = stratum_get_stats(sctx, id, params);
+		goto out;
+	}
+	if(!strcasecmp(method, "client.get_version"))
+	{
 		ret = stratum_get_version(sctx, id);
 		goto out;
 	}
-	if (!strcasecmp(method, "client.show_message")) {
+	if(!strcasecmp(method, "client.show_message"))
+	{
 		ret = stratum_show_message(sctx, id, params);
 		goto out;
 	}
-
+	if(!ret)
+	{
+		// don't fail = disconnect stratum on unknown (and optional?) methods
+		if(opt_debug) applog(LOG_WARNING, "unknown stratum method %s!", method);
+		ret = stratum_unknown_method(sctx, id);
+	}
 out:
-	if (val)
+	if(val)
 		json_decref(val);
 
 	return ret;
@@ -1421,8 +1932,11 @@ struct thread_q *tq_new(void)
 	struct thread_q *tq;
 
 	tq = (struct thread_q *)calloc(1, sizeof(*tq));
-	if (!tq)
-		return NULL;
+	if(tq == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 
 	INIT_LIST_HEAD(&tq->q);
 	pthread_mutex_init(&tq->mutex, NULL);
@@ -1435,10 +1949,11 @@ void tq_free(struct thread_q *tq)
 {
 	struct tq_ent *ent, *iter;
 
-	if (!tq)
+	if(!tq)
 		return;
 
-	list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent, struct tq_ent) {
+	list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent, struct tq_ent)
+	{
 		list_del(&ent->q_node);
 		free(ent);
 	}
@@ -1476,17 +1991,23 @@ bool tq_push(struct thread_q *tq, void *data)
 	bool rc = true;
 
 	ent = (struct tq_ent *)calloc(1, sizeof(*ent));
-	if (!ent)
-		return false;
+	if(ent == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 
 	ent->data = data;
 	INIT_LIST_HEAD(&ent->q_node);
 
 	pthread_mutex_lock(&tq->mutex);
 
-	if (!tq->frozen) {
+	if(!tq->frozen)
+	{
 		list_add_tail(&ent->q_node, &tq->q);
-	} else {
+	}
+	else
+	{
 		free(ent);
 		rc = false;
 	}
@@ -1505,16 +2026,16 @@ void *tq_pop(struct thread_q *tq, const struct timespec *abstime)
 
 	pthread_mutex_lock(&tq->mutex);
 
-	if (!list_empty(&tq->q))
+	if(!list_empty(&tq->q))
 		goto pop;
 
-	if (abstime)
+	if(abstime)
 		rc = pthread_cond_timedwait(&tq->cond, &tq->mutex, abstime);
 	else
 		rc = pthread_cond_wait(&tq->cond, &tq->mutex);
-	if (rc)
+	if(rc)
 		goto out;
-	if (list_empty(&tq->q))
+	if(list_empty(&tq->q))
 		goto out;
 
 pop:
@@ -1530,9 +2051,9 @@ void *tq_pop(struct thread_q *tq, const struct timespec *abstime)
 }
 
 /**
- * @param buf char[9] mini
- * @param time_t timer to convert
- */
+* @param buf char[9] mini
+* @param time_t timer to convert
+*/
 size_t time2str(char* buf, time_t timer)
 {
 	struct tm* tm_info;
@@ -1541,12 +2062,17 @@ size_t time2str(char* buf, time_t timer)
 }
 
 /**
- * Alloc and returns time string (to be freed)
- * @param time_t timer to convert
- */
+* Alloc and returns time string (to be freed)
+* @param time_t timer to convert
+*/
 char* atime2str(time_t timer)
 {
-	char* buf = (char*) malloc(16);
+	char* buf = (char*)malloc(16);
+	if(buf == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
 	memset(buf, 0, 16);
 	time2str(buf, timer);
 	return buf;
@@ -1556,9 +2082,10 @@ char* atime2str(time_t timer)
 static char* format_hash(char* buf, uchar *hash)
 {
 	int len = 0;
-	for (int i=0; i < 32; i += 4) {
-		len += sprintf(buf+len, "%02x%02x%02x%02x ",
-			hash[i], hash[i+1], hash[i+2], hash[i+3]);
+	for(int i = 0; i < 32; i += 4)
+	{
+		len += sprintf(buf + len, "%02x%02x%02x%02x ",
+					   hash[i], hash[i + 1], hash[i + 2], hash[i + 3]);
 	}
 	return buf;
 }
@@ -1568,10 +2095,11 @@ extern void applog_compare_hash(uchar *hash, uchar *hash2)
 {
 	char s[256] = "";
 	int len = 0;
-	for (int i=0; i < 32; i += 4) {
-		const char *color = memcmp(hash+i, hash2+i, 4) ? CL_WHT : CL_GRY;
-		len += sprintf(s+len, "%s%02x%02x%02x%02x " CL_GRY, color,
-			hash[i], hash[i+1], hash[i+2], hash[i+3]);
+	for(int i = 0; i < 32; i += 4)
+	{
+		const char *color = memcmp(hash + i, hash2 + i, 4) ? CL_WHT : CL_GRY;
+		len += sprintf(s + len, "%s%02x%02x%02x%02x " CL_GRY, color,
+					   hash[i], hash[i + 1], hash[i + 2], hash[i + 3]);
 		s[len] = '\0';
 	}
 	applog(LOG_DEBUG, "%s", s);
@@ -1579,7 +2107,7 @@ extern void applog_compare_hash(uchar *hash, uchar *hash2)
 
 extern void applog_hash(uchar *hash)
 {
-	char s[128] = {'\0'};
+	char s[128] = { '\0' };
 	applog(LOG_DEBUG, "%s", format_hash(s, hash));
 }
 
@@ -1600,11 +2128,13 @@ void do_gpu_tests(void)
 	tgt[7] = 0xffff;
 
 	memset(buf, 0, sizeof buf);
+	scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done);
+
+	//memset(buf, 0, sizeof buf);
 	// buf[0] = 1; buf[64] = 2; // for endian tests
 	scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
 
 	memset(buf, 0, sizeof buf);
-	scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84
 
 	free(work_restart);
 	work_restart = NULL;
@@ -1614,17 +2144,13 @@ void do_gpu_tests(void)
 
 void print_hash_tests(void)
 {
-	char s[128] = {'\0'};
+	char s[128] = { '\0' };
 	uchar buf[128], hash[128];
 	memset(buf, 0, sizeof buf);
 	// buf[0] = 1; buf[64] = 2; // for endian tests
 
 	printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n");
 
-	memset(hash, 0, sizeof hash);
-	animehash(&hash[0], &buf[0]);
-	printpfx("anime", hash);
-
 	memset(hash, 0, sizeof hash);
 	blake256hash(&hash[0], &buf[0], 8);
 	printpfx("blakecoin", hash);
@@ -1649,10 +2175,6 @@ void print_hash_tests(void)
 	groestlhash(&hash[0], &buf[0]);
 	printpfx("groestl", hash);
 
-	memset(hash, 0, sizeof hash);
-	heavycoin_hash(&hash[0], &buf[0], 32);
-	printpfx("heavy", hash);
-
 	memset(hash, 0, sizeof hash);
 	jackpothash(&hash[0], &buf[0]);
 	printpfx("jackpot", hash);
@@ -1665,10 +2187,6 @@ void print_hash_tests(void)
 	doomhash(&hash[0], &buf[0]);
 	printpfx("luffa", hash);
 
-	memset(hash, 0, sizeof hash);
-	lyra2_hash(&hash[0], &buf[0]);
-	printpfx("lyra2", hash);
-
 	memset(hash, 0, sizeof hash);
 	myriadhash(&hash[0], &buf[0]);
 	printpfx("myriad", hash);
@@ -1689,6 +2207,9 @@ void print_hash_tests(void)
 	qubithash(&hash[0], &buf[0]);
 	printpfx("qubit", hash);
 
+	skeincoinhash(&hash[0], &buf[0]);
+	printpfx("skein", hash);
+
 	memset(hash, 0, sizeof hash);
 	s3hash(&hash[0], &buf[0]);
 	printpfx("S3", hash);
@@ -1721,3 +2242,27 @@ void print_hash_tests(void)
 
 	do_gpu_tests();
 }
+
+void bin2hex(char *s, const unsigned char *p, size_t len)
+{
+	for(size_t i = 0; i < len; i++)
+		sprintf(s + (i * 2), "%02x", (unsigned int)p[i]);
+}
+
+char *abin2hex(const unsigned char *p, size_t len)
+{
+	char *s = (char*)malloc((len * 2) + 1);
+	if(s == NULL)
+	{
+		applog(LOG_ERR, "Out of memory!");
+		proper_exit(2);
+	}
+	bin2hex(s, p, len);
+	return s;
+}
+void applog_hex(void *data, int len)
+{
+	char* hex = abin2hex((uchar*)data, len);
+	applog(LOG_INFO, "%s", hex);
+	free(hex);
+}
diff --git a/x11/c11.cu b/x11/c11.cu
new file mode 100644
index 0000000000..47021b1eca
--- /dev/null
+++ b/x11/c11.cu
@@ -0,0 +1,266 @@
+extern "C"
+{
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+}
+
+#include "miner.h"
+//#include <cuda.h>
+//#include <cuda_runtime.h>
+#include "cuda_helper.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+extern void quark_blake512_cpu_init(int thr_id);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+
+extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+
+extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+
+extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+
+extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+
+extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+
+extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+
+extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+
+extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads);
+
+extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *d_hash, uint32_t target, uint32_t *h_found);
+extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
+
+extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
+extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes,
+										  const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse);
+
+extern "C" void c11hash(void *output, const void *input)
+{
+			// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11
+		sph_blake512_context ctx_blake;
+		sph_bmw512_context ctx_bmw;
+		sph_groestl512_context ctx_groestl;
+		sph_jh512_context ctx_jh;
+		sph_keccak512_context ctx_keccak;
+		sph_skein512_context ctx_skein;
+		sph_luffa512_context ctx_luffa;
+		sph_cubehash512_context ctx_cubehash;
+		sph_shavite512_context ctx_shavite;
+		sph_simd512_context ctx_simd;
+		sph_echo512_context ctx_echo;
+		
+		unsigned char hash[128];
+		memset(hash, 0, sizeof hash);
+		
+		sph_blake512_init(&ctx_blake);
+		sph_blake512(&ctx_blake, input, 80);
+		sph_blake512_close(&ctx_blake, (void*)hash);
+		
+		sph_bmw512_init(&ctx_bmw);
+		sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+		sph_bmw512_close(&ctx_bmw, (void*)hash);
+		
+		sph_groestl512_init(&ctx_groestl);
+		sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+		sph_groestl512_close(&ctx_groestl, (void*)hash);
+		
+		sph_jh512_init(&ctx_jh);
+		sph_jh512(&ctx_jh, (const void*)hash, 64);
+		sph_jh512_close(&ctx_jh, (void*)hash);
+		
+		sph_keccak512_init(&ctx_keccak);
+		sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+		sph_keccak512_close(&ctx_keccak, (void*)hash);
+		
+		sph_skein512_init(&ctx_skein);
+		sph_skein512(&ctx_skein, (const void*)hash, 64);
+		sph_skein512_close(&ctx_skein, (void*)hash);
+		
+		sph_luffa512_init(&ctx_luffa);
+		sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+		sph_luffa512_close(&ctx_luffa, (void*)hash);
+		
+		sph_cubehash512_init(&ctx_cubehash);
+		sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+		sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+		
+		sph_shavite512_init(&ctx_shavite);
+		sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+		sph_shavite512_close(&ctx_shavite, (void*)hash);
+		
+		sph_simd512_init(&ctx_simd);
+		sph_simd512(&ctx_simd, (const void*)hash, 64);
+		sph_simd512_close(&ctx_simd, (void*)hash);
+		
+		sph_echo512_init(&ctx_echo);
+		sph_echo512(&ctx_echo, (const void*)hash, 64);
+		sph_echo512_close(&ctx_echo, (void*)hash);
+		
+		memcpy(output, hash, 32);
+}
+
+static THREAD uint32_t *d_hash = nullptr;
+
+int scanhash_c11(int thr_id, uint32_t *pdata,
+				 uint32_t *ptarget, uint32_t max_nonce,
+				 uint32_t *hashes_done)
+{
+	uint32_t foundnonces[2];
+	const uint32_t first_nonce = pdata[19];
+
+	cudaDeviceProp props;
+	CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, device_map[thr_id]));
+	static THREAD uint32_t throughputmax;
+
+	if(opt_benchmark)
+		ptarget[7] = 0x4f;
+
+	static THREAD bool init = false;
+	if(!init)
+	{
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		CUDA_SAFE_CALL(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+		CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		get_cuda_arch(&cuda_arch[thr_id]);
+
+		unsigned int intensity;
+#if defined WIN32 && !defined _WIN64
+		intensity = 256 * 256 * 16;
+#else
+		if(strstr(props.name, "970"))		  intensity = (256 * 256 * 22);
+		else if(strstr(props.name, "980"))    intensity = (256 * 256 * 22);
+		else if(strstr(props.name, "1070"))   intensity = (256 * 256 * 22);
+		else if(strstr(props.name, "1080"))   intensity = (256 * 256 * 22);
+		else if(strstr(props.name, "750 Ti")) intensity = (256 * 256 * 20);
+		else if(strstr(props.name, "750"))    intensity = (256 * 256 * 19);
+		else if(strstr(props.name, "960"))    intensity = (256 * 256 * 19);
+		else intensity = (256 * 256 * 19);
+#endif
+		throughputmax = device_intensity(device_map[thr_id], __func__, intensity);
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
+
+		quark_groestl512_cpu_init(thr_id, throughputmax);
+		quark_bmw512_cpu_init(thr_id, throughputmax);
+		x11_echo512_cpu_init(thr_id, throughputmax);
+		x11_simd512_cpu_init(thr_id, throughputmax);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * 4 * throughputmax));
+		mining_has_stopped[thr_id] = false;
+		init = true;
+	}
+	uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffc00;
+	uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32;
+
+	uint32_t endiandata[20];
+	for(int k = 0; k < 20; k++)
+		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata);
+
+	do
+	{
+
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, simdthreads);
+		x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], d_hash, ptarget[7], foundnonces);
+		cudaStreamSynchronize(gpustream[thr_id]);
+		if(stop_mining)
+		{
+			mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);
+		}
+		if (foundnonces[0] != 0xffffffff)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t vhash64[8]={0};
+			if(opt_verify)
+			{
+				be32enc(&endiandata[19], foundnonces[0]);
+				c11hash(vhash64, endiandata);
+			}
+			if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			{
+				int res = 1;
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if(foundnonces[1] != 0xffffffff)
+				{
+					if(opt_verify)
+					{
+						be32enc(&endiandata[19], foundnonces[1]);
+						c11hash(vhash64, endiandata);
+					}
+					if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+						pdata[21] = foundnonces[1];
+						res++;
+						if(opt_benchmark)
+							applog(LOG_INFO, "GPU #%d: Found second nonce %08x", thr_id, foundnonces[1]);
+					}
+					else
+					{
+						if(vhash64[7] != Htarg)
+						{
+							applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundnonces[1]);
+						}
+					}
+				}
+				pdata[19] = foundnonces[0];
+				if(opt_benchmark)
+					applog(LOG_INFO, "GPU #%d: Found nonce %08x", thr_id, foundnonces[0]);
+				return res;
+			}
+			else
+			{
+				if(vhash64[7] != Htarg)
+				{
+					applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundnonces[0]);
+				}
+			}
+		}
+		pdata[19] += throughput;
+	} while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
+
+	*hashes_done = pdata[19] - first_nonce ;
+	return 0;
+}
diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu
index 45a7fde991..c99f1bfbc3 100644
--- a/x11/cuda_x11_aes.cu
+++ b/x11/cuda_x11_aes.cu
@@ -1,353 +1,154 @@
-
-/* AES Helper for inline-usage from SPH */
-#define AESx(x) SPH_C32(x)
-
+#include "cuda_helper.h"
 __constant__ __align__(64) uint32_t d_AES0[256] = {
-	AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
-	AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
-	AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
-	AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
-	AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
-	AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
-	AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
-	AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
-	AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
-	AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
-	AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
-	AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
-	AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
-	AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
-	AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
-	AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
-	AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
-	AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
-	AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
-	AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
-	AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
-	AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
-	AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
-	AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
-	AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
-	AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
-	AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
-	AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
-	AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
-	AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
-	AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
-	AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
-	AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
-	AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
-	AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
-	AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
-	AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
-	AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
-	AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
-	AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
-	AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
-	AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
-	AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
-	AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
-	AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
-	AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
-	AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
-	AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
-	AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
-	AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
-	AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
-	AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
-	AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
-	AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
-	AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
-	AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
-	AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
-	AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
-	AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
-	AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
-	AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
-	AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
-	AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
-	AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
-};
-
-__constant__ __align__(64) uint32_t d_AES1[256] = {
-	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
-	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
-	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
-	AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
-	AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
-	AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
-	AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
-	AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
-	AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
-	AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
-	AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
-	AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
-	AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
-	AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
-	AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
-	AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
-	AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
-	AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
-	AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
-	AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
-	AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
-	AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
-	AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
-	AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
-	AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
-	AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
-	AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
-	AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
-	AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
-	AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
-	AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
-	AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
-	AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
-	AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
-	AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
-	AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
-	AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
-	AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
-	AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
-	AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
-	AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
-	AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
-	AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
-	AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
-	AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
-	AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
-	AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
-	AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
-	AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
-	AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
-	AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
-	AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
-	AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
-	AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
-	AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
-	AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
-	AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
-	AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
-	AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
-	AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
-	AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
-	AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
-	AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
-	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
+	0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6,
+	0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591,
+	0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56,
+	0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC,
+	0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA,
+	0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB,
+	0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45,
+	0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B,
+	0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C,
+	0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83,
+	0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9,
+	0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A,
+	0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D,
+	0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F,
+	0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF,
+	0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA,
+	0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34,
+	0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B,
+	0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D,
+	0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413,
+	0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1,
+	0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6,
+	0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972,
+	0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85,
+	0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED,
+	0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511,
+	0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE,
+	0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B,
+	0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05,
+	0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1,
+	0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142,
+	0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF,
+	0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3,
+	0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E,
+	0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A,
+	0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6,
+	0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3,
+	0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B,
+	0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428,
+	0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD,
+	0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14,
+	0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8,
+	0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4,
+	0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2,
+	0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA,
+	0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949,
+	0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF,
+	0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810,
+	0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C,
+	0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697,
+	0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E,
+	0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F,
+	0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC,
+	0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C,
+	0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969,
+	0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27,
+	0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122,
+	0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433,
+	0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9,
+	0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5,
+	0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A,
+	0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0,
+	0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E,
+	0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C
 };
 
-__constant__ __align__(64) uint32_t d_AES2[256] = {
-	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
-	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
-	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
-	AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
-	AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
-	AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
-	AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
-	AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
-	AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
-	AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
-	AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
-	AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
-	AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
-	AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
-	AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
-	AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
-	AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
-	AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
-	AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
-	AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
-	AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
-	AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
-	AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
-	AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
-	AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
-	AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
-	AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
-	AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
-	AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
-	AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
-	AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
-	AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
-	AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
-	AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
-	AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
-	AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
-	AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
-	AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
-	AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
-	AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
-	AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
-	AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
-	AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
-	AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
-	AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
-	AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
-	AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
-	AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
-	AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
-	AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
-	AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
-	AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
-	AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
-	AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
-	AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
-	AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
-	AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
-	AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
-	AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
-	AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
-	AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
-	AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
-	AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
-	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
-};
-
-__constant__ __align__(64) uint32_t d_AES3[256] = {
-	AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
-	AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
-	AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
-	AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
-	AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
-	AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
-	AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
-	AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
-	AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
-	AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
-	AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
-	AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
-	AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
-	AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
-	AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
-	AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
-	AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
-	AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
-	AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
-	AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
-	AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
-	AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
-	AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
-	AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
-	AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
-	AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
-	AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
-	AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
-	AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
-	AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
-	AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
-	AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
-	AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
-	AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
-	AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
-	AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
-	AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
-	AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
-	AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
-	AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
-	AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
-	AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
-	AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
-	AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
-	AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
-	AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
-	AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
-	AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
-	AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
-	AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
-	AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
-	AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
-	AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
-	AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
-	AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
-	AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
-	AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
-	AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
-	AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
-	AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
-	AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
-	AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
-	AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
-	AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
-};
-
-
 __device__ __forceinline__
 void aes_gpu_init(uint32_t *const sharedMemory)
 {
 	/* each thread startup will fill a uint32 */
 	if (threadIdx.x < 256) {
 		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
-		sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x];
-		sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x];
-		sharedMemory[threadIdx.x+768] = d_AES3[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = ROL8(sharedMemory[threadIdx.x]);
+		sharedMemory[threadIdx.x + 512] = ROL16(sharedMemory[threadIdx.x]);
+		sharedMemory[threadIdx.x + 768] = ROL24(sharedMemory[threadIdx.x]);
 	}
 }
 
-/* tried with 3 xor.b32 asm, not faster */
-#define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d));
+__device__ __forceinline__
+uint32_t bfe(uint32_t x, uint8_t bit, uint8_t numBits)
+{
+	uint32_t ret;
+	asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"((uint32_t)bit), "r"((uint32_t)numBits));
+	return ret;
+}
+
+__device__ __forceinline__
+uint32_t bfi(uint32_t x, uint32_t a, uint32_t bit, uint32_t numBits)
+{
+	uint32_t ret;
+	asm("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(ret) : "r"(x), "r"(a), "r"(bit), "r"(numBits));
+	return ret;
+}
 
-__device__
+__device__ __forceinline__
 static void aes_round(
 const uint32_t *const __restrict__ sharedMemory,
 const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0,
-	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3 )
 {
-
-	y0 = xor4_32(
-		sharedMemory[__byte_perm(x0, 0, 0x4440)],
-		sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
-		sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
-		sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]);
-
-	y1 = xor4_32(
-		sharedMemory[__byte_perm(x1, 0, 0x4440)],
-		sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
-		sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
-		sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
-
-	y2 = xor4_32(
-		sharedMemory[__byte_perm(x2, 0, 0x4440)],
-		sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
-		sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
-		sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2
-
-	y0 ^= k0;
-
-	y3 = xor4_32(
-		sharedMemory[__byte_perm(x3, 0, 0x4440)],
-		sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
-		sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
-		sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3
+	const uint32_t a0 = (uint32_t) &sharedMemory[0];
+	y0 = *(uint32_t *)(bfi(x0, a0, 2, 8))
+		^ sharedMemory[bfe(x1, 8, 8) + 256]
+		^ sharedMemory[bfe(x2, 16, 8) + 512]
+		^ sharedMemory[bfe(x3, 24, 8) + 768] ^ k0;
+
+	y1 = *(uint32_t *)(bfi(x1, a0, 2, 8))
+		^sharedMemory[bfe(x2, 8, 8) + 256]
+		^sharedMemory[bfe(x3, 16, 8) + 512]
+		^ sharedMemory[bfe(x0, 24, 8) + 768];
+
+	y2 = *(uint32_t *)(bfi(x2, a0, 2, 8))
+	   ^sharedMemory[bfe(x3, 8, 8) + 256]
+	   ^sharedMemory[bfe(x0, 16, 8) + 512]
+	   ^ sharedMemory[bfe(x1, 24, 8) + 768];
+
+	y3 = *(uint32_t *)(bfi(x3, a0, 2, 8))
+	   ^ sharedMemory[bfe(x0, 8, 8) + 256]
+	   ^ sharedMemory[bfe(x1, 16, 8) + 512]
+	   ^ sharedMemory[bfe(x2, 24, 8) + 768];
 }
 
-__device__
+__device__ __forceinline__
 static void aes_round(
 const uint32_t *const __restrict__ sharedMemory,
 const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
 	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
 {
-	y0 = xor4_32(
-		sharedMemory[__byte_perm(x0, 0, 0x4440)],
-		sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
-		sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
-		sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]);
-
-	y1 = xor4_32(
-		sharedMemory[__byte_perm(x1, 0, 0x4440)],
-		sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
-		sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
-		sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
-
-	y2 = xor4_32(
-		sharedMemory[__byte_perm(x2, 0, 0x4440)],
-		sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
-		sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
-		sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2
 
-	y3 = xor4_32(
-		sharedMemory[__byte_perm(x3, 0, 0x4440)],
-		sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
-		sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
-		sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3
+	const uint32_t a0 = (uint32_t)&sharedMemory[0];
+	y0 = *(uint32_t *)(bfi(x0, a0, 2, 8))
+		^ sharedMemory[bfe(x1, 8, 8) + 256]
+		^ sharedMemory[bfe(x2, 16, 8) + 512]
+		^ sharedMemory[__byte_perm(x3, 0, 0x4443)+ 768];
+	
+
+	y1 = *(uint32_t *)(bfi(x1, a0, 2, 8))
+		^ sharedMemory[bfe(x2, 8, 8) + 256]
+		^ sharedMemory[bfe(x3, 16, 8) + 512]
+		^ sharedMemory[bfe(x0, 24, 8) + 768];
+
+	y2 = *(uint32_t *)(bfi(x2, a0, 2, 8))
+		^ sharedMemory[bfe(x3, 8, 8) + 256]
+		^ sharedMemory[bfe(x0, 16, 8) + 512]
+		^ sharedMemory[bfe(x1, 24, 8) + 768];
+
+	y3 = *(uint32_t *)(bfi(x3, a0, 2, 8))
+		^ sharedMemory[bfe(x0, 8, 8) + 256]
+		^ sharedMemory[bfe(x1, 16, 8) + 512]
+		^ sharedMemory[bfe(x2, 24, 8) + 768];
 }
 
diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu
index dad5a6b511..b5e0552601 100644
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@@ -1,296 +1,307 @@
 #include "cuda_helper.h"
 
-typedef unsigned char BitSequence;
-
-#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
-#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
-
-#if __CUDA_ARCH__ < 350
-#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
+#define ROUND_EVEN   \
+		xg = (x0 + xg); \
+		x0 = ROTL32(x0, 7); \
+		xh = (x1 + xh); \
+		x1 = ROTL32(x1, 7); \
+		xi = (x2 + xi); \
+		x2 = ROTL32(x2, 7); \
+		xj = (x3 + xj); \
+		x3 = ROTL32(x3, 7); \
+		xk = (x4 + xk); \
+		x4 = ROTL32(x4, 7); \
+		xl = (x5 + xl); \
+		x5 = ROTL32(x5, 7); \
+		xm = (x6 + xm); \
+		x6 = ROTL32(x6, 7); \
+		xn = (x7 + xn); \
+		x7 = ROTL32(x7, 7); \
+		xo = (x8 + xo); \
+		x8 = ROTL32(x8, 7); \
+		xp = (x9 + xp); \
+		x9 = ROTL32(x9, 7); \
+		xq = (xa + xq); \
+		xa = ROTL32(xa, 7); \
+		xr = (xb + xr); \
+		xb = ROTL32(xb, 7); \
+		xs = (xc + xs); \
+		xc = ROTL32(xc, 7); \
+		xt = (xd + xt); \
+		xd = ROTL32(xd, 7); \
+		xu = (xe + xu); \
+		xe = ROTL32(xe, 7); \
+		xv = (xf + xv); \
+		xf = ROTL32(xf, 7); \
+		x8 ^= xg; \
+		x9 ^= xh; \
+		xa ^= xi; \
+		xb ^= xj; \
+		xc ^= xk; \
+		xd ^= xl; \
+		xe ^= xm; \
+		xf ^= xn; \
+		x0 ^= xo; \
+		x1 ^= xp; \
+		x2 ^= xq; \
+		x3 ^= xr; \
+		x4 ^= xs; \
+		x5 ^= xt; \
+		x6 ^= xu; \
+		x7 ^= xv; \
+		xi = (x8 + xi); \
+		x8 = ROTL32(x8, 11); \
+		xj = (x9 + xj); \
+		x9 = ROTL32(x9, 11); \
+		xg = (xa + xg); \
+		xa = ROTL32(xa, 11); \
+		xh = (xb + xh); \
+		xb = ROTL32(xb, 11); \
+		xm = (xc + xm); \
+		xc = ROTL32(xc, 11); \
+		xn = (xd + xn); \
+		xd = ROTL32(xd, 11); \
+		xk = (xe + xk); \
+		xe = ROTL32(xe, 11); \
+		xl = (xf + xl); \
+		xf = ROTL32(xf, 11); \
+		xq = (x0 + xq); \
+		x0 = ROTL32(x0, 11); \
+		xr = (x1 + xr); \
+		x1 = ROTL32(x1, 11); \
+		xo = (x2 + xo); \
+		x2 = ROTL32(x2, 11); \
+		xp = (x3 + xp); \
+		x3 = ROTL32(x3, 11); \
+		xu = (x4 + xu); \
+		x4 = ROTL32(x4, 11); \
+		xv = (x5 + xv); \
+		x5 = ROTL32(x5, 11); \
+		xs = (x6 + xs); \
+		x6 = ROTL32(x6, 11); \
+		xt = (x7 + xt); \
+		x7 = ROTL32(x7, 11); \
+		xc ^= xi; \
+		xd ^= xj; \
+		xe ^= xg; \
+		xf ^= xh; \
+		x8 ^= xm; \
+		x9 ^= xn; \
+		xa ^= xk; \
+		xb ^= xl; \
+		x4 ^= xq; \
+		x5 ^= xr; \
+		x6 ^= xo; \
+		x7 ^= xp; \
+		x0 ^= xu; \
+		x1 ^= xv; \
+		x2 ^= xs; \
+		x3 ^= xt; 
+
+#define ROUND_ODD    \
+		xj = (xc + xj); \
+		xc = ROTL32(xc, 7); \
+		xi = (xd + xi); \
+		xd = ROTL32(xd, 7); \
+		xh = (xe + xh); \
+		xe = ROTL32(xe, 7); \
+		xg = (xf + xg); \
+		xf = ROTL32(xf, 7); \
+		xn = (x8 + xn); \
+		x8 = ROTL32(x8, 7); \
+		xm = (x9 + xm); \
+		x9 = ROTL32(x9, 7); \
+		xl = (xa + xl); \
+		xa = ROTL32(xa, 7); \
+		xk = (xb + xk); \
+		xb = ROTL32(xb, 7); \
+		xr = (x4 + xr); \
+		x4 = ROTL32(x4, 7); \
+		xq = (x5 + xq); \
+		x5 = ROTL32(x5, 7); \
+		xp = (x6 + xp); \
+		x6 = ROTL32(x6, 7); \
+		xo = (x7 + xo); \
+		x7 = ROTL32(x7, 7); \
+		xv = (x0 + xv); \
+		x0 = ROTL32(x0, 7); \
+		xu = (x1 + xu); \
+		x1 = ROTL32(x1, 7); \
+		xt = (x2 + xt); \
+		x2 = ROTL32(x2, 7); \
+		xs = (x3 + xs); \
+		x3 = ROTL32(x3, 7); \
+		x4 ^= xj; \
+		x5 ^= xi; \
+		x6 ^= xh; \
+		x7 ^= xg; \
+		x0 ^= xn; \
+		x1 ^= xm; \
+		x2 ^= xl; \
+		x3 ^= xk; \
+		xc ^= xr; \
+		xd ^= xq; \
+		xe ^= xp; \
+		xf ^= xo; \
+		x8 ^= xv; \
+		x9 ^= xu; \
+		xa ^= xt; \
+		xb ^= xs; \
+		xh = (x4 + xh); \
+		x4 = ROTL32(x4, 11); \
+		xg = (x5 + xg); \
+		x5 = ROTL32(x5, 11); \
+		xj = (x6 + xj); \
+		x6 = ROTL32(x6, 11); \
+		xi = (x7 + xi); \
+		x7 = ROTL32(x7, 11); \
+		xl = (x0 + xl); \
+		x0 = ROTL32(x0, 11); \
+		xk = (x1 + xk); \
+		x1 = ROTL32(x1, 11); \
+		xn = (x2 + xn); \
+		x2 = ROTL32(x2, 11); \
+		xm = (x3 + xm); \
+		x3 = ROTL32(x3, 11); \
+		xp = (xc + xp); \
+		xc = ROTL32(xc, 11); \
+		xo = (xd + xo); \
+		xd = ROTL32(xd, 11); \
+		xr = (xe + xr); \
+		xe = ROTL32(xe, 11); \
+		xq = (xf + xq); \
+		xf = ROTL32(xf, 11); \
+		xt = (x8 + xt); \
+		x8 = ROTL32(x8, 11); \
+		xs = (x9 + xs); \
+		x9 = ROTL32(x9, 11); \
+		xv = (xa + xv); \
+		xa = ROTL32(xa, 11); \
+		xu = (xb + xu); \
+		xb = ROTL32(xb, 11); \
+		x0 ^= xh; \
+		x1 ^= xg; \
+		x2 ^= xj; \
+		x3 ^= xi; \
+		x4 ^= xl; \
+		x5 ^= xk; \
+		x6 ^= xn; \
+		x7 ^= xm; \
+		x8 ^= xp; \
+		x9 ^= xo; \
+		xa ^= xr; \
+		xb ^= xq; \
+		xc ^= xt; \
+		xd ^= xs; \
+		xe ^= xv; \
+		xf ^= xu; 
+
+#define SIXTEEN_ROUNDS \
+		for (int j = 0; j < 8; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD;}
+__global__
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(512, 2)
 #else
-#define LROT(x, bits) __funnelshift_l(x, x, bits)
+__launch_bounds__(256, 5)
 #endif
-
-#define ROTATEUPWARDS7(a)  LROT(a,7)
-#define ROTATEUPWARDS11(a) LROT(a,11)
-
-#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
-
-__device__ __constant__
-static const uint32_t c_IV_512[32] = {
-	0x2AEA2A61, 0x50F494D4, 0x2D538B8B,
-	0x4167D83E, 0x3FEE2313, 0xC701CF8C,
-	0xCC39968E, 0x50AC5695, 0x4D42C787,
-	0xA647A8B3, 0x97CF0BEF, 0x825B4537,
-	0xEEF864D2, 0xF22090C4, 0xD0E5CD33,
-	0xA23911AE, 0xFCD398D9, 0x148FE485,
-	0x1B017BEF, 0xB6444532, 0x6A536159,
-	0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
-	0xD65C8A2B, 0xA5A70E75, 0xB1C62456,
-	0xBC796576, 0x1921C8F7, 0xE7989AF1,
-	0x7795D246, 0xD43E3B44
-};
-
-static __device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2])
-{
-    int r;
-    int j;
-    int k;
-    int l;
-    int m;
-
-//#pragma unroll 16
-    for (r = 0;r < CUBEHASH_ROUNDS;++r) {
-
-        /* "add x_0jklm into x_1jklmn modulo 2^32" */
-#pragma unroll 2
-        for (j = 0;j < 2;++j)
-#pragma unroll 2
-            for (k = 0;k < 2;++k)
-#pragma unroll 2
-                for (l = 0;l < 2;++l)
-#pragma unroll 2
-                    for (m = 0;m < 2;++m)
-                        x[1][j][k][l][m] += x[0][j][k][l][m];
-
-        /* "rotate x_0jklm upwards by 7 bits" */
-#pragma unroll 2
-        for (j = 0;j < 2;++j)
-#pragma unroll 2
-            for (k = 0;k < 2;++k)
-#pragma unroll 2
-                for (l = 0;l < 2;++l)
-#pragma unroll 2
-                    for (m = 0;m < 2;++m)
-                        x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
-
-        /* "swap x_00klm with x_01klm" */
-#pragma unroll 2
-        for (k = 0;k < 2;++k)
-#pragma unroll 2
-            for (l = 0;l < 2;++l)
-#pragma unroll 2
-                for (m = 0;m < 2;++m)
-                    SWAP(x[0][0][k][l][m],x[0][1][k][l][m])
-
-        /* "xor x_1jklm into x_0jklm" */
-#pragma unroll 2
-        for (j = 0;j < 2;++j)
-#pragma unroll 2
-            for (k = 0;k < 2;++k)
-#pragma unroll 2
-                for (l = 0;l < 2;++l)
-#pragma unroll 2
-                    for (m = 0;m < 2;++m)
-                        x[0][j][k][l][m] ^= x[1][j][k][l][m];
-
-        /* "swap x_1jk0m with x_1jk1m" */
-#pragma unroll 2
-        for (j = 0;j < 2;++j)
-#pragma unroll 2
-            for (k = 0;k < 2;++k)
-#pragma unroll 2
-                for (m = 0;m < 2;++m)
-                    SWAP(x[1][j][k][0][m],x[1][j][k][1][m])
-
-        /* "add x_0jklm into x_1jklm modulo 2^32" */
-#pragma unroll 2
-        for (j = 0;j < 2;++j)
-#pragma unroll 2
-            for (k = 0;k < 2;++k)
-#pragma unroll 2
-                for (l = 0;l < 2;++l)
-#pragma unroll 2
-                    for (m = 0;m < 2;++m)
-                        x[1][j][k][l][m] += x[0][j][k][l][m];
-
-        /* "rotate x_0jklm upwards by 11 bits" */
-#pragma unroll 2
-        for (j = 0;j < 2;++j)
-#pragma unroll 2
-            for (k = 0;k < 2;++k)
-#pragma unroll 2
-                for (l = 0;l < 2;++l)
-#pragma unroll 2
-                    for (m = 0;m < 2;++m)
-                        x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
-
-        /* "swap x_0j0lm with x_0j1lm" */
-#pragma unroll 2
-        for (j = 0;j < 2;++j)
-#pragma unroll 2
-            for (l = 0;l < 2;++l)
-#pragma unroll 2
-                for (m = 0;m < 2;++m)
-                    SWAP(x[0][j][0][l][m],x[0][j][1][l][m])
-
-        /* "xor x_1jklm into x_0jklm" */
-#pragma unroll 2
-        for (j = 0;j < 2;++j)
-#pragma unroll 2
-            for (k = 0;k < 2;++k)
-#pragma unroll 2
-                for (l = 0;l < 2;++l)
-#pragma unroll 2
-                    for (m = 0;m < 2;++m)
-                        x[0][j][k][l][m] ^= x[1][j][k][l][m];
-
-        /* "swap x_1jkl0 with x_1jkl1" */
-#pragma unroll 2
-        for (j = 0;j < 2;++j)
-#pragma unroll 2
-            for (k = 0;k < 2;++k)
-#pragma unroll 2
-                for (l = 0;l < 2;++l)
-                    SWAP(x[1][j][k][l][0],x[1][j][k][l][1])
-
-    }
-}
-
-
-static __device__ __forceinline__ void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
-{
-    int k;
-    int l;
-    int m;
-    uint32_t *in = block;
-
-#pragma unroll 2
-    for (k = 0;k < 2;++k)
-#pragma unroll 2
-        for (l = 0;l < 2;++l)
-#pragma unroll 2
-            for (m = 0;m < 2;++m)
-                x[0][0][k][l][m] ^= *in++;
-}
-
-static __device__ __forceinline__ void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
-{
-    int j;
-    int k;
-    int l;
-    int m;
-    uint32_t *out = hash;
-
-#pragma unroll 2
-    for (j = 0;j < 2;++j)
-#pragma unroll 2
-        for (k = 0;k < 2;++k)
-#pragma unroll 2
-            for (l = 0;l < 2;++l)
-#pragma unroll 2
-                for (m = 0;m < 2;++m)
-                    *out++ = x[0][j][k][l][m];
-}
-
-void __device__ __forceinline__ Init(uint32_t x[2][2][2][2][2])
+void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash)
 {
-    int i,j,k,l,m;
-#if 0
-    /* "the first three state words x_00000, x_00001, x_00010" */
-    /* "are set to the integers h/8, b, r respectively." */
-    /* "the remaining state words are set to 0." */
-#pragma unroll 2
-    for (i = 0;i < 2;++i)
-#pragma unroll 2
-      for (j = 0;j < 2;++j)
-#pragma unroll 2
-        for (k = 0;k < 2;++k)
-#pragma unroll 2
-          for (l = 0;l < 2;++l)
-#pragma unroll 2
-            for (m = 0;m < 2;++m)
-              x[i][j][k][l][m] = 0;
-    x[0][0][0][0][0] = 512/8;
-    x[0][0][0][0][1] = CUBEHASH_BLOCKBYTES;
-    x[0][0][0][1][0] = CUBEHASH_ROUNDS;
-
-    /* "the state is then transformed invertibly through 10r identical rounds */
-    for (i = 0;i < 10;++i) rrounds(x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if(thread < threads)
+	{
+		uint32_t *Hash = &g_hash[16 * thread];
+
+		uint32_t x0 = 0x2AEA2A61 ^ Hash[0];
+		uint32_t x1 = 0x50F494D4 ^ Hash[1];
+		uint32_t x2 = 0x2D538B8B ^ Hash[2];
+		uint32_t x3 = 0x4167D83E ^ Hash[3];
+		uint32_t x4 = 0x3FEE2313 ^ Hash[4];
+		uint32_t x5 = 0xC701CF8C ^ Hash[5];
+		uint32_t x6 = 0xCC39968E ^ Hash[6];
+		uint32_t x7 = 0x50AC5695 ^ Hash[7];
+		uint32_t x8 = 0x4D42C787, x9 = 0xA647A8B3, xa = 0x97CF0BEF, xb = 0x825B4537;
+		uint32_t xc = 0xEEF864D2, xd = 0xF22090C4, xe = 0xD0E5CD33, xf = 0xA23911AE;
+		uint32_t xg = 0xFCD398D9, xh = 0x148FE485, xi = 0x1B017BEF, xj = 0xB6444532;
+		uint32_t xk = 0x6A536159, xl = 0x2FF5781C, xm = 0x91FA7934, xn = 0x0DBADEA9;
+		uint32_t xo = 0xD65C8A2B, xp = 0xA5A70E75, xq = 0xB1C62456, xr = 0xBC796576;
+		uint32_t xs = 0x1921C8F7, xt = 0xE7989AF1, xu = 0x7795D246, xv = 0xD43E3B44;
+
+#if __CUDA_ARCH__ > 500
+		#pragma unroll 
+		for (int j = 0; j < 8; j++)
 #else
-    const uint32_t *iv = c_IV_512;
-
-#pragma unroll 2
-    for (i = 0;i < 2;++i)
-#pragma unroll 2
-      for (j = 0;j < 2;++j)
-#pragma unroll 2
-        for (k = 0;k < 2;++k)
-#pragma unroll 2
-          for (l = 0;l < 2;++l)
-#pragma unroll 2
-            for (m = 0;m < 2;++m)
-              x[i][j][k][l][m] = *iv++;
+		#pragma unroll 1
+		for (int j = 0; j < 8; j++)
 #endif
+		{
+				ROUND_EVEN;
+				ROUND_ODD;
+		}
+
+		x0 ^= (Hash[8]);
+		x1 ^= (Hash[9]);
+		x2 ^= (Hash[10]);
+		x3 ^= (Hash[11]);
+		x4 ^= (Hash[12]);
+		x5 ^= (Hash[13]);
+		x6 ^= (Hash[14]);
+		x7 ^= (Hash[15]);
+#if __CUDA_ARCH__ > 500
+		#pragma unroll
+		for (int j = 0; j < 8; j++)
+#else
+		for (int j = 0; j < 8; j++)
+#endif
+		{
+			ROUND_EVEN;
+			ROUND_ODD;
+		}
+		x0 ^= 0x80;
+
+		for (int j = 0; j < 8; j++)
+		{
+			ROUND_EVEN;
+			ROUND_ODD;
+		}
+		xv ^= 1;
+
+		for(int i = 3; i < 13; i++)
+		{
+			for (int j = 0; j < 8; j++)
+			{
+				ROUND_EVEN;
+				ROUND_ODD;
+			}
+		}
+
+		Hash[0] = x0;
+		Hash[1] = x1;
+		Hash[2] = x2;
+		Hash[3] = x3;
+		Hash[4] = x4;
+		Hash[5] = x5;
+		Hash[6] = x6;
+		Hash[7] = x7;
+		Hash[8] = x8;
+		Hash[9] = x9;
+		Hash[10] = xa;
+		Hash[11] = xb;
+		Hash[12] = xc;
+		Hash[13] = xd;
+		Hash[14] = xe;
+		Hash[15] = xf;
+	}
 }
-
-void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
-{
-    /* "xor the block into the first b bytes of the state" */
-    /* "and then transform the state invertibly through r identical rounds" */
-    block_tox((uint32_t*)data, x);
-    rrounds(x);
-}
-
-void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
-{
-    int i;
-
-    /* "the integer 1 is xored into the last state word x_11111" */
-    x[1][1][1][1][1] ^= 1;
-
-    /* "the state is then transformed invertibly through 10r identical rounds" */
-#pragma unroll 10
-    for (i = 0;i < 10;++i) rrounds(x);
-
-    /* "output the first h/8 bytes of the state" */
-    hash_fromx((uint32_t*)hashval, x);
-}
-
-
-/***************************************************/
-// Die Hash-Funktion
-__global__ __launch_bounds__(256, 4)
-void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
-{
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
-
-        uint32_t x[2][2][2][2][2];
-        Init(x);
-
-        // erste Hälfte des Hashes (32 bytes)
-        Update32(x, (const BitSequence*)Hash);
-
-        // zweite Hälfte des Hashes (32 bytes)
-        Update32(x, (const BitSequence*)(Hash+8));
-
-        // Padding Block
-        uint32_t last[8];
-        last[0] = 0x80;
-#pragma unroll 7
-        for (int i=1; i < 8; i++) last[i] = 0;
-        Update32(x, (const BitSequence*)last);
-
-        Final(x, (BitSequence*)Hash);
-    }
-}
-
-
 __host__
-void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 256;
 
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
 
-    x11_cubehash512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x11_cubehash512_gpu_hash_64 <<<grid, block, 0, gpustream[thr_id]>> >(threads, startNounce, d_hash);
 }
-
diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu
index c26cffd427..ddfc91b789 100644
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@@ -2,105 +2,147 @@
 #include <memory.h>
 
 #include "cuda_helper.h"
+#include "cuda_vector.h"
 
-#include "cuda_x11_aes.cu"
 
-static uint2 *d_nonce[MAX_GPUS];
+// #ifdef NOASM
+#include "cuda_x11_aes_noasm.cu"
+// #else
+// #include "cuda_x11_aes.cu"
+// #endif
+
 static uint32_t *d_found[MAX_GPUS];
 
+__constant__ uint32_t P[48] = {
+	0xe7e9f5f5,
+	0xf5e7e9f5,
+	0xb3b36b23,
+	0xb3dbe7af,
+
+	0xa4213d7e,
+	0xf5e7e9f5,
+	0xb3b36b23,
+	0xb3dbe7af,
+	//8-12
+	0x01425eb8,
+	0xf5e7e9f5,
+	0xb3b36b23,
+	0xb3dbe7af,
+
+	0x65978b09,
+	0xf5e7e9f5,
+	0xb3b36b23,
+	0xb3dbe7af,
+
+	//21-25
+	0x2cb6b661,
+	0x6b23b3b3,
+	0xcf93a7cf,
+	0x9d9d3751,
+
+	0x9ac2dea3,
+	0xf5e7e9f5,
+	0xb3b36b23,
+	0xb3dbe7af,
+
+	//34-38
+	0x579f9f33,
+	0xfbfbfbfb,
+	0xfbfbfbfb,
+	0xefefd3c7,
+
+	0xdbfde1dd,
+	0xf5e7e9f5,
+	0xb3b36b23,
+	0xb3dbe7af,
+
+	0x34514d9e,
+	0xf5e7e9f5,
+	0xb3b36b23,
+	0xb3dbe7af,
+
+
+	0xb134347e,
+	0xea6f7e7e,
+	0xbd7731bd,
+	0x8a8a1968,
+
+	0x14b8a457,
+	0xf5e7e9f5,
+	0xb3b36b23,
+	0xb3dbe7af,
+
+	0x265f4382,
+	0xf5e7e9f5,
+	0xb3b36b23,
+	0xb3dbe7af
+	//58-61
+};
+
 __device__ __forceinline__ void AES_2ROUND(
 	const uint32_t*const __restrict__ sharedMemory,
 	uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
 	const uint32_t k0)
 {
-	aes_round(sharedMemory,
-		x0, x1, x2, x3,
-		k0,
-		x0, x1, x2, x3);
-
-	aes_round(sharedMemory,
-		x0, x1, x2, x3,
-		x0, x1, x2, x3);
-
-
+	uint32_t y0 =
+		sharedMemory[__byte_perm(x0, 0, 0x4440)] ^
+		sharedMemory[__byte_perm(x1, 0, 0x4441) + 256] ^
+		sharedMemory[__byte_perm(x2, 0, 0x4442) + 512] ^
+		sharedMemory[__byte_perm(x3, 0, 0x4443) + 768] ^ k0;
+
+	uint32_t y1 =
+		sharedMemory[__byte_perm(x1, 0, 0x4440)] ^
+		sharedMemory[__byte_perm(x2, 0, 0x4441) + 256] ^
+		sharedMemory[__byte_perm(x3, 0, 0x4442) + 512] ^
+		sharedMemory[__byte_perm(x0, 0, 0x4443) + 768];
+
+	uint32_t y2 =
+		sharedMemory[__byte_perm(x2, 0, 0x4440)] ^
+		sharedMemory[__byte_perm(x3, 0, 0x4441) + 256] ^
+		sharedMemory[__byte_perm(x0, 0, 0x4442) + 512] ^
+		sharedMemory[__byte_perm(x1, 0, 0x4443) + 768];
+
+	uint32_t y3 =
+		sharedMemory[__byte_perm(x3, 0, 0x4440)] ^
+		sharedMemory[__byte_perm(x0, 0, 0x4441) + 256] ^
+		sharedMemory[__byte_perm(x1, 0, 0x4442) + 512] ^
+		sharedMemory[__byte_perm(x2, 0, 0x4443) + 768];
+
+	x0 =
+		sharedMemory[__byte_perm(y0, 0, 0x4440)] ^
+		sharedMemory[__byte_perm(y1, 0, 0x4441) + 256] ^
+		sharedMemory[__byte_perm(y2, 0, 0x4442) + 512] ^
+		sharedMemory[__byte_perm(y3, 0, 0x4443) + 768];
+
+	x1 =
+		sharedMemory[__byte_perm(y1, 0, 0x4440)] ^
+		sharedMemory[__byte_perm(y2, 0, 0x4441) + 256] ^
+		sharedMemory[__byte_perm(y3, 0, 0x4442) + 512] ^
+		sharedMemory[__byte_perm(y0, 0, 0x4443) + 768];
+
+	x2 =
+		sharedMemory[__byte_perm(y2, 0, 0x4440)] ^
+		sharedMemory[__byte_perm(y3, 0, 0x4441) + 256] ^
+		sharedMemory[__byte_perm(y0, 0, 0x4442) + 512] ^
+		sharedMemory[__byte_perm(y1, 0, 0x4443) + 768];
+
+	x3 =
+		sharedMemory[__byte_perm(y3, 0, 0x4440)] ^
+		sharedMemory[__byte_perm(y0, 0, 0x4441) + 256] ^
+		sharedMemory[__byte_perm(y1, 0, 0x4442) + 512] ^
+		sharedMemory[__byte_perm(y2, 0, 0x4443) + 768];
 }
 
 __device__ __forceinline__ void cuda_echo_round(
 	const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__  hash)
 {
-	uint32_t h[16];
-	const uint32_t P[48] = {
-		0xe7e9f5f5,
-		0xf5e7e9f5,
-		0xb3b36b23,
-		0xb3dbe7af,
-
-		0xa4213d7e,
-		0xf5e7e9f5,
-		0xb3b36b23,
-		0xb3dbe7af,
-		//8-12
-		0x01425eb8,
-		0xf5e7e9f5,
-		0xb3b36b23,
-		0xb3dbe7af,
-
-		0x65978b09,
-		0xf5e7e9f5,
-		0xb3b36b23,
-		0xb3dbe7af,
-
-		//21-25
-		0x2cb6b661,
-		0x6b23b3b3,
-		0xcf93a7cf,
-		0x9d9d3751,
-
-		0x9ac2dea3,
-		0xf5e7e9f5,
-		0xb3b36b23,
-		0xb3dbe7af,
-
-		//34-38
-		0x579f9f33,
-		0xfbfbfbfb,
-		0xfbfbfbfb,
-		0xefefd3c7,
-
-		0xdbfde1dd,
-		0xf5e7e9f5,
-		0xb3b36b23,
-		0xb3dbe7af,
-
-		0x34514d9e,
-		0xf5e7e9f5,
-		0xb3b36b23,
-		0xb3dbe7af,
-
-
-		0xb134347e,
-		0xea6f7e7e,
-		0xbd7731bd,
-		0x8a8a1968,
-
-		0x14b8a457,
-		0xf5e7e9f5,
-		0xb3b36b23,
-		0xb3dbe7af,
-
-		0x265f4382,
-		0xf5e7e9f5,
-		0xb3b36b23,
-		0xb3dbe7af
-		//58-61
-	};
 	uint32_t k0;
 
-#pragma unroll
-	for (int i = 0; i < 16; i++)
-	{
-		h[i] = hash[i];
-	}
+	uint32_t h[16];
+	uint28 *phash = (uint28*)hash;
+	uint28 *outpt = (uint28*)h;
+	outpt[0] = phash[0];
+	outpt[1] = phash[1];
 
 	k0 = 512 + 8;
 
@@ -283,9 +325,9 @@ __device__ __forceinline__ void cuda_echo_round(
 				t2 = (bc & 0x80808080);
 				t3 = (cd & 0x80808080);
 
-				uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
-				uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
-				uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
+				uint32_t abx = ((t >> 7) * 27 ^ ((ab^t) << 1));
+				uint32_t bcx = ((t2 >> 7) * 27 ^ ((bc^t2) << 1));
+				uint32_t cdx = ((t3 >> 7) * 27 ^ ((cd^t3) << 1));
 
 				W[idx + i] = abx ^ bc ^ d;
 				W[idx + i + 4] = bcx ^ a ^ cd;
@@ -309,13 +351,12 @@ __device__ __forceinline__ void cuda_echo_round(
 		hash[i] ^= W[i];
 }
 
-
-
+/*
 __device__ __forceinline__
-void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
+void echo_gpu_init_128(uint32_t *const __restrict__ sharedMemory)
 {
-	/* each thread startup will fill a uint32 */
-	if (threadIdx.x < 128) {
+	if (threadIdx.x < 128) 
+	{
 		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
 		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
 		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
@@ -327,21 +368,34 @@ void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
 		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
 	}
 }
+*/
 
 
-__global__ __launch_bounds__(128)
-void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
+__device__ __forceinline__
+void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
 {
-	__shared__ uint32_t sharedMemory[1024];
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 256) {
+		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = ROL8(sharedMemory[threadIdx.x]);
+		sharedMemory[threadIdx.x + 512] = ROL16(sharedMemory[threadIdx.x]);
+		sharedMemory[threadIdx.x + 768] = ROL24(sharedMemory[threadIdx.x]);
+	}
+}
 
-	echo_gpu_init(sharedMemory);
+__global__	__launch_bounds__(256, 3)
+void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_hash)
+{
+	__shared__ __align__(128) uint32_t sharedMemory[1024];
 
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
+	echo_gpu_init(sharedMemory);
+	__syncthreads();
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//    if (thread < threads)
     {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
+        const uint32_t nounce = (startNounce + thread);
+        const uint32_t hashPosition = nounce - startNounce;
+        uint32_t *const Hash = &g_hash[hashPosition<<4];
 		cuda_echo_round(sharedMemory, Hash);
     }
 }
@@ -349,114 +403,47 @@ void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *c
 // Setup-Funktionen
 __host__ void x11_echo512_cpu_init(int thr_id, uint32_t threads)
 {
-	cudaMalloc(&d_nonce[thr_id], sizeof(uint2));
-	CUDA_SAFE_CALL(cudaMalloc(&(d_found[thr_id]), 4 * sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMalloc(&(d_found[thr_id]), 2 * sizeof(uint32_t)));
 }
 
-__host__ void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
 {
-    const uint32_t threadsperblock = 128;
+    const uint32_t threadsperblock = 256;
 
     // berechne wie viele Thread Blocks wir brauchen
     dim3 grid((threads + threadsperblock-1)/threadsperblock);
     dim3 block(threadsperblock);
 
-    x11_echo512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+    x11_echo512_gpu_hash_64<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_hash);
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
 
 __host__ void x11_echo512_cpu_free(int32_t thr_id)
 {
-	cudaFreeHost(&d_nonce[thr_id]);
 }
 
-__global__ __launch_bounds__(128)
-void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector, uint32_t *const __restrict__ d_found, uint32_t target)
+__global__ __launch_bounds__(256, 3)
+void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, const uint64_t *const __restrict__ g_hash, uint32_t *const __restrict__ d_found, uint32_t target)
 {
-	__shared__ uint32_t sharedMemory[1024];
-	echo_gpu_init(sharedMemory);
 
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
-		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition *8];
+		__shared__ __align__(128) uint32_t sharedMemory[1024];
+		echo_gpu_init(sharedMemory);
+		__syncthreads();
+		const uint32_t nounce = (startNounce + thread);
+
+		const uint32_t hashPosition = nounce - startNounce;
+		const uint32_t *const Hash = (uint32_t*)&g_hash[hashPosition *8];
 
 		uint32_t h[16];
-		const uint32_t P[48] = {
-			0xe7e9f5f5,
-			0xf5e7e9f5,
-			0xb3b36b23,
-			0xb3dbe7af,
-
-			0xa4213d7e,
-			0xf5e7e9f5,
-			0xb3b36b23,
-			0xb3dbe7af,
-			//8-12
-			0x01425eb8,
-			0xf5e7e9f5,
-			0xb3b36b23,
-			0xb3dbe7af,
-
-			0x65978b09,
-			0xf5e7e9f5,
-			0xb3b36b23,
-			0xb3dbe7af,
-
-			//21-25
-			0x2cb6b661,
-			0x6b23b3b3,
-			0xcf93a7cf,
-			0x9d9d3751,
-
-			0x9ac2dea3,
-			0xf5e7e9f5,
-			0xb3b36b23,
-			0xb3dbe7af,
-
-			//34-38
-			0x579f9f33,
-			0xfbfbfbfb,
-			0xfbfbfbfb,
-			0xefefd3c7,
-
-			0xdbfde1dd,
-			0xf5e7e9f5,
-			0xb3b36b23,
-			0xb3dbe7af,
-
-			0x34514d9e,
-			0xf5e7e9f5,
-			0xb3b36b23,
-			0xb3dbe7af,
-
-
-			0xb134347e,
-			0xea6f7e7e,
-			0xbd7731bd,
-			0x8a8a1968,
-
-			0x14b8a457,
-			0xf5e7e9f5,
-			0xb3b36b23,
-			0xb3dbe7af,
-
-			0x265f4382,
-			0xf5e7e9f5,
-			0xb3b36b23,
-			0xb3dbe7af
-			//58-61
-		};
-
-
-#pragma unroll 16
-		for (int i = 0; i < 16; i++)
-		{
-			h[i] = Hash[i];
-		}
+		uint28 *phash = (uint28*)Hash;
+		uint28 *outpt = (uint28*)h;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+
 		uint32_t backup = h[7];
 
 		AES_2ROUND(sharedMemory,
@@ -470,101 +457,81 @@ void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6
 
 		uint32_t W[64];
 
-//#pragma unroll 4
+		#pragma unroll 4
 		for (int i = 0; i < 4; i++)
 		{
-			uint32_t a = P[i];
-			uint32_t b = P[i + 4];
-			uint32_t c = h[i + 8];
-			uint32_t d = P[i + 8];
-
-			uint32_t ab = a ^ b;
-			uint32_t bc = b ^ c;
-			uint32_t cd = c ^ d;
-
-
-			uint32_t t = (ab & 0x80808080);
-			uint32_t t2 = (bc & 0x80808080);
-			uint32_t t3 = (cd & 0x80808080);
-
-			uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
-			uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
-			uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
+			const uint32_t a = P[i];
+			const uint32_t a2 = P[12 + i];
+			const uint32_t a3 = h[i];
+			const uint32_t a4 = P[36 + i];
+			const uint32_t b = P[i + 4];
+			const uint32_t b2 = h[i + 4];
+			const uint32_t b3 = P[24 + i + 0];
+			const uint32_t b4 = P[36 + i + 4];
+			const uint32_t c = h[i + 8];
+			const uint32_t c2 = P[12 + i + 4];
+			const uint32_t c3 = P[24 + i + 4];
+			const uint32_t c4 = P[36 + i + 8];
+			const uint32_t d = P[i + 8];
+			const uint32_t d2 = P[12 + i + 8];
+			const uint32_t d3 = P[24 + i + 8];
+			const uint32_t d4 = h[i + 12];
+
+			const uint32_t ab = a ^ b;
+			const uint32_t ab2 = a2 ^ b2;
+			const uint32_t ab3 = a3 ^ b3;
+			const uint32_t ab4 = a4 ^ b4;
+			const uint32_t bc = b ^ c;
+			const uint32_t bc2 = b2 ^ c2;
+			const uint32_t bc3 = b3 ^ c3;
+			const uint32_t bc4 = b4 ^ c4;
+			const uint32_t cd = c ^ d;
+			const uint32_t cd2 = c2 ^ d2;
+			const uint32_t cd3 = c3 ^ d3;
+			const uint32_t cd4 = c4 ^ d4;
+
+			const uint32_t t = (ab & 0x80808080);
+			const uint32_t ta2 = (ab2 & 0x80808080);
+			const uint32_t ta3 = (ab3 & 0x80808080);
+			const uint32_t t4 = (ab4 & 0x80808080);
+			const uint32_t t2 = (bc & 0x80808080);
+			const uint32_t t22 = (bc2 & 0x80808080);
+			const uint32_t t23 = (bc3 & 0x80808080);
+			const uint32_t t24 = (bc4 & 0x80808080);
+			const uint32_t t3 = (cd & 0x80808080);
+			const uint32_t t32 = (cd2 & 0x80808080);
+			const uint32_t t33 = (cd3 & 0x80808080);
+			const uint32_t t34 = (cd4 & 0x80808080);
+
+			const uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
+			const uint32_t abx2 = (ta2 >> 7) * 27 ^ ((ab2^ta2) << 1);
+			const uint32_t abx3 = (ta3 >> 7) * 27 ^ ((ab3^ta3) << 1);
+			const uint32_t abx4 = (t4 >> 7) * 27 ^ ((ab4^t4) << 1);
+			const uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
+			const uint32_t bcx2 = (t22 >> 7) * 27 ^ ((bc2^t22) << 1);
+			const uint32_t bcx3 = (t23 >> 7) * 27 ^ ((bc3^t23) << 1);
+			const uint32_t bcx4 = (t24 >> 7) * 27 ^ ((bc4^t24) << 1);
+			const uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
+			const uint32_t cdx2 = (t32 >> 7) * 27 ^ ((cd2^t32) << 1);
+			const uint32_t cdx3 = (t33 >> 7) * 27 ^ ((cd3^t33) << 1);
+			const uint32_t cdx4 = (t34 >> 7) * 27 ^ ((cd4^t34) << 1);
 
 			W[0 + i] = abx ^ bc ^ d;
 			W[0 + i + 4] = bcx ^ a ^ cd;
 			W[0 + i + 8] = cdx ^ ab ^ d;
 			W[0 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
-
-			a = P[12 + i];
-			b = h[i + 4];
-			c = P[12 + i + 4];
-			d = P[12 + i + 8];
-
-			ab = a ^ b;
-			bc = b ^ c;
-			cd = c ^ d;
-
-
-			t = (ab & 0x80808080);
-			t2 = (bc & 0x80808080);
-			t3 = (cd & 0x80808080);
-
-			abx = (t >> 7) * 27 ^ ((ab^t) << 1);
-			bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
-			cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
-
-			W[16 + i] = abx ^ bc ^ d;
-			W[16 + i + 4] = bcx ^ a ^ cd;
-			W[16 + i + 8] = cdx ^ ab ^ d;
-			W[16 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
-
-			a = h[i];
-			b = P[24 + i + 0];
-			c = P[24 + i + 4];
-			d = P[24 + i + 8];
-
-			ab = a ^ b;
-			bc = b ^ c;
-			cd = c ^ d;
-
-
-			t = (ab & 0x80808080);
-			t2 = (bc & 0x80808080);
-			t3 = (cd & 0x80808080);
-
-			abx = (t >> 7) * 27 ^ ((ab^t) << 1);
-			bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
-			cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
-
-			W[32 + i] = abx ^ bc ^ d;
-			W[32 + i + 4] = bcx ^ a ^ cd;
-			W[32 + i + 8] = cdx ^ ab ^ d;
-			W[32 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
-
-			a = P[36 + i];
-			b = P[36 + i + 4];
-			c = P[36 + i + 8];
-			d = h[i + 12];
-
-			ab = a ^ b;
-			bc = b ^ c;
-			cd = c ^ d;
-
-			t = (ab & 0x80808080);
-			t2 = (bc & 0x80808080);
-			t3 = (cd & 0x80808080);
-
-			abx = (t >> 7) * 27 ^ ((ab^t) << 1);
-			bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
-			cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
-
-			W[48 + i] = abx ^ bc ^ d;
-			W[48 + i + 4] = bcx ^ a ^ cd;
-			W[48 + i + 8] = cdx ^ ab ^ d;
-			W[48 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
-
-		}
+			W[16 + i] = abx2 ^ bc2 ^ d2;
+			W[16 + i + 4] = bcx2 ^ a2 ^ cd2;
+			W[16 + i + 8] = cdx2 ^ ab2 ^ d2;
+			W[16 + i + 12] = abx2 ^ bcx2 ^ cdx2 ^ ab2 ^ c2;
+			W[32 + i] = abx3 ^ bc3 ^ d3;
+			W[32 + i + 4] = bcx3 ^ a3 ^ cd3;
+			W[32 + i + 8] = cdx3 ^ ab3 ^ d3;
+			W[32 + i + 12] = abx3 ^ bcx3 ^ cdx3 ^ ab3 ^ c3;
+			W[48 + i] = abx4 ^ bc4 ^ d4;
+			W[48 + i + 4] = bcx4 ^ a4 ^ cd4;
+			W[48 + i + 8] = cdx4 ^ ab4 ^ d4;
+			W[48 + i + 12] = abx4 ^ bcx4 ^ cdx4 ^ ab4 ^ c4;}
 
 		uint32_t k0 = 512 + 16;
 
@@ -572,7 +539,8 @@ void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6
 		{
 
 			// Big Sub Words
-			#pragma unroll 4
+
+		#pragma unroll 4
 			for (int idx = 0; idx < 64; idx += 16)
 			{
 				AES_2ROUND(sharedMemory,
@@ -593,10 +561,9 @@ void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6
 #pragma unroll 4
 			for (int i = 0; i < 4; i++)
 			{
-				uint32_t t;
 
 				/// 1, 5, 9, 13
-				t = W[4 + i];
+				uint32_t t = W[4 + i];
 				W[4 + i] = W[20 + i];
 				W[20 + i] = W[36 + i];
 				W[36 + i] = W[52 + i];
@@ -620,34 +587,34 @@ void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6
 
 			// Mix Columns
 #pragma unroll
-				for (int i = 0; i < 4; i++) // Schleife über je 2*uint32_t
-				{
+			for (int i = 0; i < 4; i++) // Schleife über je 2*uint32_t
+			{
 #pragma unroll
-					for (int idx = 0; idx < 64; idx += 16) // Schleife über die elemnte
-					{
+				for (int idx = 0; idx < 64; idx += 16) // Schleife über die elemnte
+				{
 
-					uint32_t a = W[idx + i];
-					uint32_t b = W[idx + i + 4];
-					uint32_t c = W[idx + i + 8];
-					uint32_t d = W[idx + i + 12];
+					const uint32_t a = W[idx + i];
+					const uint32_t b = W[idx + i + 4];
+					const uint32_t c = W[idx + i + 8];
+					const uint32_t d = W[idx + i + 12];
 
-					uint32_t ab = a ^ b;
-					uint32_t bc = b ^ c;
-					uint32_t cd = c ^ d;
+					const uint32_t ab = a ^ b;
+					const uint32_t bc = b ^ c;
+					const uint32_t cd = c ^ d;
 
-					uint32_t t, t2, t3;
-					t = (ab & 0x80808080);
-					t2 = (bc & 0x80808080);
-					t3 = (cd & 0x80808080);
+					const uint32_t t = (ab & 0x80808080);
+					const uint32_t t2 = (bc & 0x80808080);
+					const uint32_t t3 = (cd & 0x80808080);
 
-					uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
-					uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
-					uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
+					const uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
+					const uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
+					const uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
 
 					W[idx + i] = abx ^ bc ^ d;
 					W[idx + i + 4] = bcx ^ a ^ cd;
 					W[idx + i + 8] = cdx ^ ab ^ d;
 					W[idx + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
+
 				}
 			}
 		}
@@ -681,10 +648,8 @@ void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6
 			512 + (9 * 16) + 15);
 
 		uint32_t bc = W[23] ^ W[43];
-		uint32_t cd = W[43] ^ W[63];
 		uint32_t t2 = (bc & 0x80808080);
-
-		uint32_t test = (t2 >> 7) * 27 ^ ((bc^t2) << 1) ^ W[3] ^ cd;
+		uint32_t test = (t2 >> 7) * 27 ^ ((bc^t2) << 1) ^ W[3] ^ W[43] ^ W[63];
 		bc = W[55] ^ W[11];
 		t2 = (bc & 0x80808080);
 		test ^= (t2 >> 7) * 27 ^ ((bc^t2) << 1) ^ W[35] ^ W[11] ^ W[31] ^ backup;
@@ -696,16 +661,16 @@ void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6
 		}
 	}
 }
-__host__ void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found, int order)
+__host__ void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *d_hash, uint32_t target, uint32_t *h_found)
 {
-	const uint32_t threadsperblock = 128;
+	const uint32_t threadsperblock = 256;
 
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
-	cudaMemset(d_found[thr_id], 0xff, 4*sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaMemsetAsync(d_found[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]));
 
-	x11_echo512_gpu_hash_64_final << <grid, block>> >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_found[thr_id], target);
-	//MyStreamSynchronize(NULL, order, thr_id);
-	cudaMemcpy(h_found, d_found[thr_id], 4*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	x11_echo512_gpu_hash_64_final << <grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, (uint64_t*)d_hash, d_found[thr_id], target);
+	CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id]));
+	CUDA_SAFE_CALL(cudaMemcpy(h_found, d_found[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 }
diff --git a/x11/cuda_x11_luffa512.cu b/x11/cuda_x11_luffa512.cu
index eef6c5db1b..edecd2df05 100644
--- a/x11/cuda_x11_luffa512.cu
+++ b/x11/cuda_x11_luffa512.cu
@@ -333,15 +333,15 @@ void finalization512(hashState *state, uint32_t *b)
 
 /***************************************************/
 // Die Hash-Funktion
-__global__ void x11_luffa512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__global__ void x11_luffa512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
 {
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+    const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
     if (thread < threads)
     {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+        const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
+        const int hashPosition = nounce - startNounce;
+        uint32_t *const Hash = (uint32_t*)&g_hash[8 * hashPosition];
 
         hashState state;
 #pragma unroll 40
@@ -353,7 +353,7 @@ __global__ void x11_luffa512_gpu_hash_64(uint32_t threads, uint32_t startNounce,
     }
 }
 
-__host__ void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash)
 {
     const uint32_t threadsperblock = 256;
 
diff --git a/x11/cuda_x11_luffa512_Cubehash.cu b/x11/cuda_x11_luffa512_Cubehash.cu
index ef5bb7f963..aceade3f0b 100644
--- a/x11/cuda_x11_luffa512_Cubehash.cu
+++ b/x11/cuda_x11_luffa512_Cubehash.cu
@@ -1,30 +1,31 @@
 /*
- * luffa_for_32.c
- * Version 2.0 (Sep 15th 2009)
- *
- * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
- *
- * Hitachi, Ltd. is the owner of this software and hereby grant
- * the U.S. Government and any interested party the right to use
- * this software for the purposes of the SHA-3 evaluation process,
- * notwithstanding that this software is copyrighted.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
+* luffa_for_32.c
+* Version 2.0 (Sep 15th 2009)
+*
+* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
+*
+* Hitachi, Ltd. is the owner of this software and hereby grant
+* the U.S. Government and any interested party the right to use
+* this software for the purposes of the SHA-3 evaluation process,
+* notwithstanding that this software is copyrighted.
+*
+* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
 
 #include "cuda_helper.h"
 
 typedef unsigned char BitSequence;
 
-typedef struct {
-    uint32_t buffer[8]; /* Buffer to be hashed */
-    uint32_t chainv[40];   /* Chaining values */
+typedef struct
+{
+	uint32_t buffer[8]; /* Buffer to be hashed */
+	uint32_t chainv[40];   /* Chaining values */
 } hashState;
 
 #define MULT2(a,j)\
@@ -38,11 +39,13 @@ typedef struct {
     a[1+(8*j)] = a[0+(8*j)] ^ tmp;\
     a[0+(8*j)] = tmp;
 
-#if __CUDA_ARCH__ < 350
-#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
-#else
-#define LROT(x, bits) __funnelshift_l(x, x, bits)
-#endif
+#define LROT ROTL32
+
+#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
+#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
+
+#define ROTATEUPWARDS7(a)  LROT(a,7)
+#define ROTATEUPWARDS11(a) LROT(a,11)
 
 #define TWEAK(a0,a1,a2,a3,j)\
     a0 = LROT(a0,j);\
@@ -50,15 +53,6 @@ typedef struct {
     a2 = LROT(a2,j);\
     a3 = LROT(a3,j);
 
-#define STEP(c0,c1)\
-    SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\
-    SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\
-    MIXWORD(chainv[0],chainv[4]);\
-    MIXWORD(chainv[1],chainv[5]);\
-    MIXWORD(chainv[2],chainv[6]);\
-    MIXWORD(chainv[3],chainv[7]);\
-    ADD_CONSTANT(chainv[0],chainv[4],c0,c1);
-
 #define SUBCRUMB(a0,a1,a2,a3,a4)\
     a4  = a0;\
     a0 |= a1;\
@@ -92,6 +86,15 @@ typedef struct {
     a0 ^= c0;\
     b0 ^= c1;
 
+#define STEP(c0,c1)\
+    SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\
+    SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\
+    MIXWORD(chainv[0],chainv[4]);\
+    MIXWORD(chainv[1],chainv[5]);\
+    MIXWORD(chainv[2],chainv[6]);\
+    MIXWORD(chainv[3],chainv[7]);\
+    ADD_CONSTANT(chainv[0],chainv[4],c0,c1);
+
 // Precalculated chaining values
 __device__ __constant__ uint32_t c_IV[40] =
 { 0x8bb0a761, 0xc2e4aa8b, 0x2d539bc9, 0x381408f8,
@@ -103,196 +106,369 @@ __device__ __constant__ uint32_t c_IV[40] =
 0x66e66a8a, 0x2303208f, 0x486dafb4, 0xc0d37dc6,
 0x634d15af, 0xe5af6747, 0x10af7e38, 0xee7e6428,
 0x01262e5d, 0xc92c2e64, 0x82fee966, 0xcea738d3,
-0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529};
+0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529 };
 
 
 
 /* old chaining values
 __device__ __constant__ uint32_t c_IV[40] = {
-    0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465,
-    0x6e292011,0x90152df4,0xee058139,0xdef610bb,
-    0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3,
-    0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581,
-    0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05,
-    0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7,
-    0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67,
-    0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce,
-    0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363,
-    0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea};
+0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465,
+0x6e292011,0x90152df4,0xee058139,0xdef610bb,
+0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3,
+0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581,
+0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05,
+0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7,
+0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67,
+0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce,
+0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363,
+0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea};
 */
 
-
 __device__ __constant__ uint32_t c_CNS[80] = {
-    0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
-    0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
-    0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4,
-    0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d,
-    0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4,
-    0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28,
-    0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b,
-    0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704,
-    0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72,
-    0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7,
-    0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719,
-    0xd9847356,0x36eda57f,0xa2c78434,0x703aace7,
-    0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91,
-    0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be,
-    0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5,
-    0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355,
-    0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab,
-    0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0,
-    0x78602649,0x29131ab6,0x8edae952,0x0fc053c3,
-    0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31};
+	0x303994a6, 0xe0337818, 0xc0e65299, 0x441ba90d,
+	0x6cc33a12, 0x7f34d442, 0xdc56983e, 0x9389217f,
+	0x1e00108f, 0xe5a8bce6, 0x7800423d, 0x5274baf4,
+	0x8f5b7882, 0x26889ba7, 0x96e1db12, 0x9a226e9d,
+	0xb6de10ed, 0x01685f3d, 0x70f47aae, 0x05a17cf4,
+	0x0707a3d4, 0xbd09caca, 0x1c1e8f51, 0xf4272b28,
+	0x707a3d45, 0x144ae5cc, 0xaeb28562, 0xfaa7ae2b,
+	0xbaca1589, 0x2e48f1c1, 0x40a46f3e, 0xb923c704,
+	0xfc20d9d2, 0xe25e72c1, 0x34552e25, 0xe623bb72,
+	0x7ad8818f, 0x5c58a4a4, 0x8438764a, 0x1e38e2e7,
+	0xbb6de032, 0x78e38b9d, 0xedb780c8, 0x27586719,
+	0xd9847356, 0x36eda57f, 0xa2c78434, 0x703aace7,
+	0xb213afa5, 0xe028c9bf, 0xc84ebe95, 0x44756f91,
+	0x4e608a22, 0x7e8fce32, 0x56d858fe, 0x956548be,
+	0x343b138f, 0xfe191be2, 0xd0ec4e3d, 0x3cb226e5,
+	0x2ceb4882, 0x5944a28e, 0xb3ad2208, 0xa1c4c355,
+	0xf0d2e9e3, 0x5090d577, 0xac11d7fa, 0x2d1925ab,
+	0x1bcb66f2, 0xb46496ac, 0x6f2d9bc9, 0xd1925ab0,
+	0x78602649, 0x29131ab6, 0x8edae952, 0x0fc053c3,
+	0x3b6ba548, 0x3f014f0c, 0xedae9520, 0xfc053c31 };
 
 
 /***************************************************/
 __device__ __forceinline__
-void rnd512(hashState *state)
+void rnd512(uint32_t *const __restrict__ statebuffer, uint32_t *const __restrict__ statechainv)
 {
-    int i,j;
-    uint32_t t[40];
-    uint32_t chainv[8];
-    uint32_t tmp;
+	int i, j;
+	uint32_t t[40];
+	uint32_t chainv[8];
+	uint32_t tmp;
 
 #pragma unroll 8
-    for(i=0;i<8;i++) 
+	for(i = 0; i<8; i++)
 	{
 		t[i] = 0;
 #pragma unroll 5
-        for(j=0;j<5;j++) 
+		for(j = 0; j<5; j++)
 		{
-           t[i] ^= state->chainv[i+8*j];
-        }
+			t[i] ^= statechainv[i + 8 * j];
+		}
 	}
 
-    MULT2(t, 0);
+	MULT2(t, 0);
 
 #pragma unroll 5
-    for(j=0;j<5;j++) {
+	for(j = 0; j<5; j++)
+	{
 #pragma unroll 8
-        for(i=0;i<8;i++) {
-            state->chainv[i+8*j] ^= t[i];
-        }
-    }
+		for(i = 0; i<8; i++)
+		{
+			statechainv[i + 8 * j] ^= t[i];
+		}
+	}
 
 #pragma unroll 5
-    for(j=0;j<5;j++) {
+	for(j = 0; j<5; j++)
+	{
 #pragma unroll 8
-        for(i=0;i<8;i++) {
-            t[i+8*j] = state->chainv[i+8*j];
-        }
-    }
+		for(i = 0; i<8; i++)
+		{
+			t[i + 8 * j] = statechainv[i + 8 * j];
+		}
+	}
 
 #pragma unroll 5
-    for(j=0;j<5;j++) {
-        MULT2(state->chainv, j);
-    }
+	for(j = 0; j<5; j++)
+	{
+		MULT2(statechainv, j);
+	}
 
 #pragma unroll 5
-    for(j=0;j<5;j++) {
+	for(j = 0; j<5; j++)
+	{
 #pragma unroll 8
-        for(i=0;i<8;i++) {
-            state->chainv[8*j+i] ^= t[8*((j+1)%5)+i];
-        }
-    }
+		for(i = 0; i<8; i++)
+		{
+			statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i];
+		}
+	}
 
 #pragma unroll 5
-    for(j=0;j<5;j++) {
+	for(j = 0; j<5; j++)
+	{
 #pragma unroll 8
-        for(i=0;i<8;i++) {
-            t[i+8*j] = state->chainv[i+8*j];
-        }
-    }
+		for(i = 0; i<8; i++)
+		{
+			t[i + 8 * j] = statechainv[i + 8 * j];
+		}
+	}
 
 #pragma unroll 5
-    for(j=0;j<5;j++) {
-        MULT2(state->chainv, j);
-    }
+	for(j = 0; j<5; j++)
+	{
+		MULT2(statechainv, j);
+	}
 
 #pragma unroll 5
-    for(j=0;j<5;j++) {
+	for(j = 0; j<5; j++)
+	{
 #pragma unroll 8
-        for(i=0;i<8;i++) {
-            state->chainv[8*j+i] ^= t[8*((j+4)%5)+i];
-        }
-    }
+		for(i = 0; i<8; i++)
+		{
+			statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i];
+		}
+	}
 
 #pragma unroll 5
-    for(j=0;j<5;j++) {
+	for(j = 0; j<5; j++)
+	{
 #pragma unroll 8
-        for(i=0;i<8;i++) {
-            state->chainv[i+8*j] ^= state->buffer[i];
-        }
-        MULT2(state->buffer, 0);
-    }
+		for(i = 0; i<8; i++)
+		{
+			statechainv[i + 8 * j] ^= statebuffer[i];
+		}
+		MULT2(statebuffer, 0);
+	}
 
 #pragma unroll 8
-    for(i=0;i<8;i++) {
-        chainv[i] = state->chainv[i];
-    }
+	for(i = 0; i<8; i++)
+	{
+		chainv[i] = statechainv[i];
+	}
+
+#pragma unroll 1
+	for(i = 0; i<=14; i+=2)
+	{
+		STEP(c_CNS[i], c_CNS[i + 1]);
+	}
 
 #pragma unroll 8
-    for(i=0;i<8;i++) {
-        STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]);
-    }
+	for(i = 0; i<8; i++)
+	{
+		statechainv[i] = chainv[i];
+		chainv[i] = statechainv[i + 8];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
+
+#pragma unroll 1
+	for(i = 0; i<=14; i+=2)
+	{
+		STEP(c_CNS[i + 16], c_CNS[i + 16 + 1]);
+	}
 
 #pragma unroll 8
-    for(i=0;i<8;i++) {
-        state->chainv[i] = chainv[i];
-        chainv[i] = state->chainv[i+8];
-    }
+	for(i = 0; i<8; i++)
+	{
+		statechainv[i + 8] = chainv[i];
+		chainv[i] = statechainv[i + 16];
+	}
 
-    TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1);
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
+
+#pragma unroll 1
+	for(i = 0; i<=14; i+=2)
+	{
+		STEP(c_CNS[i + 32], c_CNS[i + 32 + 1]);
+	}
 
 #pragma unroll 8
-    for(i=0;i<8;i++) {
-        STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]);
-    }
+	for(i = 0; i<8; i++)
+	{
+		statechainv[i + 16] = chainv[i];
+		chainv[i] = statechainv[i + 24];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
 
+#pragma unroll 1
+	for(i = 0; i<=14; i+=2)
+	{
+		STEP(c_CNS[i + 48], c_CNS[i + 48 + 1]);
+	}
+
+#pragma unroll 8
+	for(i = 0; i<8; i++)
+	{
+		statechainv[i + 24] = chainv[i];
+		chainv[i] = statechainv[i + 32];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
+
+#pragma unroll 1
+	for(i = 0; i<=14; i+=2)
+	{
+		STEP(c_CNS[i + 64], c_CNS[i + 64 + 1]);
+	}
+
+#pragma unroll 8
+	for(i = 0; i<8; i++)
+	{
+		statechainv[i + 32] = chainv[i];
+	}
+}
+
+
+__device__ __forceinline__
+void rnd512_finalfirst(uint32_t *const statechainv)
+{
+	int i, j;
+	uint32_t t[40];
+	uint32_t chainv[8];
+	uint32_t tmp;
+
+#pragma unroll 8
+	for (i = 0; i<8; i++)
+	{
+		t[i] = 0;
+#pragma unroll 5
+		for (j = 0; j<5; j++)
+		{
+			t[i] ^= statechainv[i + 8 * j];
+		}
+	}
+
+	MULT2(t, 0);
+
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
+#pragma unroll 8
+		for (i = 0; i<8; i++) {
+			statechainv[i + 8 * j] ^= t[i];
+		}
+	}
+
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
+#pragma unroll 8
+		for (i = 0; i<8; i++) {
+			t[i + 8 * j] = statechainv[i + 8 * j];
+		}
+	}
+
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
+		MULT2(statechainv, j);
+	}
+
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
 #pragma unroll 8
-    for(i=0;i<8;i++) {
-        state->chainv[i+8] = chainv[i];
-        chainv[i] = state->chainv[i+16];
-    }
+		for (i = 0; i<8; i++) {
+			statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i];
+		}
+	}
 
-    TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2);
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
+#pragma unroll 8
+		for (i = 0; i<8; i++) {
+			t[i + 8 * j] = statechainv[i + 8 * j];
+		}
+	}
 
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
+		MULT2(statechainv, j);
+	}
+
+#pragma unroll 5
+	for (j = 0; j<5; j++) {
 #pragma unroll 8
-    for(i=0;i<8;i++) {
-        STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]);
-    }
+		for (i = 0; i<8; i++) {
+			statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i];
+		}
+	}
+
+	statechainv[0 + 8 * 0] ^= 0x80000000;
+	statechainv[1 + 8 * 1] ^= 0x80000000;
+	statechainv[2 + 8 * 2] ^= 0x80000000;
+	statechainv[3 + 8 * 3] ^= 0x80000000;
+	statechainv[4 + 8 * 4] ^= 0x80000000;
+
 
 #pragma unroll 8
-    for(i=0;i<8;i++) {
-        state->chainv[i+16] = chainv[i];
-        chainv[i] = state->chainv[i+24];
-    }
+	for (i = 0; i<8; i++) {
+		chainv[i] = statechainv[i];
+	}
 
-    TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3);
+#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]);
+	}
+
+#pragma unroll 8
+	for (i = 0; i<8; i++) {
+		statechainv[i] = chainv[i];
+		chainv[i] = statechainv[i + 8];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]);
+	}
 
 #pragma unroll 8
-    for(i=0;i<8;i++) {
-        STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]);
-    }
+	for (i = 0; i<8; i++) {
+		statechainv[i + 8] = chainv[i];
+		chainv[i] = statechainv[i + 16];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]);
+	}
 
 #pragma unroll 8
-    for(i=0;i<8;i++) {
-        state->chainv[i+24] = chainv[i];
-        chainv[i] = state->chainv[i+32];
-    }
+	for (i = 0; i<8; i++) {
+		statechainv[i + 16] = chainv[i];
+		chainv[i] = statechainv[i + 24];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
 
-    TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4);
+#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]);
+	}
 
 #pragma unroll 8
-    for(i=0;i<8;i++) {
-        STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]);
-    }
+	for (i = 0; i<8; i++) {
+		statechainv[i + 24] = chainv[i];
+		chainv[i] = statechainv[i + 32];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
+
+#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]);
+	}
 
 #pragma unroll 8
-    for(i=0;i<8;i++) {
-        state->chainv[i+32] = chainv[i];
-    }
+	for (i = 0; i<8; i++) {
+		statechainv[i + 32] = chainv[i];
+	}
 }
+
+
 __device__ __forceinline__
 void rnd512_first(uint32_t state[40], uint32_t buffer[8])
 {
@@ -301,87 +477,100 @@ void rnd512_first(uint32_t state[40], uint32_t buffer[8])
 	uint32_t tmp;
 
 #pragma unroll 5
-	for (j = 0; j<5; j++) {
+	for(j = 0; j<5; j++)
+	{
 		state[0 + 8 * j] ^= buffer[0];
 
 #pragma unroll 7
-		for (i = 1; i<8; i++) {
+		for(i = 1; i<8; i++)
+		{
 			state[i + 8 * j] ^= buffer[i];
 		}
 		MULT2(buffer, 0);
 	}
 
 #pragma unroll 8
-	for (i = 0; i<8; i++) {
+	for(i = 0; i<8; i++)
+	{
 		chainv[i] = state[i];
 	}
 
-#pragma unroll 8
-	for (i = 0; i<8; i++) {
+#pragma unroll 1
+	for(i = 0; i<8; i++)
+	{
 		STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]);
 	}
 
 #pragma unroll 8
-	for (i = 0; i<8; i++) {
+	for(i = 0; i<8; i++)
+	{
 		state[i] = chainv[i];
 		chainv[i] = state[i + 8];
 	}
 
 	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
 
-#pragma unroll 8
-	for (i = 0; i<8; i++) {
+#pragma unroll 1
+	for(i = 0; i<8; i++)
+	{
 		STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]);
 	}
 
 #pragma unroll 8
-	for (i = 0; i<8; i++) {
+	for(i = 0; i<8; i++)
+	{
 		state[i + 8] = chainv[i];
 		chainv[i] = state[i + 16];
 	}
 
 	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
 
-#pragma unroll 8
-	for (i = 0; i<8; i++) {
+#pragma unroll 1
+	for(i = 0; i<8; i++)
+	{
 		STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]);
 	}
 
 #pragma unroll 8
-	for (i = 0; i<8; i++) {
+	for(i = 0; i<8; i++)
+	{
 		state[i + 16] = chainv[i];
 		chainv[i] = state[i + 24];
 	}
 
 	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
 
-#pragma unroll 8
-	for (i = 0; i<8; i++) {
+#pragma unroll 1
+	for(i = 0; i<8; i++)
+	{
 		STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]);
 	}
 
 #pragma unroll 8
-	for (i = 0; i<8; i++) {
+	for(i = 0; i<8; i++)
+	{
 		state[i + 24] = chainv[i];
 		chainv[i] = state[i + 32];
 	}
 
 	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
 
-#pragma unroll 8
-	for (i = 0; i<8; i++) {
+#pragma unroll 1
+	for(i = 0; i<8; i++)
+	{
 		STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]);
 	}
 
 #pragma unroll 8
-	for (i = 0; i<8; i++) {
+	for(i = 0; i<8; i++)
+	{
 		state[i + 32] = chainv[i];
 	}
 }
 
 /***************************************************/
 __device__ __forceinline__
-void rnd512_nullhash(uint32_t *state)
+void rnd512_nullhash(uint32_t *const state)
 {
 	int i, j;
 	uint32_t t[40];
@@ -389,10 +578,12 @@ void rnd512_nullhash(uint32_t *state)
 	uint32_t tmp;
 
 #pragma unroll 8
-	for (i = 0; i<8; i++) {
+	for(i = 0; i<8; i++)
+	{
 		t[i] = state[i + 8 * 0];
 #pragma unroll 4
-		for (j = 1; j<5; j++) {
+		for(j = 1; j<5; j++)
+		{
 			t[i] ^= state[i + 8 * j];
 		}
 	}
@@ -400,467 +591,688 @@ void rnd512_nullhash(uint32_t *state)
 	MULT2(t, 0);
 
 #pragma unroll 5
-	for (j = 0; j<5; j++) {
+	for(j = 0; j<5; j++)
+	{
 #pragma unroll 8
-		for (i = 0; i<8; i++) {
+		for(i = 0; i<8; i++)
+		{
 			state[i + 8 * j] ^= t[i];
 		}
 	}
 
 #pragma unroll 5
-	for (j = 0; j<5; j++) {
+	for(j = 0; j<5; j++)
+	{
 #pragma unroll 8
-		for (i = 0; i<8; i++) {
+		for(i = 0; i<8; i++)
+		{
 			t[i + 8 * j] = state[i + 8 * j];
 		}
 	}
 
 #pragma unroll 5
-	for (j = 0; j<5; j++) {
+	for(j = 0; j<5; j++)
+	{
 		MULT2(state, j);
 	}
 
 #pragma unroll 5
-	for (j = 0; j<5; j++) {
+	for(j = 0; j<5; j++)
+	{
 #pragma unroll 8
-		for (i = 0; i<8; i++) {
+		for(i = 0; i<8; i++)
+		{
 			state[8 * j + i] ^= t[8 * ((j + 1) % 5) + i];
 		}
 	}
 
 #pragma unroll 5
-	for (j = 0; j<5; j++) {
+	for(j = 0; j<5; j++)
+	{
 #pragma unroll 8
-		for (i = 0; i<8; i++) {
+		for(i = 0; i<8; i++)
+		{
 			t[i + 8 * j] = state[i + 8 * j];
 		}
 	}
 
 #pragma unroll 5
-	for (j = 0; j<5; j++) {
+	for(j = 0; j<5; j++)
+	{
 		MULT2(state, j);
 	}
 
 #pragma unroll 5
-	for (j = 0; j<5; j++) {
+	for(j = 0; j<5; j++)
+	{
 #pragma unroll 8
-		for (i = 0; i<8; i++) {
+		for(i = 0; i<8; i++)
+		{
 			state[8 * j + i] ^= t[8 * ((j + 4) % 5) + i];
 		}
 	}
 
 #pragma unroll 8
-	for (i = 0; i<8; i++) {
+	for(i = 0; i<8; i++)
+	{
 		chainv[i] = state[i];
 	}
 
-#pragma unroll 8
-	for (i = 0; i<8; i++) {
+#pragma unroll 1
+	for(i = 0; i<8; i++)
+	{
 		STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]);
 	}
 
 #pragma unroll 8
-	for (i = 0; i<8; i++) {
+	for(i = 0; i<8; i++)
+	{
 		state[i] = chainv[i];
 		chainv[i] = state[i + 8];
 	}
 
 	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
 
-#pragma unroll 8
-	for (i = 0; i<8; i++) {
+#pragma unroll 1
+	for(i = 0; i<8; i++)
+	{
 		STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]);
 	}
 
 #pragma unroll 8
-	for (i = 0; i<8; i++) {
+	for(i = 0; i<8; i++)
+	{
 		state[i + 8] = chainv[i];
 		chainv[i] = state[i + 16];
 	}
 
 	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
 
-#pragma unroll 8
-	for (i = 0; i<8; i++) {
+#pragma unroll 1
+	for(i = 0; i<8; i++)
+	{
 		STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]);
 	}
 
 #pragma unroll 8
-	for (i = 0; i<8; i++) {
+	for(i = 0; i<8; i++)
+	{
 		state[i + 16] = chainv[i];
 		chainv[i] = state[i + 24];
 	}
 
 	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
 
-#pragma unroll 8
-	for (i = 0; i<8; i++) {
+#pragma unroll 1
+	for(i = 0; i<8; i++)
+	{
 		STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]);
 	}
 
 #pragma unroll 8
-	for (i = 0; i<8; i++) {
+	for(i = 0; i<8; i++)
+	{
 		state[i + 24] = chainv[i];
 		chainv[i] = state[i + 32];
 	}
 
 	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
 
-#pragma unroll 8
-	for (i = 0; i<8; i++) {
+#pragma unroll 1
+	for(i = 0; i<8; i++)
+	{
 		STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]);
 	}
 
 #pragma unroll 8
-	for (i = 0; i<8; i++) {
+	for(i = 0; i<8; i++)
+	{
 		state[i + 32] = chainv[i];
 	}
 }
 __device__ __forceinline__
-void Update512(hashState *state, const uint32_t*data)
+void Update512(uint32_t *const __restrict__ statebuffer, uint32_t *const __restrict__ statechainv, const uint32_t *const __restrict__ data)
 {
 #pragma unroll 8
-	for (int i = 0; i < 8; i++) state->buffer[i] = cuda_swab32(data[i]);
-    rnd512_first(state->chainv, state->buffer);
+	for(int i = 0; i < 8; i++)
+		statebuffer[i] = cuda_swab32(data[i]);
+	rnd512_first(statechainv, statebuffer);
 
 #pragma unroll 8
-	for (int i = 0; i < 8; i++) state->buffer[i] = cuda_swab32(data[i + 8]);
-    rnd512(state);
+	for(int i = 0; i < 8; i++)
+		statebuffer[i] = cuda_swab32(data[i + 8]);
+	rnd512(statebuffer, statechainv);
 }
 
 
 /***************************************************/
 __device__ __forceinline__
-void finalization512(hashState *state, uint32_t *b)
+void finalization512(uint32_t *const __restrict__ statechainv, uint32_t *const __restrict__ b)
 {
-    int i,j;
-
-    state->buffer[0] = 0x80000000;
-	#pragma unroll 7
-    for(int i=1;i<8;i++) state->buffer[i] = 0;
-	rnd512(state);
-
-    /*---- blank round with m=0 ----*/
-	rnd512_nullhash(state->chainv);
+	int i, j;
+	rnd512_finalfirst(statechainv);
+	/*---- blank round with m=0 ----*/
+	rnd512_nullhash(statechainv);
 
 #pragma unroll 8
-    for(i=0;i<8;i++) {
-		b[i] = state->chainv[i + 8 * 0];
+	for(i = 0; i<8; i++)
+	{
+		b[i] = statechainv[i + 8 * 0];
 #pragma unroll 4
-        for(j=1;j<5;j++) {
-            b[i] ^= state->chainv[i+8*j];
-        }
-        b[i] = cuda_swab32((b[i]));
-    }
+		for(j = 1; j<5; j++)
+		{
+			b[i] ^= statechainv[i + 8 * j];
+		}
+		b[i] = cuda_swab32((b[i]));
+	}
 
-	rnd512_nullhash(state->chainv);
+	rnd512_nullhash(statechainv);
 
 #pragma unroll 8
-    for(i=0;i<8;i++) {
-		b[8 + i] = state->chainv[i + 8 * 0];
+	for(i = 0; i<8; i++)
+	{
+		b[8 + i] = statechainv[i + 8 * 0];
 #pragma unroll 4
-        for(j=1;j<5;j++) {
-            b[8+i] ^= state->chainv[i+8*j];
-        }
-        b[8 + i] = cuda_swab32((b[8 + i]));
-    }
-}
-
-
-typedef unsigned char BitSequence;
-
-#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
-#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
-
-#if __CUDA_ARCH__ < 350
-#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
-#else
-#define LROT(x, bits) __funnelshift_l(x, x, bits)
-#endif
-
-#define ROTATEUPWARDS7(a)  LROT(a,7)
-#define ROTATEUPWARDS11(a) LROT(a,11)
-
-#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
-
-__device__ __constant__
-static const uint32_t c_IV_512[32] = {
-
-	0x2AEA2A61, 0x50F494D4, 0x2D538B8B,
-	0x4167D83E, 0x3FEE2313, 0xC701CF8C,
-	0xCC39968E, 0x50AC5695, 0x4D42C787,
-	0xA647A8B3, 0x97CF0BEF, 0x825B4537,
-	0xEEF864D2, 0xF22090C4, 0xD0E5CD33,
-	0xA23911AE, 0xFCD398D9, 0x148FE485,
-	0x1B017BEF, 0xB6444532, 0x6A536159,
-	0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
-	0xD65C8A2B, 0xA5A70E75, 0xB1C62456,
-	0xBC796576, 0x1921C8F7, 0xE7989AF1,
-	0x7795D246, 0xD43E3B44
-};
-
-__device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2])
-{
-	int r;
-	int j;
-	int k;
-	int l;
-	int m;
-
-//	#pragma unroll 
-	for (r = 0; r < CUBEHASH_ROUNDS; ++r) {
-
-		/* "add x_0jklm into x_1jklmn modulo 2^32" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-#pragma unroll 2
-					for (m = 0; m < 2; ++m)
-						x[1][j][k][l][m] += x[0][j][k][l][m];
-
-		/* "rotate x_0jklm upwards by 7 bits" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-#pragma unroll 2
-					for (m = 0; m < 2; ++m)
-						x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
-
-		/* "swap x_00klm with x_01klm" */
-#pragma unroll 2
-		for (k = 0; k < 2; ++k)
-#pragma unroll 2
-			for (l = 0; l < 2; ++l)
-#pragma unroll 2
-				for (m = 0; m < 2; ++m)
-					SWAP(x[0][0][k][l][m], x[0][1][k][l][m])
-
-					/* "xor x_1jklm into x_0jklm" */
-#pragma unroll 2
-					for (j = 0; j < 2; ++j)
-#pragma unroll 2
-						for (k = 0; k < 2; ++k)
-#pragma unroll 2
-							for (l = 0; l < 2; ++l)
-#pragma unroll 2
-								for (m = 0; m < 2; ++m)
-									x[0][j][k][l][m] ^= x[1][j][k][l][m];
-
-		/* "swap x_1jk0m with x_1jk1m" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (m = 0; m < 2; ++m)
-					SWAP(x[1][j][k][0][m], x[1][j][k][1][m])
-
-					/* "add x_0jklm into x_1jklm modulo 2^32" */
-#pragma unroll 2
-					for (j = 0; j < 2; ++j)
-#pragma unroll 2
-						for (k = 0; k < 2; ++k)
-#pragma unroll 2
-							for (l = 0; l < 2; ++l)
-#pragma unroll 2
-								for (m = 0; m < 2; ++m)
-									x[1][j][k][l][m] += x[0][j][k][l][m];
-
-		/* "rotate x_0jklm upwards by 11 bits" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-#pragma unroll 2
-					for (m = 0; m < 2; ++m)
-						x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
-
-		/* "swap x_0j0lm with x_0j1lm" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (l = 0; l < 2; ++l)
-#pragma unroll 2
-				for (m = 0; m < 2; ++m)
-					SWAP(x[0][j][0][l][m], x[0][j][1][l][m])
-
-					/* "xor x_1jklm into x_0jklm" */
-#pragma unroll 2
-					for (j = 0; j < 2; ++j)
-#pragma unroll 2
-						for (k = 0; k < 2; ++k)
-#pragma unroll 2
-							for (l = 0; l < 2; ++l)
-#pragma unroll 2
-								for (m = 0; m < 2; ++m)
-									x[0][j][k][l][m] ^= x[1][j][k][l][m];
-
-		/* "swap x_1jkl0 with x_1jkl1" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-					SWAP(x[1][j][k][l][0], x[1][j][k][l][1])
-
+		for(j = 1; j<5; j++)
+		{
+			b[8 + i] ^= statechainv[i + 8 * j];
+		}
+		b[8 + i] = cuda_swab32((b[8 + i]));
 	}
 }
 
+#define ROUND_EVEN   \
+		xg = (x0 + xg); \
+		x0 = ROTL32(x0, 7); \
+		xh = (x1 + xh); \
+		x1 = ROTL32(x1, 7); \
+		xi = (x2 + xi); \
+		x2 = ROTL32(x2, 7); \
+		xj = (x3 + xj); \
+		x3 = ROTL32(x3, 7); \
+		xk = (x4 + xk); \
+		x4 = ROTL32(x4, 7); \
+		xl = (x5 + xl); \
+		x5 = ROTL32(x5, 7); \
+		xm = (x6 + xm); \
+		x6 = ROTL32(x6, 7); \
+		xn = (x7 + xn); \
+		x7 = ROTL32(x7, 7); \
+		xo = (x8 + xo); \
+		x8 = ROTL32(x8, 7); \
+		xp = (x9 + xp); \
+		x9 = ROTL32(x9, 7); \
+		xq = (xa + xq); \
+		xa = ROTL32(xa, 7); \
+		xr = (xb + xr); \
+		xb = ROTL32(xb, 7); \
+		xs = (xc + xs); \
+		xc = ROTL32(xc, 7); \
+		xt = (xd + xt); \
+		xd = ROTL32(xd, 7); \
+		xu = (xe + xu); \
+		xe = ROTL32(xe, 7); \
+		xv = (xf + xv); \
+		xf = ROTL32(xf, 7); \
+		x8 ^= xg; \
+		x9 ^= xh; \
+		xa ^= xi; \
+		xb ^= xj; \
+		xc ^= xk; \
+		xd ^= xl; \
+		xe ^= xm; \
+		xf ^= xn; \
+		x0 ^= xo; \
+		x1 ^= xp; \
+		x2 ^= xq; \
+		x3 ^= xr; \
+		x4 ^= xs; \
+		x5 ^= xt; \
+		x6 ^= xu; \
+		x7 ^= xv; \
+		xi = (x8 + xi); \
+		x8 = ROTL32(x8, 11); \
+		xj = (x9 + xj); \
+		x9 = ROTL32(x9, 11); \
+		xg = (xa + xg); \
+		xa = ROTL32(xa, 11); \
+		xh = (xb + xh); \
+		xb = ROTL32(xb, 11); \
+		xm = (xc + xm); \
+		xc = ROTL32(xc, 11); \
+		xn = (xd + xn); \
+		xd = ROTL32(xd, 11); \
+		xk = (xe + xk); \
+		xe = ROTL32(xe, 11); \
+		xl = (xf + xl); \
+		xf = ROTL32(xf, 11); \
+		xq = (x0 + xq); \
+		x0 = ROTL32(x0, 11); \
+		xr = (x1 + xr); \
+		x1 = ROTL32(x1, 11); \
+		xo = (x2 + xo); \
+		x2 = ROTL32(x2, 11); \
+		xp = (x3 + xp); \
+		x3 = ROTL32(x3, 11); \
+		xu = (x4 + xu); \
+		x4 = ROTL32(x4, 11); \
+		xv = (x5 + xv); \
+		x5 = ROTL32(x5, 11); \
+		xs = (x6 + xs); \
+		x6 = ROTL32(x6, 11); \
+		xt = (x7 + xt); \
+		x7 = ROTL32(x7, 11); \
+		xc ^= xi; \
+		xd ^= xj; \
+		xe ^= xg; \
+		xf ^= xh; \
+		x8 ^= xm; \
+		x9 ^= xn; \
+		xa ^= xk; \
+		xb ^= xl; \
+		x4 ^= xq; \
+		x5 ^= xr; \
+		x6 ^= xo; \
+		x7 ^= xp; \
+		x0 ^= xu; \
+		x1 ^= xv; \
+		x2 ^= xs; \
+		x3 ^= xt; 
+
+#define ROUND_ODD    \
+		xj = (xc + xj); \
+		xc = ROTL32(xc, 7); \
+		xi = (xd + xi); \
+		xd = ROTL32(xd, 7); \
+		xh = (xe + xh); \
+		xe = ROTL32(xe, 7); \
+		xg = (xf + xg); \
+		xf = ROTL32(xf, 7); \
+		xn = (x8 + xn); \
+		x8 = ROTL32(x8, 7); \
+		xm = (x9 + xm); \
+		x9 = ROTL32(x9, 7); \
+		xl = (xa + xl); \
+		xa = ROTL32(xa, 7); \
+		xk = (xb + xk); \
+		xb = ROTL32(xb, 7); \
+		xr = (x4 + xr); \
+		x4 = ROTL32(x4, 7); \
+		xq = (x5 + xq); \
+		x5 = ROTL32(x5, 7); \
+		xp = (x6 + xp); \
+		x6 = ROTL32(x6, 7); \
+		xo = (x7 + xo); \
+		x7 = ROTL32(x7, 7); \
+		xv = (x0 + xv); \
+		x0 = ROTL32(x0, 7); \
+		xu = (x1 + xu); \
+		x1 = ROTL32(x1, 7); \
+		xt = (x2 + xt); \
+		x2 = ROTL32(x2, 7); \
+		xs = (x3 + xs); \
+		x3 = ROTL32(x3, 7); \
+		x4 ^= xj; \
+		x5 ^= xi; \
+		x6 ^= xh; \
+		x7 ^= xg; \
+		x0 ^= xn; \
+		x1 ^= xm; \
+		x2 ^= xl; \
+		x3 ^= xk; \
+		xc ^= xr; \
+		xd ^= xq; \
+		xe ^= xp; \
+		xf ^= xo; \
+		x8 ^= xv; \
+		x9 ^= xu; \
+		xa ^= xt; \
+		xb ^= xs; \
+		xh = (x4 + xh); \
+		x4 = ROTL32(x4, 11); \
+		xg = (x5 + xg); \
+		x5 = ROTL32(x5, 11); \
+		xj = (x6 + xj); \
+		x6 = ROTL32(x6, 11); \
+		xi = (x7 + xi); \
+		x7 = ROTL32(x7, 11); \
+		xl = (x0 + xl); \
+		x0 = ROTL32(x0, 11); \
+		xk = (x1 + xk); \
+		x1 = ROTL32(x1, 11); \
+		xn = (x2 + xn); \
+		x2 = ROTL32(x2, 11); \
+		xm = (x3 + xm); \
+		x3 = ROTL32(x3, 11); \
+		xp = (xc + xp); \
+		xc = ROTL32(xc, 11); \
+		xo = (xd + xo); \
+		xd = ROTL32(xd, 11); \
+		xr = (xe + xr); \
+		xe = ROTL32(xe, 11); \
+		xq = (xf + xq); \
+		xf = ROTL32(xf, 11); \
+		xt = (x8 + xt); \
+		x8 = ROTL32(x8, 11); \
+		xs = (x9 + xs); \
+		x9 = ROTL32(x9, 11); \
+		xv = (xa + xv); \
+		xa = ROTL32(xa, 11); \
+		xu = (xb + xu); \
+		xb = ROTL32(xb, 11); \
+		x0 ^= xh; \
+		x1 ^= xg; \
+		x2 ^= xj; \
+		x3 ^= xi; \
+		x4 ^= xl; \
+		x5 ^= xk; \
+		x6 ^= xn; \
+		x7 ^= xm; \
+		x8 ^= xp; \
+		x9 ^= xo; \
+		xa ^= xr; \
+		xb ^= xq; \
+		xc ^= xt; \
+		xd ^= xs; \
+		xe ^= xv; \
+		xf ^= xu; 
+
+#define SIXTEEN_ROUNDS \
+		for (int j = 0; j < 8; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD;}
 
-__device__ __forceinline__ void block_tox(uint32_t *in, uint32_t x[2][2][2][2][2])
-{
-	int k;
-	int l;
-	int m;
-//	uint32_t *in = block;
-
-#pragma unroll 2
-	for (k = 0; k < 2; ++k)
-#pragma unroll 2
-		for (l = 0; l < 2; ++l)
-#pragma unroll 2
-			for (m = 0; m < 2; ++m)
-				x[0][0][k][l][m] ^= *in++;
-}
-
-__device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2])
-{
-	int j;
-	int k;
-	int l;
-	int m;
-//	uint32_t *out = hash;
-
-#pragma unroll 2
-	for (j = 0; j < 2; ++j)
-#pragma unroll 2
-		for (k = 0; k < 2; ++k)
-#pragma unroll 2
-			for (l = 0; l < 2; ++l)
-#pragma unroll 2
-				for (m = 0; m < 2; ++m)
-					*out++ = x[0][j][k][l][m];
-}
-
-void __device__ __forceinline__ Init(uint32_t x[2][2][2][2][2])
-{
-	int i, j, k, l, m;
-#if 0
-	/* "the first three state words x_00000, x_00001, x_00010" */
-	/* "are set to the integers h/8, b, r respectively." */
-	/* "the remaining state words are set to 0." */
-#pragma unroll 2
-	for (i = 0; i < 2; ++i)
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-#pragma unroll 2
-					for (m = 0; m < 2; ++m)
-						x[i][j][k][l][m] = 0;
-	x[0][0][0][0][0] = 512 / 8;
-	x[0][0][0][0][1] = CUBEHASH_BLOCKBYTES;
-	x[0][0][0][1][0] = CUBEHASH_ROUNDS;
-
-	/* "the state is then transformed invertibly through 10r identical rounds */
-	for (i = 0; i < 10; ++i) rrounds(x);
+__global__
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(256, 4)
 #else
-	const uint32_t *iv = c_IV_512;
-
-#pragma unroll 2
-	for (i = 0; i < 2; ++i)
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-#pragma unroll 2
-					for (m = 0; m < 2; ++m)
-						x[i][j][k][l][m] = *iv++;
+__launch_bounds__(256, 3)
 #endif
-}
-
-void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const uint32_t *data)
-{
-	/* "xor the block into the first b bytes of the state" */
-	/* "and then transform the state invertibly through r identical rounds" */
-	block_tox((uint32_t*)data, x);
-	rrounds(x);
-}
-
-void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
+void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *const g_hash)
 {
-	int i;
-
-	/* "the integer 1 is xored into the last state word x_11111" */
-	x[1][1][1][1][1] ^= 1;
-
-	/* "the state is then transformed invertibly through 10r identical rounds" */
-//	#pragma unroll 10
-	for (i = 0; i < 10; ++i) rrounds(x);
-
-	/* "output the first h/8 bytes of the state" */
-	hash_fromx(hashval, x);
-}
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if(thread < threads)
+	{
+		uint32_t *const Hash = (uint32_t*)&g_hash[8 * thread];
 
+		uint32_t statebuffer[8];
+		uint32_t statechainv[40] =
+		{
+			0x8bb0a761, 0xc2e4aa8b, 0x2d539bc9, 0x381408f8,
+			0x478f6633, 0x255a46ff, 0x581c37f7, 0x601c2e8e,
+			0x266c5f9d, 0xc34715d8, 0x8900670e, 0x51a540be,
+			0xe4ce69fb, 0x5089f4d4, 0x3cc0a506, 0x609bcb02,
+			0xa4e3cd82, 0xd24fd6ca, 0xc0f196dc, 0xcf41eafe,
+			0x0ff2e673, 0x303804f2, 0xa7b3cd48, 0x677addd4,
+			0x66e66a8a, 0x2303208f, 0x486dafb4, 0xc0d37dc6,
+			0x634d15af, 0xe5af6747, 0x10af7e38, 0xee7e6428,
+			0x01262e5d, 0xc92c2e64, 0x82fee966, 0xcea738d3,
+			0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529
+		};
+
+		Update512(statebuffer, statechainv, Hash);
+		finalization512(statechainv, Hash);
+		//Cubehash
 
-/***************************************************/
-// Die Hash-Funktion
-__global__
-void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
-{
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		uint32_t x0 = 0x2AEA2A61 ^ Hash[0];
+		uint32_t x1 = 0x50F494D4 ^ Hash[1];
+		uint32_t x2 = 0x2D538B8B ^ Hash[2];
+		uint32_t x3 = 0x4167D83E ^ Hash[3];
+		uint32_t x4 = 0x3FEE2313 ^ Hash[4];
+		uint32_t x5 = 0xC701CF8C ^ Hash[5];
+		uint32_t x6 = 0xCC39968E ^ Hash[6];
+		uint32_t x7 = 0x50AC5695 ^ Hash[7];
+		uint32_t x8 = 0x4D42C787, x9 = 0xA647A8B3, xa = 0x97CF0BEF, xb = 0x825B4537;
+		uint32_t xc = 0xEEF864D2, xd = 0xF22090C4, xe = 0xD0E5CD33, xf = 0xA23911AE;
+		uint32_t xg = 0xFCD398D9 + x0, xh = 0x148FE485 + x1, xi = 0x1B017BEF + x2, xj = 0xB6444532 + x3;
+		uint32_t xk = 0x6A536159 + x4, xl = 0x2FF5781C + x5, xm = 0x91FA7934 + x6, xn = 0x0DBADEA9 + x7;
+		uint32_t xo = 0xD65C8A2B + x8, xp = 0xA5A70E75 + x9, xq = 0xB1C62456 + xa, xr = 0xBC796576 + xb;
+		uint32_t xs = 0x1921C8F7 + xc, xt = 0xE7989AF1 + xd, xu = 0x7795D246 + xe, xv = 0xD43E3B44 + xf;
+
+
+			x0 = ROTL32(x0, 7);
+			x1 = ROTL32(x1, 7);
+			x2 = ROTL32(x2, 7);
+			x3 = ROTL32(x3, 7); 
+			x4 = ROTL32(x4, 7);
+			x5 = ROTL32(x5, 7);
+			x6 = ROTL32(x6, 7);
+			x7 = ROTL32(x7, 7);
+			x8 = ROTL32(x8, 7);
+			x9 = ROTL32(x9, 7);
+			xa = ROTL32(xa, 7);
+			xb = ROTL32(xb, 7);
+			xc = ROTL32(xc, 7);
+			xd = ROTL32(xd, 7);
+			xe = ROTL32(xe, 7);
+			xf = ROTL32(xf, 7);
+			x8 ^= xg;
+			x9 ^= xh;
+			xa ^= xi;
+			xb ^= xj;
+			xc ^= xk;
+			xd ^= xl;
+			xe ^= xm;
+			xf ^= xn;
+			x0 ^= xo;
+			x1 ^= xp;
+			x2 ^= xq;
+			x3 ^= xr;
+			x4 ^= xs;
+			x5 ^= xt;
+			x6 ^= xu;
+			x7 ^= xv;
+			xi = (x8 + xi);
+			x8 = ROTL32(x8, 11);
+			xj = (x9 + xj);
+			x9 = ROTL32(x9, 11);
+			xg = (xa + xg);
+			xa = ROTL32(xa, 11);
+			xh = (xb + xh);
+			xb = ROTL32(xb, 11);
+			xm = (xc + xm);
+			xc = ROTL32(xc, 11);
+			xn = (xd + xn);
+			xd = ROTL32(xd, 11);
+			xk = (xe + xk);
+			xe = ROTL32(xe, 11);
+			xl = (xf + xl);
+			xf = ROTL32(xf, 11);
+			xq = (x0 + xq);
+			x0 = ROTL32(x0, 11);
+			xr = (x1 + xr);
+			x1 = ROTL32(x1, 11);
+			xo = (x2 + xo);
+			x2 = ROTL32(x2, 11);
+			xp = (x3 + xp);
+			x3 = ROTL32(x3, 11);
+			xu = (x4 + xu);
+			x4 = ROTL32(x4, 11);
+			xv = (x5 + xv);
+			x5 = ROTL32(x5, 11);
+			xs = (x6 + xs);
+			x6 = ROTL32(x6, 11);
+			xt = (x7 + xt);
+			x7 = ROTL32(x7, 11);
+			xc ^= xi;
+			xd ^= xj;
+			xe ^= xg;
+			xf ^= xh;
+			x8 ^= xm;
+			x9 ^= xn;
+			xa ^= xk;
+			xb ^= xl;
+			x4 ^= xq;
+			x5 ^= xr;
+			x6 ^= xo;
+			x7 ^= xp;
+			x0 ^= xu;
+			x1 ^= xv;
+			x2 ^= xs;
+			x3 ^= xt;
+
+		xj = (xc + xj);
+			xc = ROTL32(xc, 7);
+			xi = (xd + xi);
+			xd = ROTL32(xd, 7);
+			xh = (xe + xh);
+			xe = ROTL32(xe, 7);
+			xg = (xf + xg);
+			xf = ROTL32(xf, 7);
+			xn = (x8 + xn);
+			x8 = ROTL32(x8, 7);
+			xm = (x9 + xm);
+			x9 = ROTL32(x9, 7);
+			xl = (xa + xl);
+			xa = ROTL32(xa, 7);
+			xk = (xb + xk);
+			xb = ROTL32(xb, 7);
+			xr = (x4 + xr);
+			x4 = ROTL32(x4, 7);
+			xq = (x5 + xq);
+			x5 = ROTL32(x5, 7);
+			xp = (x6 + xp);
+			x6 = ROTL32(x6, 7);
+			xo = (x7 + xo);
+			x7 = ROTL32(x7, 7);
+			xv = (x0 + xv);
+			x0 = ROTL32(x0, 7);
+			xu = (x1 + xu);
+			x1 = ROTL32(x1, 7);
+			xt = (x2 + xt);
+			x2 = ROTL32(x2, 7);
+			xs = (x3 + xs);
+			x3 = ROTL32(x3, 7);
+			x4 ^= xj;
+			x5 ^= xi;
+			x6 ^= xh;
+			x7 ^= xg;
+			x0 ^= xn;
+			x1 ^= xm;
+			x2 ^= xl;
+			x3 ^= xk;
+			xc ^= xr;
+			xd ^= xq;
+			xe ^= xp;
+			xf ^= xo;
+			x8 ^= xv;
+			x9 ^= xu;
+			xa ^= xt;
+			xb ^= xs;
+			xh = (x4 + xh);
+			x4 = ROTL32(x4, 11);
+			xg = (x5 + xg);
+			x5 = ROTL32(x5, 11);
+			xj = (x6 + xj);
+			x6 = ROTL32(x6, 11);
+			xi = (x7 + xi);
+			x7 = ROTL32(x7, 11);
+			xl = (x0 + xl);
+			x0 = ROTL32(x0, 11);
+			xk = (x1 + xk);
+			x1 = ROTL32(x1, 11);
+			xn = (x2 + xn);
+			x2 = ROTL32(x2, 11);
+			xm = (x3 + xm);
+			x3 = ROTL32(x3, 11);
+			xp = (xc + xp);
+			xc = ROTL32(xc, 11);
+			xo = (xd + xo);
+			xd = ROTL32(xd, 11);
+			xr = (xe + xr);
+			xe = ROTL32(xe, 11);
+			xq = (xf + xq);
+			xf = ROTL32(xf, 11);
+			xt = (x8 + xt);
+			x8 = ROTL32(x8, 11);
+			xs = (x9 + xs);
+			x9 = ROTL32(x9, 11);
+			xv = (xa + xv);
+			xa = ROTL32(xa, 11);
+			xu = (xb + xu);
+			xb = ROTL32(xb, 11);
+			x0 ^= xh;
+			x1 ^= xg;
+			x2 ^= xj;
+			x3 ^= xi;
+			x4 ^= xl;
+			x5 ^= xk;
+			x6 ^= xn;
+			x7 ^= xm;
+			x8 ^= xp;
+			x9 ^= xo; 
+			xa ^= xr;
+			xb ^= xq;
+			xc ^= xt;
+			xd ^= xs;
+			xe ^= xv;
+			xf ^= xu;
+
+		for (int j = 1; j < 8; j++)
+		{
+			ROUND_EVEN;
+			ROUND_ODD;
+		}
+		x0 ^= (Hash[8]);
+		x1 ^= (Hash[9]);
+		x2 ^= (Hash[10]);
+		x3 ^= (Hash[11]);
+		x4 ^= (Hash[12]);
+		x5 ^= (Hash[13]);
+		x6 ^= (Hash[14]);
+		x7 ^= (Hash[15]);
 
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
 
-        hashState state;
-#pragma unroll 40
-        for(int i=0;i<40;i++) state.chainv[i] = c_IV[i];
+		for (int j = 0; j < 8; j++)
+		{
+			ROUND_EVEN;
+			ROUND_ODD;
+		}
+		x0 ^= 0x80;
+	
+		for (int j = 0; j < 8; j++)
+		{
+			ROUND_EVEN;
+			ROUND_ODD;
+		}
+		xv ^= 1;
 
-		Update512(&state, Hash);
-        finalization512(&state, Hash);
-		//Cubehash
+		for(int i = 3; i < 13; i++)
+		{
+			for (int j = 0; j < 8; j++)
+			{
+				ROUND_EVEN;
+				ROUND_ODD;
+			}
+		}
 
-		uint32_t x[2][2][2][2][2];
-		Init(x);
-		// erste Hälfte des Hashes (32 bytes)
-		Update32(x, Hash);
-		// zweite Hälfte des Hashes (32 bytes)
-		Update32(x, &Hash[8]);
-		// Padding Block
-		uint32_t last[8];
-		last[0] = 0x80;
-#pragma unroll 7
-		for (int i = 1; i < 8; i++) last[i] = 0;
-		Update32(x, last);
-		Final(x, Hash);	
+		Hash[0] = x0;
+		Hash[1] = x1;
+		Hash[2] = x2;
+		Hash[3] = x3;
+		Hash[4] = x4;
+		Hash[5] = x5;
+		Hash[6] = x6;
+		Hash[7] = x7;
+		Hash[8] = x8;
+		Hash[9] = x9;
+		Hash[10] = xa;
+		Hash[11] = xb;
+		Hash[12] = xc;
+		Hash[13] = xd;
+		Hash[14] = xe;
+		Hash[15] = xf;
 	}
 }
 
-__host__ void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
 {
-    const uint32_t threadsperblock = 256;
+	const uint32_t threadsperblock = 256;
 
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
 
-	x11_luffaCubehash512_gpu_hash_64 << <grid, block>> >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x11_luffaCubehash512_gpu_hash_64 << <grid, block, 0, gpustream[thr_id]>> >(threads, startNounce, (uint64_t*)d_hash);
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
-
diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu
index d531da9321..12b85d617c 100644
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@@ -1,37 +1,37 @@
 #include "cuda_helper.h"
 #include <memory.h> // memcpy()
+#include "cuda_vector.h"
 
-#define TPB 128
+
+#define TPB 320
 
 __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
 
-#include "cuda_x11_aes.cu"
+// #ifdef NOASM
+#include "cuda_x11_aes_noasm.cu"
+// #else
+// #include "cuda_x11_aes.cu"
+// #endif
 
 __device__ __forceinline__
-static void AES_ROUND_NOKEY(
-	const uint32_t* __restrict__ sharedMemory,
+void AES_ROUND_NOKEY(
+	const uint32_t*const __restrict__ sharedMemory,
 	uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3)
 {
-	uint32_t y0, y1, y2, y3;
 	aes_round(sharedMemory,
-		x0, x1, x2, x3,
-		y0, y1, y2, y3);
-
-	x0 = y0;
-	x1 = y1;
-	x2 = y2;
-	x3 = y3;
+			  x0, x1, x2, x3,
+			  x0, x1, x2, x3);
 }
 
 __device__ __forceinline__
-static void KEY_EXPAND_ELT(
-	const uint32_t* __restrict__ sharedMemory,
+void KEY_EXPAND_ELT(
+	const uint32_t*const __restrict__ sharedMemory,
 	uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3)
 {
 	uint32_t y0, y1, y2, y3;
 	aes_round(sharedMemory,
-		k0, k1, k2, k3,
-		y0, y1, y2, y3);
+			  k0, k1, k2, k3,
+			  y0, y1, y2, y3);
 
 	k0 = y1;
 	k1 = y2;
@@ -40,94 +40,59 @@ static void KEY_EXPAND_ELT(
 }
 
 __device__ __forceinline__
-static void c512(const uint32_t*const __restrict__ sharedMemory, uint32_t *const __restrict__  state, uint32_t *const __restrict__  msg, const uint32_t count)
+void shavite_gpu_init(uint32_t *sharedMemory)
 {
-	uint32_t p0, p1, p2, p3, p4, p5, p6, p7;
-	uint32_t p8, p9, pA, pB, pC, pD, pE, pF;
-	uint32_t x0, x1, x2, x3;
-	uint32_t rk[32];
-	uint32_t i;
-	const uint32_t counter = count;
-
-	p0 = state[0x0];
-	p1 = state[0x1];
-	p2 = state[0x2];
-	p3 = state[0x3];
-	p4 = state[0x4];
-	p5 = state[0x5];
-	p6 = state[0x6];
-	p7 = state[0x7];
-	p8 = state[0x8];
-	p9 = state[0x9];
-	pA = state[0xA];
-	pB = state[0xB];
-	pC = state[0xC];
-	pD = state[0xD];
-	pE = state[0xE];
-	pF = state[0xF];
-
-	x0 = p4;
-	x1 = p5;
-	x2 = p6;
-	x3 = p7;
-#pragma unroll
-	for (i = 0; i<16; i += 4)
-	{
-		rk[i] = msg[i];
-		x0 ^= msg[i];
-		rk[i + 1] = msg[i + 1];
-		x1 ^= msg[i + 1];
-		rk[i + 2] = msg[i + 2];
-		x2 ^= msg[i + 2];
-		rk[i + 3] = msg[i + 3];
-		x3 ^= msg[i + 3];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 256) {
+		/* each thread startup will fill a uint32 */
+		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = ROL8(sharedMemory[threadIdx.x]);
+		sharedMemory[threadIdx.x + 512] = ROL16(sharedMemory[threadIdx.x]);
+		sharedMemory[threadIdx.x + 768] = ROL24(sharedMemory[threadIdx.x]);
+//		sharedMemory[threadIdx.x + 64 * 2 ] = d_AES0[threadIdx.x + 64 * 2];
+//		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
+//		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
+//		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
 	}
+}
+__global__ __launch_bounds__(TPB, 3)
+void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t *const __restrict__ g_hash)
+{
+	__shared__  __align__(128) uint32_t sharedMemory[1024];
 
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
-	if (count == 512)
-	{
-		rk[16] = 0x80U;
-		x0 = pC ^ 0x80U;
-		rk[17] = 0;
-		x1 = pD;
-		rk[18] = 0;
-		x2 = pE;
-		rk[19] = 0;
-		x3 = pF;
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[20] = 0;
-		rk[21] = 0;
-		rk[22] = 0;
-		rk[23] = 0;
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[24] = 0;
-		rk[25] = 0;
-		rk[26] = 0;
-		rk[27] = 0x02000000U;
-		x3 ^= 0x02000000U;
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[28] = 0;
-		rk[29] = 0;
-		rk[30] = 0;
-		rk[31] = 0x02000000;
-		x3 ^= 0x02000000;
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	}
-	else
+	shavite_gpu_init(sharedMemory);
+	__syncthreads();
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if(thread < threads)
 	{
-		x0 = pC;
-		x1 = pD;
-		x2 = pE;
-		x3 = pF;
+		uint32_t *Hash = &g_hash[thread * 16];
+
+		uint32_t rk[32];
+		uint32_t msg[16];
 
-		for (i = 16; i<32; i += 4)
+		uint28 *phash = (uint28*)Hash;
+		uint28 *outpt = (uint28*)msg;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+
+		uint32_t state[16] =
+		{
+			0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
+			0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
+			0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
+			0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
+		};
+
+		uint32_t x0 = 0xD1901A06;
+		uint32_t x1 = 0x430AE307;
+		uint32_t x2 = 0xB29F5CD1;
+		uint32_t x3 = 0xDF07FBFC;
+
+		for(int i = 0; i < 16; i += 4)
 		{
-			rk[i] = msg[i];
-			x0 ^= msg[i];
+
+			rk[i + 0] = msg[i + 0];
+			x0 ^= msg[i + 0];
 			rk[i + 1] = msg[i + 1];
 			x1 ^= msg[i + 1];
 			rk[i + 2] = msg[i + 2];
@@ -136,1285 +101,211 @@ static void c512(const uint32_t*const __restrict__ sharedMemory, uint32_t *const
 			x3 ^= msg[i + 3];
 			AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		}
-	}
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
+		state[0] ^= x0;
+		state[1] ^= x1;
+		state[2] ^= x2;
+		state[3] ^= x3;
 
-	// 1
-	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
+		// 1
+		KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
 
-	rk[0] ^= rk[28];
-	rk[1] ^= rk[29];
-	rk[2] ^= rk[30];
-	rk[3] ^= rk[31];
-	rk[0] ^= counter;
-	rk[3] ^= 0xFFFFFFFF;
-	x0 = p0 ^ rk[0];
-	x1 = p1 ^ rk[1];
-	x2 = p2 ^ rk[2];
-	x3 = p3 ^ rk[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
-	rk[4] ^= rk[0];
-	rk[5] ^= rk[1];
-	rk[6] ^= rk[2];
-	rk[7] ^= rk[3];
-	x0 ^= rk[4];
-	x1 ^= rk[5];
-	x2 ^= rk[6];
-	x3 ^= rk[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
-	rk[8] ^= rk[4];
-	rk[9] ^= rk[5];
-	rk[10] ^= rk[6];
-	rk[11] ^= rk[7];
-	x0 ^= rk[8];
-	x1 ^= rk[9];
-	x2 ^= rk[10];
-	x3 ^= rk[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
-	rk[12] ^= rk[8];
-	rk[13] ^= rk[9];
-	rk[14] ^= rk[10];
-	rk[15] ^= rk[11];
-	x0 ^= rk[12];
-	x1 ^= rk[13];
-	x2 ^= rk[14];
-	x3 ^= rk[15];
+		rk[3] ^= (0x02000000UL ^ 0xFFFFFFFFUL);	//rk[31];
+		rk[0] ^= 512;
+		//	rk[3] ^= 0xFFFFFFFF;
 
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
+		x0 = state[0] ^ rk[0];
+		x1 = state[1] ^ rk[1];
+		x2 = state[2] ^ rk[2];
+		x3 = state[3] ^ rk[3];
 
 
-	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
-	rk[16] ^= rk[12];
-	rk[17] ^= rk[13];
-	rk[18] ^= rk[14];
-	rk[19] ^= rk[15];
-	x0 = p8 ^ rk[16];
-	x1 = p9 ^ rk[17];
-	x2 = pA ^ rk[18];
-	x3 = pB ^ rk[19];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
-	rk[20] ^= rk[16];
-	rk[21] ^= rk[17];
-	rk[22] ^= rk[18];
-	rk[23] ^= rk[19];
-	x0 ^= rk[20];
-	x1 ^= rk[21];
-	x2 ^= rk[22];
-	x3 ^= rk[23];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
-	rk[24] ^= rk[20];
-	rk[25] ^= rk[21];
-	rk[26] ^= rk[22];
-	rk[27] ^= rk[23];
-	x0 ^= rk[24];
-	x1 ^= rk[25];
-	x2 ^= rk[26];
-	x3 ^= rk[27];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
-	rk[28] ^= rk[24];
-	rk[29] ^= rk[25];
-	rk[30] ^= rk[26];
-	rk[31] ^= rk[27];
-	x0 ^= rk[28];
-	x1 ^= rk[29];
-	x2 ^= rk[30];
-	x3 ^= rk[31];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
+		rk[4] ^= rk[0];
+		rk[5] ^= rk[1];
+		rk[6] ^= rk[2];
+		rk[7] ^= rk[3];
+		x0 ^= rk[4];
+		x1 ^= rk[5];
+		x2 ^= rk[6];
+		x3 ^= rk[7];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
+		rk[8] ^= rk[4];
+		rk[9] ^= rk[5];
+		rk[10] ^= rk[6];
+		rk[11] ^= rk[7];
+		x0 ^= rk[8];
+		x1 ^= rk[9];
+		x2 ^= rk[10];
+		x3 ^= rk[11];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
+		rk[12] ^= rk[8];
+		rk[13] ^= rk[9];
+		rk[14] ^= rk[10];
+		rk[15] ^= rk[11];
+		x0 ^= rk[12];
+		x1 ^= rk[13];
+		x2 ^= rk[14];
+		x3 ^= rk[15];
 
-	rk[0] ^= rk[25];
-	x0 = pC ^ rk[0];
-	rk[1] ^= rk[26];
-	x1 = pD ^ rk[1];
-	rk[2] ^= rk[27];
-	x2 = pE ^ rk[2];
-	rk[3] ^= rk[28];
-	x3 = pF ^ rk[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[4] ^= rk[29];
-	x0 ^= rk[4];
-	rk[5] ^= rk[30];
-	x1 ^= rk[5];
-	rk[6] ^= rk[31];
-	x2 ^= rk[6];
-	rk[7] ^= rk[0];
-	x3 ^= rk[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[8] ^= rk[1];
-	x0 ^= rk[8];
-	rk[9] ^= rk[2];
-	x1 ^= rk[9];
-	rk[10] ^= rk[3];
-	x2 ^= rk[10];
-	rk[11] ^= rk[4];
-	x3 ^= rk[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[12] ^= rk[5];
-	x0 ^= rk[12];
-	rk[13] ^= rk[6];
-	x1 ^= rk[13];
-	rk[14] ^= rk[7];
-	x2 ^= rk[14];
-	rk[15] ^= rk[8];
-	x3 ^= rk[15];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
-	rk[16] ^= rk[9];
-	x0 = p4 ^ rk[16];
-	rk[17] ^= rk[10];
-	x1 = p5 ^ rk[17];
-	rk[18] ^= rk[11];
-	x2 = p6 ^ rk[18];
-	rk[19] ^= rk[12];
-	x3 = p7 ^ rk[19];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[20] ^= rk[13];
-	x0 ^= rk[20];
-	rk[21] ^= rk[14];
-	x1 ^= rk[21];
-	rk[22] ^= rk[15];
-	x2 ^= rk[22];
-	rk[23] ^= rk[16];
-	x3 ^= rk[23];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[24] ^= rk[17];
-	x0 ^= rk[24];
-	rk[25] ^= rk[18];
-	x1 ^= rk[25];
-	rk[26] ^= rk[19];
-	x2 ^= rk[26];
-	rk[27] ^= rk[20];
-	x3 ^= rk[27];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[28] ^= rk[21];
-	x0 ^= rk[28];
-	rk[29] ^= rk[22];
-	x1 ^= rk[29];
-	rk[30] ^= rk[23];
-	x2 ^= rk[30];
-	rk[31] ^= rk[24];
-	x3 ^= rk[31];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
-	/* round 3, 7, 11 */
-	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
-	rk[0] ^= rk[28];
-	rk[1] ^= rk[29];
-	rk[2] ^= rk[30];
-	rk[3] ^= rk[31];
-	x0 = p8 ^ rk[0];
-	x1 = p9 ^ rk[1];
-	x2 = pA ^ rk[2];
-	x3 = pB ^ rk[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
-	rk[4] ^= rk[0];
-	rk[5] ^= rk[1];
-	rk[6] ^= rk[2];
-	rk[7] ^= rk[3];
-	x0 ^= rk[4];
-	x1 ^= rk[5];
-	x2 ^= rk[6];
-	x3 ^= rk[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
-	rk[8] ^= rk[4];
-	rk[9] ^= rk[5];
-	rk[10] ^= rk[6];
-	rk[11] ^= rk[7];
-	x0 ^= rk[8];
-	x1 ^= rk[9];
-	x2 ^= rk[10];
-	x3 ^= rk[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
-	rk[12] ^= rk[8];
-	rk[13] ^= rk[9];
-	rk[14] ^= rk[10];
-	rk[15] ^= rk[11];
-	x0 ^= rk[12];
-	x1 ^= rk[13];
-	x2 ^= rk[14];
-	x3 ^= rk[15];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
-	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
-	rk[16] ^= rk[12];
-	rk[17] ^= rk[13];
-	rk[18] ^= rk[14];
-	rk[19] ^= rk[15];
-	x0 = p0 ^ rk[16];
-	x1 = p1 ^ rk[17];
-	x2 = p2 ^ rk[18];
-	x3 = p3 ^ rk[19];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
-	rk[20] ^= rk[16];
-	rk[21] ^= rk[17];
-	rk[22] ^= rk[18];
-	rk[23] ^= rk[19];
-	x0 ^= rk[20];
-	x1 ^= rk[21];
-	x2 ^= rk[22];
-	x3 ^= rk[23];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
-	rk[24] ^= rk[20];
-	rk[25] ^= rk[21];
-	rk[26] ^= rk[22];
-	rk[27] ^= rk[23];
-	x0 ^= rk[24];
-	x1 ^= rk[25];
-	x2 ^= rk[26];
-	x3 ^= rk[27];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
-	rk[28] ^= rk[24];
-	rk[29] ^= rk[25];
-	rk[30] ^= rk[26];
-	rk[31] ^= rk[27];
-	x0 ^= rk[28];
-	x1 ^= rk[29];
-	x2 ^= rk[30];
-	x3 ^= rk[31];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
-	/* round 4, 8, 12 */
-	rk[0] ^= rk[25];
-	x0 = p4 ^ rk[0];
-	rk[1] ^= rk[26];
-	x1 = p5 ^ rk[1];
-	rk[2] ^= rk[27];
-	x2 = p6 ^ rk[2];
-	rk[3] ^= rk[28];
-	x3 = p7 ^ rk[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[4] ^= rk[29];
-	x0 ^= rk[4];
-	rk[5] ^= rk[30];
-	x1 ^= rk[5];
-	rk[6] ^= rk[31];
-	x2 ^= rk[6];
-	rk[7] ^= rk[0];
-	x3 ^= rk[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[8] ^= rk[1];
-	x0 ^= rk[8];
-	rk[9] ^= rk[2];
-	x1 ^= rk[9];
-	rk[10] ^= rk[3];
-	x2 ^= rk[10];
-	rk[11] ^= rk[4];
-	x3 ^= rk[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[12] ^= rk[5];
-	x0 ^= rk[12];
-	rk[13] ^= rk[6];
-	x1 ^= rk[13];
-	rk[14] ^= rk[7];
-	x2 ^= rk[14];
-	rk[15] ^= rk[8];
-	x3 ^= rk[15];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
-	rk[16] ^= rk[9];
-	x0 = pC ^ rk[16];
-	rk[17] ^= rk[10];
-	x1 = pD ^ rk[17];
-	rk[18] ^= rk[11];
-	x2 = pE ^ rk[18];
-	rk[19] ^= rk[12];
-	x3 = pF ^ rk[19];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[20] ^= rk[13];
-	x0 ^= rk[20];
-	rk[21] ^= rk[14];
-	x1 ^= rk[21];
-	rk[22] ^= rk[15];
-	x2 ^= rk[22];
-	rk[23] ^= rk[16];
-	x3 ^= rk[23];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[24] ^= rk[17];
-	x0 ^= rk[24];
-	rk[25] ^= rk[18];
-	x1 ^= rk[25];
-	rk[26] ^= rk[19];
-	x2 ^= rk[26];
-	rk[27] ^= rk[20];
-	x3 ^= rk[27];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[28] ^= rk[21];
-	x0 ^= rk[28];
-	rk[29] ^= rk[22];
-	x1 ^= rk[29];
-	rk[30] ^= rk[23];
-	x2 ^= rk[30];
-	rk[31] ^= rk[24];
-	x3 ^= rk[31];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
+		state[8] ^= 0x32be246fUL;
+		state[9] ^= 0xe33ad1e5UL;
+		state[10] ^= 0xd659b13eUL;
+		state[11] ^= 0xb6a1a92cUL;
 
-	// 2
-	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
-	rk[0] ^= rk[28];
-	rk[1] ^= rk[29];
-	rk[2] ^= rk[30];
-	rk[3] ^= rk[31];
-	x0 = p0 ^ rk[0];
-	x1 = p1 ^ rk[1];
-	x2 = p2 ^ rk[2];
-	x3 = p3 ^ rk[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
-	rk[4] ^= rk[0];
-	rk[5] ^= rk[1];
-	rk[6] ^= rk[2];
-	rk[7] ^= rk[3];
-	rk[7] ^= ~counter;
-	x0 ^= rk[4];
-	x1 ^= rk[5];
-	x2 ^= rk[6];
-	x3 ^= rk[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
-	rk[8] ^= rk[4];
-	rk[9] ^= rk[5];
-	rk[10] ^= rk[6];
-	rk[11] ^= rk[7];
-	x0 ^= rk[8];
-	x1 ^= rk[9];
-	x2 ^= rk[10];
-	x3 ^= rk[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
-	rk[12] ^= rk[8];
-	rk[13] ^= rk[9];
-	rk[14] ^= rk[10];
-	rk[15] ^= rk[11];
-	x0 ^= rk[12];
-	x1 ^= rk[13];
-	x2 ^= rk[14];
-	x3 ^= rk[15];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
-	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
-	rk[16] ^= rk[12];
-	rk[17] ^= rk[13];
-	rk[18] ^= rk[14];
-	rk[19] ^= rk[15];
-	x0 = p8 ^ rk[16];
-	x1 = p9 ^ rk[17];
-	x2 = pA ^ rk[18];
-	x3 = pB ^ rk[19];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
-	rk[20] ^= rk[16];
-	rk[21] ^= rk[17];
-	rk[22] ^= rk[18];
-	rk[23] ^= rk[19];
-	x0 ^= rk[20];
-	x1 ^= rk[21];
-	x2 ^= rk[22];
-	x3 ^= rk[23];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
-	rk[24] ^= rk[20];
-	rk[25] ^= rk[21];
-	rk[26] ^= rk[22];
-	rk[27] ^= rk[23];
-	x0 ^= rk[24];
-	x1 ^= rk[25];
-	x2 ^= rk[26];
-	x3 ^= rk[27];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
-	rk[28] ^= rk[24];
-	rk[29] ^= rk[25];
-	rk[30] ^= rk[26];
-	rk[31] ^= rk[27];
-	x0 ^= rk[28];
-	x1 ^= rk[29];
-	x2 ^= rk[30];
-	x3 ^= rk[31];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
+		state[12] ^= x0;
+		state[13] ^= x1;
+		state[14] ^= x2;
+		state[15] ^= x3;
 
-	rk[0] ^= rk[25];
-	x0 = pC ^ rk[0];
-	rk[1] ^= rk[26];
-	x1 = pD ^ rk[1];
-	rk[2] ^= rk[27];
-	x2 = pE ^ rk[2];
-	rk[3] ^= rk[28];
-	x3 = pF ^ rk[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[4] ^= rk[29];
-	x0 ^= rk[4];
-	rk[5] ^= rk[30];
-	x1 ^= rk[5];
-	rk[6] ^= rk[31];
-	x2 ^= rk[6];
-	rk[7] ^= rk[0];
-	x3 ^= rk[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[8] ^= rk[1];
-	x0 ^= rk[8];
-	rk[9] ^= rk[2];
-	x1 ^= rk[9];
-	rk[10] ^= rk[3];
-	x2 ^= rk[10];
-	rk[11] ^= rk[4];
-	x3 ^= rk[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[12] ^= rk[5];
-	x0 ^= rk[12];
-	rk[13] ^= rk[6];
-	x1 ^= rk[13];
-	rk[14] ^= rk[7];
-	x2 ^= rk[14];
-	rk[15] ^= rk[8];
-	x3 ^= rk[15];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
-	rk[16] ^= rk[9];
-	x0 = p4 ^ rk[16];
-	rk[17] ^= rk[10];
-	x1 = p5 ^ rk[17];
-	rk[18] ^= rk[11];
-	x2 = p6 ^ rk[18];
-	rk[19] ^= rk[12];
-	x3 = p7 ^ rk[19];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[20] ^= rk[13];
-	x0 ^= rk[20];
-	rk[21] ^= rk[14];
-	x1 ^= rk[21];
-	rk[22] ^= rk[15];
-	x2 ^= rk[22];
-	rk[23] ^= rk[16];
-	x3 ^= rk[23];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[24] ^= rk[17];
-	x0 ^= rk[24];
-	rk[25] ^= rk[18];
-	x1 ^= rk[25];
-	rk[26] ^= rk[19];
-	x2 ^= rk[26];
-	rk[27] ^= rk[20];
-	x3 ^= rk[27];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[28] ^= rk[21];
-	x0 ^= rk[28];
-	rk[29] ^= rk[22];
-	x1 ^= rk[29];
-	rk[30] ^= rk[23];
-	x2 ^= rk[30];
-	rk[31] ^= rk[24];
-	x3 ^= rk[31];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
-	/* round 3, 7, 11 */
-	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
-	rk[0] ^= rk[28];
-	rk[1] ^= rk[29];
-	rk[2] ^= rk[30];
-	rk[3] ^= rk[31];
-	x0 = p8 ^ rk[0];
-	x1 = p9 ^ rk[1];
-	x2 = pA ^ rk[2];
-	x3 = pB ^ rk[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
-	rk[4] ^= rk[0];
-	rk[5] ^= rk[1];
-	rk[6] ^= rk[2];
-	rk[7] ^= rk[3];
-	x0 ^= rk[4];
-	x1 ^= rk[5];
-	x2 ^= rk[6];
-	x3 ^= rk[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
-	rk[8] ^= rk[4];
-	rk[9] ^= rk[5];
-	rk[10] ^= rk[6];
-	rk[11] ^= rk[7];
-	x0 ^= rk[8];
-	x1 ^= rk[9];
-	x2 ^= rk[10];
-	x3 ^= rk[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
-	rk[12] ^= rk[8];
-	rk[13] ^= rk[9];
-	rk[14] ^= rk[10];
-	rk[15] ^= rk[11];
-	x0 ^= rk[12];
-	x1 ^= rk[13];
-	x2 ^= rk[14];
-	x3 ^= rk[15];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
-	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
-	rk[16] ^= rk[12];
-	rk[17] ^= rk[13];
-	rk[18] ^= rk[14];
-	rk[19] ^= rk[15];
-	x0 = p0 ^ rk[16];
-	x1 = p1 ^ rk[17];
-	x2 = p2 ^ rk[18];
-	x3 = p3 ^ rk[19];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
-	rk[20] ^= rk[16];
-	rk[21] ^= rk[17];
-	rk[22] ^= rk[18];
-	rk[23] ^= rk[19];
-	x0 ^= rk[20];
-	x1 ^= rk[21];
-	x2 ^= rk[22];
-	x3 ^= rk[23];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
-	rk[24] ^= rk[20];
-	rk[25] ^= rk[21];
-	rk[26] ^= rk[22];
-	rk[27] ^= rk[23];
-	x0 ^= rk[24];
-	x1 ^= rk[25];
-	x2 ^= rk[26];
-	x3 ^= rk[27];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
-	rk[28] ^= rk[24];
-	rk[29] ^= rk[25];
-	rk[30] ^= rk[26];
-	rk[31] ^= rk[27];
-	x0 ^= rk[28];
-	x1 ^= rk[29];
-	x2 ^= rk[30];
-	x3 ^= rk[31];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
-	/* round 4, 8, 12 */
-	rk[0] ^= rk[25];
-	x0 = p4 ^ rk[0];
-	rk[1] ^= rk[26];
-	x1 = p5 ^ rk[1];
-	rk[2] ^= rk[27];
-	x2 = p6 ^ rk[2];
-	rk[3] ^= rk[28];
-	x3 = p7 ^ rk[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[4] ^= rk[29];
-	x0 ^= rk[4];
-	rk[5] ^= rk[30];
-	x1 ^= rk[5];
-	rk[6] ^= rk[31];
-	x2 ^= rk[6];
-	rk[7] ^= rk[0];
-	x3 ^= rk[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[8] ^= rk[1];
-	x0 ^= rk[8];
-	rk[9] ^= rk[2];
-	x1 ^= rk[9];
-	rk[10] ^= rk[3];
-	x2 ^= rk[10];
-	rk[11] ^= rk[4];
-	x3 ^= rk[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[12] ^= rk[5];
-	x0 ^= rk[12];
-	rk[13] ^= rk[6];
-	x1 ^= rk[13];
-	rk[14] ^= rk[7];
-	x2 ^= rk[14];
-	rk[15] ^= rk[8];
-	x3 ^= rk[15];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
-	rk[16] ^= rk[9];
-	x0 = pC ^ rk[16];
-	rk[17] ^= rk[10];
-	x1 = pD ^ rk[17];
-	rk[18] ^= rk[11];
-	x2 = pE ^ rk[18];
-	rk[19] ^= rk[12];
-	x3 = pF ^ rk[19];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[20] ^= rk[13];
-	x0 ^= rk[20];
-	rk[21] ^= rk[14];
-	x1 ^= rk[21];
-	rk[22] ^= rk[15];
-	x2 ^= rk[22];
-	rk[23] ^= rk[16];
-	x3 ^= rk[23];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[24] ^= rk[17];
-	x0 ^= rk[24];
-	rk[25] ^= rk[18];
-	x1 ^= rk[25];
-	rk[26] ^= rk[19];
-	x2 ^= rk[26];
-	rk[27] ^= rk[20];
-	x3 ^= rk[27];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[28] ^= rk[21];
-	x0 ^= rk[28];
-	rk[29] ^= rk[22];
-	x1 ^= rk[29];
-	rk[30] ^= rk[23];
-	x2 ^= rk[30];
-	rk[31] ^= rk[24];
-	x3 ^= rk[31];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
+		rk[16] = rk[12] ^ 0x63636363UL;
+		rk[17] = rk[13] ^ 0x63636363UL;
+		rk[18] = rk[14] ^ 0x63636363UL;
+		rk[19] = rk[15] ^ 0x8acdcd24UL;
+		x0 = state[8] ^ rk[16];
+		x1 = state[9] ^ rk[17];
+		x2 = state[10] ^ rk[18];
+		x3 = state[11] ^ rk[19];
+		rk[20] = 0x63636363UL ^ rk[16];
+		rk[21] = 0x63636363UL ^ rk[17];
+		rk[22] = 0x63636363UL ^ rk[18];
+		rk[23] = 0x63636363UL ^ rk[19];
+		rk[24] = 0x63636363UL ^ rk[20];
+		rk[25] = 0x63636363UL ^ rk[21];
+		rk[26] = 0x63636363UL ^ rk[22];
+		rk[27] = 0x4b5f7777UL ^ rk[23];
 
-	// 3
-	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
-	rk[0] ^= rk[28];
-	rk[1] ^= rk[29];
-	rk[2] ^= rk[30];
-	rk[3] ^= rk[31];
-	x0 = p0 ^ rk[0];
-	x1 = p1 ^ rk[1];
-	x2 = p2 ^ rk[2];
-	x3 = p3 ^ rk[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
-	rk[4] ^= rk[0];
-	rk[5] ^= rk[1];
-	rk[6] ^= rk[2];
-	rk[7] ^= rk[3];
-	x0 ^= rk[4];
-	x1 ^= rk[5];
-	x2 ^= rk[6];
-	x3 ^= rk[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
-	rk[8] ^= rk[4];
-	rk[9] ^= rk[5];
-	rk[10] ^= rk[6];
-	rk[11] ^= rk[7];
-	x0 ^= rk[8];
-	x1 ^= rk[9];
-	x2 ^= rk[10];
-	x3 ^= rk[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
-	rk[12] ^= rk[8];
-	rk[13] ^= rk[9];
-	rk[14] ^= rk[10];
-	rk[15] ^= rk[11];
-	x0 ^= rk[12];
-	x1 ^= rk[13];
-	x2 ^= rk[14];
-	x3 ^= rk[15];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
-	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
-	rk[16] ^= rk[12];
-	rk[17] ^= rk[13];
-	rk[18] ^= rk[14];
-	rk[19] ^= rk[15];
-	x0 = p8 ^ rk[16];
-	x1 = p9 ^ rk[17];
-	x2 = pA ^ rk[18];
-	x3 = pB ^ rk[19];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
-	rk[20] ^= rk[16];
-	rk[21] ^= rk[17];
-	rk[22] ^= rk[18];
-	rk[23] ^= rk[19];
-	x0 ^= rk[20];
-	x1 ^= rk[21];
-	x2 ^= rk[22];
-	x3 ^= rk[23];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
-	rk[24] ^= rk[20];
-	rk[25] ^= rk[21];
-	rk[26] ^= rk[22];
-	rk[27] ^= rk[23];
-	x0 ^= rk[24];
-	x1 ^= rk[25];
-	x2 ^= rk[26];
-	x3 ^= rk[27];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
-	rk[28] ^= rk[24];
-	rk[29] ^= rk[25];
-	rk[30] ^= rk[26];
-	rk[31] ^= rk[27];
-	rk[30] ^= counter;
-	rk[31] ^= 0xFFFFFFFF;
-	x0 ^= rk[28];
-	x1 ^= rk[29];
-	x2 ^= rk[30];
-	x3 ^= rk[31];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
+		rk[28] = 0x63636363UL ^ rk[24];
+		rk[29] = 0x63636363UL ^ rk[25];
+		rk[30] = 0x63636363UL ^ rk[26];
+		rk[31] = 0x4b5f7777UL ^ rk[27];
 
-	rk[0] ^= rk[25];
-	x0 = pC ^ rk[0];
-	rk[1] ^= rk[26];
-	x1 = pD ^ rk[1];
-	rk[2] ^= rk[27];
-	x2 = pE ^ rk[2];
-	rk[3] ^= rk[28];
-	x3 = pF ^ rk[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[4] ^= rk[29];
-	x0 ^= rk[4];
-	rk[5] ^= rk[30];
-	x1 ^= rk[5];
-	rk[6] ^= rk[31];
-	x2 ^= rk[6];
-	rk[7] ^= rk[0];
-	x3 ^= rk[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[8] ^= rk[1];
-	x0 ^= rk[8];
-	rk[9] ^= rk[2];
-	x1 ^= rk[9];
-	rk[10] ^= rk[3];
-	x2 ^= rk[10];
-	rk[11] ^= rk[4];
-	x3 ^= rk[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[12] ^= rk[5];
-	x0 ^= rk[12];
-	rk[13] ^= rk[6];
-	x1 ^= rk[13];
-	rk[14] ^= rk[7];
-	x2 ^= rk[14];
-	rk[15] ^= rk[8];
-	x3 ^= rk[15];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
-	rk[16] ^= rk[9];
-	x0 = p4 ^ rk[16];
-	rk[17] ^= rk[10];
-	x1 = p5 ^ rk[17];
-	rk[18] ^= rk[11];
-	x2 = p6 ^ rk[18];
-	rk[19] ^= rk[12];
-	x3 = p7 ^ rk[19];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[20] ^= rk[13];
-	x0 ^= rk[20];
-	rk[21] ^= rk[14];
-	x1 ^= rk[21];
-	rk[22] ^= rk[15];
-	x2 ^= rk[22];
-	rk[23] ^= rk[16];
-	x3 ^= rk[23];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[24] ^= rk[17];
-	x0 ^= rk[24];
-	rk[25] ^= rk[18];
-	x1 ^= rk[25];
-	rk[26] ^= rk[19];
-	x2 ^= rk[26];
-	rk[27] ^= rk[20];
-	x3 ^= rk[27];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[28] ^= rk[21];
-	x0 ^= rk[28];
-	rk[29] ^= rk[22];
-	x1 ^= rk[29];
-	rk[30] ^= rk[23];
-	x2 ^= rk[30];
-	rk[31] ^= rk[24];
-	x3 ^= rk[31];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
 
 
-	/* round 3, 7, 11 */
-	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
-	rk[0] ^= rk[28];
-	rk[1] ^= rk[29];
-	rk[2] ^= rk[30];
-	rk[3] ^= rk[31];
-	x0 = p8 ^ rk[0];
-	x1 = p9 ^ rk[1];
-	x2 = pA ^ rk[2];
-	x3 = pB ^ rk[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
-	rk[4] ^= rk[0];
-	rk[5] ^= rk[1];
-	rk[6] ^= rk[2];
-	rk[7] ^= rk[3];
-	x0 ^= rk[4];
-	x1 ^= rk[5];
-	x2 ^= rk[6];
-	x3 ^= rk[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
-	rk[8] ^= rk[4];
-	rk[9] ^= rk[5];
-	rk[10] ^= rk[6];
-	rk[11] ^= rk[7];
-	x0 ^= rk[8];
-	x1 ^= rk[9];
-	x2 ^= rk[10];
-	x3 ^= rk[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
-	rk[12] ^= rk[8];
-	rk[13] ^= rk[9];
-	rk[14] ^= rk[10];
-	rk[15] ^= rk[11];
-	x0 ^= rk[12];
-	x1 ^= rk[13];
-	x2 ^= rk[14];
-	x3 ^= rk[15];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
-	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
-	rk[16] ^= rk[12];
-	rk[17] ^= rk[13];
-	rk[18] ^= rk[14];
-	rk[19] ^= rk[15];
-	x0 = p0 ^ rk[16];
-	x1 = p1 ^ rk[17];
-	x2 = p2 ^ rk[18];
-	x3 = p3 ^ rk[19];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
-	rk[20] ^= rk[16];
-	rk[21] ^= rk[17];
-	rk[22] ^= rk[18];
-	rk[23] ^= rk[19];
-	x0 ^= rk[20];
-	x1 ^= rk[21];
-	x2 ^= rk[22];
-	x3 ^= rk[23];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
-	rk[24] ^= rk[20];
-	rk[25] ^= rk[21];
-	rk[26] ^= rk[22];
-	rk[27] ^= rk[23];
-	x0 ^= rk[24];
-	x1 ^= rk[25];
-	x2 ^= rk[26];
-	x3 ^= rk[27];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
-	rk[28] ^= rk[24];
-	rk[29] ^= rk[25];
-	rk[30] ^= rk[26];
-	rk[31] ^= rk[27];
-	x0 ^= rk[28];
-	x1 ^= rk[29];
-	x2 ^= rk[30];
-	x3 ^= rk[31];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
-	/* round 4, 8, 12 */
-	rk[0] ^= rk[25];
-	x0 = p4 ^ rk[0];
-	rk[1] ^= rk[26];
-	x1 = p5 ^ rk[1];
-	rk[2] ^= rk[27];
-	x2 = p6 ^ rk[2];
-	rk[3] ^= rk[28];
-	x3 = p7 ^ rk[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[4] ^= rk[29];
-	x0 ^= rk[4];
-	rk[5] ^= rk[30];
-	x1 ^= rk[5];
-	rk[6] ^= rk[31];
-	x2 ^= rk[6];
-	rk[7] ^= rk[0];
-	x3 ^= rk[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[8] ^= rk[1];
-	x0 ^= rk[8];
-	rk[9] ^= rk[2];
-	x1 ^= rk[9];
-	rk[10] ^= rk[3];
-	x2 ^= rk[10];
-	rk[11] ^= rk[4];
-	x3 ^= rk[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[12] ^= rk[5];
-	x0 ^= rk[12];
-	rk[13] ^= rk[6];
-	x1 ^= rk[13];
-	rk[14] ^= rk[7];
-	x2 ^= rk[14];
-	rk[15] ^= rk[8];
-	x3 ^= rk[15];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
-	rk[16] ^= rk[9];
-	x0 = pC ^ rk[16];
-	rk[17] ^= rk[10];
-	x1 = pD ^ rk[17];
-	rk[18] ^= rk[11];
-	x2 = pE ^ rk[18];
-	rk[19] ^= rk[12];
-	x3 = pF ^ rk[19];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[20] ^= rk[13];
-	x0 ^= rk[20];
-	rk[21] ^= rk[14];
-	x1 ^= rk[21];
-	rk[22] ^= rk[15];
-	x2 ^= rk[22];
-	rk[23] ^= rk[16];
-	x3 ^= rk[23];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[24] ^= rk[17];
-	x0 ^= rk[24];
-	rk[25] ^= rk[18];
-	x1 ^= rk[25];
-	rk[26] ^= rk[19];
-	x2 ^= rk[26];
-	rk[27] ^= rk[20];
-	x3 ^= rk[27];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk[28] ^= rk[21];
-	x0 ^= rk[28];
-	rk[29] ^= rk[22];
-	x1 ^= rk[29];
-	rk[30] ^= rk[23];
-	x2 ^= rk[30];
-	rk[31] ^= rk[24];
-	x3 ^= rk[31];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
-
-	/* round 13 */
-	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
-	rk[0] ^= rk[28];
-	rk[1] ^= rk[29];
-	rk[2] ^= rk[30];
-	rk[3] ^= rk[31];
-	x0 = p0 ^ rk[0];
-	x1 = p1 ^ rk[1];
-	x2 = p2 ^ rk[2];
-	x3 = p3 ^ rk[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
-	rk[4] ^= rk[0];
-	rk[5] ^= rk[1];
-	rk[6] ^= rk[2];
-	rk[7] ^= rk[3];
-	x0 ^= rk[4];
-	x1 ^= rk[5];
-	x2 ^= rk[6];
-	x3 ^= rk[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
-	rk[8] ^= rk[4];
-	rk[9] ^= rk[5];
-	rk[10] ^= rk[6];
-	rk[11] ^= rk[7];
-	x0 ^= rk[8];
-	x1 ^= rk[9];
-	x2 ^= rk[10];
-	x3 ^= rk[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
-	rk[12] ^= rk[8];
-	rk[13] ^= rk[9];
-	rk[14] ^= rk[10];
-	rk[15] ^= rk[11];
-	x0 ^= rk[12];
-	x1 ^= rk[13];
-	x2 ^= rk[14];
-	x3 ^= rk[15];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
-	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
-	rk[16] ^= rk[12];
-	rk[17] ^= rk[13];
-	rk[18] ^= rk[14];
-	rk[19] ^= rk[15];
-	x0 = p8 ^ rk[16];
-	x1 = p9 ^ rk[17];
-	x2 = pA ^ rk[18];
-	x3 = pB ^ rk[19];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
-	rk[20] ^= rk[16];
-	rk[21] ^= rk[17];
-	rk[22] ^= rk[18];
-	rk[23] ^= rk[19];
-	x0 ^= rk[20];
-	x1 ^= rk[21];
-	x2 ^= rk[22];
-	x3 ^= rk[23];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
-	rk[24] ^= rk[20];
-	rk[25] ^= rk[21] ^ counter;
-	rk[26] ^= rk[22];
-	rk[27] ^= rk[23] ^ 0xFFFFFFFF;
-	x0 ^= rk[24];
-	x1 ^= rk[25];
-	x2 ^= rk[26];
-	x3 ^= rk[27];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
-	rk[28] ^= rk[24];
-	rk[29] ^= rk[25];
-	rk[30] ^= rk[26];
-	rk[31] ^= rk[27];
-	x0 ^= rk[28];
-	x1 ^= rk[29];
-	x2 ^= rk[30];
-	x3 ^= rk[31];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
-	state[0x0] ^= p8;
-	state[0x1] ^= p9;
-	state[0x2] ^= pA;
-	state[0x3] ^= pB;
-	state[0x4] ^= pC;
-	state[0x5] ^= pD;
-	state[0x6] ^= pE;
-	state[0x7] ^= pF;
-	state[0x8] ^= p0;
-	state[0x9] ^= p1;
-	state[0xA] ^= p2;
-	state[0xB] ^= p3;
-	state[0xC] ^= p4;
-	state[0xD] ^= p5;
-	state[0xE] ^= p6;
-	state[0xF] ^= p7;
-}
-
-__device__ __forceinline__
-void shavite_gpu_init(uint32_t *sharedMemory)
-{
-	/* each thread startup will fill a uint32 */
-	if (threadIdx.x < 128) {
-		sharedMemory[threadIdx.x ] = d_AES0[threadIdx.x];
-		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
-		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
-		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
-
-		sharedMemory[threadIdx.x + 64 * 2 ] = d_AES0[threadIdx.x + 64 * 2];
-		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
-		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
-		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
-	}
-}
-__global__ __launch_bounds__(TPB, 8)
-void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, uint32_t *const __restrict__ g_nonceVector)
-{
-	__shared__  uint32_t sharedMemory[1024];
-
-	shavite_gpu_init(sharedMemory);
-
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-		int hashPosition = nounce - startNounce;
-		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition*8];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 
-		// kopiere init-state
+		x0 ^= rk[20];
+		x1 ^= rk[21];
+		x2 ^= rk[22];
+		x3 ^= rk[23];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 
-		uint32_t p0, p1, p2, p3, p4, p5, p6, p7;
-		uint32_t p8, p9, pA, pB, pC, pD, pE, pF;
-		uint32_t x0, x1, x2, x3;
-		uint32_t rk[32];
-		const uint32_t msg[16] =
-		{
-			Hash[0], Hash[1], Hash[2], Hash[3], Hash[4], Hash[5], Hash[6], Hash[7], Hash[8], Hash[9], Hash[10], Hash[11], Hash[12], Hash[13], Hash[14], Hash[15]
-		};
-		const uint32_t state[16] = 
-		{
-			SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
-			SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
-			SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
-			SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A)
-		};
 
-		p0 = state[0x0];
-		p1 = state[0x1];
-		p2 = state[0x2];
-		p3 = state[0x3];
-		p4 = state[0x4];
-		p5 = state[0x5];
-		p6 = state[0x6];
-		p7 = state[0x7];
-		p8 = state[0x8];
-		p9 = state[0x9];
-		pA = state[0xA];
-		pB = state[0xB];
-		pC = state[0xC];
-		pD = state[0xD];
-		pE = state[0xE];
-		pF = state[0xF];
-
-		x0 = p4;
-		x1 = p5;
-		x2 = p6;
-		x3 = p7;
-
-
-		rk[0] = msg[0];
-		x0 ^= msg[0];
-		rk[1] = msg[1];
-		x1 ^= msg[1];
-		rk[2] = msg[2];
-		x2 ^= msg[2];
-		rk[3] = msg[3];
-		x3 ^= msg[3];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-
-		rk[4] = msg[4];
-		x0 ^= msg[4];
-		rk[5] = msg[5];
-		x1 ^= msg[5];
-		rk[6] = msg[6];
-		x2 ^= msg[6];
-		rk[7] = msg[7];
-		x3 ^= msg[7];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-
-		rk[8] = msg[8];
-		x0 ^= msg[8];
-		rk[9] = msg[9];
-		x1 ^= msg[9];
-		rk[10] = msg[10];
-		x2 ^= msg[10];
-		rk[11] = msg[11];
-		x3 ^= msg[11];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[12] = msg[12];
-		x0 ^= msg[12];
-		rk[13] = msg[13];
-		x1 ^= msg[13];
-		rk[14] = msg[14];
-		x2 ^= msg[14];
-		rk[15] = msg[15];
-		x3 ^= msg[15];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-
-		p0 ^= x0;
-		p1 ^= x1;
-		p2 ^= x2;
-		p3 ^= x3;
+		x0 ^= rk[24];
+		x1 ^= rk[25];
+		x2 ^= rk[26];
+		x3 ^= rk[27];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 
-		// 1
-		KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
+		x0 ^= rk[28];
+		x1 ^= rk[29];
+		x2 ^= rk[30];
+		x3 ^= rk[31];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 
-		rk[3] ^= (0x02000000UL ^ 0xFFFFFFFFUL);	//rk[31];
-		rk[0] ^= 512;
-		//	rk[3] ^= 0xFFFFFFFF;
+		state[4] ^= x0;
+		state[5] ^= x1;
+		state[6] ^= x2;
+		state[7] ^= x3;
 
-		x0 = p0 ^ rk[0];
-		x1 = p1 ^ rk[1];
-		x2 = p2 ^ rk[2];
-		x3 = p3 ^ rk[3];
+		rk[0] ^= rk[25];
+		x0 = state[12] ^ rk[0];
+		rk[1] ^= rk[26];
+		x1 = state[13] ^ rk[1];
+		rk[2] ^= rk[27];
+		x2 = state[14] ^ rk[2];
+		rk[3] ^= rk[28];
+		x3 = state[15] ^ rk[3];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
-		rk[4] ^= rk[0];
-		rk[5] ^= rk[1];
+		rk[4] ^= rk[29];
+		x0 ^= rk[4];
+		rk[5] ^= rk[30];
+		x1 ^= rk[5];
+		rk[6] ^= rk[31];
+		x2 ^= rk[6];
+		rk[7] ^= rk[0];
+		x3 ^= rk[7];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk[8] ^= rk[1];
+		x0 ^= rk[8];
+		rk[9] ^= rk[2];
+		x1 ^= rk[9];
+		rk[10] ^= rk[3];
+		x2 ^= rk[10];
+		rk[11] ^= rk[4];
+		x3 ^= rk[11];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk[12] ^= rk[5];
+		x0 ^= rk[12];
+		rk[13] ^= rk[6];
+		x1 ^= rk[13];
+		rk[14] ^= rk[7];
+		x2 ^= rk[14];
+		rk[15] ^= rk[8];
+		x3 ^= rk[15];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		state[8] ^= x0;
+		state[9] ^= x1;
+		state[10] ^= x2;
+		state[11] ^= x3;
+		rk[16] ^= rk[9];
+		x0 = state[4] ^ rk[16];
+		rk[17] ^= rk[10];
+		x1 = state[5] ^ rk[17];
+		rk[18] ^= rk[11];
+		x2 = state[6] ^ rk[18];
+		rk[19] ^= rk[12];
+		x3 = state[7] ^ rk[19];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk[20] ^= rk[13];
+		x0 ^= rk[20];
+		rk[21] ^= rk[14];
+		x1 ^= rk[21];
+		rk[22] ^= rk[15];
+		x2 ^= rk[22];
+		rk[23] ^= rk[16];
+		x3 ^= rk[23];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk[24] ^= rk[17];
+		x0 ^= rk[24];
+		rk[25] ^= rk[18];
+		x1 ^= rk[25];
+		rk[26] ^= rk[19];
+		x2 ^= rk[26];
+		rk[27] ^= rk[20];
+		x3 ^= rk[27];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk[28] ^= rk[21];
+		x0 ^= rk[28];
+		rk[29] ^= rk[22];
+		x1 ^= rk[29];
+		rk[30] ^= rk[23];
+		x2 ^= rk[30];
+		rk[31] ^= rk[24];
+		x3 ^= rk[31];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		state[0] ^= x0;
+		state[1] ^= x1;
+		state[2] ^= x2;
+		state[3] ^= x3;
+		/* round 3, 7, 11 */
+		KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
+		rk[0] ^= rk[28];
+		rk[1] ^= rk[29];
+		rk[2] ^= rk[30];
+		rk[3] ^= rk[31];
+		x0 = state[8] ^ rk[0];
+		x1 = state[9] ^ rk[1];
+		x2 = state[10] ^ rk[2];
+		x3 = state[11] ^ rk[3];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
+		rk[4] ^= rk[0];
+		rk[5] ^= rk[1];
 		rk[6] ^= rk[2];
 		rk[7] ^= rk[3];
 		x0 ^= rk[4];
@@ -1441,33 +332,22 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		x1 ^= rk[13];
 		x2 ^= rk[14];
 		x3 ^= rk[15];
-
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		pC ^= x0;
-		pD ^= x1;
-		pE ^= x2;
-		pF ^= x3;
-
-		p8 ^= 0x32be246fUL;
-		p9 ^= 0xe33ad1e5UL;
-		pA ^= 0xd659b13eUL;
-		pB ^= 0xb6a1a92cUL;
-
-		rk[16] = rk[12] ^ 0x63636363UL;
-		rk[17] = rk[13] ^ 0x63636363UL;
-		rk[18] = rk[14] ^ 0x63636363UL;
-		rk[19] = rk[15] ^ 0x8acdcd24UL;
-		x0 = p8 ^ rk[16];
-		x1 = p9 ^ rk[17];
-		x2 = pA ^ rk[18];
-		x3 = pB ^ rk[19];
+		state[4] ^= x0;
+		state[5] ^= x1;
+		state[6] ^= x2;
+		state[7] ^= x3;
+		KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
+		rk[16] ^= rk[12];
+		rk[17] ^= rk[13];
+		rk[18] ^= rk[14];
+		rk[19] ^= rk[15];
+		x0 = state[0] ^ rk[16];
+		x1 = state[1] ^ rk[17];
+		x2 = state[2] ^ rk[18];
+		x3 = state[3] ^ rk[19];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-
-		rk[20] = 0x63636363UL;
-		rk[21] = 0x63636363UL;
-		rk[22] = 0x63636363UL;
-		rk[23] = 0x63636363UL;
-
+		KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
 		rk[20] ^= rk[16];
 		rk[21] ^= rk[17];
 		rk[22] ^= rk[18];
@@ -1477,12 +357,7 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		x2 ^= rk[22];
 		x3 ^= rk[23];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-
-		rk[24] = 0x63636363UL;
-		rk[25] = 0x63636363UL;
-		rk[26] = 0x63636363UL;
-		rk[27] = 0x4b5f7777UL;
-
+		KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
 		rk[24] ^= rk[20];
 		rk[25] ^= rk[21];
 		rk[26] ^= rk[22];
@@ -1492,12 +367,180 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		x2 ^= rk[26];
 		x3 ^= rk[27];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
+		rk[28] ^= rk[24];
+		rk[29] ^= rk[25];
+		rk[30] ^= rk[26];
+		rk[31] ^= rk[27];
+		x0 ^= rk[28];
+		x1 ^= rk[29];
+		x2 ^= rk[30];
+		x3 ^= rk[31];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		state[12] ^= x0;
+		state[13] ^= x1;
+		state[14] ^= x2;
+		state[15] ^= x3;
+		/* round 4, 8, 12 */
+		rk[0] ^= rk[25];
+		x0 = state[4] ^ rk[0];
+		rk[1] ^= rk[26];
+		x1 = state[5] ^ rk[1];
+		rk[2] ^= rk[27];
+		x2 = state[6] ^ rk[2];
+		rk[3] ^= rk[28];
+		x3 = state[7] ^ rk[3];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk[4] ^= rk[29];
+		x0 ^= rk[4];
+		rk[5] ^= rk[30];
+		x1 ^= rk[5];
+		rk[6] ^= rk[31];
+		x2 ^= rk[6];
+		rk[7] ^= rk[0];
+		x3 ^= rk[7];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk[8] ^= rk[1];
+		x0 ^= rk[8];
+		rk[9] ^= rk[2];
+		x1 ^= rk[9];
+		rk[10] ^= rk[3];
+		x2 ^= rk[10];
+		rk[11] ^= rk[4];
+		x3 ^= rk[11];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk[12] ^= rk[5];
+		x0 ^= rk[12];
+		rk[13] ^= rk[6];
+		x1 ^= rk[13];
+		rk[14] ^= rk[7];
+		x2 ^= rk[14];
+		rk[15] ^= rk[8];
+		x3 ^= rk[15];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 
-		rk[28] = 0x63636363UL;
-		rk[29] = 0x63636363UL;
-		rk[30] = 0x63636363UL;
-		rk[31] = 0x4b5f7777UL;
+		state[0] ^= x0;
+		state[1] ^= x1;
+		state[2] ^= x2;
+		state[3] ^= x3;
+		rk[16] ^= rk[9];
+		x0 = state[12] ^ rk[16];
+		rk[17] ^= rk[10];
+		x1 = state[13] ^ rk[17];
+		rk[18] ^= rk[11];
+		x2 = state[14] ^ rk[18];
+		rk[19] ^= rk[12];
+		x3 = state[15] ^ rk[19];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk[20] ^= rk[13];
+		x0 ^= rk[20];
+		rk[21] ^= rk[14];
+		x1 ^= rk[21];
+		rk[22] ^= rk[15];
+		x2 ^= rk[22];
+		rk[23] ^= rk[16];
+		x3 ^= rk[23];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk[24] ^= rk[17];
+		x0 ^= rk[24];
+		rk[25] ^= rk[18];
+		x1 ^= rk[25];
+		rk[26] ^= rk[19];
+		x2 ^= rk[26];
+		rk[27] ^= rk[20];
+		x3 ^= rk[27];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk[28] ^= rk[21];
+		x0 ^= rk[28];
+		rk[29] ^= rk[22];
+		x1 ^= rk[29];
+		rk[30] ^= rk[23];
+		x2 ^= rk[30];
+		rk[31] ^= rk[24];
+		x3 ^= rk[31];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		state[8] ^= x0;
+		state[9] ^= x1;
+		state[10] ^= x2;
+		state[11] ^= x3;
 
+		// 2
+		KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
+		rk[0] ^= rk[28];
+		rk[1] ^= rk[29];
+		rk[2] ^= rk[30];
+		rk[3] ^= rk[31];
+		x0 = state[0] ^ rk[0];
+		x1 = state[1] ^ rk[1];
+		x2 = state[2] ^ rk[2];
+		x3 = state[3] ^ rk[3];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
+		rk[4] ^= rk[0];
+		rk[5] ^= rk[1];
+		rk[6] ^= rk[2];
+		rk[7] ^= rk[3];
+		rk[7] ^= ~512;
+		x0 ^= rk[4];
+		x1 ^= rk[5];
+		x2 ^= rk[6];
+		x3 ^= rk[7];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
+		rk[8] ^= rk[4];
+		rk[9] ^= rk[5];
+		rk[10] ^= rk[6];
+		rk[11] ^= rk[7];
+		x0 ^= rk[8];
+		x1 ^= rk[9];
+		x2 ^= rk[10];
+		x3 ^= rk[11];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
+		rk[12] ^= rk[8];
+		rk[13] ^= rk[9];
+		rk[14] ^= rk[10];
+		rk[15] ^= rk[11];
+		x0 ^= rk[12];
+		x1 ^= rk[13];
+		x2 ^= rk[14];
+		x3 ^= rk[15];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		state[12] ^= x0;
+		state[13] ^= x1;
+		state[14] ^= x2;
+		state[15] ^= x3;
+		KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
+		rk[16] ^= rk[12];
+		rk[17] ^= rk[13];
+		rk[18] ^= rk[14];
+		rk[19] ^= rk[15];
+		x0 = state[8] ^ rk[16];
+		x1 = state[9] ^ rk[17];
+		x2 = state[10] ^ rk[18];
+		x3 = state[11] ^ rk[19];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
+		rk[20] ^= rk[16];
+		rk[21] ^= rk[17];
+		rk[22] ^= rk[18];
+		rk[23] ^= rk[19];
+		x0 ^= rk[20];
+		x1 ^= rk[21];
+		x2 ^= rk[22];
+		x3 ^= rk[23];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
+		rk[24] ^= rk[20];
+		rk[25] ^= rk[21];
+		rk[26] ^= rk[22];
+		rk[27] ^= rk[23];
+		x0 ^= rk[24];
+		x1 ^= rk[25];
+		x2 ^= rk[26];
+		x3 ^= rk[27];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
 		rk[28] ^= rk[24];
 		rk[29] ^= rk[25];
 		rk[30] ^= rk[26];
@@ -1507,19 +550,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		x2 ^= rk[30];
 		x3 ^= rk[31];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p4 ^= x0;
-		p5 ^= x1;
-		p6 ^= x2;
-		p7 ^= x3;
+		state[4] ^= x0;
+		state[5] ^= x1;
+		state[6] ^= x2;
+		state[7] ^= x3;
 
 		rk[0] ^= rk[25];
-		x0 = pC ^ rk[0];
+		x0 = state[12] ^ rk[0];
 		rk[1] ^= rk[26];
-		x1 = pD ^ rk[1];
+		x1 = state[13] ^ rk[1];
 		rk[2] ^= rk[27];
-		x2 = pE ^ rk[2];
+		x2 = state[14] ^ rk[2];
 		rk[3] ^= rk[28];
-		x3 = pF ^ rk[3];
+		x3 = state[15] ^ rk[3];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		rk[4] ^= rk[29];
 		x0 ^= rk[4];
@@ -1548,18 +591,18 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		rk[15] ^= rk[8];
 		x3 ^= rk[15];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p8 ^= x0;
-		p9 ^= x1;
-		pA ^= x2;
-		pB ^= x3;
+		state[8] ^= x0;
+		state[9] ^= x1;
+		state[10] ^= x2;
+		state[11] ^= x3;
 		rk[16] ^= rk[9];
-		x0 = p4 ^ rk[16];
+		x0 = state[4] ^ rk[16];
 		rk[17] ^= rk[10];
-		x1 = p5 ^ rk[17];
+		x1 = state[5] ^ rk[17];
 		rk[18] ^= rk[11];
-		x2 = p6 ^ rk[18];
+		x2 = state[6] ^ rk[18];
 		rk[19] ^= rk[12];
-		x3 = p7 ^ rk[19];
+		x3 = state[7] ^ rk[19];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		rk[20] ^= rk[13];
 		x0 ^= rk[20];
@@ -1588,20 +631,20 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		rk[31] ^= rk[24];
 		x3 ^= rk[31];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p0 ^= x0;
-		p1 ^= x1;
-		p2 ^= x2;
-		p3 ^= x3;
+		state[0] ^= x0;
+		state[1] ^= x1;
+		state[2] ^= x2;
+		state[3] ^= x3;
 		/* round 3, 7, 11 */
 		KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
 		rk[0] ^= rk[28];
 		rk[1] ^= rk[29];
 		rk[2] ^= rk[30];
 		rk[3] ^= rk[31];
-		x0 = p8 ^ rk[0];
-		x1 = p9 ^ rk[1];
-		x2 = pA ^ rk[2];
-		x3 = pB ^ rk[3];
+		x0 = state[8] ^ rk[0];
+		x1 = state[9] ^ rk[1];
+		x2 = state[10] ^ rk[2];
+		x3 = state[11] ^ rk[3];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
 		rk[4] ^= rk[0];
@@ -1633,19 +676,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		x2 ^= rk[14];
 		x3 ^= rk[15];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p4 ^= x0;
-		p5 ^= x1;
-		p6 ^= x2;
-		p7 ^= x3;
+		state[4] ^= x0;
+		state[5] ^= x1;
+		state[6] ^= x2;
+		state[7] ^= x3;
 		KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
 		rk[16] ^= rk[12];
 		rk[17] ^= rk[13];
 		rk[18] ^= rk[14];
 		rk[19] ^= rk[15];
-		x0 = p0 ^ rk[16];
-		x1 = p1 ^ rk[17];
-		x2 = p2 ^ rk[18];
-		x3 = p3 ^ rk[19];
+		x0 = state[0] ^ rk[16];
+		x1 = state[1] ^ rk[17];
+		x2 = state[2] ^ rk[18];
+		x3 = state[3] ^ rk[19];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
 		rk[20] ^= rk[16];
@@ -1677,19 +720,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		x2 ^= rk[30];
 		x3 ^= rk[31];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		pC ^= x0;
-		pD ^= x1;
-		pE ^= x2;
-		pF ^= x3;
+		state[12] ^= x0;
+		state[13] ^= x1;
+		state[14] ^= x2;
+		state[15] ^= x3;
 		/* round 4, 8, 12 */
 		rk[0] ^= rk[25];
-		x0 = p4 ^ rk[0];
+		x0 = state[4] ^ rk[0];
 		rk[1] ^= rk[26];
-		x1 = p5 ^ rk[1];
+		x1 = state[5] ^ rk[1];
 		rk[2] ^= rk[27];
-		x2 = p6 ^ rk[2];
+		x2 = state[6] ^ rk[2];
 		rk[3] ^= rk[28];
-		x3 = p7 ^ rk[3];
+		x3 = state[7] ^ rk[3];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		rk[4] ^= rk[29];
 		x0 ^= rk[4];
@@ -1718,19 +761,18 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		rk[15] ^= rk[8];
 		x3 ^= rk[15];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-
-		p0 ^= x0;
-		p1 ^= x1;
-		p2 ^= x2;
-		p3 ^= x3;
+		state[0] ^= x0;
+		state[1] ^= x1;
+		state[2] ^= x2;
+		state[3] ^= x3;
 		rk[16] ^= rk[9];
-		x0 = pC ^ rk[16];
+		x0 = state[12] ^ rk[16];
 		rk[17] ^= rk[10];
-		x1 = pD ^ rk[17];
+		x1 = state[13] ^ rk[17];
 		rk[18] ^= rk[11];
-		x2 = pE ^ rk[18];
+		x2 = state[14] ^ rk[18];
 		rk[19] ^= rk[12];
-		x3 = pF ^ rk[19];
+		x3 = state[15] ^ rk[19];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		rk[20] ^= rk[13];
 		x0 ^= rk[20];
@@ -1759,28 +801,27 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		rk[31] ^= rk[24];
 		x3 ^= rk[31];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p8 ^= x0;
-		p9 ^= x1;
-		pA ^= x2;
-		pB ^= x3;
+		state[8] ^= x0;
+		state[9] ^= x1;
+		state[10] ^= x2;
+		state[11] ^= x3;
 
-		// 2
+		// 3
 		KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
 		rk[0] ^= rk[28];
 		rk[1] ^= rk[29];
 		rk[2] ^= rk[30];
 		rk[3] ^= rk[31];
-		x0 = p0 ^ rk[0];
-		x1 = p1 ^ rk[1];
-		x2 = p2 ^ rk[2];
-		x3 = p3 ^ rk[3];
+		x0 = state[0] ^ rk[0];
+		x1 = state[1] ^ rk[1];
+		x2 = state[2] ^ rk[2];
+		x3 = state[3] ^ rk[3];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
 		rk[4] ^= rk[0];
 		rk[5] ^= rk[1];
 		rk[6] ^= rk[2];
 		rk[7] ^= rk[3];
-		rk[7] ^= ~512;
 		x0 ^= rk[4];
 		x1 ^= rk[5];
 		x2 ^= rk[6];
@@ -1806,19 +847,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		x2 ^= rk[14];
 		x3 ^= rk[15];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		pC ^= x0;
-		pD ^= x1;
-		pE ^= x2;
-		pF ^= x3;
+		state[12] ^= x0;
+		state[13] ^= x1;
+		state[14] ^= x2;
+		state[15] ^= x3;
 		KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
 		rk[16] ^= rk[12];
 		rk[17] ^= rk[13];
 		rk[18] ^= rk[14];
 		rk[19] ^= rk[15];
-		x0 = p8 ^ rk[16];
-		x1 = p9 ^ rk[17];
-		x2 = pA ^ rk[18];
-		x3 = pB ^ rk[19];
+		x0 = state[8] ^ rk[16];
+		x1 = state[9] ^ rk[17];
+		x2 = state[10] ^ rk[18];
+		x3 = state[11] ^ rk[19];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
 		rk[20] ^= rk[16];
@@ -1844,25 +885,27 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		rk[28] ^= rk[24];
 		rk[29] ^= rk[25];
 		rk[30] ^= rk[26];
-		rk[31] ^= rk[27];
+		rk[31] ^= ~rk[27];
+		rk[30] ^= 512;
+		//		rk[31] ^= 0xFFFFFFFF;
 		x0 ^= rk[28];
 		x1 ^= rk[29];
 		x2 ^= rk[30];
 		x3 ^= rk[31];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p4 ^= x0;
-		p5 ^= x1;
-		p6 ^= x2;
-		p7 ^= x3;
+		state[4] ^= x0;
+		state[5] ^= x1;
+		state[6] ^= x2;
+		state[7] ^= x3;
 
 		rk[0] ^= rk[25];
-		x0 = pC ^ rk[0];
+		x0 = state[12] ^ rk[0];
 		rk[1] ^= rk[26];
-		x1 = pD ^ rk[1];
+		x1 = state[13] ^ rk[1];
 		rk[2] ^= rk[27];
-		x2 = pE ^ rk[2];
+		x2 = state[14] ^ rk[2];
 		rk[3] ^= rk[28];
-		x3 = pF ^ rk[3];
+		x3 = state[15] ^ rk[3];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		rk[4] ^= rk[29];
 		x0 ^= rk[4];
@@ -1891,18 +934,18 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		rk[15] ^= rk[8];
 		x3 ^= rk[15];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p8 ^= x0;
-		p9 ^= x1;
-		pA ^= x2;
-		pB ^= x3;
+		state[8] ^= x0;
+		state[9] ^= x1;
+		state[10] ^= x2;
+		state[11] ^= x3;
 		rk[16] ^= rk[9];
-		x0 = p4 ^ rk[16];
+		x0 = state[4] ^ rk[16];
 		rk[17] ^= rk[10];
-		x1 = p5 ^ rk[17];
+		x1 = state[5] ^ rk[17];
 		rk[18] ^= rk[11];
-		x2 = p6 ^ rk[18];
+		x2 = state[6] ^ rk[18];
 		rk[19] ^= rk[12];
-		x3 = p7 ^ rk[19];
+		x3 = state[7] ^ rk[19];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		rk[20] ^= rk[13];
 		x0 ^= rk[20];
@@ -1931,20 +974,21 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		rk[31] ^= rk[24];
 		x3 ^= rk[31];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p0 ^= x0;
-		p1 ^= x1;
-		p2 ^= x2;
-		p3 ^= x3;
+		state[0] ^= x0;
+		state[1] ^= x1;
+		state[2] ^= x2;
+		state[3] ^= x3;
+
 		/* round 3, 7, 11 */
 		KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
 		rk[0] ^= rk[28];
 		rk[1] ^= rk[29];
 		rk[2] ^= rk[30];
 		rk[3] ^= rk[31];
-		x0 = p8 ^ rk[0];
-		x1 = p9 ^ rk[1];
-		x2 = pA ^ rk[2];
-		x3 = pB ^ rk[3];
+		x0 = state[8] ^ rk[0];
+		x1 = state[9] ^ rk[1];
+		x2 = state[10] ^ rk[2];
+		x3 = state[11] ^ rk[3];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
 		rk[4] ^= rk[0];
@@ -1976,19 +1020,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		x2 ^= rk[14];
 		x3 ^= rk[15];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p4 ^= x0;
-		p5 ^= x1;
-		p6 ^= x2;
-		p7 ^= x3;
+		state[4] ^= x0;
+		state[5] ^= x1;
+		state[6] ^= x2;
+		state[7] ^= x3;
 		KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
 		rk[16] ^= rk[12];
 		rk[17] ^= rk[13];
 		rk[18] ^= rk[14];
 		rk[19] ^= rk[15];
-		x0 = p0 ^ rk[16];
-		x1 = p1 ^ rk[17];
-		x2 = p2 ^ rk[18];
-		x3 = p3 ^ rk[19];
+		x0 = state[0] ^ rk[16];
+		x1 = state[1] ^ rk[17];
+		x2 = state[2] ^ rk[18];
+		x3 = state[3] ^ rk[19];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
 		rk[20] ^= rk[16];
@@ -2020,19 +1064,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		x2 ^= rk[30];
 		x3 ^= rk[31];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		pC ^= x0;
-		pD ^= x1;
-		pE ^= x2;
-		pF ^= x3;
+		state[12] ^= x0;
+		state[13] ^= x1;
+		state[14] ^= x2;
+		state[15] ^= x3;
 		/* round 4, 8, 12 */
 		rk[0] ^= rk[25];
-		x0 = p4 ^ rk[0];
+		x0 = state[4] ^ rk[0];
 		rk[1] ^= rk[26];
-		x1 = p5 ^ rk[1];
+		x1 = state[5] ^ rk[1];
 		rk[2] ^= rk[27];
-		x2 = p6 ^ rk[2];
+		x2 = state[6] ^ rk[2];
 		rk[3] ^= rk[28];
-		x3 = p7 ^ rk[3];
+		x3 = state[7] ^ rk[3];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		rk[4] ^= rk[29];
 		x0 ^= rk[4];
@@ -2061,18 +1105,18 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		rk[15] ^= rk[8];
 		x3 ^= rk[15];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p0 ^= x0;
-		p1 ^= x1;
-		p2 ^= x2;
-		p3 ^= x3;
+		state[0] ^= x0;
+		state[1] ^= x1;
+		state[2] ^= x2;
+		state[3] ^= x3;
 		rk[16] ^= rk[9];
-		x0 = pC ^ rk[16];
+		x0 = state[12] ^ rk[16];
 		rk[17] ^= rk[10];
-		x1 = pD ^ rk[17];
+		x1 = state[13] ^ rk[17];
 		rk[18] ^= rk[11];
-		x2 = pE ^ rk[18];
+		x2 = state[14] ^ rk[18];
 		rk[19] ^= rk[12];
-		x3 = pF ^ rk[19];
+		x3 = state[15] ^ rk[19];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		rk[20] ^= rk[13];
 		x0 ^= rk[20];
@@ -2101,21 +1145,21 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		rk[31] ^= rk[24];
 		x3 ^= rk[31];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p8 ^= x0;
-		p9 ^= x1;
-		pA ^= x2;
-		pB ^= x3;
+		state[8] ^= x0;
+		state[9] ^= x1;
+		state[10] ^= x2;
+		state[11] ^= x3;
 
-		// 3
+		/* round 13 */
 		KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
 		rk[0] ^= rk[28];
 		rk[1] ^= rk[29];
 		rk[2] ^= rk[30];
 		rk[3] ^= rk[31];
-		x0 = p0 ^ rk[0];
-		x1 = p1 ^ rk[1];
-		x2 = p2 ^ rk[2];
-		x3 = p3 ^ rk[3];
+		x0 = state[0] ^ rk[0];
+		x1 = state[1] ^ rk[1];
+		x2 = state[2] ^ rk[2];
+		x3 = state[3] ^ rk[3];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
 		rk[4] ^= rk[0];
@@ -2147,19 +1191,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		x2 ^= rk[14];
 		x3 ^= rk[15];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		pC ^= x0;
-		pD ^= x1;
-		pE ^= x2;
-		pF ^= x3;
+		state[12] ^= x0;
+		state[13] ^= x1;
+		state[14] ^= x2;
+		state[15] ^= x3;
 		KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
 		rk[16] ^= rk[12];
 		rk[17] ^= rk[13];
 		rk[18] ^= rk[14];
 		rk[19] ^= rk[15];
-		x0 = p8 ^ rk[16];
-		x1 = p9 ^ rk[17];
-		x2 = pA ^ rk[18];
-		x3 = pB ^ rk[19];
+		x0 = state[8] ^ rk[16];
+		x1 = state[9] ^ rk[17];
+		x2 = state[10] ^ rk[18];
+		x3 = state[11] ^ rk[19];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
 		rk[20] ^= rk[16];
@@ -2173,9 +1217,9 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 		KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
 		rk[24] ^= rk[20];
-		rk[25] ^= rk[21];
+		rk[25] ^= rk[21] ^ 512;
 		rk[26] ^= rk[22];
-		rk[27] ^= rk[23];
+		rk[27] ^= ~rk[23]; //^ 0xFFFFFFFF;
 		x0 ^= rk[24];
 		x1 ^= rk[25];
 		x2 ^= rk[26];
@@ -2186,413 +1230,1279 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		rk[29] ^= rk[25];
 		rk[30] ^= rk[26];
 		rk[31] ^= rk[27];
-		rk[30] ^= 512;
-		rk[31] ^= 0xFFFFFFFF;
 		x0 ^= rk[28];
 		x1 ^= rk[29];
 		x2 ^= rk[30];
 		x3 ^= rk[31];
 		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p4 ^= x0;
-		p5 ^= x1;
-		p6 ^= x2;
-		p7 ^= x3;
+		state[4] ^= x0;
+		state[5] ^= x1;
+		state[6] ^= x2;
+		state[7] ^= x3;
+
+		Hash[0] = 0x72FCCDD8 ^ state[8];
+		Hash[1] = 0x79CA4727 ^ state[9];
+		Hash[2] = 0x128A077B ^ state[10];
+		Hash[3] = 0x40D55AEC ^ state[11];
+		Hash[4] = 0xD1901A06 ^ state[12];
+		Hash[5] = 0x430AE307 ^ state[13];
+		Hash[6] = 0xB29F5CD1 ^ state[14];
+		Hash[7] = 0xDF07FBFC ^ state[15];
+		Hash[8] = 0x8E45D73D ^ state[0];
+		Hash[9] = 0x681AB538 ^ state[1];
+		Hash[10] = 0xBDE86578 ^ state[2];
+		Hash[11] = 0xDD577E47 ^ state[3];
+		Hash[12] = 0xE275EADE ^ state[4];
+		Hash[13] = 0x502D9FCD ^ state[5];
+		Hash[14] = 0xB9357178 ^ state[6];
+		Hash[15] = 0x022A4B9A ^ state[7];
+	}
+}
+
+__device__ __forceinline__
+static void c512(const uint32_t*const __restrict__ sharedMemory, uint32_t *const __restrict__  state, uint32_t *const __restrict__  msg)
+{
+	uint32_t p0, p1, p2, p3, p4, p5, p6, p7;
+	uint32_t p8, p9, pA, pB, pC, pD, pE, pF;
+	uint32_t x0, x1, x2, x3;
+	uint32_t rk[32];
+	uint32_t i;
+	const uint32_t counter = 640;
+
+	p0 = state[0x0];
+	p1 = state[0x1];
+	p2 = state[0x2];
+	p3 = state[0x3];
+	p4 = state[0x4];
+	p5 = state[0x5];
+	p6 = state[0x6];
+	p7 = state[0x7];
+	p8 = state[0x8];
+	p9 = state[0x9];
+	pA = state[0xA];
+	pB = state[0xB];
+	pC = state[0xC];
+	pD = state[0xD];
+	pE = state[0xE];
+	pF = state[0xF];
+
+	x0 = p4;
+	x1 = p5;
+	x2 = p6;
+	x3 = p7;
+#pragma unroll
+	for(i = 0; i<16; i += 4)
+	{
+		rk[i] = msg[i];
+		x0 ^= msg[i];
+		rk[i + 1] = msg[i + 1];
+		x1 ^= msg[i + 1];
+		rk[i + 2] = msg[i + 2];
+		x2 ^= msg[i + 2];
+		rk[i + 3] = msg[i + 3];
+		x3 ^= msg[i + 3];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	}
+
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	x0 = pC;
+	x1 = pD;
+	x2 = pE;
+	x3 = pF;
+
+#pragma unroll 
+	for(i = 16; i<32; i += 4)
+	{
+		rk[i] = msg[i];
+		x0 ^= msg[i];
+		rk[i + 1] = msg[i + 1];
+		x1 ^= msg[i + 1];
+		rk[i + 2] = msg[i + 2];
+		x2 ^= msg[i + 2];
+		rk[i + 3] = msg[i + 3];
+		x3 ^= msg[i + 3];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	}
+	p8 ^= x0;
+	p9 ^= x1;
+	pA ^= x2;
+	pB ^= x3;
+
+	// 1
+	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
+
+	rk[0] ^= rk[28];
+	rk[1] ^= rk[29];
+	rk[2] ^= rk[30];
+	rk[3] ^= ~rk[31];
+	rk[0] ^= counter;
+	//rk[3] ^= 0xFFFFFFFF;
+	x0 = p0 ^ rk[0];
+	x1 = p1 ^ rk[1];
+	x2 = p2 ^ rk[2];
+	x3 = p3 ^ rk[3];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
+	rk[4] ^= rk[0];
+	rk[5] ^= rk[1];
+	rk[6] ^= rk[2];
+	rk[7] ^= rk[3];
+	x0 ^= rk[4];
+	x1 ^= rk[5];
+	x2 ^= rk[6];
+	x3 ^= rk[7];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
+	rk[8] ^= rk[4];
+	rk[9] ^= rk[5];
+	rk[10] ^= rk[6];
+	rk[11] ^= rk[7];
+	x0 ^= rk[8];
+	x1 ^= rk[9];
+	x2 ^= rk[10];
+	x3 ^= rk[11];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
+	rk[12] ^= rk[8];
+	rk[13] ^= rk[9];
+	rk[14] ^= rk[10];
+	rk[15] ^= rk[11];
+	x0 ^= rk[12];
+	x1 ^= rk[13];
+	x2 ^= rk[14];
+	x3 ^= rk[15];
+
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	pC ^= x0;
+	pD ^= x1;
+	pE ^= x2;
+	pF ^= x3;
+
+	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
+	rk[16] ^= rk[12];
+	rk[17] ^= rk[13];
+	rk[18] ^= rk[14];
+	rk[19] ^= rk[15];
+	x0 = p8 ^ rk[16];
+	x1 = p9 ^ rk[17];
+	x2 = pA ^ rk[18];
+	x3 = pB ^ rk[19];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
+	rk[20] ^= rk[16];
+	rk[21] ^= rk[17];
+	rk[22] ^= rk[18];
+	rk[23] ^= rk[19];
+	x0 ^= rk[20];
+	x1 ^= rk[21];
+	x2 ^= rk[22];
+	x3 ^= rk[23];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
+	rk[24] ^= rk[20];
+	rk[25] ^= rk[21];
+	rk[26] ^= rk[22];
+	rk[27] ^= rk[23];
+	x0 ^= rk[24];
+	x1 ^= rk[25];
+	x2 ^= rk[26];
+	x3 ^= rk[27];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
+	rk[28] ^= rk[24];
+	rk[29] ^= rk[25];
+	rk[30] ^= rk[26];
+	rk[31] ^= rk[27];
+	x0 ^= rk[28];
+	x1 ^= rk[29];
+	x2 ^= rk[30];
+	x3 ^= rk[31];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+
+	rk[0] ^= rk[25];
+	x0 = pC ^ rk[0];
+	rk[1] ^= rk[26];
+	x1 = pD ^ rk[1];
+	rk[2] ^= rk[27];
+	x2 = pE ^ rk[2];
+	rk[3] ^= rk[28];
+	x3 = pF ^ rk[3];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[4] ^= rk[29];
+	x0 ^= rk[4];
+	rk[5] ^= rk[30];
+	x1 ^= rk[5];
+	rk[6] ^= rk[31];
+	x2 ^= rk[6];
+	rk[7] ^= rk[0];
+	x3 ^= rk[7];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[8] ^= rk[1];
+	x0 ^= rk[8];
+	rk[9] ^= rk[2];
+	x1 ^= rk[9];
+	rk[10] ^= rk[3];
+	x2 ^= rk[10];
+	rk[11] ^= rk[4];
+	x3 ^= rk[11];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[12] ^= rk[5];
+	x0 ^= rk[12];
+	rk[13] ^= rk[6];
+	x1 ^= rk[13];
+	rk[14] ^= rk[7];
+	x2 ^= rk[14];
+	rk[15] ^= rk[8];
+	x3 ^= rk[15];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p8 ^= x0;
+	p9 ^= x1;
+	pA ^= x2;
+	pB ^= x3;
+	rk[16] ^= rk[9];
+	x0 = p4 ^ rk[16];
+	rk[17] ^= rk[10];
+	x1 = p5 ^ rk[17];
+	rk[18] ^= rk[11];
+	x2 = p6 ^ rk[18];
+	rk[19] ^= rk[12];
+	x3 = p7 ^ rk[19];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[20] ^= rk[13];
+	x0 ^= rk[20];
+	rk[21] ^= rk[14];
+	x1 ^= rk[21];
+	rk[22] ^= rk[15];
+	x2 ^= rk[22];
+	rk[23] ^= rk[16];
+	x3 ^= rk[23];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[24] ^= rk[17];
+	x0 ^= rk[24];
+	rk[25] ^= rk[18];
+	x1 ^= rk[25];
+	rk[26] ^= rk[19];
+	x2 ^= rk[26];
+	rk[27] ^= rk[20];
+	x3 ^= rk[27];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[28] ^= rk[21];
+	x0 ^= rk[28];
+	rk[29] ^= rk[22];
+	x1 ^= rk[29];
+	rk[30] ^= rk[23];
+	x2 ^= rk[30];
+	rk[31] ^= rk[24];
+	x3 ^= rk[31];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 3, 7, 11 */
+	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
+	rk[0] ^= rk[28];
+	rk[1] ^= rk[29];
+	rk[2] ^= rk[30];
+	rk[3] ^= rk[31];
+	x0 = p8 ^ rk[0];
+	x1 = p9 ^ rk[1];
+	x2 = pA ^ rk[2];
+	x3 = pB ^ rk[3];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
+	rk[4] ^= rk[0];
+	rk[5] ^= rk[1];
+	rk[6] ^= rk[2];
+	rk[7] ^= rk[3];
+	x0 ^= rk[4];
+	x1 ^= rk[5];
+	x2 ^= rk[6];
+	x3 ^= rk[7];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
+	rk[8] ^= rk[4];
+	rk[9] ^= rk[5];
+	rk[10] ^= rk[6];
+	rk[11] ^= rk[7];
+	x0 ^= rk[8];
+	x1 ^= rk[9];
+	x2 ^= rk[10];
+	x3 ^= rk[11];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
+	rk[12] ^= rk[8];
+	rk[13] ^= rk[9];
+	rk[14] ^= rk[10];
+	rk[15] ^= rk[11];
+	x0 ^= rk[12];
+	x1 ^= rk[13];
+	x2 ^= rk[14];
+	x3 ^= rk[15];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
+	rk[16] ^= rk[12];
+	rk[17] ^= rk[13];
+	rk[18] ^= rk[14];
+	rk[19] ^= rk[15];
+	x0 = p0 ^ rk[16];
+	x1 = p1 ^ rk[17];
+	x2 = p2 ^ rk[18];
+	x3 = p3 ^ rk[19];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
+	rk[20] ^= rk[16];
+	rk[21] ^= rk[17];
+	rk[22] ^= rk[18];
+	rk[23] ^= rk[19];
+	x0 ^= rk[20];
+	x1 ^= rk[21];
+	x2 ^= rk[22];
+	x3 ^= rk[23];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
+	rk[24] ^= rk[20];
+	rk[25] ^= rk[21];
+	rk[26] ^= rk[22];
+	rk[27] ^= rk[23];
+	x0 ^= rk[24];
+	x1 ^= rk[25];
+	x2 ^= rk[26];
+	x3 ^= rk[27];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
+	rk[28] ^= rk[24];
+	rk[29] ^= rk[25];
+	rk[30] ^= rk[26];
+	rk[31] ^= rk[27];
+	x0 ^= rk[28];
+	x1 ^= rk[29];
+	x2 ^= rk[30];
+	x3 ^= rk[31];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	pC ^= x0;
+	pD ^= x1;
+	pE ^= x2;
+	pF ^= x3;
+	/* round 4, 8, 12 */
+	rk[0] ^= rk[25];
+	x0 = p4 ^ rk[0];
+	rk[1] ^= rk[26];
+	x1 = p5 ^ rk[1];
+	rk[2] ^= rk[27];
+	x2 = p6 ^ rk[2];
+	rk[3] ^= rk[28];
+	x3 = p7 ^ rk[3];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[4] ^= rk[29];
+	x0 ^= rk[4];
+	rk[5] ^= rk[30];
+	x1 ^= rk[5];
+	rk[6] ^= rk[31];
+	x2 ^= rk[6];
+	rk[7] ^= rk[0];
+	x3 ^= rk[7];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[8] ^= rk[1];
+	x0 ^= rk[8];
+	rk[9] ^= rk[2];
+	x1 ^= rk[9];
+	rk[10] ^= rk[3];
+	x2 ^= rk[10];
+	rk[11] ^= rk[4];
+	x3 ^= rk[11];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[12] ^= rk[5];
+	x0 ^= rk[12];
+	rk[13] ^= rk[6];
+	x1 ^= rk[13];
+	rk[14] ^= rk[7];
+	x2 ^= rk[14];
+	rk[15] ^= rk[8];
+	x3 ^= rk[15];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	rk[16] ^= rk[9];
+	x0 = pC ^ rk[16];
+	rk[17] ^= rk[10];
+	x1 = pD ^ rk[17];
+	rk[18] ^= rk[11];
+	x2 = pE ^ rk[18];
+	rk[19] ^= rk[12];
+	x3 = pF ^ rk[19];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[20] ^= rk[13];
+	x0 ^= rk[20];
+	rk[21] ^= rk[14];
+	x1 ^= rk[21];
+	rk[22] ^= rk[15];
+	x2 ^= rk[22];
+	rk[23] ^= rk[16];
+	x3 ^= rk[23];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[24] ^= rk[17];
+	x0 ^= rk[24];
+	rk[25] ^= rk[18];
+	x1 ^= rk[25];
+	rk[26] ^= rk[19];
+	x2 ^= rk[26];
+	rk[27] ^= rk[20];
+	x3 ^= rk[27];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[28] ^= rk[21];
+	x0 ^= rk[28];
+	rk[29] ^= rk[22];
+	x1 ^= rk[29];
+	rk[30] ^= rk[23];
+	x2 ^= rk[30];
+	rk[31] ^= rk[24];
+	x3 ^= rk[31];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p8 ^= x0;
+	p9 ^= x1;
+	pA ^= x2;
+	pB ^= x3;
+
+	// 2
+	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
+	rk[0] ^= rk[28];
+	rk[1] ^= rk[29];
+	rk[2] ^= rk[30];
+	rk[3] ^= rk[31];
+	x0 = p0 ^ rk[0];
+	x1 = p1 ^ rk[1];
+	x2 = p2 ^ rk[2];
+	x3 = p3 ^ rk[3];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
+	rk[4] ^= rk[0];
+	rk[5] ^= rk[1];
+	rk[6] ^= rk[2];
+	rk[7] ^= rk[3];
+	rk[7] ^= ~counter;
+	x0 ^= rk[4];
+	x1 ^= rk[5];
+	x2 ^= rk[6];
+	x3 ^= rk[7];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
+	rk[8] ^= rk[4];
+	rk[9] ^= rk[5];
+	rk[10] ^= rk[6];
+	rk[11] ^= rk[7];
+	x0 ^= rk[8];
+	x1 ^= rk[9];
+	x2 ^= rk[10];
+	x3 ^= rk[11];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
+	rk[12] ^= rk[8];
+	rk[13] ^= rk[9];
+	rk[14] ^= rk[10];
+	rk[15] ^= rk[11];
+	x0 ^= rk[12];
+	x1 ^= rk[13];
+	x2 ^= rk[14];
+	x3 ^= rk[15];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	pC ^= x0;
+	pD ^= x1;
+	pE ^= x2;
+	pF ^= x3;
+	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
+	rk[16] ^= rk[12];
+	rk[17] ^= rk[13];
+	rk[18] ^= rk[14];
+	rk[19] ^= rk[15];
+	x0 = p8 ^ rk[16];
+	x1 = p9 ^ rk[17];
+	x2 = pA ^ rk[18];
+	x3 = pB ^ rk[19];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
+	rk[20] ^= rk[16];
+	rk[21] ^= rk[17];
+	rk[22] ^= rk[18];
+	rk[23] ^= rk[19];
+	x0 ^= rk[20];
+	x1 ^= rk[21];
+	x2 ^= rk[22];
+	x3 ^= rk[23];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
+	rk[24] ^= rk[20];
+	rk[25] ^= rk[21];
+	rk[26] ^= rk[22];
+	rk[27] ^= rk[23];
+	x0 ^= rk[24];
+	x1 ^= rk[25];
+	x2 ^= rk[26];
+	x3 ^= rk[27];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
+	rk[28] ^= rk[24];
+	rk[29] ^= rk[25];
+	rk[30] ^= rk[26];
+	rk[31] ^= rk[27];
+	x0 ^= rk[28];
+	x1 ^= rk[29];
+	x2 ^= rk[30];
+	x3 ^= rk[31];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+
+	rk[0] ^= rk[25];
+	x0 = pC ^ rk[0];
+	rk[1] ^= rk[26];
+	x1 = pD ^ rk[1];
+	rk[2] ^= rk[27];
+	x2 = pE ^ rk[2];
+	rk[3] ^= rk[28];
+	x3 = pF ^ rk[3];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[4] ^= rk[29];
+	x0 ^= rk[4];
+	rk[5] ^= rk[30];
+	x1 ^= rk[5];
+	rk[6] ^= rk[31];
+	x2 ^= rk[6];
+	rk[7] ^= rk[0];
+	x3 ^= rk[7];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[8] ^= rk[1];
+	x0 ^= rk[8];
+	rk[9] ^= rk[2];
+	x1 ^= rk[9];
+	rk[10] ^= rk[3];
+	x2 ^= rk[10];
+	rk[11] ^= rk[4];
+	x3 ^= rk[11];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[12] ^= rk[5];
+	x0 ^= rk[12];
+	rk[13] ^= rk[6];
+	x1 ^= rk[13];
+	rk[14] ^= rk[7];
+	x2 ^= rk[14];
+	rk[15] ^= rk[8];
+	x3 ^= rk[15];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p8 ^= x0;
+	p9 ^= x1;
+	pA ^= x2;
+	pB ^= x3;
+	rk[16] ^= rk[9];
+	x0 = p4 ^ rk[16];
+	rk[17] ^= rk[10];
+	x1 = p5 ^ rk[17];
+	rk[18] ^= rk[11];
+	x2 = p6 ^ rk[18];
+	rk[19] ^= rk[12];
+	x3 = p7 ^ rk[19];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[20] ^= rk[13];
+	x0 ^= rk[20];
+	rk[21] ^= rk[14];
+	x1 ^= rk[21];
+	rk[22] ^= rk[15];
+	x2 ^= rk[22];
+	rk[23] ^= rk[16];
+	x3 ^= rk[23];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[24] ^= rk[17];
+	x0 ^= rk[24];
+	rk[25] ^= rk[18];
+	x1 ^= rk[25];
+	rk[26] ^= rk[19];
+	x2 ^= rk[26];
+	rk[27] ^= rk[20];
+	x3 ^= rk[27];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[28] ^= rk[21];
+	x0 ^= rk[28];
+	rk[29] ^= rk[22];
+	x1 ^= rk[29];
+	rk[30] ^= rk[23];
+	x2 ^= rk[30];
+	rk[31] ^= rk[24];
+	x3 ^= rk[31];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 3, 7, 11 */
+	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
+	rk[0] ^= rk[28];
+	rk[1] ^= rk[29];
+	rk[2] ^= rk[30];
+	rk[3] ^= rk[31];
+	x0 = p8 ^ rk[0];
+	x1 = p9 ^ rk[1];
+	x2 = pA ^ rk[2];
+	x3 = pB ^ rk[3];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
+	rk[4] ^= rk[0];
+	rk[5] ^= rk[1];
+	rk[6] ^= rk[2];
+	rk[7] ^= rk[3];
+	x0 ^= rk[4];
+	x1 ^= rk[5];
+	x2 ^= rk[6];
+	x3 ^= rk[7];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
+	rk[8] ^= rk[4];
+	rk[9] ^= rk[5];
+	rk[10] ^= rk[6];
+	rk[11] ^= rk[7];
+	x0 ^= rk[8];
+	x1 ^= rk[9];
+	x2 ^= rk[10];
+	x3 ^= rk[11];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
+	rk[12] ^= rk[8];
+	rk[13] ^= rk[9];
+	rk[14] ^= rk[10];
+	rk[15] ^= rk[11];
+	x0 ^= rk[12];
+	x1 ^= rk[13];
+	x2 ^= rk[14];
+	x3 ^= rk[15];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
+	rk[16] ^= rk[12];
+	rk[17] ^= rk[13];
+	rk[18] ^= rk[14];
+	rk[19] ^= rk[15];
+	x0 = p0 ^ rk[16];
+	x1 = p1 ^ rk[17];
+	x2 = p2 ^ rk[18];
+	x3 = p3 ^ rk[19];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
+	rk[20] ^= rk[16];
+	rk[21] ^= rk[17];
+	rk[22] ^= rk[18];
+	rk[23] ^= rk[19];
+	x0 ^= rk[20];
+	x1 ^= rk[21];
+	x2 ^= rk[22];
+	x3 ^= rk[23];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
+	rk[24] ^= rk[20];
+	rk[25] ^= rk[21];
+	rk[26] ^= rk[22];
+	rk[27] ^= rk[23];
+	x0 ^= rk[24];
+	x1 ^= rk[25];
+	x2 ^= rk[26];
+	x3 ^= rk[27];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
+	rk[28] ^= rk[24];
+	rk[29] ^= rk[25];
+	rk[30] ^= rk[26];
+	rk[31] ^= rk[27];
+	x0 ^= rk[28];
+	x1 ^= rk[29];
+	x2 ^= rk[30];
+	x3 ^= rk[31];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	pC ^= x0;
+	pD ^= x1;
+	pE ^= x2;
+	pF ^= x3;
+	/* round 4, 8, 12 */
+	rk[0] ^= rk[25];
+	x0 = p4 ^ rk[0];
+	rk[1] ^= rk[26];
+	x1 = p5 ^ rk[1];
+	rk[2] ^= rk[27];
+	x2 = p6 ^ rk[2];
+	rk[3] ^= rk[28];
+	x3 = p7 ^ rk[3];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[4] ^= rk[29];
+	x0 ^= rk[4];
+	rk[5] ^= rk[30];
+	x1 ^= rk[5];
+	rk[6] ^= rk[31];
+	x2 ^= rk[6];
+	rk[7] ^= rk[0];
+	x3 ^= rk[7];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[8] ^= rk[1];
+	x0 ^= rk[8];
+	rk[9] ^= rk[2];
+	x1 ^= rk[9];
+	rk[10] ^= rk[3];
+	x2 ^= rk[10];
+	rk[11] ^= rk[4];
+	x3 ^= rk[11];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[12] ^= rk[5];
+	x0 ^= rk[12];
+	rk[13] ^= rk[6];
+	x1 ^= rk[13];
+	rk[14] ^= rk[7];
+	x2 ^= rk[14];
+	rk[15] ^= rk[8];
+	x3 ^= rk[15];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	rk[16] ^= rk[9];
+	x0 = pC ^ rk[16];
+	rk[17] ^= rk[10];
+	x1 = pD ^ rk[17];
+	rk[18] ^= rk[11];
+	x2 = pE ^ rk[18];
+	rk[19] ^= rk[12];
+	x3 = pF ^ rk[19];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[20] ^= rk[13];
+	x0 ^= rk[20];
+	rk[21] ^= rk[14];
+	x1 ^= rk[21];
+	rk[22] ^= rk[15];
+	x2 ^= rk[22];
+	rk[23] ^= rk[16];
+	x3 ^= rk[23];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[24] ^= rk[17];
+	x0 ^= rk[24];
+	rk[25] ^= rk[18];
+	x1 ^= rk[25];
+	rk[26] ^= rk[19];
+	x2 ^= rk[26];
+	rk[27] ^= rk[20];
+	x3 ^= rk[27];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[28] ^= rk[21];
+	x0 ^= rk[28];
+	rk[29] ^= rk[22];
+	x1 ^= rk[29];
+	rk[30] ^= rk[23];
+	x2 ^= rk[30];
+	rk[31] ^= rk[24];
+	x3 ^= rk[31];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p8 ^= x0;
+	p9 ^= x1;
+	pA ^= x2;
+	pB ^= x3;
 
-		rk[0] ^= rk[25];
-		x0 = pC ^ rk[0];
-		rk[1] ^= rk[26];
-		x1 = pD ^ rk[1];
-		rk[2] ^= rk[27];
-		x2 = pE ^ rk[2];
-		rk[3] ^= rk[28];
-		x3 = pF ^ rk[3];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[4] ^= rk[29];
-		x0 ^= rk[4];
-		rk[5] ^= rk[30];
-		x1 ^= rk[5];
-		rk[6] ^= rk[31];
-		x2 ^= rk[6];
-		rk[7] ^= rk[0];
-		x3 ^= rk[7];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[8] ^= rk[1];
-		x0 ^= rk[8];
-		rk[9] ^= rk[2];
-		x1 ^= rk[9];
-		rk[10] ^= rk[3];
-		x2 ^= rk[10];
-		rk[11] ^= rk[4];
-		x3 ^= rk[11];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[12] ^= rk[5];
-		x0 ^= rk[12];
-		rk[13] ^= rk[6];
-		x1 ^= rk[13];
-		rk[14] ^= rk[7];
-		x2 ^= rk[14];
-		rk[15] ^= rk[8];
-		x3 ^= rk[15];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p8 ^= x0;
-		p9 ^= x1;
-		pA ^= x2;
-		pB ^= x3;
-		rk[16] ^= rk[9];
-		x0 = p4 ^ rk[16];
-		rk[17] ^= rk[10];
-		x1 = p5 ^ rk[17];
-		rk[18] ^= rk[11];
-		x2 = p6 ^ rk[18];
-		rk[19] ^= rk[12];
-		x3 = p7 ^ rk[19];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[20] ^= rk[13];
-		x0 ^= rk[20];
-		rk[21] ^= rk[14];
-		x1 ^= rk[21];
-		rk[22] ^= rk[15];
-		x2 ^= rk[22];
-		rk[23] ^= rk[16];
-		x3 ^= rk[23];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[24] ^= rk[17];
-		x0 ^= rk[24];
-		rk[25] ^= rk[18];
-		x1 ^= rk[25];
-		rk[26] ^= rk[19];
-		x2 ^= rk[26];
-		rk[27] ^= rk[20];
-		x3 ^= rk[27];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[28] ^= rk[21];
-		x0 ^= rk[28];
-		rk[29] ^= rk[22];
-		x1 ^= rk[29];
-		rk[30] ^= rk[23];
-		x2 ^= rk[30];
-		rk[31] ^= rk[24];
-		x3 ^= rk[31];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p0 ^= x0;
-		p1 ^= x1;
-		p2 ^= x2;
-		p3 ^= x3;
+	// 3
+	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
+	rk[0] ^= rk[28];
+	rk[1] ^= rk[29];
+	rk[2] ^= rk[30];
+	rk[3] ^= rk[31];
+	x0 = p0 ^ rk[0];
+	x1 = p1 ^ rk[1];
+	x2 = p2 ^ rk[2];
+	x3 = p3 ^ rk[3];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
+	rk[4] ^= rk[0];
+	rk[5] ^= rk[1];
+	rk[6] ^= rk[2];
+	rk[7] ^= rk[3];
+	x0 ^= rk[4];
+	x1 ^= rk[5];
+	x2 ^= rk[6];
+	x3 ^= rk[7];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
+	rk[8] ^= rk[4];
+	rk[9] ^= rk[5];
+	rk[10] ^= rk[6];
+	rk[11] ^= rk[7];
+	x0 ^= rk[8];
+	x1 ^= rk[9];
+	x2 ^= rk[10];
+	x3 ^= rk[11];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
+	rk[12] ^= rk[8];
+	rk[13] ^= rk[9];
+	rk[14] ^= rk[10];
+	rk[15] ^= rk[11];
+	x0 ^= rk[12];
+	x1 ^= rk[13];
+	x2 ^= rk[14];
+	x3 ^= rk[15];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	pC ^= x0;
+	pD ^= x1;
+	pE ^= x2;
+	pF ^= x3;
+	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
+	rk[16] ^= rk[12];
+	rk[17] ^= rk[13];
+	rk[18] ^= rk[14];
+	rk[19] ^= rk[15];
+	x0 = p8 ^ rk[16];
+	x1 = p9 ^ rk[17];
+	x2 = pA ^ rk[18];
+	x3 = pB ^ rk[19];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
+	rk[20] ^= rk[16];
+	rk[21] ^= rk[17];
+	rk[22] ^= rk[18];
+	rk[23] ^= rk[19];
+	x0 ^= rk[20];
+	x1 ^= rk[21];
+	x2 ^= rk[22];
+	x3 ^= rk[23];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
+	rk[24] ^= rk[20];
+	rk[25] ^= rk[21];
+	rk[26] ^= rk[22];
+	rk[27] ^= rk[23];
+	x0 ^= rk[24];
+	x1 ^= rk[25];
+	x2 ^= rk[26];
+	x3 ^= rk[27];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
+	rk[28] ^= rk[24];
+	rk[29] ^= rk[25];
+	rk[30] ^= rk[26];
+	rk[31] ^= ~rk[27];
+	rk[30] ^= counter;
+	//rk[31] ^= 0xFFFFFFFF;
+	x0 ^= rk[28];
+	x1 ^= rk[29];
+	x2 ^= rk[30];
+	x3 ^= rk[31];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+
+	rk[0] ^= rk[25];
+	x0 = pC ^ rk[0];
+	rk[1] ^= rk[26];
+	x1 = pD ^ rk[1];
+	rk[2] ^= rk[27];
+	x2 = pE ^ rk[2];
+	rk[3] ^= rk[28];
+	x3 = pF ^ rk[3];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[4] ^= rk[29];
+	x0 ^= rk[4];
+	rk[5] ^= rk[30];
+	x1 ^= rk[5];
+	rk[6] ^= rk[31];
+	x2 ^= rk[6];
+	rk[7] ^= rk[0];
+	x3 ^= rk[7];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[8] ^= rk[1];
+	x0 ^= rk[8];
+	rk[9] ^= rk[2];
+	x1 ^= rk[9];
+	rk[10] ^= rk[3];
+	x2 ^= rk[10];
+	rk[11] ^= rk[4];
+	x3 ^= rk[11];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[12] ^= rk[5];
+	x0 ^= rk[12];
+	rk[13] ^= rk[6];
+	x1 ^= rk[13];
+	rk[14] ^= rk[7];
+	x2 ^= rk[14];
+	rk[15] ^= rk[8];
+	x3 ^= rk[15];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p8 ^= x0;
+	p9 ^= x1;
+	pA ^= x2;
+	pB ^= x3;
+	rk[16] ^= rk[9];
+	x0 = p4 ^ rk[16];
+	rk[17] ^= rk[10];
+	x1 = p5 ^ rk[17];
+	rk[18] ^= rk[11];
+	x2 = p6 ^ rk[18];
+	rk[19] ^= rk[12];
+	x3 = p7 ^ rk[19];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[20] ^= rk[13];
+	x0 ^= rk[20];
+	rk[21] ^= rk[14];
+	x1 ^= rk[21];
+	rk[22] ^= rk[15];
+	x2 ^= rk[22];
+	rk[23] ^= rk[16];
+	x3 ^= rk[23];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[24] ^= rk[17];
+	x0 ^= rk[24];
+	rk[25] ^= rk[18];
+	x1 ^= rk[25];
+	rk[26] ^= rk[19];
+	x2 ^= rk[26];
+	rk[27] ^= rk[20];
+	x3 ^= rk[27];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[28] ^= rk[21];
+	x0 ^= rk[28];
+	rk[29] ^= rk[22];
+	x1 ^= rk[29];
+	rk[30] ^= rk[23];
+	x2 ^= rk[30];
+	rk[31] ^= rk[24];
+	x3 ^= rk[31];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
 
-		/* round 3, 7, 11 */
-		KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
-		rk[0] ^= rk[28];
-		rk[1] ^= rk[29];
-		rk[2] ^= rk[30];
-		rk[3] ^= rk[31];
-		x0 = p8 ^ rk[0];
-		x1 = p9 ^ rk[1];
-		x2 = pA ^ rk[2];
-		x3 = pB ^ rk[3];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
-		rk[4] ^= rk[0];
-		rk[5] ^= rk[1];
-		rk[6] ^= rk[2];
-		rk[7] ^= rk[3];
-		x0 ^= rk[4];
-		x1 ^= rk[5];
-		x2 ^= rk[6];
-		x3 ^= rk[7];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
-		rk[8] ^= rk[4];
-		rk[9] ^= rk[5];
-		rk[10] ^= rk[6];
-		rk[11] ^= rk[7];
-		x0 ^= rk[8];
-		x1 ^= rk[9];
-		x2 ^= rk[10];
-		x3 ^= rk[11];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
-		rk[12] ^= rk[8];
-		rk[13] ^= rk[9];
-		rk[14] ^= rk[10];
-		rk[15] ^= rk[11];
-		x0 ^= rk[12];
-		x1 ^= rk[13];
-		x2 ^= rk[14];
-		x3 ^= rk[15];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p4 ^= x0;
-		p5 ^= x1;
-		p6 ^= x2;
-		p7 ^= x3;
-		KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
-		rk[16] ^= rk[12];
-		rk[17] ^= rk[13];
-		rk[18] ^= rk[14];
-		rk[19] ^= rk[15];
-		x0 = p0 ^ rk[16];
-		x1 = p1 ^ rk[17];
-		x2 = p2 ^ rk[18];
-		x3 = p3 ^ rk[19];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
-		rk[20] ^= rk[16];
-		rk[21] ^= rk[17];
-		rk[22] ^= rk[18];
-		rk[23] ^= rk[19];
-		x0 ^= rk[20];
-		x1 ^= rk[21];
-		x2 ^= rk[22];
-		x3 ^= rk[23];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
-		rk[24] ^= rk[20];
-		rk[25] ^= rk[21];
-		rk[26] ^= rk[22];
-		rk[27] ^= rk[23];
-		x0 ^= rk[24];
-		x1 ^= rk[25];
-		x2 ^= rk[26];
-		x3 ^= rk[27];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
-		rk[28] ^= rk[24];
-		rk[29] ^= rk[25];
-		rk[30] ^= rk[26];
-		rk[31] ^= rk[27];
-		x0 ^= rk[28];
-		x1 ^= rk[29];
-		x2 ^= rk[30];
-		x3 ^= rk[31];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		pC ^= x0;
-		pD ^= x1;
-		pE ^= x2;
-		pF ^= x3;
-		/* round 4, 8, 12 */
-		rk[0] ^= rk[25];
-		x0 = p4 ^ rk[0];
-		rk[1] ^= rk[26];
-		x1 = p5 ^ rk[1];
-		rk[2] ^= rk[27];
-		x2 = p6 ^ rk[2];
-		rk[3] ^= rk[28];
-		x3 = p7 ^ rk[3];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[4] ^= rk[29];
-		x0 ^= rk[4];
-		rk[5] ^= rk[30];
-		x1 ^= rk[5];
-		rk[6] ^= rk[31];
-		x2 ^= rk[6];
-		rk[7] ^= rk[0];
-		x3 ^= rk[7];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[8] ^= rk[1];
-		x0 ^= rk[8];
-		rk[9] ^= rk[2];
-		x1 ^= rk[9];
-		rk[10] ^= rk[3];
-		x2 ^= rk[10];
-		rk[11] ^= rk[4];
-		x3 ^= rk[11];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[12] ^= rk[5];
-		x0 ^= rk[12];
-		rk[13] ^= rk[6];
-		x1 ^= rk[13];
-		rk[14] ^= rk[7];
-		x2 ^= rk[14];
-		rk[15] ^= rk[8];
-		x3 ^= rk[15];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p0 ^= x0;
-		p1 ^= x1;
-		p2 ^= x2;
-		p3 ^= x3;
-		rk[16] ^= rk[9];
-		x0 = pC ^ rk[16];
-		rk[17] ^= rk[10];
-		x1 = pD ^ rk[17];
-		rk[18] ^= rk[11];
-		x2 = pE ^ rk[18];
-		rk[19] ^= rk[12];
-		x3 = pF ^ rk[19];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[20] ^= rk[13];
-		x0 ^= rk[20];
-		rk[21] ^= rk[14];
-		x1 ^= rk[21];
-		rk[22] ^= rk[15];
-		x2 ^= rk[22];
-		rk[23] ^= rk[16];
-		x3 ^= rk[23];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[24] ^= rk[17];
-		x0 ^= rk[24];
-		rk[25] ^= rk[18];
-		x1 ^= rk[25];
-		rk[26] ^= rk[19];
-		x2 ^= rk[26];
-		rk[27] ^= rk[20];
-		x3 ^= rk[27];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk[28] ^= rk[21];
-		x0 ^= rk[28];
-		rk[29] ^= rk[22];
-		x1 ^= rk[29];
-		rk[30] ^= rk[23];
-		x2 ^= rk[30];
-		rk[31] ^= rk[24];
-		x3 ^= rk[31];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p8 ^= x0;
-		p9 ^= x1;
-		pA ^= x2;
-		pB ^= x3;
 
-		/* round 13 */
-		KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
-		rk[0] ^= rk[28];
-		rk[1] ^= rk[29];
-		rk[2] ^= rk[30];
-		rk[3] ^= rk[31];
-		x0 = p0 ^ rk[0];
-		x1 = p1 ^ rk[1];
-		x2 = p2 ^ rk[2];
-		x3 = p3 ^ rk[3];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
-		rk[4] ^= rk[0];
-		rk[5] ^= rk[1];
-		rk[6] ^= rk[2];
-		rk[7] ^= rk[3];
-		x0 ^= rk[4];
-		x1 ^= rk[5];
-		x2 ^= rk[6];
-		x3 ^= rk[7];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
-		rk[8] ^= rk[4];
-		rk[9] ^= rk[5];
-		rk[10] ^= rk[6];
-		rk[11] ^= rk[7];
-		x0 ^= rk[8];
-		x1 ^= rk[9];
-		x2 ^= rk[10];
-		x3 ^= rk[11];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
-		rk[12] ^= rk[8];
-		rk[13] ^= rk[9];
-		rk[14] ^= rk[10];
-		rk[15] ^= rk[11];
-		x0 ^= rk[12];
-		x1 ^= rk[13];
-		x2 ^= rk[14];
-		x3 ^= rk[15];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		pC ^= x0;
-		pD ^= x1;
-		pE ^= x2;
-		pF ^= x3;
-		KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
-		rk[16] ^= rk[12];
-		rk[17] ^= rk[13];
-		rk[18] ^= rk[14];
-		rk[19] ^= rk[15];
-		x0 = p8 ^ rk[16];
-		x1 = p9 ^ rk[17];
-		x2 = pA ^ rk[18];
-		x3 = pB ^ rk[19];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
-		rk[20] ^= rk[16];
-		rk[21] ^= rk[17];
-		rk[22] ^= rk[18];
-		rk[23] ^= rk[19];
-		x0 ^= rk[20];
-		x1 ^= rk[21];
-		x2 ^= rk[22];
-		x3 ^= rk[23];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
-		rk[24] ^= rk[20];
-		rk[25] ^= rk[21] ^ 512;
-		rk[26] ^= rk[22];
-		rk[27] ^= ~rk[23]; //^ 0xFFFFFFFF;
-		x0 ^= rk[24];
-		x1 ^= rk[25];
-		x2 ^= rk[26];
-		x3 ^= rk[27];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
-		rk[28] ^= rk[24];
-		rk[29] ^= rk[25];
-		rk[30] ^= rk[26];
-		rk[31] ^= rk[27];
-		x0 ^= rk[28];
-		x1 ^= rk[29];
-		x2 ^= rk[30];
-		x3 ^= rk[31];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		p4 ^= x0;
-		p5 ^= x1;
-		p6 ^= x2;
-		p7 ^= x3;
-		Hash[0]=state[0x0] ^ p8;
-		Hash[1]=state[0x1] ^ p9;
-		Hash[2]= state[0x2] ^ pA;
-		Hash[3] = state[0x3] ^ pB;
-		Hash[4] = state[0x4] ^ pC;
-		Hash[5] = state[0x5] ^ pD;
-		Hash[6] = state[0x6] ^ pE;
-		Hash[7] = state[0x7] ^ pF;
-		Hash[8] = state[0x8] ^ p0;
-		Hash[9] = state[0x9] ^ p1;
-		Hash[10] = state[0xA] ^ p2;
-		Hash[11] = state[0xB] ^ p3;
-		Hash[12] = state[0xC] ^ p4;
-		Hash[13] = state[0xD] ^ p5;
-		Hash[14] = state[0xE] ^ p6;
-		Hash[15] = state[0xF] ^ p7;
-	}
+	/* round 3, 7, 11 */
+	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
+	rk[0] ^= rk[28];
+	rk[1] ^= rk[29];
+	rk[2] ^= rk[30];
+	rk[3] ^= rk[31];
+	x0 = p8 ^ rk[0];
+	x1 = p9 ^ rk[1];
+	x2 = pA ^ rk[2];
+	x3 = pB ^ rk[3];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
+	rk[4] ^= rk[0];
+	rk[5] ^= rk[1];
+	rk[6] ^= rk[2];
+	rk[7] ^= rk[3];
+	x0 ^= rk[4];
+	x1 ^= rk[5];
+	x2 ^= rk[6];
+	x3 ^= rk[7];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
+	rk[8] ^= rk[4];
+	rk[9] ^= rk[5];
+	rk[10] ^= rk[6];
+	rk[11] ^= rk[7];
+	x0 ^= rk[8];
+	x1 ^= rk[9];
+	x2 ^= rk[10];
+	x3 ^= rk[11];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
+	rk[12] ^= rk[8];
+	rk[13] ^= rk[9];
+	rk[14] ^= rk[10];
+	rk[15] ^= rk[11];
+	x0 ^= rk[12];
+	x1 ^= rk[13];
+	x2 ^= rk[14];
+	x3 ^= rk[15];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
+	rk[16] ^= rk[12];
+	rk[17] ^= rk[13];
+	rk[18] ^= rk[14];
+	rk[19] ^= rk[15];
+	x0 = p0 ^ rk[16];
+	x1 = p1 ^ rk[17];
+	x2 = p2 ^ rk[18];
+	x3 = p3 ^ rk[19];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
+	rk[20] ^= rk[16];
+	rk[21] ^= rk[17];
+	rk[22] ^= rk[18];
+	rk[23] ^= rk[19];
+	x0 ^= rk[20];
+	x1 ^= rk[21];
+	x2 ^= rk[22];
+	x3 ^= rk[23];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
+	rk[24] ^= rk[20];
+	rk[25] ^= rk[21];
+	rk[26] ^= rk[22];
+	rk[27] ^= rk[23];
+	x0 ^= rk[24];
+	x1 ^= rk[25];
+	x2 ^= rk[26];
+	x3 ^= rk[27];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
+	rk[28] ^= rk[24];
+	rk[29] ^= rk[25];
+	rk[30] ^= rk[26];
+	rk[31] ^= rk[27];
+	x0 ^= rk[28];
+	x1 ^= rk[29];
+	x2 ^= rk[30];
+	x3 ^= rk[31];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	pC ^= x0;
+	pD ^= x1;
+	pE ^= x2;
+	pF ^= x3;
+	/* round 4, 8, 12 */
+	rk[0] ^= rk[25];
+	x0 = p4 ^ rk[0];
+	rk[1] ^= rk[26];
+	x1 = p5 ^ rk[1];
+	rk[2] ^= rk[27];
+	x2 = p6 ^ rk[2];
+	rk[3] ^= rk[28];
+	x3 = p7 ^ rk[3];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[4] ^= rk[29];
+	x0 ^= rk[4];
+	rk[5] ^= rk[30];
+	x1 ^= rk[5];
+	rk[6] ^= rk[31];
+	x2 ^= rk[6];
+	rk[7] ^= rk[0];
+	x3 ^= rk[7];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[8] ^= rk[1];
+	x0 ^= rk[8];
+	rk[9] ^= rk[2];
+	x1 ^= rk[9];
+	rk[10] ^= rk[3];
+	x2 ^= rk[10];
+	rk[11] ^= rk[4];
+	x3 ^= rk[11];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[12] ^= rk[5];
+	x0 ^= rk[12];
+	rk[13] ^= rk[6];
+	x1 ^= rk[13];
+	rk[14] ^= rk[7];
+	x2 ^= rk[14];
+	rk[15] ^= rk[8];
+	x3 ^= rk[15];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	rk[16] ^= rk[9];
+	x0 = pC ^ rk[16];
+	rk[17] ^= rk[10];
+	x1 = pD ^ rk[17];
+	rk[18] ^= rk[11];
+	x2 = pE ^ rk[18];
+	rk[19] ^= rk[12];
+	x3 = pF ^ rk[19];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[20] ^= rk[13];
+	x0 ^= rk[20];
+	rk[21] ^= rk[14];
+	x1 ^= rk[21];
+	rk[22] ^= rk[15];
+	x2 ^= rk[22];
+	rk[23] ^= rk[16];
+	x3 ^= rk[23];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[24] ^= rk[17];
+	x0 ^= rk[24];
+	rk[25] ^= rk[18];
+	x1 ^= rk[25];
+	rk[26] ^= rk[19];
+	x2 ^= rk[26];
+	rk[27] ^= rk[20];
+	x3 ^= rk[27];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	rk[28] ^= rk[21];
+	x0 ^= rk[28];
+	rk[29] ^= rk[22];
+	x1 ^= rk[29];
+	rk[30] ^= rk[23];
+	x2 ^= rk[30];
+	rk[31] ^= rk[24];
+	x3 ^= rk[31];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p8 ^= x0;
+	p9 ^= x1;
+	pA ^= x2;
+	pB ^= x3;
+
+	/* round 13 */
+	KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]);
+	rk[0] ^= rk[28];
+	rk[1] ^= rk[29];
+	rk[2] ^= rk[30];
+	rk[3] ^= rk[31];
+	x0 = p0 ^ rk[0];
+	x1 = p1 ^ rk[1];
+	x2 = p2 ^ rk[2];
+	x3 = p3 ^ rk[3];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]);
+	rk[4] ^= rk[0];
+	rk[5] ^= rk[1];
+	rk[6] ^= rk[2];
+	rk[7] ^= rk[3];
+	x0 ^= rk[4];
+	x1 ^= rk[5];
+	x2 ^= rk[6];
+	x3 ^= rk[7];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]);
+	rk[8] ^= rk[4];
+	rk[9] ^= rk[5];
+	rk[10] ^= rk[6];
+	rk[11] ^= rk[7];
+	x0 ^= rk[8];
+	x1 ^= rk[9];
+	x2 ^= rk[10];
+	x3 ^= rk[11];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]);
+	rk[12] ^= rk[8];
+	rk[13] ^= rk[9];
+	rk[14] ^= rk[10];
+	rk[15] ^= rk[11];
+	x0 ^= rk[12];
+	x1 ^= rk[13];
+	x2 ^= rk[14];
+	x3 ^= rk[15];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	pC ^= x0;
+	pD ^= x1;
+	pE ^= x2;
+	pF ^= x3;
+	KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]);
+	rk[16] ^= rk[12];
+	rk[17] ^= rk[13];
+	rk[18] ^= rk[14];
+	rk[19] ^= rk[15];
+	x0 = p8 ^ rk[16];
+	x1 = p9 ^ rk[17];
+	x2 = pA ^ rk[18];
+	x3 = pB ^ rk[19];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]);
+	rk[20] ^= rk[16];
+	rk[21] ^= rk[17];
+	rk[22] ^= rk[18];
+	rk[23] ^= rk[19];
+	x0 ^= rk[20];
+	x1 ^= rk[21];
+	x2 ^= rk[22];
+	x3 ^= rk[23];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]);
+	rk[24] ^= rk[20];
+	rk[25] ^= rk[21] ^ counter;
+	rk[26] ^= rk[22];
+	rk[27] ^= ~rk[23]; //^ 0xFFFFFFFF;
+	x0 ^= rk[24];
+	x1 ^= rk[25];
+	x2 ^= rk[26];
+	x3 ^= rk[27];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]);
+	rk[28] ^= rk[24];
+	rk[29] ^= rk[25];
+	rk[30] ^= rk[26];
+	rk[31] ^= rk[27];
+	x0 ^= rk[28];
+	x1 ^= rk[29];
+	x2 ^= rk[30];
+	x3 ^= rk[31];
+	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	state[0x0] ^= p8;
+	state[0x1] ^= p9;
+	state[0x2] ^= pA;
+	state[0x3] ^= pB;
+	state[0x4] ^= pC;
+	state[0x5] ^= pD;
+	state[0x6] ^= pE;
+	state[0x7] ^= pF;
+	state[0x8] ^= p0;
+	state[0x9] ^= p1;
+	state[0xA] ^= p2;
+	state[0xB] ^= p3;
+	state[0xC] ^= p4;
+	state[0xD] ^= p5;
+	state[0xE] ^= p6;
+	state[0xF] ^= p7;
 }
 
-
-__global__ __launch_bounds__(TPB, 8)
+__global__ __launch_bounds__(TPB, 3)
 void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
-	if (threadIdx.x < 128) {
+	if(threadIdx.x < 256)
+	{
 		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
-		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
-		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
-		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
-
-		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
-		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
-		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
-		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 256] = ROTL32(sharedMemory[threadIdx.x], 8);
+		sharedMemory[threadIdx.x + 512] = ROTL32(sharedMemory[threadIdx.x], 16);
+		sharedMemory[threadIdx.x + 768] = ROTL32(sharedMemory[threadIdx.x], 24);
 	}
-
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	__syncthreads();
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if(thread < threads)
 	{
 		const uint32_t nounce = startNounce + thread;
 
 		// kopiere init-state
 		uint32_t state[16] = {
-			SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
-			SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
-			SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
-			SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A)
+			0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
+			0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
+			0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
+			0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
 		};
 
 		uint32_t msg[32];
 
-		#pragma unroll 32
-		for(int i=0;i<32;i++) {
+#pragma unroll
+		for(int i = 0; i<31; i++)
+		{
 			msg[i] = c_PaddedMessage80[i];
 		}
 		msg[19] = cuda_swab32(nounce);
@@ -2600,37 +2510,39 @@ void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
 		msg[27] = 0x2800000;
 		msg[31] = 0x2000000;
 
-		c512(sharedMemory, state, msg, 640);
+		c512(sharedMemory, state, msg);
 
 		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
 
-		#pragma unroll 16
-		for(int i=0;i<16;i++)
+#pragma unroll 16
+		for(int i = 0; i<16; i++)
 			outHash[i] = state[i];
 
 	} //thread < threads
 }
 
-__host__ void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
 {
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + TPB-1)/TPB);
 	dim3 block(TPB);
 
-	x11_shavite512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x11_shavite512_gpu_hash_64<<<grid, block, 0, gpustream[thr_id]>>>(threads, d_hash);
+
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
 
-__host__ void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+__host__ void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
 {
 
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + TPB-1)/TPB);
 	dim3 block(TPB);
 
-	x11_shavite512_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash);
+	x11_shavite512_gpu_hash_80<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_outputHash);
 }
 
-__host__ void x11_shavite512_setBlock_80(void *pdata)
+__host__ void x11_shavite512_setBlock_80(int thr_id, void *pdata)
 {
 	// Message mit Padding bereitstellen
 	// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
@@ -2638,6 +2550,6 @@ __host__ void x11_shavite512_setBlock_80(void *pdata)
 	memcpy(PaddedMessage, pdata, 80);
 	memset(PaddedMessage+80, 0, 48);
 
-	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbolAsync(c_PaddedMessage80, PaddedMessage, 32 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]);
 }
 
diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu
index 17a24ff616..5b85c3f514 100644
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@@ -5,17 +5,19 @@
 //
 // STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations
 
-#define TPB 64
-
+#define TPB 256
 #include "cuda_helper.h"
+#include "cuda_vector.h"
 #include <stdio.h>
 
 
-uint32_t *d_state[MAX_GPUS];
-uint4 *d_temp4[MAX_GPUS];
+static uint32_t *d_state[MAX_GPUS];
+static uint4 *d_temp4[MAX_GPUS];
 
+#if __CUDA_ARCH__ < 320
 // texture bound to d_temp4[thr_id], for read access in Compaction kernel
 texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
+#endif
 
 __constant__ uint8_t c_perm0[8] = { 2, 3, 6, 7, 0, 1, 4, 5 };
 __constant__ uint8_t c_perm1[8] = { 6, 7, 2, 3, 4, 5, 0, 1 };
@@ -26,14 +28,6 @@ __constant__ uint8_t c_perm5[8] = { 6, 7, 2, 3, 0, 1, 4, 5 };
 __constant__ uint8_t c_perm6[8] = { 6, 7, 0, 1, 4, 5, 2, 3 };
 __constant__ uint8_t c_perm7[8] = { 4, 5, 2, 3, 6, 7, 0, 1 };
 
-
-__constant__ uint32_t c_IV_512[32] = {
-	0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
-	0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
-	0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
-	0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
-};
-
 __constant__ short c_FFT128_8_16_Twiddle[128] = {
 	1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
 	1,  60,   2, 120,   4,  -17,   8, -34,  16, -68,  32, 121,  64, -15, 128, -30,
@@ -104,25 +98,21 @@ void FFT_8(int *y, int stripe) {
  * Unrolled decimation in frequency (DIF) radix-2 NTT.
  * Output data is in revbin_permuted order.
  */
-
+	uint32_t u, v;
 #define X(i) y[stripe*i]
 
 #define DO_REDUCE(i) \
 	X(i) = REDUCE(X(i))
 
 #define DO_REDUCE_FULL_S(i) \
-do { \
 	X(i) = REDUCE(X(i)); \
-	X(i) = EXTRA_REDUCE_S(X(i)); \
-} while(0)
+	X(i) = EXTRA_REDUCE_S(X(i));
 
 #define BUTTERFLY(i,j,n) \
-do { \
-	int u= X(i); \
-	int v= X(j); \
+	u= y[stripe*i]; \
+	v= y[stripe*j]; \
 	X(i) = u+v; \
-	X(j) = (u-v) << (2*n); \
-} while(0)
+	X(j) = (u-v) << (2*n);
 
 	BUTTERFLY(0, 4, 0);
 	BUTTERFLY(1, 5, 1);
@@ -167,11 +157,8 @@ __device__ __forceinline__ void FFT_16(int *y) {
  * Output data is in revbin_permuted order.
  */
 #define DO_REDUCE_FULL_S(i) \
-	do { \
 		y[i] = REDUCE(y[i]); \
-		y[i] = EXTRA_REDUCE_S(y[i]); \
-	} while(0)
-
+		y[i] = EXTRA_REDUCE_S(y[i]);
 	int u,v;
 
 	// BUTTERFLY(0, 8, 0);
@@ -283,7 +270,7 @@ void FFT_128_full(int *y)
 	for (i=0; i<16; i++)
 	/*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i*8+(threadIdx.x&7)]);
 
-#pragma unroll 8
+//#pragma unroll 8
 	for (i=0; i<16; i+=2)
 		FFT_16(y+i);  // eight sequential FFT16's, each one executed in parallel by 8 threads
 }
@@ -327,14 +314,197 @@ void Expansion(const uint32_t *const __restrict__ data, uint4 *const __restrict_
 	int expanded[32];
 #pragma unroll 4
 	for (int i=0; i < 4; i++) {
-		expanded[  i] = __byte_perm(__shfl((int)data[0], 2*i, 8), __shfl((int)data[0], (2*i)+1, 8), threadIdx.x&7)&0xff;
-		expanded[4+i] = __byte_perm(__shfl((int)data[1], 2*i, 8), __shfl((int)data[1], (2*i)+1, 8), threadIdx.x&7)&0xff;
+		expanded[i] = __byte_perm(__shfl((int)data[0], 2 * i, 8), __shfl((int)data[0], (2 * i) + 1, 8), threadIdx.x & 7) & 0xff;
+		expanded[4 + i] = __byte_perm(__shfl((int)data[1], 2 * i, 8), __shfl((int)data[1], (2 * i) + 1, 8), threadIdx.x & 7) & 0xff;
 	}
-#pragma unroll 8
-	for (int i=8; i < 16; i++)
-		expanded[i] = 0;
 
-	FFT_256_halfzero(expanded);
+	expanded[9] = 0;
+	expanded[11] = 0;
+	expanded[13] = 0;
+	expanded[15] = 0;
+
+//	FFT_256_halfzero(expanded);
+
+	/*
+	* FFT_256 using w=41 as 256th root of unity.
+	* Decimation in frequency (DIF) NTT.
+	* Output data is in revbin_permuted order.
+	* In place.
+	*/
+//	const int tmp = expanded[15];
+
+	#pragma unroll 8
+	for (int i = 0; i<8; i++)
+		expanded[16 + i] = REDUCE(expanded[i] * c_FFT256_2_128_Twiddle[8 * i + (threadIdx.x & 7)]);
+
+
+//#pragma unroll 8
+//	for (int i = 24; i<32; i++)
+//		expanded[i] = 0;
+	expanded[9+16] = 0;
+	expanded[11 + 16] = 0;
+	expanded[13 + 16] = 0;
+	expanded[15 + 16] = 0;
+
+	/* handle X^255 with an additional butterfly */
+	if ((threadIdx.x & 7) == 7)
+	{
+		expanded[15] = 1;
+		expanded[31] = 0x0100 * 94; 
+	}
+
+	//	FFT_128_full(expanded);
+
+		int i;
+		uint32_t u, v;
+
+#define DO_REDUCE(i) \
+	expanded[2*i] = REDUCE(expanded[2*i])
+
+#define DO_REDUCE_FULL_S(i) \
+	expanded[2*i] = REDUCE(expanded[2*i]); \
+	expanded[2*i] = EXTRA_REDUCE_S(expanded[2*i]);
+
+#define BUTTERFLY(i,j,n) \
+	u= expanded[2*i]; \
+	v= expanded[2*j]; \
+	expanded[2*i] = u+v; \
+	expanded[2*j] = (u-v) << (2*n);
+
+//		BUTTERFLY(0, 4, 0);		//0 8 0
+		expanded[2 * 4] = expanded[2 * 0];
+
+//		BUTTERFLY(1, 5, 1);		//2 10 2
+		u = expanded[2 * 1];
+		expanded[2 * 5] = (u ) << (2 * 1);
+
+//		BUTTERFLY(2, 6, 2);		//4 12 4
+		u = expanded[2 * 2];
+		expanded[2 * 6] = (u) << (2 * 2);
+
+//		BUTTERFLY(3, 7, 3);		//6 14 6
+		u = expanded[2 * 3];
+		expanded[2 * 7] = (u) << (2 * 3);
+
+		expanded[2 * 6] = REDUCE(expanded[2 * 6]);
+		expanded[2 * 7] = REDUCE(expanded[2 * 7]);
+
+		BUTTERFLY(0, 2, 0);
+		BUTTERFLY(4, 6, 0);
+		BUTTERFLY(1, 3, 2);
+		BUTTERFLY(5, 7, 2);
+
+		DO_REDUCE(7);
+
+		BUTTERFLY(0, 1, 0);
+		BUTTERFLY(2, 3, 0);
+		BUTTERFLY(4, 5, 0);
+		BUTTERFLY(6, 7, 0);
+
+		DO_REDUCE_FULL_S(0);
+		DO_REDUCE_FULL_S(1);
+		DO_REDUCE_FULL_S(2);
+		DO_REDUCE_FULL_S(3);
+		DO_REDUCE_FULL_S(4);
+		DO_REDUCE_FULL_S(5);
+		DO_REDUCE_FULL_S(6);
+		DO_REDUCE_FULL_S(7);
+
+#undef X
+#undef DO_REDUCE
+#undef DO_REDUCE_FULL_S
+#undef BUTTERFLY
+
+//		FFT_8(expanded + 0, 2); // eight parallel FFT8's
+
+		FFT_8(expanded + 1, 2); // eight parallel FFT8's
+
+		expanded[0] = REDUCE(expanded[0]);
+		expanded[1] = REDUCE(expanded[1]);
+#pragma unroll
+		for (i = 2; i<16; i++)
+			expanded[i] = REDUCE(expanded[i] * c_FFT128_8_16_Twiddle[i * 8 + (threadIdx.x & 7)]);
+
+		//#pragma unroll 8
+		for (i = 0; i<16; i += 2)
+			FFT_16(expanded + i);  // eight sequential FFT16's, each one executed in parallel by 8 threads
+
+
+		
+//		FFT_128_full(expanded + 16);
+
+#define DO_REDUCE(i) \
+	expanded[2*i+ 16] = REDUCE(expanded[2*i+ 16])
+
+#define DO_REDUCE_FULL_S(i) \
+	expanded[2*i+ 16] = REDUCE(expanded[2*i+ 16]); \
+	expanded[2*i+ 16] = EXTRA_REDUCE_S(expanded[2*i+ 16]);
+
+#define BUTTERFLY(i,j,n) \
+	u= expanded[2*i+ 16]; \
+	v= expanded[2*j+ 16]; \
+	expanded[2*i+ 16] = u+v; \
+	expanded[2*j+ 16] = (u-v) << (2*n);
+
+		//		BUTTERFLY(0, 4, 0);		//0 8 0
+		expanded[2 * 4 + 16] = expanded[2 * 0 + 16];
+
+		//		BUTTERFLY(1, 5, 1);		//2 10 2
+		u = expanded[2 * 1 + 16];
+		expanded[2 * 5 + 16] = (u) << (2 * 1);
+
+		//		BUTTERFLY(2, 6, 2);		//4 12 4
+		u = expanded[2 * 2 + 16];
+		expanded[2 * 6 + 16] = (u) << (2 * 2);
+
+		//		BUTTERFLY(3, 7, 3);		//6 14 6
+		u = expanded[2 * 3 + 16];
+		expanded[2 * 7 + 16] = (u) << (2 * 3);
+
+		expanded[2 * 6 + 16] = REDUCE(expanded[2 * 6 + 16]);
+		expanded[2 * 7 + 16] = REDUCE(expanded[2 * 7 + 16]);
+
+		BUTTERFLY(0, 2, 0);
+		BUTTERFLY(4, 6, 0);
+		BUTTERFLY(1, 3, 2);
+		BUTTERFLY(5, 7, 2);
+
+		DO_REDUCE(7);
+
+		BUTTERFLY(0, 1, 0);
+		BUTTERFLY(2, 3, 0);
+		BUTTERFLY(4, 5, 0);
+		BUTTERFLY(6, 7, 0);
+
+		DO_REDUCE_FULL_S(0);
+		DO_REDUCE_FULL_S(1);
+		DO_REDUCE_FULL_S(2);
+		DO_REDUCE_FULL_S(3);
+		DO_REDUCE_FULL_S(4);
+		DO_REDUCE_FULL_S(5);
+		DO_REDUCE_FULL_S(6);
+		DO_REDUCE_FULL_S(7);
+
+#undef X
+#undef DO_REDUCE
+#undef DO_REDUCE_FULL_S
+#undef BUTTERFLY
+
+		//		FFT_8(expanded + 0, 2); // eight parallel FFT8's
+
+
+		FFT_8(expanded + 1 + 16, 2); // eight parallel FFT8's
+
+		expanded[0 + 16] = REDUCE(expanded[0 + 16]);
+		expanded[1 + 16] = REDUCE(expanded[1 + 16]);
+#pragma unroll
+		for (i = 2; i<16; i++)
+			expanded[i + 16] = REDUCE(expanded[i + 16] * c_FFT128_8_16_Twiddle[i * 8 + (threadIdx.x & 7)]);
+
+		//#pragma unroll 8
+		for (i = 0; i<16; i += 2)
+			FFT_16(expanded + i+ 16);  // eight sequential FFT16's, each one executed in parallel by 8 threads
+
 
 	// store w matrices in global memory
 
@@ -541,14 +711,14 @@ void Expansion(const uint32_t *const __restrict__ data, uint4 *const __restrict_
 
 /***************************************************/
 __global__ void __launch_bounds__(TPB, 4)
-x11_simd512_gpu_expand_64(uint32_t threads, uint32_t startNounce, const uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector, uint4 *const __restrict__ g_temp4)
+x11_simd512_gpu_expand_64(uint32_t threads, uint32_t startNounce, const uint64_t *const __restrict__ g_hash, uint4 *const __restrict__ g_temp4)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x)/8;
-	if (thread < threads)
+//	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t nounce = (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
+		const uint32_t hashPosition = nounce - startNounce;
 
 		uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition];
 
@@ -566,62 +736,65 @@ x11_simd512_gpu_expand_64(uint32_t threads, uint32_t startNounce, const uint64_t
 	}
 }
 
-__global__ void __launch_bounds__(TPB, 4)
-x11_simd512_gpu_compress1_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
+__global__ __launch_bounds__(TPB, 2)
+void x11_simd512_gpu_compress_64_maxwell(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint4 *g_fft4)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t nounce = (startNounce + thread);
+		uint4 g_state[64];
 
-		int hashPosition = nounce - startNounce;
-		uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
 
-		Compression1(Hash, hashPosition, g_fft4, g_state);
+		const uint32_t hashPosition = nounce - startNounce;
+		uint32_t *const Hash = (uint32_t*)&g_hash[8 * hashPosition];
+
+		Compression1(Hash, hashPosition, g_fft4, (uint32_t *)g_state);
+		Compression2(hashPosition, g_fft4, (uint32_t *)&g_state);
+		Final(Hash, hashPosition, g_fft4, (uint32_t *)&g_state);
 	}
 }
+
 __global__ void __launch_bounds__(TPB, 4)
-x11_simd512_gpu_compress2_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
+x11_simd512_gpu_compress1_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint4 *g_fft4, uint32_t *g_state)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+//	if(thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		uint32_t nounce = startNounce + thread;
 
 		int hashPosition = nounce - startNounce;
+		uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
 
-		Compression2(hashPosition, g_fft4, g_state);
+		Compression1(Hash, hashPosition, g_fft4, g_state);
 	}
 }
 
-
-__global__ void __launch_bounds__(TPB, 4)
-x11_simd512_gpu_compress_64_maxwell(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
+__global__ void __launch_bounds__(TPB, 1)
+x11_simd512_gpu_compress2_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint4 *g_fft4, uint32_t *g_state)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t nounce =  (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
-		uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
+		const uint32_t hashPosition = nounce - startNounce;
 
-		Compression1(Hash, hashPosition, g_fft4, g_state);
 		Compression2(hashPosition, g_fft4, g_state);
 	}
 }
 
 
 __global__ void  __launch_bounds__(TPB, 4)
-x11_simd512_gpu_final_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
+x11_simd512_gpu_final_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint4 *g_fft4, uint32_t *g_state)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t nounce = (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
-		uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
+		const uint32_t hashPosition = nounce - startNounce;
+		uint32_t *const Hash = (uint32_t*)&g_hash[8 * hashPosition];
 
 		Final(Hash, hashPosition, g_fft4, g_state);
 	}
@@ -630,45 +803,42 @@ x11_simd512_gpu_final_64(uint32_t threads, uint32_t startNounce, uint64_t *g_has
 __host__ 
 int x11_simd512_cpu_init(int thr_id, uint32_t threads)
 {
-	CUDA_SAFE_CALL(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads));
 	CUDA_SAFE_CALL(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads));
 
 	// Texture for 128-Bit Zugriffe
-	cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc<uint4>();
-	texRef1D_128.normalized = 0;
-	texRef1D_128.filterMode = cudaFilterModePoint;
-	texRef1D_128.addressMode[0] = cudaAddressModeClamp;
-	CUDA_SAFE_CALL(cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads));
+//	cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc<uint4>();
+//	texRef1D_128.normalized = 0;
+//	texRef1D_128.filterMode = cudaFilterModePoint;
+//	texRef1D_128.addressMode[0] = cudaAddressModeClamp;
+//	CUDA_SAFE_CALL(cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads));
 	return 0;
 }
 void x11_simd512_cpu_free(int thr_id)
 {
-	cudaFree(&d_state[thr_id]);
 	cudaFree(&d_temp4[thr_id]);
 }
+
 __host__
-void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t simdthreads)
 {
-	dim3 block(TPB);
-	dim3 grid8(((threads + TPB-1)/TPB)*8);
+	dim3 grid8(((threads + simdthreads - 1) / simdthreads) * 8);
 
-	x11_simd512_gpu_expand_64 <<<grid8, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]);
-	//MyStreamSynchronize(NULL, order, thr_id);
-
-	dim3 grid((threads + TPB-1)/TPB);
 
 	if (device_sm[device_map[thr_id]] >= 500) 
 	{
-		x11_simd512_gpu_compress_64_maxwell << < grid, block >> > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
-		//MyStreamSynchronize(NULL, order, thr_id);
+		dim3 block(simdthreads);
+		dim3 grid((threads + simdthreads - 1) / simdthreads);
+		x11_simd512_gpu_expand_64 << <grid8, block, 0, gpustream[thr_id] >> > (threads, startNounce, (uint64_t*)d_hash, d_temp4[thr_id]);
+		x11_simd512_gpu_compress_64_maxwell << < grid, block, 0, gpustream[thr_id] >> > (threads, startNounce, (uint64_t*)d_hash, d_temp4[thr_id]);
 	}
 	else 
 	{
-		x11_simd512_gpu_compress1_64 << < grid, block >> > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
-		x11_simd512_gpu_compress2_64 << < grid, block >> > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
-		//	MyStreamSynchronize(NULL, order, thr_id);
+		dim3 block(TPB);
+		dim3 grid((threads + TPB - 1) / TPB);
+		x11_simd512_gpu_expand_64 << <grid8, block, 0, gpustream[thr_id] >> > (threads, startNounce, (uint64_t*)d_hash, d_temp4[thr_id]);
+		x11_simd512_gpu_compress1_64 << < grid, block, 0, gpustream[thr_id] >> > (threads, startNounce, (uint64_t*)d_hash, d_temp4[thr_id], d_state[thr_id]);
+		x11_simd512_gpu_compress2_64 << < grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, (uint64_t*)d_hash, d_temp4[thr_id], d_state[thr_id]);
+		x11_simd512_gpu_final_64 << <grid, block, 0, gpustream[thr_id] >> > (threads, startNounce, (uint64_t*)d_hash, d_temp4[thr_id], d_state[thr_id]);
 	}
-
-	x11_simd512_gpu_final_64 << <grid, block >> > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
-//	MyStreamSynchronize(NULL, order, thr_id);
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
diff --git a/x11/fresh.cu b/x11/fresh.cu
index 29758af357..959adc6e7e 100644
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@@ -12,25 +12,23 @@ extern "C" {
 // to test gpu hash on a null buffer
 #define NULLTEST 0
 
-static uint32_t *d_hash[MAX_GPUS];
-
-extern void x11_shavite512_setBlock_80(void *pdata);
-extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
-extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_shavite512_setBlock_80(int thr_id, void *pdata);
+extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+//extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *d_hash, uint32_t target, uint32_t *h_found);
 
 extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-											uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse,
-											int order);
+extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes,
+											const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse);
 
 // CPU Hash
-extern "C" void fresh_hash(void *state, const void *input)
+void fresh_hash(void *state, const void *input)
 {
 	// shavite-simd-shavite-simd-echo
 
@@ -67,91 +65,118 @@ extern "C" void fresh_hash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
-
-extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+extern int scanhash_fresh(int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done)
 {
+	static THREAD uint32_t *d_hash = nullptr;
+	static THREAD uint32_t *h_found = nullptr;
+
 	const uint32_t first_nonce = pdata[19];
 	uint32_t endiandata[20];
 
-	uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19);
-	throughput = min(throughput, (max_nonce - first_nonce));
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1 << 19);
+	uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00;
+	uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x00ff;
+		ptarget[7] = 0xf;
 
-	if (!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
 
-		x11_simd512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughputmax);
+		x11_echo512_cpu_init(thr_id, throughputmax);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput + 4), 0);
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax + 4));
+		CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 4 * sizeof(uint32_t)));
 
-		cuda_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughputmax);
+		mining_has_stopped[thr_id] = false;
 
-		init[thr_id] = true;
+		init = true;
 	}
 
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 	
-	x11_shavite512_setBlock_80((void*)endiandata);
-	cuda_check_cpu_setTarget(ptarget);
+	x11_shavite512_setBlock_80(thr_id, (void*)endiandata);
 
 	do {
-		uint32_t Htarg = ptarget[7];
-
-		uint32_t foundNonce;
-		int order = 0;
 
 		// GPU Hash
-		x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-#if NULLTEST
-		uint32_t buf[8]; memset(buf, 0, sizeof buf);
-		CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
-		CUDA_SAFE_CALL(cudaDeviceSynchronize());
-		print_hash((unsigned char*)buf); printf("\n");
-#endif
 
-		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, simdthreads);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, simdthreads);
+		x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], d_hash, ptarget[7], h_found);
+		cudaStreamSynchronize(gpustream[thr_id]);
+
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(h_found[0] != 0xffffffff)
 		{
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
+			const uint32_t Htarg = ptarget[7];
+			uint32_t vhash64[8]={0};
+			if(opt_verify){ be32enc(&endiandata[19], h_found[0]);
 			fresh_hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			{
 				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce);
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (secNonce != 0) {
-					pdata[21] = secNonce;
-					res++;
+				if (h_found[1] != 0xffffffff)
+				{
+					if(opt_verify){ be32enc(&endiandata[19], h_found[1]);
+					fresh_hash(vhash64, endiandata);
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+
+						pdata[21] = h_found[1];
+						res++;
+						if (opt_benchmark)
+							applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]);
+					}
+					else
+					{
+						if (vhash64[7] != Htarg)
+						{
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]);
+						}
+					}
+
 				}
-				pdata[19] = foundNonce;
+				pdata[19] = h_found[0];
+				if (opt_benchmark)
+					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]);
 				return res;
 			}
 			else
 			{
-				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+				if (vhash64[7] != Htarg)
+				{
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]);
+				}
 			}
 		}
-
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/x11/s3.cu b/x11/s3.cu
index e4c7749f6b..39abc03e3d 100644
--- a/x11/s3.cu
+++ b/x11/s3.cu
@@ -11,22 +11,24 @@ extern "C" {
 #include "miner.h"
 #include "cuda_helper.h"
 
+#ifdef __cplusplus
+#include <cstdint>
+#else
 #include <stdint.h>
+#endif
 
-static uint32_t *d_hash[MAX_GPUS];
-
-extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
-extern void x11_shavite512_setBlock_80(void *pdata);
+extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void x11_shavite512_setBlock_80(int thr_id, void *pdata);
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads);
 
 extern void quark_skein512_cpu_init(int thr_id);
-extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t *h_found, uint32_t target, int order);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+extern void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t *h_found, uint32_t target);
 
 /* CPU HASH */
-extern "C" void s3hash(void *output, const void *input)
+void s3hash(void *output, const void *input)
 {
 	sph_shavite512_context ctx_shavite;
 	sph_simd512_context ctx_simd;
@@ -49,93 +51,118 @@ extern "C" void s3hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
-static uint32_t *h_found[MAX_GPUS];
-
 /* Main S3 entry point */
-extern "C" int scanhash_s3(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+extern int scanhash_s3(int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done)
 {
+	static THREAD uint32_t *d_hash = nullptr;
+	static THREAD uint32_t *h_found = nullptr;
+
 	const uint32_t first_nonce = pdata[19];
 	unsigned int intensity = 20; // 256*256*8*2;
 #ifdef WIN32
 	// reduce by one the intensity on windows
 	intensity--;
 #endif
-	uint32_t throughput = device_intensity(thr_id, __func__, 1 << intensity);
-	throughput = min(throughput, (max_nonce - first_nonce));
-
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1 << intensity);
+	uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00;
+	uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32;
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000000fu;
+		ptarget[7] = 0x0000000fu;
 
-	if (!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		get_cuda_arch(&cuda_arch[thr_id]);
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
 
-		x11_simd512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughputmax);
 		quark_skein512_cpu_init(thr_id);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
-		CUDA_CALL_OR_RET_X(cudaMallocHost(&(h_found[thr_id]), 2 * sizeof(uint32_t)), 0);
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax));
+		CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 2 * sizeof(uint32_t)));
 
-		cuda_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughputmax);
+		mining_has_stopped[thr_id] = false;
 
-		init[thr_id] = true;
+		init = true;
 	}
 
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
-	x11_shavite512_setBlock_80((void*)endiandata);
-	cuda_check_cpu_setTarget(ptarget);
+	x11_shavite512_setBlock_80(thr_id, (void*)endiandata);
+	cuda_check_cpu_setTarget(ptarget, thr_id);
 
 	do {
-		int order = 0;
+		x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, simdthreads);
+		quark_skein512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash, h_found, ptarget[7]);
 
-		x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_skein512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], h_found[thr_id], ptarget[7], order++);
-
-		if (h_found[thr_id][0] != 0xffffffff)
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(h_found[0] != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], h_found[thr_id][0]);
+			uint32_t vhash64[8]={0};
+			if(opt_verify){ be32enc(&endiandata[19], h_found[0]);
 			s3hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
 			{
 				int res = 1;
-				// check if there was some other ones...
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (h_found[thr_id][1] != 0xffffffff)
+				if (h_found[1] != 0xffffffff)
 				{
-					pdata[21] = h_found[thr_id][1];
-					res++;
-					if (opt_benchmark)
-						applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, h_found[thr_id][1]);
+					if(opt_verify){ be32enc(&endiandata[19], h_found[1]);
+					s3hash(vhash64, endiandata);
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+
+						pdata[21] = h_found[1];
+						res++;
+						if (opt_benchmark)
+							applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]);
+					}
+					else
+					{
+						if (vhash64[7] != Htarg)
+						{
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]);
+						}
+					}
+
 				}
-				pdata[19] = h_found[thr_id][0];
+				pdata[19] = h_found[0];
 				if (opt_benchmark)
-					applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, h_found[thr_id][0]);
+					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]);
 				return res;
 			}
 			else
 			{
 				if (vhash64[7] != Htarg)
 				{
-					applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_found[thr_id][0]);
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]);
 				}
 			}
 		}
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/x11/simd_functions.cu b/x11/simd_functions.cu
index fdf00e2615..6a029ab344 100644
--- a/x11/simd_functions.cu
+++ b/x11/simd_functions.cu
@@ -1134,7 +1134,7 @@ __device__ __forceinline__ void Round8_3_final(uint32_t *A, int r, int s, int t,
 	STEP8_MAJ_31(d_cw3[7], u, r, &A[8], &A[16], &A[24], A);
 }
 
-#if __CUDA_ARCH__ < 350
+#if __CUDA_ARCH__ < 320
 #define expanded_vector(x) tex1Dfetch(texRef1D_128, (x))
 #else
 //#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x))
@@ -1278,10 +1278,18 @@ __device__ __forceinline__ void SIMD_Compress1(uint32_t *const __restrict__ A, c
 {
 	int i;
 	const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente)
+
+	uint32_t msg[16];
+
+	uint28 *phash = (uint28*)M;
+	uint28 *outpt = (uint28*)msg;
+	outpt[0] = phash[0];
+	outpt[1] = phash[1];
+
 #pragma unroll 8
 	for(i=0; i<8; i++) {
-		A[i] ^= M[i];
-		(&A[8])[i] ^= M[8+i];
+		A[i] ^= msg[i];
+		(&A[8])[i] ^= msg[8 + i];
 	}
 	Round8_0(A, thr_offset, 3, 23, 17, 27, g_fft4);
 	Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4);
@@ -1297,10 +1305,9 @@ __device__ __forceinline__ void Compression1(const uint32_t *const __restrict__
 	};
 
 	SIMD_Compress1(A, texture_id, hashval, g_fft4);
-	uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
 #pragma unroll 32
 	for (int i=0; i < 32; i++)
-		state[threadIdx.x+blockDim.x*i] = A[i];
+		g_state[i] = A[i];
 }
 
 __device__ __forceinline__ void SIMD_Compress2(uint32_t *const __restrict__ A, const int thr_id, const uint4 *const __restrict__ g_fft4)
@@ -1324,12 +1331,11 @@ __device__ __forceinline__ void Compression2(const int texture_id, const uint4 *
 {
 	uint32_t A[32];
 	int i;
-	uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)];
 #pragma unroll 32
-	for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
+	for (i = 0; i < 32; i++) A[i] = g_state[i];
 	SIMD_Compress2(A, texture_id, g_fft4);
 #pragma unroll 32
-	for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i];
+	for (i=0; i < 32; i++) g_state[i] = A[i];
 }
 
 __device__ __forceinline__ void SIMD_Compress_Final(uint32_t *const __restrict__ A)
@@ -1360,10 +1366,9 @@ __device__ __forceinline__ void Final(uint32_t *const __restrict__ hashval, cons
 {
 	uint32_t A[32];
 	int i;
-	const uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)];
 #pragma unroll 32
 	for (i=0; i < 32; i++)
-		A[i] = state[threadIdx.x+blockDim.x*i];
+		A[i] = g_state[i];
 
 	SIMD_Compress_Final(A);
 #pragma unroll 16
diff --git a/x11/x11.cu b/x11/x11.cu
index 99eee9e980..b5aa115943 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -1,11 +1,5 @@
 extern "C"
 {
-
-#define FASTECHO 1			//Fast echo can give hardware errors on low difficulty but accepted on most pools.
-
-#ifdef _DEBUG //Visual Leak Detector for Visual C++ 
-//	#include <vld.h>
-#endif
 #include "sph/sph_blake.h"
 #include "sph/sph_bmw.h"
 #include "sph/sph_groestl.h"
@@ -28,49 +22,47 @@ extern "C"
 #include <stdio.h>
 #include <memory.h>
 
-
-uint32_t *d_hash[MAX_GPUS];
-uint32_t *h_found[MAX_GPUS];
-
-extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_init(int thr_id);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
-extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,  uint32_t *d_hash);
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads);
+
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found, int order);
+extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *d_hash, uint32_t target, uint32_t *h_found);
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 
 extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-                                          uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse, int order);
+extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes,
+                                          const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse);
 
 // X11 CPU Hash
-extern "C" void x11hash(void *output, const void *input)
+void x11hash(void *output, const void *input)
 {
 	// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11
 
@@ -136,123 +128,139 @@ extern "C" void x11hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
-
-extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done)
+extern int scanhash_x11(int thr_id, uint32_t *pdata,
+    uint32_t *ptarget, uint32_t max_nonce,
+    uint32_t *hashes_done)
 {
+	static THREAD uint32_t *d_hash = nullptr;
+	static THREAD uint32_t *h_found = nullptr;
+
 	const uint32_t first_nonce = pdata[19];
-	unsigned int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;
-	uint32_t throughput = device_intensity(thr_id, __func__, 1 << intensity); // 19=256*256*8;
-	throughput = min(throughput, (max_nonce - first_nonce));
+
+	cudaDeviceProp props;
+	CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, device_map[thr_id]));
+	static THREAD uint32_t throughputmax;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0xf;
+		ptarget[7] = 0x4f;
 
-	if (!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
-		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
-		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-		quark_groestl512_cpu_init(thr_id, throughput);
-		quark_bmw512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
-		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
-			return 0;
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		CUDA_SAFE_CALL(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+		CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		get_cuda_arch(&cuda_arch[thr_id]);
+
+		unsigned int intensity;
+#if defined WIN32 && !defined _WIN64
+		intensity = 256 * 256 * 16;
+#else
+		if(strstr(props.name, "970"))		  intensity = (256 * 256 * 22);
+		else if(strstr(props.name, "980"))    intensity = (256 * 256 * 22);
+		else if(strstr(props.name, "1070"))   intensity = (256 * 256 * 22);
+		else if(strstr(props.name, "1080"))   intensity = (256 * 256 * 22);
+		else if(strstr(props.name, "750 Ti")) intensity = (256 * 256 * 20);
+		else if(strstr(props.name, "750"))    intensity = (256 * 256 * 19);
+		else if(strstr(props.name, "960"))    intensity = (256 * 256 * 19);
+		else intensity = (256 * 256 * 19);
+#endif
+		throughputmax = device_intensity(device_map[thr_id], __func__, intensity);
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
 		}
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); // why 64 ?
-		CUDA_CALL_OR_RET_X(cudaMallocHost(&(h_found[thr_id]), 4 * sizeof(uint32_t)), 0);
-		cuda_check_cpu_init(thr_id, throughput);
-		init[thr_id] = true;
+#endif
+		quark_groestl512_cpu_init(thr_id, throughputmax);
+		quark_bmw512_cpu_init(thr_id, throughputmax);
+		x11_echo512_cpu_init(thr_id, throughputmax);
+		x11_simd512_cpu_init(thr_id, throughputmax);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * 4 * throughputmax));
+		CUDA_SAFE_CALL(cudaMallocHost(&h_found, 2 * sizeof(uint32_t)));
+		mining_has_stopped[thr_id] = false;
+		init = true;
 	}
+	uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffc00;
+	uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32;
+
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
-
-	do {
-		int order = 0;
-
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		#ifdef FASTECHO
-		x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], ptarget[7], h_found[thr_id], order++);
-		if (h_found[thr_id][0] != 0xffffffff)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata);
+	CUDA_SAFE_CALL(cudaGetLastError());
+	do
+	{
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, simdthreads);
+		x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], d_hash, ptarget[7], h_found);
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(h_found[0] != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], h_found[thr_id][0]);
-			x11hash(vhash64, endiandata);
+			uint32_t vhash64[8] = {0};
+			if(opt_verify)
+			{
+				be32enc(&endiandata[19], h_found[0]);
+				x11hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			}
+			if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
 			{
 				int res = 1;
-				// check if there was some other ones...
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (h_found[thr_id][1] != 0xffffffff)
+				if(h_found[1] != 0xffffffff)
 				{
-					pdata[21] = h_found[thr_id][1];
-					res++;
-					if (opt_benchmark)
-						applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, h_found[thr_id][1], vhash64[7], Htarg);
-				}
-				pdata[19] = h_found[thr_id][0];
-				if (opt_benchmark)
-					applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, h_found[thr_id][0], vhash64[7], Htarg);
-				return res;
-			}
-			else
-			{
-				if (vhash64[7] != Htarg)
+					if(opt_verify)
+					{
+						be32enc(&endiandata[19], h_found[1]);
+						x11hash(vhash64, endiandata);
+					} if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
 					{
-						applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_found[thr_id][0]);
+
+						pdata[21] = h_found[1];
+						res++;
+						if(opt_benchmark)
+							applog(LOG_INFO, "GPU #%d Found second nonce %08x", device_map[thr_id], h_found[1]);
+					}
+					else
+					{
+						if(vhash64[7] != Htarg)
+						{
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]);
+						}
 					}
-			}
-		}
-		#else
-		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
-		{
-			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			x11hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
-			{
-				int res = 1;
-				// check if there was some other ones...
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce);
-				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (secNonce != 0) {
-					pdata[21] = secNonce;
-					res++;
-					if (opt_benchmark)  applog(LOG_INFO, "Found second nounce", thr_id, foundNonce, vhash64[7], Htarg);
 				}
-				pdata[19] = foundNonce;
-				if (opt_benchmark) applog(LOG_INFO, "Found nounce", thr_id, foundNonce, vhash64[7], Htarg);
+				pdata[19] = h_found[0];
+				if(opt_benchmark)
+					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]);
 				return res;
 			}
 			else
 			{
-				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+				if(vhash64[7] != Htarg)
+				{
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]);
+				}
 			}
 		}
-		#endif
-
 		pdata[19] += throughput;
-	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
+	} while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu
index cd851726f8..f66f447574 100644
--- a/x13/cuda_x13_fugue512.cu
+++ b/x13/cuda_x13_fugue512.cu
@@ -11,10 +11,10 @@
 #include "cuda_helper.h"
 
 
+
 __constant__ uint32_t pTarget[8];
 static uint32_t *d_nonce[MAX_GPUS];
 
-
 /*
  * X13 kernel implementation.
  *
@@ -46,17 +46,17 @@ static uint32_t *d_nonce[MAX_GPUS];
  * @author   phm <phm@inbox.com>
  */
 
-#define mixtab0(x) (*((uint32_t*)mixtabs + (    (x))))
-#define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x))))
-#define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x))))
-#define mixtab3(x) (*((uint32_t*)mixtabs + (768+(x))))
+#define mixtab0(x) (*(mixtabs + (    (x))))
+#define mixtab1(x) (*(mixtabs + (256+(x))))
+#define mixtab2(x) (*(mixtabs + (512+(x))))
+#define mixtab3(x) (*(mixtabs + (768+(x))))
 
-texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab1Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab2Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab3Tex;
+//texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
+//texture<unsigned int, 1, cudaReadModeElementType> mixTab1Tex;
+//texture<unsigned int, 1, cudaReadModeElementType> mixTab2Tex;
+//texture<unsigned int, 1, cudaReadModeElementType> mixTab3Tex;
 
-static const uint32_t mixtab0_cpu[] = {
+__constant__ uint32_t mixTab0Tex[] = {
 	SPH_C32(0x63633297), SPH_C32(0x7c7c6feb), SPH_C32(0x77775ec7),
 	SPH_C32(0x7b7b7af7), SPH_C32(0xf2f2e8e5), SPH_C32(0x6b6b0ab7),
 	SPH_C32(0x6f6f16a7), SPH_C32(0xc5c56d39), SPH_C32(0x303090c0),
@@ -144,8 +144,8 @@ static const uint32_t mixtab0_cpu[] = {
 	SPH_C32(0xb0b03df6), SPH_C32(0x5454b74b), SPH_C32(0xbbbb0cda),
 	SPH_C32(0x16166258)
 };
-
- static const uint32_t mixtab1_cpu[] = {
+/*
+__constant__ uint32_t mixTab1Tex[] = {
 	SPH_C32(0x97636332), SPH_C32(0xeb7c7c6f), SPH_C32(0xc777775e),
 	SPH_C32(0xf77b7b7a), SPH_C32(0xe5f2f2e8), SPH_C32(0xb76b6b0a),
 	SPH_C32(0xa76f6f16), SPH_C32(0x39c5c56d), SPH_C32(0xc0303090),
@@ -234,7 +234,7 @@ static const uint32_t mixtab0_cpu[] = {
 	SPH_C32(0x58161662)
 };
 
- static const uint32_t mixtab2_cpu[] = {
+__constant__ uint32_t mixTab2Tex[] = {
 	SPH_C32(0x32976363), SPH_C32(0x6feb7c7c), SPH_C32(0x5ec77777),
 	SPH_C32(0x7af77b7b), SPH_C32(0xe8e5f2f2), SPH_C32(0x0ab76b6b),
 	SPH_C32(0x16a76f6f), SPH_C32(0x6d39c5c5), SPH_C32(0x90c03030),
@@ -323,7 +323,7 @@ static const uint32_t mixtab0_cpu[] = {
 	SPH_C32(0x62581616)
 };
 
- static const uint32_t mixtab3_cpu[] = {
+__constant__ uint32_t mixTab3Tex[] = {
 	SPH_C32(0x63329763), SPH_C32(0x7c6feb7c), SPH_C32(0x775ec777),
 	SPH_C32(0x7b7af77b), SPH_C32(0xf2e8e5f2), SPH_C32(0x6b0ab76b),
 	SPH_C32(0x6f16a76f), SPH_C32(0xc56d39c5), SPH_C32(0x3090c030),
@@ -411,6 +411,7 @@ static const uint32_t mixtab0_cpu[] = {
 	SPH_C32(0xb03df6b0), SPH_C32(0x54b74b54), SPH_C32(0xbb0cdabb),
 	SPH_C32(0x16625816)
 };
+*/
 
 #define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \
 		x22 ^= x00; \
@@ -430,29 +431,90 @@ static const uint32_t mixtab0_cpu[] = {
 		x20 ^= x06; \
 	}
 #define SMIX(x0, x1, x2, x3) { \
-		uint32_t c0 = 0; \
-		uint32_t c1 = 0; \
-		uint32_t c2 = 0; \
-		uint32_t c3 = 0; \
-		uint32_t r0 = 0; \
-		uint32_t r1 = 0; \
-		uint32_t r2 = 0; \
-		uint32_t r3 = 0; \
-		uint32_t tmp; \
-		tmp = mixtab0(__byte_perm(x0, 0, 0x4443)); \
-		c0 ^= tmp; \
+		uint32_t tmp = mixtab0(__byte_perm(x0, 0, 0x4443)); \
+		uint32_t c0 = tmp; \
 		tmp = mixtab1(__byte_perm(x0, 0, 0x4442)); \
 		c0 ^= tmp; \
-		r1 ^= tmp; \
+		uint32_t r1 = tmp; \
 		tmp = mixtab2(__byte_perm(x0, 0, 0x4441)); \
 		c0 ^= tmp; \
-		r2 ^= tmp; \
+		uint32_t r2= tmp; \
 		tmp = mixtab3(__byte_perm(x0, 0, 0x4440)); \
 		c0 ^= tmp; \
-		r3 ^= tmp; \
+		uint32_t r3= tmp; \
 		tmp = mixtab0(__byte_perm(x1, 0, 0x4443)); \
+		uint32_t c1 = tmp; \
+		uint32_t r0 = tmp; \
+		tmp = mixtab1(__byte_perm(x1, 0, 0x4442)); \
+		c1 ^= tmp; \
+		tmp = mixtab2(__byte_perm(x1, 0, 0x4441)); \
 		c1 ^= tmp; \
+		r2 ^= tmp; \
+		tmp = mixtab3(__byte_perm(x1, 0, 0x4440)); \
+		c1 ^= tmp; \
+		r3 ^= tmp; \
+		tmp = mixtab0(__byte_perm(x2, 0, 0x4443)); \
+		uint32_t c2 = tmp; \
+		r0 ^= tmp; \
+		tmp = mixtab1(__byte_perm(x2, 0, 0x4442)); \
+		c2 ^= tmp; \
+		r1 ^= tmp; \
+		tmp = mixtab2(__byte_perm(x2, 0, 0x4441)); \
+		c2 ^= tmp; \
+		tmp = mixtab3(__byte_perm(x2, 0, 0x4440)); \
+		c2 ^= tmp; \
+		r3 ^= tmp; \
+		tmp = mixtab0(__byte_perm(x3, 0, 0x4443)); \
+		uint32_t c3 = tmp; \
 		r0 ^= tmp; \
+		tmp = mixtab1(__byte_perm(x3, 0, 0x4442)); \
+		c3 ^= tmp; \
+		r1 ^= tmp; \
+		tmp = mixtab2(__byte_perm(x3, 0, 0x4441)); \
+		c3 ^= tmp; \
+		r2 ^= tmp; \
+		tmp = mixtab3(__byte_perm(x3, 0, 0x4440)); \
+		c3 ^= tmp; \
+		uint32_t tmp2 = __byte_perm((c0 ^ r0),(c1 ^ r1), 0x3636);\
+		tmp= __byte_perm((c2 ^ r2),(c3 ^ r3), 0x1414); \
+		x0 = __byte_perm(tmp2,tmp, 0x3254);\
+		r0 = ROL8(r0); \
+		r1 = ROL8(r1); \
+		r2 = ROL8(r2); \
+		r3 = ROL8(r3); \
+		tmp2 = __byte_perm((c1 ^ r0),(c2 ^ r1), 0x3636);\
+		tmp= __byte_perm((c3 ^ r2),(c0 ^ r3), 0x1414); \
+		x1 = __byte_perm(tmp2,tmp, 0x3254);\
+		r0 = ROL8(r0); \
+		r1 = ROL8(r1); \
+		r2 = ROL8(r2); \
+		r3 = ROL8(r3); \
+		tmp2 = __byte_perm((c2 ^ r0),(c3 ^ r1), 0x3636);\
+		tmp= __byte_perm((c0 ^ r2),(c1 ^ r3), 0x1414); \
+		x2 = __byte_perm(tmp2,tmp, 0x3254);\
+		r0 = ROL8(r0); \
+		r1 = ROL8(r1); \
+		r2 = ROL8(r2); \
+		r3 = ROL8(r3); \
+		tmp2 = __byte_perm((c3 ^ r0),(c0 ^ r1), 0x3636);\
+		tmp= __byte_perm((c1 ^ r2),(c2 ^ r3), 0x1414); \
+		x3 = __byte_perm(tmp2,tmp, 0x3254);\
+		}
+#define SMIX0(x0, x1, x2, x3) { \
+		uint32_t tmp = mixtab0(__byte_perm(x0, 0, 0x4443)); \
+		uint32_t c0 = tmp; \
+		tmp = mixtab1(__byte_perm(x0, 0, 0x4442)); \
+		c0 ^= tmp; \
+		uint32_t r1 = tmp; \
+		tmp = mixtab2(__byte_perm(x0, 0, 0x4441)); \
+		c0 ^= tmp; \
+		uint32_t r2= tmp; \
+		tmp = mixtab3(__byte_perm(x0, 0, 0x4440)); \
+		c0 ^= tmp; \
+		uint32_t r3= tmp; \
+		tmp = mixtab0(__byte_perm(x1, 0, 0x4443)); \
+		uint32_t c1 = tmp; \
+		uint32_t r0 = tmp; \
 		tmp = mixtab1(__byte_perm(x1, 0, 0x4442)); \
 		c1 ^= tmp; \
 		tmp = mixtab2(__byte_perm(x1, 0, 0x4441)); \
@@ -462,7 +524,7 @@ static const uint32_t mixtab0_cpu[] = {
 		c1 ^= tmp; \
 		r3 ^= tmp; \
 		tmp = mixtab0(__byte_perm(x2, 0, 0x4443)); \
-		c2 ^= tmp; \
+		uint32_t c2 = tmp; \
 		r0 ^= tmp; \
 		tmp = mixtab1(__byte_perm(x2, 0, 0x4442)); \
 		c2 ^= tmp; \
@@ -473,7 +535,7 @@ static const uint32_t mixtab0_cpu[] = {
 		c2 ^= tmp; \
 		r3 ^= tmp; \
 		tmp = mixtab0(__byte_perm(x3, 0, 0x4443)); \
-		c3 ^= tmp; \
+		uint32_t c3 = tmp; \
 		r0 ^= tmp; \
 		tmp = mixtab1(__byte_perm(x3, 0, 0x4442)); \
 		c3 ^= tmp; \
@@ -483,23 +545,11 @@ static const uint32_t mixtab0_cpu[] = {
 		r2 ^= tmp; \
 		tmp = mixtab3(__byte_perm(x3, 0, 0x4440)); \
 		c3 ^= tmp; \
-		x0 = ((c0 ^ r0) & SPH_C32(0xFF000000)) \
-			| ((c1 ^ r1) & SPH_C32(0x00FF0000)) \
-			| ((c2 ^ r2) & SPH_C32(0x0000FF00)) \
-			| ((c3 ^ r3) & SPH_C32(0x000000FF)); \
-		x1 = ((c1 ^ (r0 << 8)) & SPH_C32(0xFF000000)) \
-			| ((c2 ^ (r1 << 8)) & SPH_C32(0x00FF0000)) \
-			| ((c3 ^ (r2 << 8)) & SPH_C32(0x0000FF00)) \
-			| ((c0 ^ (r3 >> 24)) & SPH_C32(0x000000FF)); \
-		x2 = ((c2 ^ (r0 << 16)) & SPH_C32(0xFF000000)) \
-			| ((c3 ^ (r1 << 16)) & SPH_C32(0x00FF0000)) \
-			| ((c0 ^ (r2 >> 16)) & SPH_C32(0x0000FF00)) \
-			| ((c1 ^ (r3 >> 16)) & SPH_C32(0x000000FF)); \
-		x3 = ((c3 ^ (r0 << 24)) & SPH_C32(0xFF000000)) \
-			| ((c0 ^ (r1 >> 8)) & SPH_C32(0x00FF0000)) \
-			| ((c1 ^ (r2 >> 8)) & SPH_C32(0x0000FF00)) \
-			| ((c2 ^ (r3 >> 8)) & SPH_C32(0x000000FF)); \
+		uint32_t tmp2 = __byte_perm((c0 ^ r0),(c1 ^ r1), 0x3636);\
+		tmp= __byte_perm((c2 ^ r2),(c3 ^ r3), 0x1414); \
+		x0 = __byte_perm(tmp2,tmp, 0x3254);\
 		}
+
 #define ROR3 { \
 	B33 = S33, B34 = S34, B35 = S35; \
     S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \
@@ -508,20 +558,13 @@ static const uint32_t mixtab0_cpu[] = {
 	S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \
 	}
 
-#define ROR8 { \
-	B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
-    S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \
-	S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \
-	S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \
-	S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \
-	}
-
-#define ROR9 { \
-	B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
-    S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \
-	S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \
-	S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \
-	S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \
+#define ROL1 { \
+			B35 = S00; \
+			S00 = S01; S01 = S02; S02 = S03; S03 = S04; S04 = S05; S05 = S06; S06 = S07; S07 = S08; S08 = S09; S09 = S10; \
+			S10 = S11; S11 = S12; S12 = S13; S13 = S14; S14 = S15; S15 = S16; S16 = S17; S17 = S18; S18 = S19; S19 = S20; \
+			S20 = S21; S21 = S22; S22 = S23; S23 = S24; S24 = S25; S25 = S26; S26 = S27; S27 = S28; S28 = S29; S29 = S30; \
+			S30 = S31; S31 = S32; S32 = S33; S33 = S34; S34 = S35; \
+			S35 = B35; \
 	}
 
 #define FUGUE512_3(x, y, z) {  \
@@ -556,92 +599,229 @@ static const uint32_t mixtab0_cpu[] = {
         SMIX(S00, S01, S02, S03); \
 	}
 
-//__launch_bounds__(128, 6)
-__global__ __launch_bounds__(128,8)
-void x13_fugue512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__global__ __launch_bounds__(128, 8)
+void x13_fugue512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash)
 {
 	__shared__ uint32_t mixtabs[1024];
-	
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
 	{
-		if (threadIdx.x < 128) 
+		if (threadIdx.x < 128)
 		{
-		*((uint32_t*)mixtabs + (threadIdx.x)) = tex1Dfetch(mixTab0Tex, threadIdx.x);
-		*((uint32_t*)mixtabs + (128 + threadIdx.x)) = tex1Dfetch(mixTab0Tex, threadIdx.x+128);
-		*((uint32_t*)mixtabs + (256 + threadIdx.x)) = tex1Dfetch(mixTab1Tex, threadIdx.x);
-		*((uint32_t*)mixtabs + (384 + threadIdx.x)) = tex1Dfetch(mixTab1Tex , threadIdx.x+128);
-		*((uint32_t*)mixtabs + (512 + threadIdx.x)) = tex1Dfetch(mixTab2Tex, threadIdx.x);
-		*((uint32_t*)mixtabs + (640 + threadIdx.x)) = tex1Dfetch(mixTab2Tex, threadIdx.x+128);
-		*((uint32_t*)mixtabs + (768 + threadIdx.x)) = tex1Dfetch(mixTab3Tex, threadIdx.x);
-		*((uint32_t*)mixtabs + (896 + threadIdx.x)) = tex1Dfetch(mixTab3Tex, threadIdx.x+128);
+			mixtabs[threadIdx.x] = mixTab0Tex[threadIdx.x];
+			mixtabs[threadIdx.x + 128] = mixTab0Tex[threadIdx.x + 128];
+			mixtabs[256 + threadIdx.x] = ROTR32(mixtabs[threadIdx.x], 8);
+			mixtabs[256 + threadIdx.x + 128] = ROTR32(mixtabs[threadIdx.x + 128], 8);
+			mixtabs[512 + threadIdx.x] = ROTR32(mixtabs[threadIdx.x], 16);
+			mixtabs[512 + threadIdx.x + 128] = ROTR32(mixtabs[threadIdx.x + 128], 16);
+			mixtabs[768 + threadIdx.x] = ROTR32(mixtabs[threadIdx.x], 24);
+			mixtabs[768 + threadIdx.x + 128] = ROTR32(mixtabs[threadIdx.x + 128], 24);
 		}
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-		
-		int hashPosition = nounce - startNounce;
-		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
+		__syncthreads();
+		const uint32_t nounce =  (startNounce + thread);
+
+		const uint32_t hashPosition = nounce - startNounce;
+		uint32_t *const Hash = &g_hash[hashPosition*16];
 
-		#pragma unroll 16
+#pragma unroll 16
 		for (int i = 0; i < 16; i++)
 			Hash[i] = cuda_swab32(Hash[i]);
-		__syncthreads(); 
 
 		uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
 		uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
 		uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
 		uint32_t S30, S31, S32, S33, S34, S35;
 
-		uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
-		uint64_t bc = (uint64_t) 64 << 3;
-		uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL);
-		uint32_t bchi = (uint32_t)(bc >> 32);
-
-		S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
-		S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); 
-		S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); 
-		S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
-		S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
-
-		FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2]));
-		FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5]));
-		FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8]));
-		FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB]));
-		FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE]));
-		FUGUE512_3((Hash[0xF]), bchi, bclo);
-		
-	//#pragma unroll
+		uint32_t B33, B34, B35;
+
+		S02 = S03 = S05 = S06 = S09 = S10 = S11 = S12 = S13 = S14 = S16 = S17 = S18 = S19 = 0;
+		S20 = 0x8807a57eUL; S21 = 0xe616af75UL; S22 = 0xc5d3e4dbUL; S23 = 0xac9ab027UL;
+		S24 = 0xd915f117UL; S25 = 0xb6eecc54UL; S26 = 0x06e8020bUL; S27 = 0x4a92efd1UL;
+		S28 = 0xaac6e2c9UL; S29 = 0xddb21398UL; S30 = 0xcae65838UL; S31 = 0x437f203fUL;
+		S32 = 0x25ea78e7UL; S33 = 0x4c0a2cc1UL; S34 = 0xda6ed11dUL; S35 = 0xe13e3567UL;
+
+		S01 = 0xd915f117UL;
+		S04 = 0x4a92efd1UL;
+		S07 = 0xcae65838UL;
+		S15 = 0xd915f117UL;
+		S00 = Hash[0];
+		S08 = Hash[0];
+
+		uint32_t c0 = 0x9ae23283UL;
+		uint32_t c1 = 0x0361b92dUL;
+		uint32_t c2 = 0x4c92d8edUL;
+		uint32_t r0, r1, r2;
+		uint32_t tmp, tmp2, c3;
+
+		tmp = mixtabs[__byte_perm(S00, 0, 17475)]; c3 = tmp; r0 = 0xafaf608aUL ^ tmp;
+		tmp = mixtabs[256 + __byte_perm(S00, 0, 17474)]; c3 ^= tmp; r1 = 0x79d5d51dUL ^ tmp;
+		tmp = mixtabs[512 + __byte_perm(S00, 0, 17473)]; c3 ^= tmp; r2 = 0xf6274f4fUL ^ tmp;
+		tmp = mixtabs[768 + __byte_perm(S00, 0, 17472)]; c3 ^= tmp;
+		tmp2 = __byte_perm(c0 ^ r0, c1 ^ r1, 13878);
+		tmp = __byte_perm(c2 ^ r2, c3 ^ 0x59947f59UL, 5140);
+		S33 = __byte_perm(tmp2, tmp, 12884);
+		r0 = ROL8(r0); r1 = ROL8(r1);
+		r2 = ROL8(r2);
+		tmp2 = __byte_perm(c1 ^ r0, c2 ^ r1, 13878);
+		tmp = __byte_perm(c3 ^ r2, c0 ^ 0x947f5959UL, 5140);
+		S34 = __byte_perm(tmp2, tmp, 12884);
+		r0 = ROL8(r0); r1 = ROL8(r1);
+		r2 = ROL8(r2);
+		tmp2 = __byte_perm(c2 ^ r0, c3 ^ r1, 13878);
+		tmp = __byte_perm(c0 ^ r2, c1 ^ 0x7f595994UL, 5140);
+		S35 = __byte_perm(tmp2, tmp, 12884);
+		r0 = ROL8(r0); r1 = ROL8(r1);
+		r2 = ROL8(r2);
+		tmp2 = __byte_perm(c3 ^ r0, c0 ^ r1, 13878);
+		tmp = __byte_perm(c1 ^ r2, c2 ^ 0x5959947fUL, 5140);
+		S00 = __byte_perm(tmp2, tmp, 12884);
+
+		CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14);
+		SMIX(S30, S31, S32, S33);
+		CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11);
+		SMIX(S27, S28, S29, S30);
+		CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08);
+		SMIX(S24, S25, S26, S27);
+
+		TIX4(Hash[1], S24, S25, S28, S31, S32, S10, S12, S15, S18);
+		CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05);
+		SMIX(S21, S22, S23, S24);
+		CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02);
+		SMIX(S18, S19, S20, S21);
+		CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35);
+		SMIX(S15, S16, S17, S18);
+		CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32);
+		SMIX(S12, S13, S14, S15);
+
+		TIX4(Hash[2], S12, S13, S16, S19, S20, S34, S00, S03, S06);
+		CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29);
+		SMIX(S09, S10, S11, S12);
+		CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26);
+		SMIX(S06, S07, S08, S09);
+		CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23);
+		SMIX(S03, S04, S05, S06);
+		CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
+		SMIX(S00, S01, S02, S03);
+#pragma unroll
+		for (int i = 3; i < (5 * 3); i += 3)
+		{
+			FUGUE512_3((Hash[i]), (Hash[i + 1]), (Hash[i + 2]));
+		}
+		TIX4(Hash[0xF], S00, S01, S04, S07, S08, S22, S24, S27, S30);
+		CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17);
+		SMIX(S33, S34, S35, S00);
+		CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14);
+		SMIX(S30, S31, S32, S33);
+		CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11);
+		SMIX(S27, S28, S29, S30);
+		CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08);
+		SMIX(S24, S25, S26, S27);
+
+		S10 ^= S24;
+		S25 ^= S12; S28 ^= S15; S31 ^= S18;
+		S21 ^= S25; S22 ^= S26; S23 ^= S27; S03 ^= S25; S04 ^= S26; S05 ^= S27;
+		tmp = (*(mixtabs + ((__byte_perm(S21, 0, 0x4443))))); c0 = tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S21, 0, 0x4442))))); c0 ^= tmp; r1 = tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S21, 0, 0x4441))))); c0 ^= tmp; r2 = tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S21, 0, 0x4440))))); c0 ^= tmp; uint32_t r3 = tmp; tmp = (*(mixtabs + ((__byte_perm(S22, 0, 0x4443))))); c1 = tmp; r0 = tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S22, 0, 0x4442))))); c1 ^= tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S22, 0, 0x4441))))); c1 ^= tmp; r2 ^= tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S22, 0, 0x4440))))); c1 ^= tmp; r3 ^= tmp; tmp = (*(mixtabs + ((__byte_perm(S23, 0, 0x4443))))); c2 = tmp; r0 ^= tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S23, 0, 0x4442))))); c2 ^= tmp; r1 ^= tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S23, 0, 0x4441))))); c2 ^= tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S23, 0, 0x4440))))); c2 ^= tmp; r3 ^= tmp;
+		r0 ^= 0x63633297UL;
+		r1 ^= 0x97636332UL;
+		r2 ^= 0x32976363UL;
+		c3 = (0x63633297UL ^ 0x97636332UL ^ 0x32976363UL ^ 0x63329763UL);
+		tmp2 = __byte_perm((c0 ^ r0), (c1 ^ r1), 0x3636); tmp = __byte_perm((c2 ^ r2), (c3 ^ r3), 0x1414);
+		S21 = __byte_perm(tmp2, tmp, 0x3254);
+		r0 = ROTL32((r0), (8));
+		r1 = ROTL32((r1), (8));
+		r2 = ROTL32((r2), (8));
+		r3 = ROTL32((r3), (8));
+		tmp2 = __byte_perm((c1 ^ r0), (c2 ^ r1), 0x3636); tmp = __byte_perm((c3 ^ r2), (c0 ^ r3), 0x1414);
+		S22 = __byte_perm(tmp2, tmp, 0x3254);
+		r0 = ROTL32((r0), (8));
+		r1 = ROTL32((r1), (8));
+		r2 = ROTL32((r2), (8));
+		r3 = ROTL32((r3), (8));
+		tmp2 = __byte_perm((c2 ^ r0), (c3 ^ r1), 0x3636); tmp = __byte_perm((c0 ^ r2), (c1 ^ r3), 0x1414);
+		S23 = __byte_perm(tmp2, tmp, 0x3254);
+		r0 = ROTL32((r0), (8));
+		r1 = ROTL32((r1), (8));
+		r2 = ROTL32((r2), (8));
+		r3 = ROTL32((r3), (8));
+		tmp2 = __byte_perm((c3 ^ r0), (c0 ^ r1), 0x3636); tmp = __byte_perm((c1 ^ r2), (c2 ^ r3), 0x1414);
+		S24 = __byte_perm(tmp2, tmp, 0x3254);
+
+		CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02);
+		SMIX(S18, S19, S20, S21);
+		CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35);
+		SMIX(S15, S16, S17, S18);
+		CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32);
+		SMIX(S12, S13, S14, S15);
+
+		S34 ^= S12;
+		S12 = (64 << 3);
+		S20 ^= S12; S13 ^= S00; S16 ^= S03; S19 ^= S06;
+		S09 ^= S13; S10 ^= S14; S11 ^= S15; S27 ^= S13; S28 ^= S14; S29 ^= S15;
+		tmp = (*(mixtabs + ((__byte_perm(S09, 0, 0x4443)))));  c0 = tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S09, 0, 0x4442))))); c0 ^= tmp; r1 = tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S09, 0, 0x4441))))); c0 ^= tmp; r2 = tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S09, 0, 0x4440))))); c0 ^= tmp; r3 = tmp; tmp = (*(mixtabs + ((__byte_perm(S10, 0, 0x4443))))); c1 = tmp; r0 = tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S10, 0, 0x4442))))); c1 ^= tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S10, 0, 0x4441))))); c1 ^= tmp; r2 ^= tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S10, 0, 0x4440))))); c1 ^= tmp; r3 ^= tmp; tmp = (*(mixtabs + ((__byte_perm(S11, 0, 0x4443))))); c2 = tmp; r0 ^= tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S11, 0, 0x4442))))); c2 ^= tmp; r1 ^= tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S11, 0, 0x4441))))); c2 ^= tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S11, 0, 0x4440))))); c2 ^= tmp; r3 ^= tmp;
+		r0 ^= 0x63633297UL;
+		r1 ^= 0x97636332UL;
+		r2 ^= 0x5ec77777UL;
+		c3 = (0x63633297UL ^ 0x97636332UL ^ 0x5ec77777UL ^ 0x63329763);
+		tmp2 = __byte_perm((c0 ^ r0), (c1 ^ r1), 0x3636); tmp = __byte_perm((c2 ^ r2), (c3 ^ r3), 0x1414);
+		S09 = __byte_perm(tmp2, tmp, 0x3254);
+		r0 = ROTL32((r0), (8));
+		r1 = ROTL32((r1), (8));
+		r2 = ROTL32((r2), (8));
+		r3 = ROTL32((r3), (8));
+		tmp2 = __byte_perm((c1 ^ r0), (c2 ^ r1), 0x3636); tmp = __byte_perm((c3 ^ r2), (c0 ^ r3), 0x1414);
+		S10 = __byte_perm(tmp2, tmp, 0x3254);
+		r0 = ROTL32((r0), (8));
+		r1 = ROTL32((r1), (8));
+		r2 = ROTL32((r2), (8));
+		r3 = ROTL32((r3), (8));
+		tmp2 = __byte_perm((c2 ^ r0), (c3 ^ r1), 0x3636); tmp = __byte_perm((c0 ^ r2), (c1 ^ r3), 0x1414);
+		S11 = __byte_perm(tmp2, tmp, 0x3254);
+		r0 = ROTL32((r0), (8));
+		r1 = ROTL32((r1), (8));
+		r2 = ROTL32((r2), (8));
+		r3 = ROTL32((r3), (8));
+		tmp2 = __byte_perm((c3 ^ r0), (c0 ^ r1), 0x3636); tmp = __byte_perm((c1 ^ r2), (c2 ^ r3), 0x1414);
+		S12 = __byte_perm(tmp2, tmp, 0x3254);
+
+		CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26);
+		SMIX(S06, S07, S08, S09);
+		CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23);
+		SMIX(S03, S04, S05, S06);
+		CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
+		SMIX(S00, S01, S02, S03);
+
+		//#pragma unroll
 		for (int i = 0; i < 32; i++) {
 			ROR3;
 			CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
 			SMIX(S00, S01, S02, S03);
 		}
 
-	//#pragma	unroll
-		for (int i = 0; i < 13; i++) {
+		//#pragma	unroll
+		for (int i = 0; i < 13; i++)
+		{
 			S04 ^= S00;
 			S09 ^= S00;
 			S18 ^= S00;
 			S27 ^= S00;
-			ROR9;
-			SMIX(S00, S01, S02, S03);
-			S04 ^= S00;
-			S10 ^= S00;
-			S18 ^= S00;
-			S27 ^= S00;
-			ROR9;
-			SMIX(S00, S01, S02, S03);
-			S04 ^= S00;
-			S10 ^= S00;
-			S19 ^= S00;
-			S27 ^= S00;
-			ROR9;
-			SMIX(S00, S01, S02, S03);
-			S04 ^= S00;
-			S10 ^= S00;
-			S19 ^= S00;
-			S28 ^= S00;
-			ROR8;
-			SMIX(S00, S01, S02, S03);
+			SMIX(S27, S28, S29, S30);
+			S31 ^= S27;
+			S01 ^= S27;
+			S09 ^= S27;
+			S18 ^= S27;
+			SMIX(S18, S19, S20, S21);
+			S22 ^= S18;
+			S28 ^= S18;
+			S01 ^= S18;
+			S09 ^= S18;
+			SMIX(S09, S10, S11, S12);
+			S13 ^= S09;
+			S19 ^= S09;
+			S28 ^= S09;
+			S01 ^= S09;
+			SMIX(S01, S02, S03, S04);
+			ROL1;
 		}
 		S04 ^= S00;
 		S09 ^= S00;
@@ -667,29 +847,25 @@ void x13_fugue512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
 	}
 }
 
-__global__ __launch_bounds__(128, 7)
-void x13_fugue512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, uint32_t *const __restrict__ g_nonceVector, uint32_t *const __restrict__ d_nonce)
+__global__ __launch_bounds__(256,3)
+void x13_fugue512_gpu_hash_64_final(const uint32_t threads, const uint32_t startNounce, const uint32_t *const __restrict__ g_hash, uint32_t *const __restrict__ d_nonce)
 {
 	__shared__ uint32_t mixtabs[1024];
 
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread< threads)
 	{
-		if(threadIdx.x < 128)
+		if (threadIdx.x < 256)
 		{
-			*((uint32_t*)mixtabs + (threadIdx.x)) = tex1Dfetch(mixTab0Tex, threadIdx.x);
-			*((uint32_t*)mixtabs + (128 + threadIdx.x)) = tex1Dfetch(mixTab0Tex, threadIdx.x + 128);
-			*((uint32_t*)mixtabs + (256 + threadIdx.x)) = tex1Dfetch(mixTab1Tex, threadIdx.x);
-			*((uint32_t*)mixtabs + (256 + 128 + threadIdx.x)) = tex1Dfetch(mixTab1Tex, threadIdx.x + 128);
-			*((uint32_t*)mixtabs + (512 + threadIdx.x)) = tex1Dfetch(mixTab2Tex, threadIdx.x);
-			*((uint32_t*)mixtabs + (512 + 128 + threadIdx.x)) = tex1Dfetch(mixTab2Tex, threadIdx.x + 128);
-			*((uint32_t*)mixtabs + (768 + threadIdx.x)) = tex1Dfetch(mixTab3Tex, threadIdx.x);
-			*((uint32_t*)mixtabs + (768 + 128 + threadIdx.x)) = tex1Dfetch(mixTab3Tex, threadIdx.x + 128);
+			mixtabs[threadIdx.x] = mixTab0Tex[threadIdx.x];
+			mixtabs[256 + threadIdx.x] = ROTR32(mixtabs[threadIdx.x], 8);
+			mixtabs[(512 + threadIdx.x)] = ROTR32(mixtabs[threadIdx.x], 16);
+			mixtabs[(768 + threadIdx.x)] = ROTR32(mixtabs[threadIdx.x], 24);
 		}
-			uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-		int hashPosition = nounce - startNounce;
-		uint32_t *h = (uint32_t*)&g_hash[hashPosition * 8];
+		__syncthreads();
+		const uint32_t nounce =  (startNounce + thread);
+		const uint32_t hashPosition = nounce - startNounce;
+		const uint32_t *h = &g_hash[hashPosition * 16];
 		uint32_t Hash[16];
 #pragma unroll 16
 		for (int i = 0; i < 16; i++)
@@ -700,24 +876,175 @@ void x13_fugue512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint
 		uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
 		uint32_t S30, S31, S32, S33, S34, S35;
 
-		uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
-		uint64_t bc = (uint64_t)64 << 3;
-		uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL);
-		uint32_t bchi = (uint32_t)(bc >> 32);
+		uint32_t B33, B34, B35;
 
-		S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
-		S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
-		S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1);
-		S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
-		S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
-		__syncthreads();
+		S02 = S03 = S05 = S06 = S09 = S10 = S11 = S12 = S13 = S14 = S16 = S17 = S18 = S19 = 0;
+		S20 = 0x8807a57eUL; S21 = 0xe616af75UL; S22 = 0xc5d3e4dbUL; S23 = 0xac9ab027UL;
+		S24 = 0xd915f117UL; S25 = 0xb6eecc54UL; S26 = 0x06e8020bUL; S27 = 0x4a92efd1UL;
+		S28 = 0xaac6e2c9UL; S29 = 0xddb21398UL; S30 = 0xcae65838UL; S31 = 0x437f203fUL;
+		S32 = 0x25ea78e7UL; S33 = 0x4c0a2cc1UL; S34 = 0xda6ed11dUL; S35 = 0xe13e3567UL;
+
+		S01 = 0xd915f117UL;
+		S04 = 0x4a92efd1UL;
+		S07 = 0xcae65838UL;
+		S15 = 0xd915f117UL;
+		S00 = Hash[0];
+		S08 = Hash[0];
+
+		uint32_t c0 = 0x9ae23283UL;
+		uint32_t c1 = 0x0361b92dUL;
+		uint32_t c2 = 0x4c92d8edUL;
+		uint32_t r0, r1, r2;
+		uint32_t tmp, tmp2, c3;
+
+		tmp = mixtabs[__byte_perm(S00, 0, 17475)]; c3 = tmp; r0 = 0xafaf608aUL ^ tmp;
+		tmp = mixtabs[256 + __byte_perm(S00, 0, 17474)]; c3 ^= tmp; r1 = 0x79d5d51dUL ^ tmp;
+		tmp = mixtabs[512 + __byte_perm(S00, 0, 17473)]; c3 ^= tmp; r2 = 0xf6274f4fUL ^ tmp;
+		tmp = mixtabs[768 + __byte_perm(S00, 0, 17472)]; c3 ^= tmp;
+		tmp2 = __byte_perm(c0 ^ r0, c1 ^ r1, 13878);
+		tmp = __byte_perm(c2 ^ r2, c3 ^ 0x59947f59UL, 5140);
+		S33 = __byte_perm(tmp2, tmp, 12884);
+		r0 = ROL8(r0); r1 = ROL8(r1);
+		r2 = ROL8(r2);
+		tmp2 = __byte_perm(c1 ^ r0, c2 ^ r1, 13878);
+		tmp = __byte_perm(c3 ^ r2, c0 ^ 0x947f5959UL, 5140);
+		S34 = __byte_perm(tmp2, tmp, 12884);
+		r0 = ROL8(r0); r1 = ROL8(r1);
+		r2 = ROL8(r2);
+		tmp2 = __byte_perm(c2 ^ r0, c3 ^ r1, 13878);
+		tmp = __byte_perm(c0 ^ r2, c1 ^ 0x7f595994UL, 5140);
+		S35 = __byte_perm(tmp2, tmp, 12884);
+		r0 = ROL8(r0); r1 = ROL8(r1);
+		r2 = ROL8(r2);
+		tmp2 = __byte_perm(c3 ^ r0, c0 ^ r1, 13878);
+		tmp = __byte_perm(c1 ^ r2, c2 ^ 0x5959947fUL, 5140);
+		S00 = __byte_perm(tmp2, tmp, 12884);
+
+		CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14);
+		SMIX(S30, S31, S32, S33);
+		CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11);
+		SMIX(S27, S28, S29, S30);
+		CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08);
+		SMIX(S24, S25, S26, S27);
+
+		TIX4(Hash[1], S24, S25, S28, S31, S32, S10, S12, S15, S18);
+		CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05);
+		SMIX(S21, S22, S23, S24);
+		CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02);
+		SMIX(S18, S19, S20, S21); 
+		CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35);
+		SMIX(S15, S16, S17, S18);
+		CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32);
+		SMIX(S12, S13, S14, S15);
+			
+		TIX4(Hash[2], S12, S13, S16, S19, S20, S34, S00, S03, S06); 
+		CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); 
+		SMIX(S09, S10, S11, S12); 
+		CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); 
+		SMIX(S06, S07, S08, S09); 
+		CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); 
+		SMIX(S03, S04, S05, S06); 
+		CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); 
+		SMIX(S00, S01, S02, S03); 
+
+#pragma unroll
+		for (int i = 3; i < (5 * 3); i += 3)
+		{
+			FUGUE512_3((Hash[i]), (Hash[i + 1]), (Hash[i + 2]));
+		}
+		TIX4(Hash[0xF], S00, S01, S04, S07, S08, S22, S24, S27, S30);
+		CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17);
+		SMIX(S33, S34, S35, S00);
+		CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14);
+		SMIX(S30, S31, S32, S33);
+		CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11);
+		SMIX(S27, S28, S29, S30);
+		CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08);
+		SMIX(S24, S25, S26, S27);
+
+		S10 ^= S24;
+		S25 ^= S12; S28 ^= S15; S31 ^= S18;
+		S21 ^= S25; S22 ^= S26; S23 ^= S27; S03 ^= S25; S04 ^= S26; S05 ^= S27;
+		tmp = (*(mixtabs + ((__byte_perm(S21, 0, 0x4443))))); c0 = tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S21, 0, 0x4442))))); c0 ^= tmp; r1 = tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S21, 0, 0x4441))))); c0 ^= tmp; r2 = tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S21, 0, 0x4440))))); c0 ^= tmp; uint32_t r3 = tmp; tmp = (*(mixtabs + ((__byte_perm(S22, 0, 0x4443))))); c1 = tmp; r0 = tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S22, 0, 0x4442))))); c1 ^= tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S22, 0, 0x4441))))); c1 ^= tmp; r2 ^= tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S22, 0, 0x4440))))); c1 ^= tmp; r3 ^= tmp; tmp = (*(mixtabs + ((__byte_perm(S23, 0, 0x4443))))); c2 = tmp; r0 ^= tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S23, 0, 0x4442))))); c2 ^= tmp; r1 ^= tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S23, 0, 0x4441))))); c2 ^= tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S23, 0, 0x4440))))); c2 ^= tmp; r3 ^= tmp;
+		r0 ^= 0x63633297UL;
+		r1 ^= 0x97636332UL;
+		r2 ^= 0x32976363UL;
+		c3 = (0x63633297UL^0x97636332UL^0x32976363UL^0x63329763UL);
+		tmp2 = __byte_perm((c0 ^ r0), (c1 ^ r1), 0x3636); tmp = __byte_perm((c2 ^ r2), (c3 ^ r3), 0x1414);
+		S21 = __byte_perm(tmp2, tmp, 0x3254);
+		r0 = ROTL32((r0), (8));
+		r1 = ROTL32((r1), (8));
+		r2 = ROTL32((r2), (8));
+		r3 = ROTL32((r3), (8));
+		tmp2 = __byte_perm((c1 ^ r0), (c2 ^ r1), 0x3636); tmp = __byte_perm((c3 ^ r2), (c0 ^ r3), 0x1414);
+		S22 = __byte_perm(tmp2, tmp, 0x3254);
+		r0 = ROTL32((r0), (8));
+		r1 = ROTL32((r1), (8));
+		r2 = ROTL32((r2), (8));
+		r3 = ROTL32((r3), (8));
+		tmp2 = __byte_perm((c2 ^ r0), (c3 ^ r1), 0x3636); tmp = __byte_perm((c0 ^ r2), (c1 ^ r3), 0x1414);
+		S23 = __byte_perm(tmp2, tmp, 0x3254);
+		r0 = ROTL32((r0), (8));
+		r1 = ROTL32((r1), (8));
+		r2 = ROTL32((r2), (8));
+		r3 = ROTL32((r3), (8));
+		tmp2 = __byte_perm((c3 ^ r0), (c0 ^ r1), 0x3636); tmp = __byte_perm((c1 ^ r2), (c2 ^ r3), 0x1414);
+		S24 = __byte_perm(tmp2, tmp, 0x3254);
+
+		CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02);
+		SMIX(S18, S19, S20, S21);
+		CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35);
+		SMIX(S15, S16, S17, S18);
+		CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32);
+		SMIX(S12, S13, S14, S15);
 
-		FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2]));
-		FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5]));
-		FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8]));
-		FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB]));
-		FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE]));
-		FUGUE512_3((Hash[0xF]), bchi, bclo);
+        S34 ^= S12; 
+		S12 = (64 << 3); 
+		S20 ^= S12; S13 ^= S00; S16 ^= S03; S19 ^= S06; 
+		S09 ^= S13; S10 ^= S14; S11 ^= S15; S27 ^= S13; S28 ^= S14; S29 ^= S15;
+		c0 = (*(mixtabs + ((__byte_perm(S09, 0, 0x4443)))));
+		tmp = (*(mixtabs + (256 + (__byte_perm(S09, 0, 0x4442))))); c0 ^= tmp; r1 = tmp;
+		tmp = (*(mixtabs + (512 + (__byte_perm(S09, 0, 0x4441))))); c0 ^= tmp; r2 = tmp;
+		tmp = (*(mixtabs + (768 + (__byte_perm(S09, 0, 0x4440))))); c0 ^= tmp; r3 = tmp;
+		tmp = (*(mixtabs + ((__byte_perm(S10, 0, 0x4443))))); c1 = tmp; r0 = tmp;
+		tmp = (*(mixtabs + (256 + (__byte_perm(S10, 0, 0x4442))))); c1 ^= tmp;
+		tmp = (*(mixtabs + (512 + (__byte_perm(S10, 0, 0x4441))))); c1 ^= tmp; r2 ^= tmp;
+		tmp = (*(mixtabs + (768 + (__byte_perm(S10, 0, 0x4440))))); c1 ^= tmp; r3 ^= tmp;
+		tmp = (*(mixtabs + ((__byte_perm(S11, 0, 0x4443))))); c2 = tmp; r0 ^= tmp;
+		tmp = (*(mixtabs + (256 + (__byte_perm(S11, 0, 0x4442))))); c2 ^= tmp; r1 ^= tmp;
+		tmp = (*(mixtabs + (512 + (__byte_perm(S11, 0, 0x4441))))); c2 ^= tmp;
+		tmp = (*(mixtabs + (768 + (__byte_perm(S11, 0, 0x4440))))); c2 ^= tmp; r3 ^= tmp; 
+		r0 ^= 0x63633297UL;
+		r1 ^= 0x97636332UL;
+		r2 ^= 0x5ec77777UL;
+		c3 = (0x63633297UL^0x97636332UL^0x5ec77777UL^0x63329763);
+		tmp2 = __byte_perm((c0 ^ r0), (c1 ^ r1), 0x3636); tmp = __byte_perm((c2 ^ r2), (c3 ^ r3), 0x1414);
+		S09 = __byte_perm(tmp2, tmp, 0x3254);
+		r0 = ROTL32((r0), (8));
+		r1 = ROTL32((r1), (8));
+		r2 = ROTL32((r2), (8));
+		r3 = ROTL32((r3), (8));
+		tmp2 = __byte_perm((c1 ^ r0), (c2 ^ r1), 0x3636); tmp = __byte_perm((c3 ^ r2), (c0 ^ r3), 0x1414);
+		S10 = __byte_perm(tmp2, tmp, 0x3254);
+		r0 = ROTL32((r0), (8));
+		r1 = ROTL32((r1), (8));
+		r2 = ROTL32((r2), (8));
+		r3 = ROTL32((r3), (8));
+		tmp2 = __byte_perm((c2 ^ r0), (c3 ^ r1), 0x3636); tmp = __byte_perm((c0 ^ r2), (c1 ^ r3), 0x1414);
+		S11 = __byte_perm(tmp2, tmp, 0x3254);
+		r0 = ROTL32((r0), (8));
+		r1 = ROTL32((r1), (8));
+		r2 = ROTL32((r2), (8));
+		r3 = ROTL32((r3), (8));
+		tmp2 = __byte_perm((c3 ^ r0), (c0 ^ r1), 0x3636); tmp = __byte_perm((c1 ^ r2), (c2 ^ r3), 0x1414);
+		S12 = __byte_perm(tmp2, tmp, 0x3254);
+
+		CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26);
+		SMIX(S06, S07, S08, S09);
+		CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23);
+		SMIX(S03, S04, S05, S06);
+		CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
+		SMIX(S00, S01, S02, S03);
 
 	//#pragma unroll 32
 		for (int i = 0; i < 32; i++) {
@@ -726,86 +1053,76 @@ void x13_fugue512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint
 			SMIX(S00, S01, S02, S03);
 		}
 	//#pragma unroll 13
-		for (int i = 0; i < 12; i++) 
+		for (int i = 0; i < 11; i++) 
 		{
 			S04 ^= S00;
 			S09 ^= S00;
 			S18 ^= S00;
 			S27 ^= S00;
-			ROR9;
-			SMIX(S00, S01, S02, S03);
-			S04 ^= S00;
-			S10 ^= S00;
-			S18 ^= S00;
-			S27 ^= S00;
-			ROR9;
-			SMIX(S00, S01, S02, S03);
-			S04 ^= S00;
-			S10 ^= S00;
-			S19 ^= S00;
-			S27 ^= S00;
-			ROR9;
-			SMIX(S00, S01, S02, S03);
-			S04 ^= S00;
-			S10 ^= S00;
-			S19 ^= S00;
-			S28 ^= S00;
-			ROR8;
-			SMIX(S00, S01, S02, S03);
+			SMIX(S27, S28, S29, S30);
+			S31 ^= S27; 
+			S01 ^= S27;
+			S09 ^= S27; 
+			S18 ^= S27;
+			SMIX(S18, S19, S20, S21);
+			S22 ^= S18;
+			S28 ^= S18;
+			S01 ^= S18;
+			S09 ^= S18;
+			SMIX(S09, S10, S11, S12);
+			S13 ^= S09;
+			S19 ^= S09;
+			S28 ^= S09;
+			S01 ^= S09; 
+			SMIX(S01, S02, S03, S04);
+			ROL1;
 		}
 		S04 ^= S00;
 		S09 ^= S00;
 		S18 ^= S00;
 		S27 ^= S00;
-		ROR9;
-		SMIX(S00, S01, S02, S03);
-		S04 ^= S00;
-		S10 ^= S00;
-		S18 ^= S00;
-		S27 ^= S00;
-		ROR9;
-		SMIX(S00, S01, S02, S03);
-		S04 ^= S00;
-		S10 ^= S00;
-		S19 ^= S00;
-		S27 ^= S00;
-		ROR9;
-		SMIX(S00, S01, S02, S03);
-
-
-		S04 ^= S00;
-		if (cuda_swab32(S04) <= pTarget[7])
+		SMIX(S27, S28, S29, S30);
+		S31 ^= S27;
+		S01 ^= S27;
+		S09 ^= S27;
+		S18 ^= S27;
+		SMIX(S18, S19, S20, S21);
+		S22 ^= S18;
+		S28 ^= S18;
+		S01 ^= S18;
+		S09 ^= S18;
+		SMIX(S09, S10, S11, S12);
+		S13 ^= S09;
+		S19 ^= S09;
+		S28 ^= S09;
+		S01 ^= S09;
+		SMIX0(S01, S02, S03, S04);
+		S10 ^= S01;
+		S19 ^= S01;
+		S28 ^= S01;
+		SMIX0(S28, S29, S30, S31);
+		S10 ^= S28;
+		S19 ^= S28;
+		SMIX0(S19, S20, S21, S22);
+		S10 ^= S19;
+		SMIX0(S10, S11, S12, S13);
+		S14 ^= S10;
+		if (cuda_swab32(S14) <= pTarget[7])
 		{
-			if (d_nonce[0] != 0xffffffff)
-			{
-				if (d_nonce[0] < nounce)  d_nonce[0] = nounce;
-			}
-			else d_nonce[0] = nounce;
+			uint32_t tmp = atomicExch(d_nonce, nounce);
+			if (tmp != 0xffffffff)
+				d_nonce[1] = tmp;
 		}
 	}
 }
 
-#define texDef(texname, texmem, texsource, texsize) \
-	unsigned int *texmem; \
-	cudaMalloc(&texmem, texsize); \
-	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
-	texname.normalized = 0; \
-	texname.filterMode = cudaFilterModePoint; \
-	texname.addressMode[0] = cudaAddressModeClamp; \
-	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); }
-
 __host__ void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
 {
-	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
-	cudaMalloc(&d_nonce[thr_id], sizeof(uint32_t));
+	cudaMalloc(&d_nonce[thr_id], 2*sizeof(uint32_t));
 }
-__host__ void x13_fugue512_cpu_setTarget(const void *ptarget)
+__host__ void x13_fugue512_cpu_setTarget(int thr_id, const void *ptarget)
 {
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(pTarget, ptarget, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]));
 
 }
 
@@ -815,7 +1132,7 @@ __host__ void  x13_fugue512_cpu_free(int32_t thr_id)
 	cudaFreeHost(&d_nonce[thr_id]);
 }
 
-__host__ void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 128;
 
@@ -825,21 +1142,20 @@ __host__ void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
 
 	// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 
-	x13_fugue512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x13_fugue512_gpu_hash_64<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_hash);
 //	MyStreamSynchronize(NULL, order, thr_id);
 }
-__host__ uint32_t x13_fugue512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void x13_fugue512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t *res)
 {
-	const uint32_t threadsperblock = 128;
+	const uint32_t threadsperblock = 256;
 
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	cudaMemset(d_nonce[thr_id], 0xff, sizeof(uint32_t));
+	cudaMemsetAsync(d_nonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]);
 
-	x13_fugue512_gpu_hash_64_final << <grid, block>> >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_nonce[thr_id]);
-	uint32_t res;
-	cudaMemcpy(&res, d_nonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	return res;
+	x13_fugue512_gpu_hash_64_final << <grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_hash, d_nonce[thr_id]);
+	cudaMemcpyAsync(res, d_nonce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
+	cudaStreamSynchronize(gpustream[thr_id]);
 }
diff --git a/x13/cuda_x13_hamsi512.cu b/x13/cuda_x13_hamsi512.cu
index 47aab23c70..0070c80a52 100644
--- a/x13/cuda_x13_hamsi512.cu
+++ b/x13/cuda_x13_hamsi512.cu
@@ -1,463 +1,402 @@
-#include <stdio.h>
+#ifdef __cplusplus
+#include <cstdint>
+#include <cstdio>
+using namespace std;
+#else
 #include <stdint.h>
+#include <stdio.h>
+#endif
 #include <memory.h>
+#include "cuda_helper.h"
 
 
-typedef unsigned char BitSequence;
-
 
-#include "cuda_helper.h"
-
-#undef SPH_C32
-#define SPH_C32(x)    (x)
-#undef SPH_T32
-#define SPH_T32(x)    (x)
-
-static __constant__ uint32_t d_alpha_n[32] = {
-	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
-	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
-	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
-	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
-	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
-	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
-	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
-	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
-};
-static __constant__ uint32_t d_alpha_f[32] = {
-	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
-	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
-	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
-	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
-	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
-	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
-	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
+static __constant__ uint32_t d_T512[4096/4] = {
+	0xef0b0270, 0x3afd0000, 0x5dae0000,
+	0x69490000, 0x9b0f3c06, 0x4405b5f9,
+	0x66140a51, 0x924f5d0a, 0xc96b0030,
+	0xe7250000, 0x2f840000, 0x264f0000,
+	0x08695bf9, 0x6dfcf137, 0x509f6984,
+	0x9e69af68,
+	0xc96b0030, 0xe7250000, 0x2f840000,
+	0x264f0000, 0x08695bf9, 0x6dfcf137,
+	0x509f6984, 0x9e69af68, 0x26600240,
+	0xddd80000, 0x722a0000, 0x4f060000,
+	0x936667ff, 0x29f944ce, 0x368b63d5,
+	0x0c26f262,
+	0x145a3c00, 0xb9e90000, 0x61270000,
+	0xf1610000, 0xce613d6c, 0xb0493d78,
+	0x47a96720, 0xe18e24c5, 0x23671400,
+	0xc8b90000, 0xf4c70000, 0xfb750000,
+	0x73cd2465, 0xf8a6a549, 0x02c40a3f,
+	0xdc24e61f,
+	0x23671400, 0xc8b90000, 0xf4c70000,
+	0xfb750000, 0x73cd2465, 0xf8a6a549,
+	0x02c40a3f, 0xdc24e61f, 0x373d2800,
+	0x71500000, 0x95e00000, 0x0a140000,
+	0xbdac1909, 0x48ef9831, 0x456d6d1f,
+	0x3daac2da,
+	0x54285c00, 0xeaed0000, 0xc5d60000,
+	0xa1c50000, 0xb3a26770, 0x94a5c4e1,
+	0x6bb0419d, 0x551b3782, 0x9cbb1800,
+	0xb0d30000, 0x92510000, 0xed930000,
+	0x593a4345, 0xe114d5f4, 0x430633da,
+	0x78cace29,
+	0x9cbb1800, 0xb0d30000, 0x92510000,
+	0xed930000, 0x593a4345, 0xe114d5f4,
+	0x430633da, 0x78cace29, 0xc8934400,
+	0x5a3e0000, 0x57870000, 0x4c560000,
+	0xea982435, 0x75b11115, 0x28b67247,
+	0x2dd1f9ab,
+	0x29449c00, 0x64e70000, 0xf24b0000,
+	0xc2f30000, 0x0ede4e8f, 0x56c23745,
+	0xf3e04259, 0x8d0d9ec4, 0x466d0c00,
+	0x08620000, 0xdd5d0000, 0xbadd0000,
+	0x6a927942, 0x441f2b93, 0x218ace6f,
+	0xbf2c0be2,
+	0x466d0c00, 0x08620000, 0xdd5d0000,
+	0xbadd0000, 0x6a927942, 0x441f2b93,
+	0x218ace6f, 0xbf2c0be2, 0x6f299000,
+	0x6c850000, 0x2f160000, 0x782e0000,
+	0x644c37cd, 0x12dd1cd6, 0xd26a8c36,
+	0x32219526,
+	0xf6800005, 0x3443c000, 0x24070000,
+	0x8f3d0000, 0x21373bfb, 0x0ab8d5ae,
+	0xcdc58b19, 0xd795ba31, 0xa67f0001,
+	0x71378000, 0x19fc0000, 0x96db0000,
+	0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f,
+	0xac8e6c88,
+	0xa67f0001, 0x71378000, 0x19fc0000,
+	0x96db0000, 0x3a8b6dfd, 0xebcaaef3,
+	0x2c6d478f, 0xac8e6c88, 0x50ff0004,
+	0x45744000, 0x3dfb0000, 0x19e60000,
+	0x1bbc5606, 0xe1727b5d, 0xe1a8cc96,
+	0x7b1bd6b9,
+	0xf7750009, 0xcf3cc000, 0xc3d60000,
+	0x04920000, 0x029519a9, 0xf8e836ba,
+	0x7a87f14e, 0x9e16981a, 0xd46a0000,
+	0x8dc8c000, 0xa5af0000, 0x4a290000,
+	0xfc4e427a, 0xc9b4866c, 0x98369604,
+	0xf746c320,
+	0xd46a0000, 0x8dc8c000, 0xa5af0000,
+	0x4a290000, 0xfc4e427a, 0xc9b4866c,
+	0x98369604, 0xf746c320, 0x231f0009,
+	0x42f40000, 0x66790000, 0x4ebb0000,
+	0xfedb5bd3, 0x315cb0d6, 0xe2b1674a,
+	0x69505b3a,
+	0x774400f0, 0xf15a0000, 0xf5b20000,
+	0x34140000, 0x89377e8c, 0x5a8bec25,
+	0x0bc3cd1e, 0xcf3775cb, 0xf46c0050,
+	0x96180000, 0x14a50000, 0x031f0000,
+	0x42947eb8, 0x66bf7e19, 0x9ca470d2,
+	0x8a341574,
+	0xf46c0050, 0x96180000, 0x14a50000,
+	0x031f0000, 0x42947eb8, 0x66bf7e19,
+	0x9ca470d2, 0x8a341574, 0x832800a0,
+	0x67420000, 0xe1170000, 0x370b0000,
+	0xcba30034, 0x3c34923c, 0x9767bdcc,
+	0x450360bf,
+	0xe8870170, 0x9d720000, 0x12db0000,
+	0xd4220000, 0xf2886b27, 0xa921e543,
+	0x4ef8b518, 0x618813b1, 0xb4370060,
+	0x0c4c0000, 0x56c20000, 0x5cae0000,
+	0x94541f3f, 0x3b3ef825, 0x1b365f3d,
+	0xf3d45758,
+	0xb4370060, 0x0c4c0000, 0x56c20000,
+	0x5cae0000, 0x94541f3f, 0x3b3ef825,
+	0x1b365f3d, 0xf3d45758, 0x5cb00110,
+	0x913e0000, 0x44190000, 0x888c0000,
+	0x66dc7418, 0x921f1d66, 0x55ceea25,
+	0x925c44e9,
+	0x0c720000, 0x49e50f00, 0x42790000,
+	0x5cea0000, 0x33aa301a, 0x15822514,
+	0x95a34b7b, 0xb44b0090, 0xfe220000,
+	0xa7580500, 0x25d10000, 0xf7600000,
+	0x893178da, 0x1fd4f860, 0x4ed0a315,
+	0xa123ff9f,
+	0xfe220000, 0xa7580500, 0x25d10000,
+	0xf7600000, 0x893178da, 0x1fd4f860,
+	0x4ed0a315, 0xa123ff9f, 0xf2500000,
+	0xeebd0a00, 0x67a80000, 0xab8a0000,
+	0xba9b48c0, 0x0a56dd74, 0xdb73e86e,
+	0x1568ff0f,
+	0x45180000, 0xa5b51700, 0xf96a0000,
+	0x3b480000, 0x1ecc142c, 0x231395d6,
+	0x16bca6b0, 0xdf33f4df, 0xb83d0000,
+	0x16710600, 0x379a0000, 0xf5b10000,
+	0x228161ac, 0xae48f145, 0x66241616,
+	0xc5c1eb3e,
+	0xb83d0000, 0x16710600, 0x379a0000,
+	0xf5b10000, 0x228161ac, 0xae48f145,
+	0x66241616, 0xc5c1eb3e, 0xfd250000,
+	0xb3c41100, 0xcef00000, 0xcef90000,
+	0x3c4d7580, 0x8d5b6493, 0x7098b0a6,
+	0x1af21fe1,
+	0x75a40000, 0xc28b2700, 0x94a40000,
+	0x90f50000, 0xfb7857e0, 0x49ce0bae,
+	0x1767c483, 0xaedf667e, 0xd1660000,
+	0x1bbc0300, 0x9eec0000, 0xf6940000,
+	0x03024527, 0xcf70fcf2, 0xb4431b17,
+	0x857f3c2b,
+	0xd1660000, 0x1bbc0300, 0x9eec0000,
+	0xf6940000, 0x03024527, 0xcf70fcf2,
+	0xb4431b17, 0x857f3c2b, 0xa4c20000,
+	0xd9372400, 0x0a480000, 0x66610000,
+	0xf87a12c7, 0x86bef75c, 0xa324df94,
+	0x2ba05a55,
+	0x75c90003, 0x0e10c000, 0xd1200000,
+	0xbaea0000, 0x8bc42f3e, 0x8758b757,
+	0xbb28761d, 0x00b72e2b, 0xeecf0001,
+	0x6f564000, 0xf33e0000, 0xa79e0000,
+	0xbdb57219, 0xb711ebc5, 0x4a3b40ba,
+	0xfeabf254,
+	0xeecf0001, 0x6f564000, 0xf33e0000,
+	0xa79e0000, 0xbdb57219, 0xb711ebc5,
+	0x4a3b40ba, 0xfeabf254, 0x9b060002,
+	0x61468000, 0x221e0000, 0x1d740000,
+	0x36715d27, 0x30495c92, 0xf11336a7,
+	0xfe1cdc7f,
+	0x86790000, 0x3f390002, 0xe19ae000,
+	0x98560000, 0x9565670e, 0x4e88c8ea,
+	0xd3dd4944, 0x161ddab9, 0x30b70000,
+	0xe5d00000, 0xf4f46000, 0x42c40000,
+	0x63b83d6a, 0x78ba9460, 0x21afa1ea,
+	0xb0a51834,
+	0x30b70000, 0xe5d00000, 0xf4f46000,
+	0x42c40000, 0x63b83d6a, 0x78ba9460,
+	0x21afa1ea, 0xb0a51834, 0xb6ce0000,
+	0xdae90002, 0x156e8000, 0xda920000,
+	0xf6dd5a64, 0x36325c8a, 0xf272e8ae,
+	0xa6b8c28d,
+	0x14190000, 0x23ca003c, 0x50df0000,
+	0x44b60000, 0x1b6c67b0, 0x3cf3ac75,
+	0x61e610b0, 0xdbcadb80, 0xe3430000,
+	0x3a4e0014, 0xf2c60000, 0xaa4e0000,
+	0xdb1e42a6, 0x256bbe15, 0x123db156,
+	0x3a4e99d7,
+	0xe3430000, 0x3a4e0014, 0xf2c60000,
+	0xaa4e0000, 0xdb1e42a6, 0x256bbe15,
+	0x123db156, 0x3a4e99d7, 0xf75a0000,
+	0x19840028, 0xa2190000, 0xeef80000,
+	0xc0722516, 0x19981260, 0x73dba1e6,
+	0xe1844257,
+	0x54500000, 0x0671005c, 0x25ae0000,
+	0x6a1e0000, 0x2ea54edf, 0x664e8512,
+	0xbfba18c3, 0x7e715d17, 0xbc8d0000,
+	0xfc3b0018, 0x19830000, 0xd10b0000,
+	0xae1878c4, 0x42a69856, 0x0012da37,
+	0x2c3b504e,
+	0xbc8d0000, 0xfc3b0018, 0x19830000,
+	0xd10b0000, 0xae1878c4, 0x42a69856,
+	0x0012da37, 0x2c3b504e, 0xe8dd0000,
+	0xfa4a0044, 0x3c2d0000, 0xbb150000,
+	0x80bd361b, 0x24e81d44, 0xbfa8c2f4,
+	0x524a0d59,
+	0x69510000, 0xd4e1009c, 0xc3230000,
+	0xac2f0000, 0xe4950bae, 0xcea415dc,
+	0x87ec287c, 0xbce1a3ce, 0xc6730000,
+	0xaf8d000c, 0xa4c10000, 0x218d0000,
+	0x23111587, 0x7913512f, 0x1d28ac88,
+	0x378dd173,
+	0xc6730000, 0xaf8d000c, 0xa4c10000,
+	0x218d0000, 0x23111587, 0x7913512f,
+	0x1d28ac88, 0x378dd173, 0xaf220000,
+	0x7b6c0090, 0x67e20000, 0x8da20000,
+	0xc7841e29, 0xb7b744f3, 0x9ac484f4,
+	0x8b6c72bd,
+	0xcc140000, 0xa5630000, 0x5ab90780,
+	0x3b500000, 0x4bd013ff, 0x879b3418,
+	0x694348c1, 0xca5a87fe, 0x819e0000,
+	0xec570000, 0x66320280, 0x95f30000,
+	0x5da92802, 0x48f43cbc, 0xe65aa22d,
+	0x8e67b7fa,
+	0x819e0000, 0xec570000, 0x66320280,
+	0x95f30000, 0x5da92802, 0x48f43cbc,
+	0xe65aa22d, 0x8e67b7fa, 0x4d8a0000,
+	0x49340000, 0x3c8b0500, 0xaea30000,
+	0x16793bfd, 0xcf6f08a4, 0x8f19eaec,
+	0x443d3004,
+	0x78230000, 0x12fc0000, 0xa93a0b80,
+	0x90a50000, 0x713e2879, 0x7ee98924,
+	0xf08ca062, 0x636f8bab, 0x02af0000,
+	0xb7280000, 0xba1c0300, 0x56980000,
+	0xba8d45d3, 0x8048c667, 0xa95c149a,
+	0xf4f6ea7b,
+	0x02af0000, 0xb7280000, 0xba1c0300,
+	0x56980000, 0xba8d45d3, 0x8048c667,
+	0xa95c149a, 0xf4f6ea7b, 0x7a8c0000,
+	0xa5d40000, 0x13260880, 0xc63d0000,
+	0xcbb36daa, 0xfea14f43, 0x59d0b4f8,
+	0x979961d0,
+	0xac480000, 0x1ba60000, 0x45fb1380,
+	0x03430000, 0x5a85316a, 0x1fb250b6,
+	0xfe72c7fe, 0x91e478f6, 0x1e4e0000,
+	0xdecf0000, 0x6df80180, 0x77240000,
+	0xec47079e, 0xf4a0694e, 0xcda31812,
+	0x98aa496e,
+	0x1e4e0000, 0xdecf0000, 0x6df80180,
+	0x77240000, 0xec47079e, 0xf4a0694e,
+	0xcda31812, 0x98aa496e, 0xb2060000,
+	0xc5690000, 0x28031200, 0x74670000,
+	0xb6c236f4, 0xeb1239f8, 0x33d1dfec,
+	0x094e3198,
+	0xaec30000, 0x9c4f0001, 0x79d1e000,
+	0x2c150000, 0x45cc75b3, 0x6650b736,
+	0xab92f78f, 0xa312567b, 0xdb250000,
+	0x09290000, 0x49aac000, 0x81e10000,
+	0xcafe6b59, 0x42793431, 0x43566b76,
+	0xe86cba2e,
+	0xdb250000, 0x09290000, 0x49aac000,
+	0x81e10000, 0xcafe6b59, 0x42793431,
+	0x43566b76, 0xe86cba2e, 0x75e60000,
+	0x95660001, 0x307b2000, 0xadf40000,
+	0x8f321eea, 0x24298307, 0xe8c49cf9,
+	0x4b7eec55,
+	0x58430000, 0x807e0000, 0x78330001,
+	0xc66b3800, 0xe7375cdc, 0x79ad3fdd,
+	0xac73fe6f, 0x3a4479b1, 0x1d5a0000,
+	0x2b720000, 0x488d0000, 0xaf611800,
+	0x25cb2ec5, 0xc879bfd0, 0x81a20429,
+	0x1e7536a6,
+	0x1d5a0000, 0x2b720000, 0x488d0000,
+	0xaf611800, 0x25cb2ec5, 0xc879bfd0,
+	0x81a20429, 0x1e7536a6, 0x45190000,
+	0xab0c0000, 0x30be0001, 0x690a2000,
+	0xc2fc7219, 0xb1d4800d, 0x2dd1fa46,
+	0x24314f17,
+	0xa53b0000, 0x14260000, 0x4e30001e,
+	0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d,
+	0xf73168d8, 0x0b1b4946, 0x07ed0000,
+	0xb2500000, 0x8774000a, 0x970d0000,
+	0x437223ae, 0x48c76ea4, 0xf4786222,
+	0x9075b1ce,
+	0x07ed0000, 0xb2500000, 0x8774000a,
+	0x970d0000, 0x437223ae, 0x48c76ea4,
+	0xf4786222, 0x9075b1ce, 0xa2d60000,
+	0xa6760000, 0xc9440014, 0xeba30000,
+	0xccec2e7b, 0x3018c499, 0x03490afa,
+	0x9b6ef888,
+	0x88980000, 0x1f940000, 0x7fcf002e,
+	0xfb4e0000, 0xf158079a, 0x61ae9167,
+	0xa895706c, 0xe6107494, 0x0bc20000,
+	0xdb630000, 0x7e88000c, 0x15860000,
+	0x91fd48f3, 0x7581bb43, 0xf460449e,
+	0xd8b61463,
+	0x0bc20000, 0xdb630000, 0x7e88000c,
+	0x15860000, 0x91fd48f3, 0x7581bb43,
+	0xf460449e, 0xd8b61463, 0x835a0000,
+	0xc4f70000, 0x01470022, 0xeec80000,
+	0x60a54f69, 0x142f2a24, 0x5cf534f2,
+	0x3ea660f7,
+	0x52500000, 0x29540000, 0x6a61004e,
+	0xf0ff0000, 0x9a317eec, 0x452341ce,
+	0xcf568fe5, 0x5303130f, 0x538d0000,
+	0xa9fc0000, 0x9ef70006, 0x56ff0000,
+	0x0ae4004e, 0x92c5cdf9, 0xa9444018,
+	0x7f975691,
+	0x538d0000, 0xa9fc0000, 0x9ef70006,
+	0x56ff0000, 0x0ae4004e, 0x92c5cdf9,
+	0xa9444018, 0x7f975691, 0x01dd0000,
+	0x80a80000, 0xf4960048, 0xa6000000,
+	0x90d57ea2, 0xd7e68c37, 0x6612cffd,
+	0x2c94459e,
+	0xe6280000, 0x4c4b0000, 0xa8550000,
+	0xd3d002e0, 0xd86130b8, 0x98a7b0da,
+	0x289506b4, 0xd75a4897, 0xf0c50000,
+	0x59230000, 0x45820000, 0xe18d00c0,
+	0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c,
+	0x56a7b19f,
+	0xf0c50000, 0x59230000, 0x45820000,
+	0xe18d00c0, 0x3b6d0631, 0xc2ed5699,
+	0xcbe0fe1c, 0x56a7b19f, 0x16ed0000,
+	0x15680000, 0xedd70000, 0x325d0220,
+	0xe30c3689, 0x5a4ae643, 0xe375f8a8,
+	0x81fdf908,
+	0xb4310000, 0x77330000, 0xb15d0000,
+	0x7fd004e0, 0x78a26138, 0xd116c35d,
+	0xd256d489, 0x4e6f74de, 0xe3060000,
+	0xbdc10000, 0x87130000, 0xbff20060,
+	0x2eba0a1a, 0x8db53751, 0x73c5ab06,
+	0x5bd61539,
+	0xe3060000, 0xbdc10000, 0x87130000,
+	0xbff20060, 0x2eba0a1a, 0x8db53751,
+	0x73c5ab06, 0x5bd61539, 0x57370000,
+	0xcaf20000, 0x364e0000, 0xc0220480,
+	0x56186b22, 0x5ca3f40c, 0xa1937f8f,
+	0x15b961e7,
+	0x02f20000, 0xa2810000, 0x873f0000,
+	0xe36c7800, 0x1e1d74ef, 0x073d2bd6,
+	0xc4c23237, 0x7f32259e, 0xbadd0000,
+	0x13ad0000, 0xb7e70000, 0xf7282800,
+	0xdf45144d, 0x361ac33a, 0xea5a8d14,
+	0x2a2c18f0,
+	0xbadd0000, 0x13ad0000, 0xb7e70000,
+	0xf7282800, 0xdf45144d, 0x361ac33a,
+	0xea5a8d14, 0x2a2c18f0, 0xb82f0000,
+	0xb12c0000, 0x30d80000, 0x14445000,
+	0xc15860a2, 0x3127e8ec, 0x2e98bf23,
+	0x551e3d6e,
+	0x1e6c0000, 0xc4420000, 0x8a2e0000,
+	0xbcb6b800, 0x2c4413b6, 0x8bfdd3da,
+	0x6a0c1bc8, 0xb99dc2eb, 0x92560000,
+	0x1eda0000, 0xea510000, 0xe8b13000,
+	0xa93556a5, 0xebfb6199, 0xb15c2254,
+	0x33c5244f,
+	0x92560000, 0x1eda0000, 0xea510000,
+	0xe8b13000, 0xa93556a5, 0xebfb6199,
+	0xb15c2254, 0x33c5244f, 0x8c3a0000,
+	0xda980000, 0x607f0000, 0x54078800,
+	0x85714513, 0x6006b243, 0xdb50399c,
+	0x8a58e6a4,
+	0x033d0000, 0x08b30000, 0xf33a0000,
+	0x3ac20007, 0x51298a50, 0x6b6e661f,
+	0x0ea5cfe3, 0xe6da7ffe, 0xa8da0000,
+	0x96be0000, 0x5c1d0000, 0x07da0002,
+	0x7d669583, 0x1f98708a, 0xbb668808,
+	0xda878000,
+	0xa8da0000, 0x96be0000, 0x5c1d0000,
+	0x07da0002, 0x7d669583, 0x1f98708a,
+	0xbb668808, 0xda878000, 0xabe70000,
+	0x9e0d0000, 0xaf270000, 0x3d180005,
+	0x2c4f1fd3, 0x74f61695, 0xb5c347eb,
+	0x3c5dfffe,
+	0x01930000, 0xe7820000, 0xedfb0000,
+	0xcf0c000b, 0x8dd08d58, 0xbca3b42e,
+	0x063661e1, 0x536f9e7b, 0x92280000,
+	0xdc850000, 0x57fa0000, 0x56dc0003,
+	0xbae92316, 0x5aefa30c, 0x90cef752,
+	0x7b1675d7,
+	0x92280000, 0xdc850000, 0x57fa0000,
+	0x56dc0003, 0xbae92316, 0x5aefa30c,
+	0x90cef752, 0x7b1675d7, 0x93bb0000,
+	0x3b070000, 0xba010000, 0x99d00008,
+	0x3739ae4e, 0xe64c1722, 0x96f896b3,
+	0x2879ebac,
+	0x5fa80000, 0x56030000, 0x43ae0000,
+	0x64f30013, 0x257e86bf, 0x1311944e,
+	0x541e95bf, 0x8ea4db69, 0x00440000,
+	0x7f480000, 0xda7c0000, 0x2a230001,
+	0x3badc9cc, 0xa9b69c87, 0x030a9e60,
+	0xbe0a679e,
+	0x00440000, 0x7f480000, 0xda7c0000,
+	0x2a230001, 0x3badc9cc, 0xa9b69c87,
+	0x030a9e60, 0xbe0a679e, 0x5fec0000,
+	0x294b0000, 0x99d20000, 0x4ed00012,
+	0x1ed34f73, 0xbaa708c9, 0x57140bdf,
+	0x30aebcf7,
+	0xee930000, 0xd6070000, 0x92c10000,
+	0x2b9801e0, 0x9451287c, 0x3b6cfb57,
+	0x45312374, 0x201f6a64, 0x7b280000,
+	0x57420000, 0xa9e50000, 0x634300a0,
+	0x9edb442f, 0x6d9995bb, 0x27f83b03,
+	0xc7ff60f0,
+	0x7b280000, 0x57420000, 0xa9e50000,
+	0x634300a0, 0x9edb442f, 0x6d9995bb,
+	0x27f83b03, 0xc7ff60f0, 0x95bb0000,
+	0x81450000, 0x3b240000, 0x48db0140,
+	0x0a8a6c53, 0x56f56eec, 0x62c91877,
+	0xe7e00a94
 };
-static __constant__ uint32_t d_T512[64][16] = {
-	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
-	SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
-	SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
-	SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
-	SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
-	SPH_C32(0x9e69af68) },
-	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
-	SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
-	SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
-	SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
-	SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
-	SPH_C32(0x0c26f262) },
-	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
-	SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
-	SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
-	SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
-	SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
-	SPH_C32(0xdc24e61f) },
-	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
-	SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
-	SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
-	SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
-	SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
-	SPH_C32(0x3daac2da) },
-	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
-	SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
-	SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
-	SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
-	SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
-	SPH_C32(0x78cace29) },
-	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
-	SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
-	SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
-	SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
-	SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
-	SPH_C32(0x2dd1f9ab) },
-	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
-	SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
-	SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
-	SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
-	SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
-	SPH_C32(0xbf2c0be2) },
-	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
-	SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
-	SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
-	SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
-	SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
-	SPH_C32(0x32219526) },
-	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
-	SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
-	SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
-	SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
-	SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
-	SPH_C32(0xac8e6c88) },
-	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
-	SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
-	SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
-	SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
-	SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
-	SPH_C32(0x7b1bd6b9) },
-	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
-	SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
-	SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
-	SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
-	SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
-	SPH_C32(0xf746c320) },
-	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
-	SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
-	SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
-	SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
-	SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
-	SPH_C32(0x69505b3a) },
-	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
-	SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
-	SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
-	SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
-	SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
-	SPH_C32(0x8a341574) },
-	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
-	SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
-	SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
-	SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
-	SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
-	SPH_C32(0x450360bf) },
-	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
-	SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
-	SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
-	SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
-	SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
-	SPH_C32(0xf3d45758) },
-	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
-	SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
-	SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
-	SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
-	SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
-	SPH_C32(0x925c44e9) },
-	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
-	SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
-	SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
-	SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
-	SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
-	SPH_C32(0xa123ff9f) },
-	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
-	SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
-	SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
-	SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
-	SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
-	SPH_C32(0x1568ff0f) },
-	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
-	SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
-	SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
-	SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
-	SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
-	SPH_C32(0xc5c1eb3e) },
-	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
-	SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
-	SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
-	SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
-	SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
-	SPH_C32(0x1af21fe1) },
-	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
-	SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
-	SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
-	SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
-	SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
-	SPH_C32(0x857f3c2b) },
-	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
-	SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
-	SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
-	SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
-	SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
-	SPH_C32(0x2ba05a55) },
-	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
-	SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
-	SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
-	SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
-	SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
-	SPH_C32(0xfeabf254) },
-	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
-	SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
-	SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
-	SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
-	SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
-	SPH_C32(0xfe1cdc7f) },
-	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
-	SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
-	SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
-	SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
-	SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
-	SPH_C32(0xb0a51834) },
-	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
-	SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
-	SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
-	SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
-	SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
-	SPH_C32(0xa6b8c28d) },
-	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
-	SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
-	SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
-	SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
-	SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
-	SPH_C32(0x3a4e99d7) },
-	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
-	SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
-	SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
-	SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
-	SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
-	SPH_C32(0xe1844257) },
-	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
-	SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
-	SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
-	SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
-	SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
-	SPH_C32(0x2c3b504e) },
-	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
-	SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
-	SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
-	SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
-	SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
-	SPH_C32(0x524a0d59) },
-	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
-	SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
-	SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
-	SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
-	SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
-	SPH_C32(0x378dd173) },
-	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
-	SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
-	SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
-	SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
-	SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
-	SPH_C32(0x8b6c72bd) },
-	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
-	SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
-	SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
-	SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
-	SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
-	SPH_C32(0x8e67b7fa) },
-	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
-	SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
-	SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
-	SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
-	SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
-	SPH_C32(0x443d3004) },
-	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
-	SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
-	SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
-	SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
-	SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
-	SPH_C32(0xf4f6ea7b) },
-	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
-	SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
-	SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
-	SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
-	SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
-	SPH_C32(0x979961d0) },
-	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
-	SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
-	SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
-	SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
-	SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
-	SPH_C32(0x98aa496e) },
-	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
-	SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
-	SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
-	SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
-	SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
-	SPH_C32(0x094e3198) },
-	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
-	SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
-	SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
-	SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
-	SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
-	SPH_C32(0xe86cba2e) },
-	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
-	SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
-	SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
-	SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
-	SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
-	SPH_C32(0x4b7eec55) },
-	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
-	SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
-	SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
-	SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
-	SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
-	SPH_C32(0x1e7536a6) },
-	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
-	SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
-	SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
-	SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
-	SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
-	SPH_C32(0x24314f17) },
-	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
-	SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
-	SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
-	SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
-	SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
-	SPH_C32(0x9075b1ce) },
-	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
-	SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
-	SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
-	SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
-	SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
-	SPH_C32(0x9b6ef888) },
-	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
-	SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
-	SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
-	SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
-	SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
-	SPH_C32(0xd8b61463) },
-	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
-	SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
-	SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
-	SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
-	SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
-	SPH_C32(0x3ea660f7) },
-	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
-	SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
-	SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
-	SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
-	SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
-	SPH_C32(0x7f975691) },
-	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
-	SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
-	SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
-	SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
-	SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
-	SPH_C32(0x2c94459e) },
-	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
-	SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
-	SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
-	SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
-	SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
-	SPH_C32(0x56a7b19f) },
-	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
-	SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
-	SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
-	SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
-	SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
-	SPH_C32(0x81fdf908) },
-	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
-	SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
-	SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
-	SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
-	SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
-	SPH_C32(0x5bd61539) },
-	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
-	SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
-	SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
-	SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
-	SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
-	SPH_C32(0x15b961e7) },
-	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
-	SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
-	SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
-	SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
-	SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
-	SPH_C32(0x2a2c18f0) },
-	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
-	SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
-	SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
-	SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
-	SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
-	SPH_C32(0x551e3d6e) },
-	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
-	SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
-	SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
-	SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
-	SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
-	SPH_C32(0x33c5244f) },
-	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
-	SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
-	SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
-	SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
-	SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
-	SPH_C32(0x8a58e6a4) },
-	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
-	SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
-	SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
-	SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
-	SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
-	SPH_C32(0xda878000) },
-	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
-	SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
-	SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
-	SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
-	SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
-	SPH_C32(0x3c5dfffe) },
-	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
-	SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
-	SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
-	SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
-	SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
-	SPH_C32(0x7b1675d7) },
-	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
-	SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
-	SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
-	SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
-	SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
-	SPH_C32(0x2879ebac) },
-	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
-	SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
-	SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
-	SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
-	SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
-	SPH_C32(0xbe0a679e) },
-	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
-	SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
-	SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
-	SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
-	SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
-	SPH_C32(0x30aebcf7) },
-	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
-	SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
-	SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
-	SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
-	SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
-	SPH_C32(0xc7ff60f0) },
-	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
-	SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
-	SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
-	SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
-	SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
-	SPH_C32(0xe7e00a94) }
-};
-
-#define hamsi_s00   m0
-#define hamsi_s01   m1
-#define hamsi_s02   c0
-#define hamsi_s03   c1
-#define hamsi_s04   m2
-#define hamsi_s05   m3
-#define hamsi_s06   c2
-#define hamsi_s07   c3
-#define hamsi_s08   c4
-#define hamsi_s09   c5
-#define hamsi_s0A   m4
-#define hamsi_s0B   m5
-#define hamsi_s0C   c6
-#define hamsi_s0D   c7
-#define hamsi_s0E   m6
-#define hamsi_s0F   m7
-#define hamsi_s10   m8
-#define hamsi_s11   m9
-#define hamsi_s12   c8
-#define hamsi_s13   c9
-#define hamsi_s14   mA
-#define hamsi_s15   mB
-#define hamsi_s16   cA
-#define hamsi_s17   cB
-#define hamsi_s18   cC
-#define hamsi_s19   cD
-#define hamsi_s1A   mC
-#define hamsi_s1B   mD
-#define hamsi_s1C   cE
-#define hamsi_s1D   cF
-#define hamsi_s1E   mE
-#define hamsi_s1F   mF
 
 #define SBOX(a, b, c, d) { \
         uint32_t t; \
@@ -496,58 +435,58 @@ static __constant__ uint32_t d_T512[64][16] = {
     }
 
 #define ROUND_BIG(rc, alpha) { \
-		hamsi_s00 ^= alpha[0x00]; \
-		hamsi_s08 ^= alpha[0x08]; \
-		hamsi_s10 ^= alpha[0x10]; \
-		hamsi_s18 ^= alpha[0x18]; \
-		hamsi_s01 ^= alpha[0x01] ^ (uint32_t)(rc); \
-		hamsi_s09 ^= alpha[0x09]; \
-		hamsi_s11 ^= alpha[0x11]; \
-		hamsi_s19 ^= alpha[0x19]; \
-		hamsi_s02 ^= alpha[0x02]; \
-		hamsi_s0A ^= alpha[0x0A]; \
-		hamsi_s12 ^= alpha[0x12]; \
-		hamsi_s1A ^= alpha[0x1A]; \
-		hamsi_s03 ^= alpha[0x03]; \
-		hamsi_s0B ^= alpha[0x0B]; \
-		hamsi_s13 ^= alpha[0x13]; \
-		hamsi_s1B ^= alpha[0x1B]; \
-		hamsi_s04 ^= alpha[0x04]; \
-		hamsi_s0C ^= alpha[0x0C]; \
-		hamsi_s14 ^= alpha[0x14]; \
-		hamsi_s1C ^= alpha[0x1C]; \
-		hamsi_s05 ^= alpha[0x05]; \
-		hamsi_s0D ^= alpha[0x0D]; \
-		hamsi_s15 ^= alpha[0x15]; \
-		hamsi_s1D ^= alpha[0x1D]; \
-		hamsi_s06 ^= alpha[0x06]; \
-		hamsi_s0E ^= alpha[0x0E]; \
-		hamsi_s16 ^= alpha[0x16]; \
-		hamsi_s1E ^= alpha[0x1E]; \
-		hamsi_s07 ^= alpha[0x07]; \
-		hamsi_s0F ^= alpha[0x0F]; \
-		hamsi_s17 ^= alpha[0x17]; \
-		hamsi_s1F ^= alpha[0x1F]; \
-		SBOX(hamsi_s00, hamsi_s08, hamsi_s10, hamsi_s18); \
-		SBOX(hamsi_s01, hamsi_s09, hamsi_s11, hamsi_s19); \
-		SBOX(hamsi_s02, hamsi_s0A, hamsi_s12, hamsi_s1A); \
-		SBOX(hamsi_s03, hamsi_s0B, hamsi_s13, hamsi_s1B); \
-		SBOX(hamsi_s04, hamsi_s0C, hamsi_s14, hamsi_s1C); \
-		SBOX(hamsi_s05, hamsi_s0D, hamsi_s15, hamsi_s1D); \
-		SBOX(hamsi_s06, hamsi_s0E, hamsi_s16, hamsi_s1E); \
-		SBOX(hamsi_s07, hamsi_s0F, hamsi_s17, hamsi_s1F); \
-		HAMSI_L(hamsi_s00, hamsi_s09, hamsi_s12, hamsi_s1B); \
-		HAMSI_L(hamsi_s01, hamsi_s0A, hamsi_s13, hamsi_s1C); \
-		HAMSI_L(hamsi_s02, hamsi_s0B, hamsi_s14, hamsi_s1D); \
-		HAMSI_L(hamsi_s03, hamsi_s0C, hamsi_s15, hamsi_s1E); \
-		HAMSI_L(hamsi_s04, hamsi_s0D, hamsi_s16, hamsi_s1F); \
-		HAMSI_L(hamsi_s05, hamsi_s0E, hamsi_s17, hamsi_s18); \
-		HAMSI_L(hamsi_s06, hamsi_s0F, hamsi_s10, hamsi_s19); \
-		HAMSI_L(hamsi_s07, hamsi_s08, hamsi_s11, hamsi_s1A); \
-		HAMSI_L(hamsi_s00, hamsi_s02, hamsi_s05, hamsi_s07); \
-		HAMSI_L(hamsi_s10, hamsi_s13, hamsi_s15, hamsi_s16); \
-		HAMSI_L(hamsi_s09, hamsi_s0B, hamsi_s0C, hamsi_s0E); \
-		HAMSI_L(hamsi_s19, hamsi_s1A, hamsi_s1C, hamsi_s1F); \
+		m0 ^= alpha[0x00]; \
+		c4 ^= alpha[0x08]; \
+		m8 ^= alpha[0x10]; \
+		cC ^= alpha[0x18]; \
+		m1 ^= alpha[0x01] ^ rc; \
+		c5 ^= alpha[0x09]; \
+		m9 ^= alpha[0x11]; \
+		cD ^= alpha[0x19]; \
+		c0 ^= alpha[0x02]; \
+		m4 ^= alpha[0x0A]; \
+		c8 ^= alpha[0x12]; \
+		mC ^= alpha[0x1A]; \
+		c1 ^= alpha[0x03]; \
+		m5 ^= alpha[0x0B]; \
+		c9 ^= alpha[0x13]; \
+		mD ^= alpha[0x1B]; \
+		m2 ^= alpha[0x04]; \
+		c6 ^= alpha[0x0C]; \
+		mA ^= alpha[0x14]; \
+		cE ^= alpha[0x1C]; \
+		m3 ^= alpha[0x05]; \
+		c7 ^= alpha[0x0D]; \
+		mB ^= alpha[0x15]; \
+		cF ^= alpha[0x1D]; \
+		c2 ^= alpha[0x06]; \
+		m6 ^= alpha[0x0E]; \
+		cA ^= alpha[0x16]; \
+		mE ^= alpha[0x1E]; \
+		c3 ^= alpha[0x07]; \
+		m7 ^= alpha[0x0F]; \
+		cB ^= alpha[0x17]; \
+		mF ^= alpha[0x1F]; \
+		SBOX(m0, c4, m8, cC); \
+		SBOX(m1, c5, m9, cD); \
+		SBOX(c0, m4, c8, mC); \
+		SBOX(c1, m5, c9, mD); \
+		SBOX(m2, c6, mA, cE); \
+		SBOX(m3, c7, mB, cF); \
+		SBOX(c2, m6, cA, mE); \
+		SBOX(c3, m7, cB, mF); \
+		HAMSI_L(m0, c5, c8, mD); \
+		HAMSI_L(m1, m4, c9, cE); \
+		HAMSI_L(c0, m5, mA, cF); \
+		HAMSI_L(c1, c6, mB, mE); \
+		HAMSI_L(m2, c7, cA, mF); \
+		HAMSI_L(m3, m6, cB, cC); \
+		HAMSI_L(c2, m7, m8, cD); \
+		HAMSI_L(c3, c4, m9, mC); \
+		HAMSI_L(m0, c0, m3, c3); \
+		HAMSI_L(m8, c9, mB, cA); \
+		HAMSI_L(c5, m5, c6, m6); \
+		HAMSI_L(cD, mC, cE, mF); \
 	}
 
 
@@ -563,69 +502,100 @@ static __constant__ uint32_t d_T512[64][16] = {
 
 #define T_BIG   { \
 		/* order is important */ \
-		cF = (h[0xF] ^= hamsi_s17); \
-		cE = (h[0xE] ^= hamsi_s16); \
-		cD = (h[0xD] ^= hamsi_s15); \
-		cC = (h[0xC] ^= hamsi_s14); \
-		cB = (h[0xB] ^= hamsi_s13); \
-		cA = (h[0xA] ^= hamsi_s12); \
-		c9 = (h[0x9] ^= hamsi_s11); \
-		c8 = (h[0x8] ^= hamsi_s10); \
-		c7 = (h[0x7] ^= hamsi_s07); \
-		c6 = (h[0x6] ^= hamsi_s06); \
-		c5 = (h[0x5] ^= hamsi_s05); \
-		c4 = (h[0x4] ^= hamsi_s04); \
-		c3 = (h[0x3] ^= hamsi_s03); \
-		c2 = (h[0x2] ^= hamsi_s02); \
-		c1 = (h[0x1] ^= hamsi_s01); \
-		c0 = (h[0x0] ^= hamsi_s00); \
+		cF = (h[0xF] ^= cB); \
+		cE = (h[0xE] ^= cA); \
+		cD = (h[0xD] ^= mB); \
+		cC = (h[0xC] ^= mA); \
+		cB = (h[0xB] ^= c9); \
+		cA = (h[0xA] ^= c8); \
+		c9 = (h[0x9] ^= m9); \
+		c8 = (h[0x8] ^= m8); \
+		c7 = (h[0x7] ^= c3); \
+		c6 = (h[0x6] ^= c2); \
+		c5 = (h[0x5] ^= m3); \
+		c4 = (h[0x4] ^= m2); \
+		c3 = (h[0x3] ^= c1); \
+		c2 = (h[0x2] ^= c0); \
+		c1 = (h[0x1] ^= m1); \
+		c0 = (h[0x0] ^= m0); \
 	}
 
-__global__ __launch_bounds__(512,2)
-void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__global__ __launch_bounds__(256,4)
+void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash )
 {
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
+    const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//    if (thread < threads)
     {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
-        unsigned char *h1 = (unsigned char *)Hash;
-
-        uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
-        uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
-        uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
-        uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
+		const uint32_t d_alpha_n[32] = 
+		{
+			0xff00f0f0, 0xccccaaaa, 0xf0f0cccc,
+			0xff00aaaa, 0xccccaaaa, 0xf0f0ff00,
+			0xaaaacccc, 0xf0f0ff00, 0xf0f0cccc,
+			0xaaaaff00, 0xccccff00, 0xaaaaf0f0,
+			0xaaaaf0f0, 0xff00cccc, 0xccccf0f0,
+			0xff00aaaa, 0xccccaaaa, 0xff00f0f0,
+			0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00,
+			0xccccaaaa, 0xf0f0ff00, 0xaaaacccc,
+			0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0,
+			0xccccff00, 0xff00cccc, 0xaaaaf0f0,
+			0xff00aaaa, 0xccccf0f0
+		};
+
+
+		const uint32_t d_alpha_f[32] = {
+		0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0,
+		0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9,
+		0xf9c00ff0, 0x639ccaf9, 0x639c0ff0,
+		0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c,
+		0xf9c0639c, 0xcaf90ff0, 0x0ff0639c,
+		0xcaf9f9c0, 0x0ff0f9c0, 0xcaf9639c,
+		0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9,
+		0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0,
+		0xf9c0caf9, 0x639c0ff0, 0xf9c0639c,
+		0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c,
+		0xcaf9f9c0, 0x0ff0639c
+		};
+
+		const uint32_t nounce = (startNounce + thread);
+
+		const uint32_t hashPosition = nounce - startNounce;
+        uint32_t *const Hash = &g_hash[hashPosition*16];
+		uint8_t *h1 = (uint8_t *)Hash;
+
+        uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
+        uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
+        uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
+        uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
         uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
         uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
 		uint32_t *tp, db, dm;
 
-        for(int i = 0; i < 64; i += 8) {
-            
-            m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; 
+		#pragma unroll 1
+		for(int i = 0; i < 64; i += 8) 
+		{            
+            tp = &d_T512[0];
+			m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; 
             m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0;
-            tp = &d_T512[0][0];
 
-#pragma unroll 2
-            for (int u = 0; u < 8; u ++) { 
+			for (int u = 0; u < 8; u++)
+			{ 
                 db = h1[i+u]; 
-#pragma unroll 2
-                for (int v = 0; v < 8; v ++, db >>= 1) { 
-                    dm = -(uint32_t)(db & 1);
-                    m0 ^= dm & *(tp+ 0); m1 ^= dm & *(tp+ 1); 
-                    m2 ^= dm & *(tp+ 2); m3 ^= dm & *(tp+ 3); 
-                    m4 ^= dm & *(tp+ 4); m5 ^= dm & *(tp+ 5); 
-                    m6 ^= dm & *(tp+ 6); m7 ^= dm & *(tp+ 7); 
-                    m8 ^= dm & *(tp+ 8); m9 ^= dm & *(tp+ 9); 
-                    mA ^= dm & *(tp+10); mB ^= dm & *(tp+11); 
-                    mC ^= dm & *(tp+12); mD ^= dm & *(tp+13); 
-                    mE ^= dm & *(tp+14); mF ^= dm & *(tp+15); 
-                    tp += 16; 
+				for (int v = 0; v < 8; v++, db >>= 1, tp += 16)
+				{ 
+                    dm = -(db & 1);
+                    m0 ^= dm & tp[0]; m1 ^= dm & tp[1]; 
+                    m2 ^= dm & tp[2]; m3 ^= dm & tp[3]; 
+                    m4 ^= dm & tp[4]; m5 ^= dm & tp[5]; 
+                    m6 ^= dm & tp[6]; m7 ^= dm & tp[7]; 
+                    m8 ^= dm & tp[8]; m9 ^= dm & tp[9]; 
+                    mA ^= dm & tp[10]; mB ^= dm & tp[11]; 
+                    mC ^= dm & tp[12]; mD ^= dm & tp[13]; 
+                    mE ^= dm & tp[14]; mF ^= dm & tp[15]; 					
                 } 
             } 
 
-            for( int r = 0; r < 6; r += 2 ) {
+			for (int r = 0; r < 6; r += 2) 
+			{
                 ROUND_BIG(r, d_alpha_n); 
                 ROUND_BIG(r+1, d_alpha_n); 
             }
@@ -633,38 +603,251 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
         }
 
 
-        tp = &d_T512[0][0] + 112;
-
-        m0 = *(tp+ 0); m1 = *(tp+ 1); 
-        m2 = *(tp+ 2); m3 = *(tp+ 3); 
-        m4 = *(tp+ 4); m5 = *(tp+ 5); 
-        m6 = *(tp+ 6); m7 = *(tp+ 7); 
-        m8 = *(tp+ 8); m9 = *(tp+ 9); 
-        mA = *(tp+10); mB = *(tp+11); 
-        mC = *(tp+12); mD = *(tp+13); 
-        mE = *(tp+14); mF = *(tp+15); 
-
-        for( int r = 0; r < 6; r += 2 ) {
-            ROUND_BIG(r, d_alpha_n); 
-            ROUND_BIG(r+1, d_alpha_n); 
-        }
+        tp = &d_T512[0] + 112;
+
+        m0 = tp[ 0]; m1 = tp[ 1]; 
+        m2 = tp[ 2]; m3 = tp[ 3]; 
+        m4 = tp[ 4]; m5 = tp[ 5]; 
+        m6 = tp[ 6]; m7 = tp[ 7]; 
+        m8 = tp[ 8]; m9 = tp[ 9]; 
+        mA = tp[10]; mB = tp[11]; 
+        mC = tp[12]; mD = tp[13]; 
+        mE = tp[14]; mF = tp[15]; 
+
+		for (int r = 0; r < 6; r += 2)
+		{
+			//            ROUND_BIG(r, d_alpha_n); 
+			m0 ^= d_alpha_n[0x00]; \
+				c4 ^= d_alpha_n[0x08]; \
+				m8 ^= d_alpha_n[0x10]; \
+				cC ^= d_alpha_n[0x18]; \
+				m1 ^= d_alpha_n[0x01] ^ r; \
+				c5 ^= d_alpha_n[0x09]; \
+				m9 ^= d_alpha_n[0x11]; \
+				cD ^= d_alpha_n[0x19]; \
+				c0 ^= d_alpha_n[0x02]; \
+				m4 ^= d_alpha_n[0x0A]; \
+				c8 ^= d_alpha_n[0x12]; \
+				mC ^= d_alpha_n[0x1A]; \
+				c1 ^= d_alpha_n[0x03]; \
+				m5 ^= d_alpha_n[0x0B]; \
+				c9 ^= d_alpha_n[0x13]; \
+				mD ^= d_alpha_n[0x1B]; \
+				m2 ^= d_alpha_n[0x04]; \
+				c6 ^= d_alpha_n[0x0C]; \
+				mA ^= d_alpha_n[0x14]; \
+				cE ^= d_alpha_n[0x1C]; \
+				m3 ^= d_alpha_n[0x05]; \
+				c7 ^= d_alpha_n[0x0D]; \
+				mB ^= d_alpha_n[0x15]; \
+				cF ^= d_alpha_n[0x1D]; \
+				c2 ^= d_alpha_n[0x06]; \
+				m6 ^= d_alpha_n[0x0E]; \
+				cA ^= d_alpha_n[0x16]; \
+				mE ^= d_alpha_n[0x1E]; \
+				c3 ^= d_alpha_n[0x07]; \
+				m7 ^= d_alpha_n[0x0F]; \
+				cB ^= d_alpha_n[0x17]; \
+				mF ^= d_alpha_n[0x1F]; \
+				SBOX(m0, c4, m8, cC); \
+				SBOX(m1, c5, m9, cD); \
+				SBOX(c0, m4, c8, mC); \
+				SBOX(c1, m5, c9, mD); \
+				SBOX(m2, c6, mA, cE); \
+				SBOX(m3, c7, mB, cF); \
+				SBOX(c2, m6, cA, mE); \
+				SBOX(c3, m7, cB, mF); \
+				HAMSI_L(m0, c5, c8, mD); \
+				HAMSI_L(m1, m4, c9, cE); \
+				HAMSI_L(c0, m5, mA, cF); \
+				HAMSI_L(c1, c6, mB, mE); \
+				HAMSI_L(m2, c7, cA, mF); \
+				HAMSI_L(m3, m6, cB, cC); \
+				HAMSI_L(c2, m7, m8, cD); \
+				HAMSI_L(c3, c4, m9, mC); \
+				HAMSI_L(m0, c0, m3, c3); \
+				HAMSI_L(m8, c9, mB, cA); \
+				HAMSI_L(c5, m5, c6, m6); \
+				HAMSI_L(cD, mC, cE, mF); \
+
+				//            ROUND_BIG(r+1, d_alpha_n); 
+				m0 ^= d_alpha_n[0x00]; \
+				c4 ^= d_alpha_n[0x08]; \
+				m8 ^= d_alpha_n[0x10]; \
+				cC ^= d_alpha_n[0x18]; \
+				m1 ^= d_alpha_n[0x01] ^ (r+1); \
+				c5 ^= d_alpha_n[0x09]; \
+				m9 ^= d_alpha_n[0x11]; \
+				cD ^= d_alpha_n[0x19]; \
+				c0 ^= d_alpha_n[0x02]; \
+				m4 ^= d_alpha_n[0x0A]; \
+				c8 ^= d_alpha_n[0x12]; \
+				mC ^= d_alpha_n[0x1A]; \
+				c1 ^= d_alpha_n[0x03]; \
+				m5 ^= d_alpha_n[0x0B]; \
+				c9 ^= d_alpha_n[0x13]; \
+				mD ^= d_alpha_n[0x1B]; \
+				m2 ^= d_alpha_n[0x04]; \
+				c6 ^= d_alpha_n[0x0C]; \
+				mA ^= d_alpha_n[0x14]; \
+				cE ^= d_alpha_n[0x1C]; \
+				m3 ^= d_alpha_n[0x05]; \
+				c7 ^= d_alpha_n[0x0D]; \
+				mB ^= d_alpha_n[0x15]; \
+				cF ^= d_alpha_n[0x1D]; \
+				c2 ^= d_alpha_n[0x06]; \
+				m6 ^= d_alpha_n[0x0E]; \
+				cA ^= d_alpha_n[0x16]; \
+				mE ^= d_alpha_n[0x1E]; \
+				c3 ^= d_alpha_n[0x07]; \
+				m7 ^= d_alpha_n[0x0F]; \
+				cB ^= d_alpha_n[0x17]; \
+				mF ^= d_alpha_n[0x1F]; \
+				SBOX(m0, c4, m8, cC); \
+				SBOX(m1, c5, m9, cD); \
+				SBOX(c0, m4, c8, mC); \
+				SBOX(c1, m5, c9, mD); \
+				SBOX(m2, c6, mA, cE); \
+				SBOX(m3, c7, mB, cF); \
+				SBOX(c2, m6, cA, mE); \
+				SBOX(c3, m7, cB, mF); \
+				HAMSI_L(m0, c5, c8, mD); \
+				HAMSI_L(m1, m4, c9, cE); \
+				HAMSI_L(c0, m5, mA, cF); \
+				HAMSI_L(c1, c6, mB, mE); \
+				HAMSI_L(m2, c7, cA, mF); \
+				HAMSI_L(m3, m6, cB, cC); \
+				HAMSI_L(c2, m7, m8, cD); \
+				HAMSI_L(c3, c4, m9, mC); \
+				HAMSI_L(m0, c0, m3, c3); \
+				HAMSI_L(m8, c9, mB, cA); \
+				HAMSI_L(c5, m5, c6, m6); \
+				HAMSI_L(cD, mC, cE, mF); \
+		}
         T_BIG;
 
-        tp = &d_T512[0][0] + 784;
-
-        m0 = *(tp+ 0); m1 = *(tp+ 1); 
-        m2 = *(tp+ 2); m3 = *(tp+ 3); 
-        m4 = *(tp+ 4); m5 = *(tp+ 5); 
-        m6 = *(tp+ 6); m7 = *(tp+ 7); 
-        m8 = *(tp+ 8); m9 = *(tp+ 9); 
-        mA = *(tp+10); mB = *(tp+11); 
-        mC = *(tp+12); mD = *(tp+13); 
-        mE = *(tp+14); mF = *(tp+15); 
-
-        for( int r = 0; r < 12; r += 2 ) {
-            ROUND_BIG(r, d_alpha_f); 
-            ROUND_BIG(r+1, d_alpha_f); 
-        }
+        tp = &d_T512[0] + 784;
+
+        m0 = tp[ 0]; m1 = tp[ 1]; 
+        m2 = tp[ 2]; m3 = tp[ 3]; 
+        m4 = tp[ 4]; m5 = tp[ 5]; 
+        m6 = tp[ 6]; m7 = tp[ 7]; 
+        m8 = tp[ 8]; m9 = tp[ 9]; 
+        mA = tp[10]; mB = tp[11]; 
+        mC = tp[12]; mD = tp[13]; 
+        mE = tp[14]; mF = tp[15]; 
+
+#pragma unroll 1
+        for( int r = 0; r < 12; r += 2 ) 
+		{
+			//            ROUND_BIG(r, d_alpha_f); 
+			m0 ^= d_alpha_f[0x00]; \
+				c4 ^= d_alpha_f[0x08]; \
+				m8 ^= d_alpha_f[0x10]; \
+				cC ^= d_alpha_f[0x18]; \
+				m1 ^= d_alpha_f[0x01] ^ r; \
+				c5 ^= d_alpha_f[0x09]; \
+				m9 ^= d_alpha_f[0x11]; \
+				cD ^= d_alpha_f[0x19]; \
+				c0 ^= d_alpha_f[0x02]; \
+				m4 ^= d_alpha_f[0x0A]; \
+				c8 ^= d_alpha_f[0x12]; \
+				mC ^= d_alpha_f[0x1A]; \
+				c1 ^= d_alpha_f[0x03]; \
+				m5 ^= d_alpha_f[0x0B]; \
+				c9 ^= d_alpha_f[0x13]; \
+				mD ^= d_alpha_f[0x1B]; \
+				m2 ^= d_alpha_f[0x04]; \
+				c6 ^= d_alpha_f[0x0C]; \
+				mA ^= d_alpha_f[0x14]; \
+				cE ^= d_alpha_f[0x1C]; \
+				m3 ^= d_alpha_f[0x05]; \
+				c7 ^= d_alpha_f[0x0D]; \
+				mB ^= d_alpha_f[0x15]; \
+				cF ^= d_alpha_f[0x1D]; \
+				c2 ^= d_alpha_f[0x06]; \
+				m6 ^= d_alpha_f[0x0E]; \
+				cA ^= d_alpha_f[0x16]; \
+				mE ^= d_alpha_f[0x1E]; \
+				c3 ^= d_alpha_f[0x07]; \
+				m7 ^= d_alpha_f[0x0F]; \
+				cB ^= d_alpha_f[0x17]; \
+				mF ^= d_alpha_f[0x1F]; \
+				SBOX(m0, c4, m8, cC); \
+				SBOX(m1, c5, m9, cD); \
+				SBOX(c0, m4, c8, mC); \
+				SBOX(c1, m5, c9, mD); \
+				SBOX(m2, c6, mA, cE); \
+				SBOX(m3, c7, mB, cF); \
+				SBOX(c2, m6, cA, mE); \
+				SBOX(c3, m7, cB, mF); \
+				HAMSI_L(m0, c5, c8, mD); \
+				HAMSI_L(m1, m4, c9, cE); \
+				HAMSI_L(c0, m5, mA, cF); \
+				HAMSI_L(c1, c6, mB, mE); \
+				HAMSI_L(m2, c7, cA, mF); \
+				HAMSI_L(m3, m6, cB, cC); \
+				HAMSI_L(c2, m7, m8, cD); \
+				HAMSI_L(c3, c4, m9, mC); \
+				HAMSI_L(m0, c0, m3, c3); \
+				HAMSI_L(m8, c9, mB, cA); \
+				HAMSI_L(c5, m5, c6, m6); \
+				HAMSI_L(cD, mC, cE, mF); \
+
+				//            ROUND_BIG(r+1, d_alpha_n); 
+				m0 ^= d_alpha_f[0x00]; \
+				c4 ^= d_alpha_f[0x08]; \
+				m8 ^= d_alpha_f[0x10]; \
+				cC ^= d_alpha_f[0x18]; \
+				m1 ^= d_alpha_f[0x01] ^ (r + 1); \
+				c5 ^= d_alpha_f[0x09]; \
+				m9 ^= d_alpha_f[0x11]; \
+				cD ^= d_alpha_f[0x19]; \
+				c0 ^= d_alpha_f[0x02]; \
+				m4 ^= d_alpha_f[0x0A]; \
+				c8 ^= d_alpha_f[0x12]; \
+				mC ^= d_alpha_f[0x1A]; \
+				c1 ^= d_alpha_f[0x03]; \
+				m5 ^= d_alpha_f[0x0B]; \
+				c9 ^= d_alpha_f[0x13]; \
+				mD ^= d_alpha_f[0x1B]; \
+				m2 ^= d_alpha_f[0x04]; \
+				c6 ^= d_alpha_f[0x0C]; \
+				mA ^= d_alpha_f[0x14]; \
+				cE ^= d_alpha_f[0x1C]; \
+				m3 ^= d_alpha_f[0x05]; \
+				c7 ^= d_alpha_f[0x0D]; \
+				mB ^= d_alpha_f[0x15]; \
+				cF ^= d_alpha_f[0x1D]; \
+				c2 ^= d_alpha_f[0x06]; \
+				m6 ^= d_alpha_f[0x0E]; \
+				cA ^= d_alpha_f[0x16]; \
+				mE ^= d_alpha_f[0x1E]; \
+				c3 ^= d_alpha_f[0x07]; \
+				m7 ^= d_alpha_f[0x0F]; \
+				cB ^= d_alpha_f[0x17]; \
+				mF ^= d_alpha_f[0x1F]; \
+				SBOX(m0, c4, m8, cC); \
+				SBOX(m1, c5, m9, cD); \
+				SBOX(c0, m4, c8, mC); \
+				SBOX(c1, m5, c9, mD); \
+				SBOX(m2, c6, mA, cE); \
+				SBOX(m3, c7, mB, cF); \
+				SBOX(c2, m6, cA, mE); \
+				SBOX(c3, m7, cB, mF); \
+				HAMSI_L(m0, c5, c8, mD); \
+				HAMSI_L(m1, m4, c9, cE); \
+				HAMSI_L(c0, m5, mA, cF); \
+				HAMSI_L(c1, c6, mB, mE); \
+				HAMSI_L(m2, c7, cA, mF); \
+				HAMSI_L(m3, m6, cB, cC); \
+				HAMSI_L(c2, m7, m8, cD); \
+				HAMSI_L(c3, c4, m9, mC); \
+				HAMSI_L(m0, c0, m3, c3); \
+				HAMSI_L(m8, c9, mB, cA); \
+				HAMSI_L(c5, m5, c6, m6); \
+				HAMSI_L(cD, mC, cE, mF); \
+		}
         T_BIG;
 
 #pragma unroll 16
@@ -678,12 +861,12 @@ __host__ void x13_hamsi512_cpu_init(int thr_id, uint32_t threads)
 {
 }
 
-__host__ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
 {
     const uint32_t threadsperblock = 128;
 
     dim3 grid((threads + threadsperblock-1)/threadsperblock);
     dim3 block(threadsperblock);
 
-    x13_hamsi512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+    x13_hamsi512_gpu_hash_64<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_hash);
 }
\ No newline at end of file
diff --git a/x13/x13.cu b/x13/x13.cu
index 2953306de4..1dd3ee23b6 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -23,57 +23,57 @@ extern "C"
 
 #include "cuda_helper.h"
 
-static uint32_t *d_hash[MAX_GPUS];
-
-
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_init(int thr_id);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
-extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
-extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-//extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+//extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+//extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 
 
-extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
-extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,  uint32_t *d_hash);
 
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
-extern uint32_t x13_fugue512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-//extern uint32_t cuda_check_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t *result);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+//extern uint32_t cuda_check_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
 
-extern void x13_fugue512_cpu_setTarget(const void *ptarget);
+extern void x13_fugue512_cpu_setTarget(int thr_id, const void *ptarget);
 extern void  x13_fugue512_cpu_free(int32_t thr_id);
 //extern void  cuda_check_cpu_free(int32_t thr_id);
 extern void  x11_simd512_cpu_free(int32_t thr_id);
 
 
 extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, 
-                                          uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse, int order);
+extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes, 
+                                          const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse);
 
 // X13 Hashfunktion
-extern "C" void x13hash(void *output, const void *input)
+void x13hash(void *output, const void *input)
 {
 	// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13
 
@@ -149,93 +149,127 @@ extern "C" void x13hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
 
-extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+extern int scanhash_x13(int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done)
 {
+	static THREAD uint32_t *d_hash = nullptr;
+	static THREAD uint32_t *h_found = nullptr;
+
 	const uint32_t first_nonce = pdata[19];
-	static bool init[MAX_GPUS] = { 0 };
 	uint32_t endiandata[20];
-	unsigned int intensity = 19; // (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19;
-	uint32_t throughput = device_intensity(thr_id, __func__, 1 << intensity); // 19=256*256*8;
-
-	throughput = min(throughput, (max_nonce - first_nonce));
+	int intensity = (device_sm[device_map[thr_id]] > 500) ? 256 * 256 * 26 : 256 * 256 * 13;
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity);
+	uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffc00;
+	uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0xf;
+		ptarget[7] = 0xff;
 
-	if (!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		get_cuda_arch(&cuda_arch[thr_id]);
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
 
-		quark_groestl512_cpu_init(thr_id, throughput);
-		quark_bmw512_cpu_init(thr_id, throughput);
-		x11_simd512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
-		x13_hamsi512_cpu_init(thr_id, throughput);
-		x13_fugue512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughputmax);
+		quark_bmw512_cpu_init(thr_id, throughputmax);
+		x11_simd512_cpu_init(thr_id, throughputmax);
+		x11_echo512_cpu_init(thr_id, throughputmax);
+		x13_hamsi512_cpu_init(thr_id, throughputmax);
+		x13_fugue512_cpu_init(thr_id, throughputmax);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax));
+		CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 2 * sizeof(uint32_t)));
 
-		cuda_check_cpu_init(thr_id, throughput);
-		init[thr_id] = true;
+//		cuda_check_cpu_init(thr_id, throughput);
+		mining_has_stopped[thr_id] = false;
+		init = true;
 	}
 
 	for (int k = 0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	cuda_check_cpu_setTarget(ptarget);
-//	x13_fugue512_cpu_setTarget(ptarget);
+	quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata);
+	//	cuda_check_cpu_setTarget(ptarget, thr_id);
+	x13_fugue512_cpu_setTarget(thr_id, ptarget);
 
 	do {
-		int order = 0;
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-	//	uint32_t foundNonce = x13_fugue512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != 0xffffffff)
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash,simdthreads);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19],  d_hash);
+		x13_fugue512_cpu_hash_64_final(thr_id, throughput, pdata[19], d_hash, h_found);
+
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		//	h_found[0] = 0xffffffff;
+		if (h_found[0] != 0xffffffff)
 		{
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
+			const uint32_t Htarg = ptarget[7];
+			uint32_t vhash64[8]={0};
+			if(opt_verify){ be32enc(&endiandata[19], h_found[0]);
 			x13hash(vhash64, endiandata);
-			uint32_t Htarg = ptarget[7];
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) 
+
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
 			{
 				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce);
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (secNonce != 0) 
+				if (h_found[1] != 0xffffffff)
 				{
-					if (opt_benchmark) applog(LOG_INFO, "found second nounce", thr_id, foundNonce, vhash64[7], Htarg);
-					pdata[21] = secNonce;
-					res++;
+					if(opt_verify){ be32enc(&endiandata[19], h_found[1]);
+					x13hash(vhash64, endiandata);
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+
+						pdata[21] = h_found[1];
+						res++;
+						if (opt_benchmark)
+							applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]);
+					}
+					else
+					{
+						if (vhash64[7] != Htarg)
+						{
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]);
+						}
+					}
+
 				}
-				pdata[19] = foundNonce;
-				if (opt_benchmark) applog(LOG_INFO, "found nounce", thr_id, foundNonce, vhash64[7], Htarg);
+				pdata[19] = h_found[0];
+				if (opt_benchmark)
+					applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]);
 				return res;
 			}
 			else
 			{
-				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+				if (vhash64[7] != Htarg)
+				{
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]);
+				}
 			}
 		}
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/x15/cuda_whirlpoolx.cu b/x15/cuda_whirlpoolx.cu
new file mode 100644
index 0000000000..1bfbea40a4
--- /dev/null
+++ b/x15/cuda_whirlpoolx.cu
@@ -0,0 +1,615 @@
+/*
+ * Built on cbuchner1's implementation, actual hashing code
+ * based on sphlib 3.0
+ */
+#include <stdio.h>
+#include <memory.h>
+#include "cuda_helper.h"
+
+
+
+#if __CUDA_ARCH__ > 500
+#define TPB 1024
+#else
+#define TPB 256
+#endif
+
+
+#define NONCES_PER_THREAD 16
+
+__constant__  uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
+__constant__  uint2 c_xtra[8];
+__constant__  uint2 c_tmp[72];
+static uint2 *d_xtra[MAX_GPUS];
+static uint64_t *d_tmp[MAX_GPUS];
+__constant__  uint64_t pTarget[1];
+
+static uint32_t *h_wxnounce[MAX_GPUS];
+static uint32_t *d_WXNonce[MAX_GPUS];
+
+/**
+ * Whirlpool CUDA kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2014 djm34 & tpruvot & SP & Provos Alexis
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ * @author djm34
+ * @author tpruvot
+ * @author SP
+ * @author Provos Alexis
+ */
+
+__constant__ __align__(128) uint64_t mixTob0Tox[256];
+__constant__ __align__(128) uint64_t mixTob1Tox[256];
+
+const  uint64_t hmixTob0Tox[256] = {
+	0xD83078C018601818,0x2646AF05238C2323,0xB891F97EC63FC6C6,0xFBCD6F13E887E8E8,0xCB13A14C87268787,0x116D62A9B8DAB8B8,0x0902050801040101,0x0D9E6E424F214F4F,0x9B6CEEAD36D83636,
+	0xFF510459A6A2A6A6,0x0CB9BDDED26FD2D2,0x0EF706FBF5F3F5F5,0x96F280EF79F97979,0x30DECE5F6FA16F6F,0x6D3FEFFC917E9191,0xF8A407AA52555252,0x47C0FD27609D6060,0x35657689BCCABCBC,
+	0x372BCDAC9B569B9B,0x8A018C048E028E8E,0xD25B1571A3B6A3A3,0x6C183C600C300C0C,0x84F68AFF7BF17B7B,0x806AE1B535D43535,0xF53A69E81D741D1D,0xB3DD4753E0A7E0E0,0x21B3ACF6D77BD7D7,
+	0x9C99ED5EC22FC2C2,0x435C966D2EB82E2E,0x29967A624B314B4B,0x5DE121A3FEDFFEFE,0xD5AE168257415757,0xBD2A41A815541515,0xE8EEB69F77C17777,0x926EEBA537DC3737,0x9ED7567BE5B3E5E5,
+	0x1323D98C9F469F9F,0x23FD17D3F0E7F0F0,0x20947F6A4A354A4A,0x44A9959EDA4FDADA,0xA2B025FA587D5858,0xCF8FCA06C903C9C9,0x7C528D5529A42929,0x5A1422500A280A0A,0x507F4FE1B1FEB1B1,
+	0xC95D1A69A0BAA0A0,0x14D6DA7F6BB16B6B,0xD917AB5C852E8585,0x3C677381BDCEBDBD,0x8FBA34D25D695D5D,0x9020508010401010,0x07F503F3F4F7F4F4,0xDD8BC016CB0BCBCB,0xD37CC6ED3EF83E3E,
+	0x2D0A112805140505,0x78CEE61F67816767,0x97D55373E4B7E4E4,0x024EBB25279C2727,0x7382583241194141,0xA70B9D2C8B168B8B,0xF6530151A7A6A7A7,0xB2FA94CF7DE97D7D,0x4937FBDC956E9595,
+	0x56AD9F8ED847D8D8,0x70EB308BFBCBFBFB,0xCDC17123EE9FEEEE,0xBBF891C77CED7C7C,0x71CCE31766856666,0x7BA78EA6DD53DDDD,0xAF2E4BB8175C1717,0x458E460247014747,0x1A21DC849E429E9E,
+	0xD489C51ECA0FCACA,0x585A99752DB42D2D,0x2E637991BFC6BFBF,0x3F0E1B38071C0707,0xAC472301AD8EADAD,0xB0B42FEA5A755A5A,0xEF1BB56C83368383,0xB666FF8533CC3333,0x5CC6F23F63916363,
+	0x12040A1002080202,0x93493839AA92AAAA,0xDEE2A8AF71D97171,0xC68DCF0EC807C8C8,0xD1327DC819641919,0x3B92707249394949,0x5FAF9A86D943D9D9,0x31F91DC3F2EFF2F2,0xA8DB484BE3ABE3E3,
+	0xB9B62AE25B715B5B,0xBC0D9234881A8888,0x3E29C8A49A529A9A,0x0B4CBE2D26982626,0xBF64FA8D32C83232,0x597D4AE9B0FAB0B0,0xF2CF6A1BE983E9E9,0x771E33780F3C0F0F,0x33B7A6E6D573D5D5,
+	0xF41DBA74803A8080,0x27617C99BEC2BEBE,0xEB87DE26CD13CDCD,0x8968E4BD34D03434,0x3290757A483D4848,0x54E324ABFFDBFFFF,0x8DF48FF77AF57A7A,0x643DEAF4907A9090,0x9DBE3EC25F615F5F,	
+	0x3D40A01D20802020,0x0FD0D56768BD6868,0xCA3472D01A681A1A,0xB7412C19AE82AEAE,0x7D755EC9B4EAB4B4,0xCEA8199A544D5454,0x7F3BE5EC93769393,0x2F44AA0D22882222,0x63C8E907648D6464,
+	0x2AFF12DBF1E3F1F1,0xCCE6A2BF73D17373,0x82245A9012481212,0x7A805D3A401D4040,0x4810284008200808,0x959BE856C32BC3C3,0xDFC57B33EC97ECEC,0x4DAB9096DB4BDBDB,0xC05F1F61A1BEA1A1,
+	0x9107831C8D0E8D8D,0xC87AC9F53DF43D3D,0x5B33F1CC97669797,0x0000000000000000,0xF983D436CF1BCFCF,0x6E5687452BAC2B2B,0xE1ECB39776C57676,0xE619B06482328282,0x28B1A9FED67FD6D6,
+	0xC33677D81B6C1B1B,0x74775BC1B5EEB5B5,0xBE432911AF86AFAF,0x1DD4DF776AB56A6A,0xEAA00DBA505D5050,0x578A4C1245094545,0x38FB18CBF3EBF3F3,0xAD60F09D30C03030,0xC4C3742BEF9BEFEF,
+	0xDA7EC3E53FFC3F3F,0xC7AA1C9255495555,0xDB591079A2B2A2A2,0xE9C96503EA8FEAEA,0x6ACAEC0F65896565,0x036968B9BAD2BABA,0x4A5E93652FBC2F2F,0x8E9DE74EC027C0C0,0x60A181BEDE5FDEDE,
+	0xFC386CE01C701C1C,0x46E72EBBFDD3FDFD,0x1F9A64524D294D4D,0x7639E0E492729292,0xFAEABC8F75C97575,0x360C1E3006180606,0xAE0998248A128A8A,0x4B7940F9B2F2B2B2,0x85D15963E6BFE6E6,
+	0x7E1C36700E380E0E,0xE73E63F81F7C1F1F,0x55C4F73762956262,0x3AB5A3EED477D4D4,0x814D3229A89AA8A8,0x5231F4C496629696,0x62EF3A9BF9C3F9F9,0xA397F666C533C5C5,0x104AB13525942525,
+	0xABB220F259795959,0xD015AE54842A8484,0xC5E4A7B772D57272,0xEC72DDD539E43939,0x1698615A4C2D4C4C,0x94BC3BCA5E655E5E,0x9FF085E778FD7878,0xE570D8DD38E03838,0x980586148C0A8C8C,
+	0x17BFB2C6D163D1D1,0xE4570B41A5AEA5A5,0xA1D94D43E2AFE2E2,0x4EC2F82F61996161,0x427B45F1B3F6B3B3,0x3442A51521842121,0x0825D6949C4A9C9C,0xEE3C66F01E781E1E,0x6186522243114343,
+	0xB193FC76C73BC7C7,0x4FE52BB3FCD7FCFC,0x2408142004100404,0xE3A208B251595151,0x252FC7BC995E9999,0x22DAC44F6DA96D6D,0x651A39680D340D0D,0x79E93583FACFFAFA,0x69A384B6DF5BDFDF,
+	0xA9FC9BD77EE57E7E,0x1948B43D24902424,0xFE76D7C53BEC3B3B,0x9A4B3D31AB96ABAB,0xF081D13ECE1FCECE,0x9922558811441111,0x8303890C8F068F8F,0x049C6B4A4E254E4E,0x667351D1B7E6B7B7,
+	0xE0CB600BEB8BEBEB,0xC178CCFD3CF03C3C,0xFD1FBF7C813E8181,0x4035FED4946A9494,0x1CF30CEBF7FBF7F7,0x186F67A1B9DEB9B9,0x8B265F98134C1313,0x51589C7D2CB02C2C,0x05BBB8D6D36BD3D3,	
+	0x8CD35C6BE7BBE7E7,0x39DCCB576EA56E6E,0xAA95F36EC437C4C4,0x1B060F18030C0303,0xDCAC138A56455656,0x5E88491A440D4444,0xA0FE9EDF7FE17F7F,0x884F3721A99EA9A9,0x6754824D2AA82A2A,
+	0x0A6B6DB1BBD6BBBB,0x879FE246C123C1C1,0xF1A602A253515353,0x72A58BAEDC57DCDC,0x531627580B2C0B0B,0x0127D39C9D4E9D9D,0x2BD8C1476CAD6C6C,0xA462F59531C43131,0xF3E8B98774CD7474,
+	0x15F109E3F6FFF6F6,0x4C8C430A46054646,0xA5452609AC8AACAC,0xB50F973C891E8989,0xB42844A014501414,0xBADF425BE1A3E1E1,0xA62C4EB016581616,0xF774D2CD3AE83A3A,0x06D2D06F69B96969,	
+	0x41122D4809240909,0xD7E0ADA770DD7070,0x6F7154D9B6E2B6B6,0x1EBDB7CED067D0D0,0xD6C77E3BED93EDED,0xE285DB2ECC17CCCC,0x6884572A42154242,0x2C2DC2B4985A9898,0xED550E49A4AAA4A4,
+	0x7550885D28A02828,0x86B831DA5C6D5C5C,0x6BED3F93F8C7F8F8,0xC211A44486228686
+};
+
+/**
+ * Round constants.
+ */
+/* ====================================================================== */
+
+__device__ __forceinline__
+static uint64_t ROUND_ELT(const uint64_t*const __restrict__ sharedMemory, const uint64_t*const __restrict__ in, const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7)
+{
+	const uint32_t* const __restrict__ in32 = (uint32_t*)in;
+	return  		
+		sharedMemory[in32[(i0 << 1)] & 0xff] ^
+		sharedMemory[__byte_perm(in32[(i1 << 1)], 0, 0x4441) + 256] ^
+		sharedMemory[__byte_perm(in32[(i2 << 1)], 0, 0x4442) + 512] ^
+		sharedMemory[__byte_perm(in32[(i3 << 1)], 0, 0x4443) + 768] ^
+		sharedMemory[(in32[(i4 << 1) + 1]&0xff) + 1024] ^
+		sharedMemory[__byte_perm(in32[(i5 << 1) + 1], 0, 0x4441) + 1280] ^
+		sharedMemory[__byte_perm(in32[(i6 << 1) + 1], 0, 0x4442) + 1536] ^
+		sharedMemory[__byte_perm(in32[(i7 << 1) + 1], 0, 0x4443) + 1792];
+}
+
+__device__ __forceinline__
+static uint2 ROUND_ELT2(const uint64_t*const __restrict__ sharedMemory, const uint2*const __restrict__ in, const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7)
+{
+//	const uint32_t* __restrict__ in32 = (uint32_t*)in;
+	return
+		vectorize
+		(
+		sharedMemory[in[(i0)].x & 0xff] ^
+		sharedMemory[__byte_perm(in[(i1)].x, 0, 0x4441) + 256] ^
+		sharedMemory[__byte_perm(in[(i2)].x, 0, 0x4442) + 512] ^
+		sharedMemory[__byte_perm(in[(i3)].x, 0, 0x4443) + 768] ^
+		sharedMemory[(in[(i4)].y & 0xff) + 1024] ^
+		sharedMemory[__byte_perm(in[(i5)].y, 0, 0x4441) + 1280] ^
+		sharedMemory[__byte_perm(in[(i6)].y, 0, 0x4442) + 1536] ^
+		sharedMemory[__byte_perm(in[(i7)].y, 0, 0x4443) + 1792]);
+}
+
+
+#define TRANSFER(dst, src) { \
+	dst[0] = src ## 0; \
+	dst[1] = src ## 1; \
+	dst[2] = src ## 2; \
+	dst[3] = src ## 3; \
+	dst[4] = src ## 4; \
+	dst[5] = src ## 5; \
+	dst[6] = src ## 6; \
+	dst[7] = src ## 7; \
+}
+
+#define ROUND(table, in, out, c0, c1, c2, c3, c4, c5, c6, c7) { \
+	out ## 0 = (ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1)^ c0); \
+	out ## 1 = (ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2)^ c1); \
+	out ## 2 = (ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3)^ c2); \
+	out ## 3 = (ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4)^ c3); \
+	out ## 4 = (ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5)^ c4); \
+	out ## 5 = (ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6)^ c5); \
+	out ## 6 = (ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7)^ c6); \
+	out ## 7 = (ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0)^ c7); \
+}
+
+#define ROUND1(table, in, out, c) { \
+	out ## 0 = (ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1)^ c); \
+	out ## 1 = ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2); \
+	out ## 2 = ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3); \
+	out ## 3 = ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4); \
+	out ## 4 = ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5); \
+	out ## 5 = ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6); \
+	out ## 6 = ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7); \
+	out ## 7 = ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0); \
+}
+
+#define ROUND_KSCHED(table, in, out, c) \
+	ROUND1(table, in, out, c) \
+	TRANSFER(in, out)
+
+#define ROUND_WENC(table, in, key, out) \
+	ROUND(table, in, out, key[0], key[1], key[2],key[3], key[4], key[5], key[6], key[7]) \
+	TRANSFER(in, out)
+
+__device__ __forceinline__
+static void getShared(uint64_t* sharedMemory)
+{
+	if (threadIdx.x < 256)
+	{
+		sharedMemory[threadIdx.x] = mixTob0Tox[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = mixTob1Tox[threadIdx.x];
+		sharedMemory[threadIdx.x + 512] = ROTL64(mixTob0Tox[threadIdx.x], 16);
+		sharedMemory[threadIdx.x + 768] = ROTL64(mixTob0Tox[threadIdx.x], 24);
+		sharedMemory[threadIdx.x + 1024] = SWAPDWORDS(sharedMemory[threadIdx.x]);
+		sharedMemory[threadIdx.x + 1280] = SWAPDWORDS(sharedMemory[threadIdx.x + 256]);
+		sharedMemory[threadIdx.x + 1536] = SWAPDWORDS(sharedMemory[threadIdx.x + 512]);
+		sharedMemory[threadIdx.x + 1792] = SWAPDWORDS(sharedMemory[threadIdx.x + 768]);
+	}
+}
+
+
+__global__ __launch_bounds__(256)
+void precomputeX(uint32_t threads, uint2*const __restrict__ d_xtra, uint64_t*const __restrict__ d_tmp)
+{
+
+	__shared__ uint64_t sharedMemory[2048];
+	const uint64_t InitVector_RC[10] =
+	{
+		0x4F01B887E8C62318, 0x52916F79F5D2A636, 0x357B0CA38E9BBC60, 0x57FE4B2EC2D7E01D, 0xDA4AF09FE5377715,
+		0x856BA0B10A29C958, 0x67053ECBF4105DBD, 0xD8957DA78B4127E4, 0x9E4717DD667CEEFB, 0x33835AAD07BF2DCA
+	};
+
+
+	getShared(sharedMemory);
+	__syncthreads();
+	const unsigned int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+
+		uint64_t n[8];
+		uint64_t h[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+#pragma unroll 8
+		for (int i = 0; i<8; i++) {
+			n[i] = c_PaddedMessage80[i];  // read data
+		}
+		//#pragma unroll 10
+		for (unsigned int r = 0; r < 10; r++) {
+			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
+			ROUND_WENC(sharedMemory, n, h, tmp);
+		}
+#pragma unroll 8
+		for (int i = 0; i < 8; i++) {
+			h[i] = xor1(n[i], c_PaddedMessage80[i]);
+		}
+
+		if (threadIdx.x == 0)
+		{
+			d_xtra[0] = vectorize(h[1]);
+			d_xtra[0].y = cuda_swab32(d_xtra[0].y);
+		}
+		uint64_t atLastCalc = xor1(h[3], h[5]);
+
+		//////////////////////////////////
+
+		n[0] = xor1(c_PaddedMessage80[8], h[0]);
+		n[1] = c_PaddedMessage80[9];
+		n[2] = xor1(0x0000000000000080, h[2]);
+		n[3] = h[3];
+		n[4] = h[4];
+		n[5] = h[5];
+		n[6] = h[6];
+		n[7] = xor1(0x8002000000000000, h[7]);
+
+		uint64_t tmp[8];
+		tmp[0] = xor1(ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[0]);
+		tmp[1] = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp[2] = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp[3] = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp[4] = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp[5] = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp[6] = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp[7] = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		uint64_t tmp2[8];
+		uint32_t* n32 = (uint32_t*)n;
+		tmp2[0] = xor8(sharedMemory[__byte_perm(n32[0], 0, 0x4440)], sharedMemory[__byte_perm(n32[14], 0, 0x4441) + 256],
+			sharedMemory[__byte_perm(n32[12], 0, 0x4442) + 512], sharedMemory[__byte_perm(n32[10], 0, 0x4443) + 768],
+			sharedMemory[__byte_perm(n32[9], 0, 0x4440) + 1024], sharedMemory[__byte_perm(n32[7], 0, 0x4441) + 1280],
+			sharedMemory[__byte_perm(n32[5], 0, 0x4442) + 1536], tmp[0]);
+
+		tmp2[1] = xor8(tmp[1], sharedMemory[__byte_perm(n32[0], 0, 0x4441) + 256],
+			sharedMemory[__byte_perm(n32[14], 0, 0x4442) + 512], sharedMemory[__byte_perm(n32[12], 0, 0x4443) + 768],
+			sharedMemory[__byte_perm(n32[11], 0, 0x4440) + 1024], sharedMemory[__byte_perm(n32[9], 0, 0x4441) + 1280],
+			sharedMemory[__byte_perm(n32[7], 0, 0x4442) + 1536], sharedMemory[__byte_perm(n32[5], 0, 0x4443) + 1792]);
+
+		tmp2[2] = xor8(sharedMemory[__byte_perm(n32[4], 0, 0x4440)], tmp[2],
+			sharedMemory[__byte_perm(n32[0], 0, 0x4442) + 512], sharedMemory[__byte_perm(n32[14], 0, 0x4443) + 768],
+			sharedMemory[__byte_perm(n32[13], 0, 0x4440) + 1024], sharedMemory[__byte_perm(n32[11], 0, 0x4441) + 1280],
+			sharedMemory[__byte_perm(n32[9], 0, 0x4442) + 1536], sharedMemory[__byte_perm(n32[7], 0, 0x4443) + 1792]);
+
+		tmp2[3] = xor8(sharedMemory[__byte_perm(n32[6], 0, 0x4440)], sharedMemory[__byte_perm(n32[4], 0, 0x4441) + 256],
+			tmp[3], sharedMemory[__byte_perm(n32[0], 0, 0x4443) + 768],
+			sharedMemory[__byte_perm(n32[15], 0, 0x4440) + 1024], sharedMemory[__byte_perm(n32[13], 0, 0x4441) + 1280],
+			sharedMemory[__byte_perm(n32[11], 0, 0x4442) + 1536], sharedMemory[__byte_perm(n32[9], 0, 0x4443) + 1792]);
+
+		tmp2[4] = xor8(sharedMemory[__byte_perm(n32[8], 0, 0x4440)], sharedMemory[__byte_perm(n32[6], 0, 0x4441) + 256],
+			sharedMemory[__byte_perm(n32[4], 0, 0x4442) + 512], tmp[4],
+			sharedMemory[__byte_perm(n32[1], 0, 0x4440) + 1024], sharedMemory[__byte_perm(n32[15], 0, 0x4441) + 1280],
+			sharedMemory[__byte_perm(n32[13], 0, 0x4442) + 1536], sharedMemory[__byte_perm(n32[11], 0, 0x4443) + 1792]);
+
+		tmp2[5] = xor8(sharedMemory[__byte_perm(n32[10], 0, 0x4440)], sharedMemory[__byte_perm(n32[8], 0, 0x4441) + 256],
+			sharedMemory[__byte_perm(n32[6], 0, 0x4442) + 512], sharedMemory[__byte_perm(n32[4], 0, 0x4443) + 768],
+			tmp[5], sharedMemory[__byte_perm(n32[1], 0, 0x4441) + 1280],
+			sharedMemory[__byte_perm(n32[15], 0, 0x4442) + 1536], sharedMemory[__byte_perm(n32[13], 0, 0x4443) + 1792]);
+
+		tmp2[6] = xor8(sharedMemory[__byte_perm(n32[12], 0, 0x4440)], sharedMemory[__byte_perm(n32[10], 0, 0x4441) + 256],
+			sharedMemory[__byte_perm(n32[8], 0, 0x4442) + 512], sharedMemory[__byte_perm(n32[6], 0, 0x4443) + 768],
+			sharedMemory[__byte_perm(n32[5], 0, 0x4440) + 1024], tmp[6],
+			sharedMemory[__byte_perm(n32[1], 0, 0x4442) + 1536], sharedMemory[__byte_perm(n32[15], 0, 0x4443) + 1792]);
+
+		tmp2[7] = xor8(sharedMemory[__byte_perm(n32[14], 0, 0x4440)], sharedMemory[__byte_perm(n32[12], 0, 0x4441) + 256],
+			sharedMemory[__byte_perm(n32[10], 0, 0x4442) + 512], sharedMemory[__byte_perm(n32[8], 0, 0x4443) + 768],
+			sharedMemory[__byte_perm(n32[7], 0, 0x4440) + 1024], sharedMemory[__byte_perm(n32[5], 0, 0x4441) + 1280],
+			tmp[7], sharedMemory[__byte_perm(n32[1], 0, 0x4443) + 1792]);
+
+		n[1] ^= h[1];
+		tmp2[1] ^= sharedMemory[__byte_perm(n32[2], 0, 0x4440)];
+		tmp2[2] ^= sharedMemory[__byte_perm(n32[2], 0, 0x4441) + 256];
+		tmp2[3] ^= sharedMemory[__byte_perm(n32[2], 0, 0x4442) + 512];
+		tmp2[4] ^= sharedMemory[__byte_perm(n32[2], 0, 0x4443) + 768];
+
+		d_tmp[threadIdx.x] = tmp2[threadIdx.x];
+
+		uint64_t tmp3[8];
+		tmp3[0] = xor1(ROUND_ELT(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[1]);
+		tmp3[1] = ROUND_ELT(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp3[2] = ROUND_ELT(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp3[3] = ROUND_ELT(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp3[4] = ROUND_ELT(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp3[5] = ROUND_ELT(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp3[6] = ROUND_ELT(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp3[7] = ROUND_ELT(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		n32 = (uint32_t*)tmp2;
+		uint64_t tmp4[8];
+		tmp4[0] = (sharedMemory[__byte_perm(n32[9], 0, 0x4440) + 1024] ^ sharedMemory[__byte_perm(n32[7], 0, 0x4441) + 1280] ^
+			sharedMemory[__byte_perm(n32[5], 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(n32[3], 0, 0x4443) + 1792]) ^ tmp3[0];
+
+		tmp4[1] = (sharedMemory[__byte_perm(n32[2], 0, 0x4440)] ^ sharedMemory[__byte_perm(n32[9], 0, 0x4441) + 1280] ^
+			sharedMemory[__byte_perm(n32[7], 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(n32[5], 0, 0x4443) + 1792]) ^ tmp3[1];
+
+		tmp4[2] = (sharedMemory[__byte_perm(n32[4], 0, 0x4440)] ^ sharedMemory[__byte_perm(n32[2], 0, 0x4441) + 256] ^
+			sharedMemory[__byte_perm(n32[9], 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(n32[7], 0, 0x4443) + 1792]) ^ tmp3[2];
+
+		tmp4[3] = (sharedMemory[__byte_perm(n32[6], 0, 0x4440)] ^ sharedMemory[__byte_perm(n32[4], 0, 0x4441) + 256] ^
+			sharedMemory[__byte_perm(n32[2], 0, 0x4442) + 512] ^ sharedMemory[__byte_perm(n32[9], 0, 0x4443) + 1792]) ^ tmp3[3];
+
+		tmp4[4] = (sharedMemory[__byte_perm(n32[8], 0, 0x4440)] ^ sharedMemory[__byte_perm(n32[6], 0, 0x4441) + 256] ^
+			sharedMemory[__byte_perm(n32[4], 0, 0x4442) + 512] ^ sharedMemory[__byte_perm(n32[2], 0, 0x4443) + 768]) ^ tmp3[4];
+
+		tmp4[5] = (sharedMemory[__byte_perm(n32[8], 0, 0x4441) + 256] ^ sharedMemory[__byte_perm(n32[6], 0, 0x4442) + 512] ^
+			sharedMemory[__byte_perm(n32[4], 0, 0x4443) + 768] ^ sharedMemory[__byte_perm(n32[3], 0, 0x4440) + 1024]) ^ tmp3[5];
+
+		tmp4[6] = (sharedMemory[__byte_perm(n32[8], 0, 0x4442) + 512] ^ sharedMemory[__byte_perm(n32[6], 0, 0x4443) + 768] ^
+			sharedMemory[__byte_perm(n32[5], 0, 0x4440) + 1024] ^ sharedMemory[__byte_perm(n32[3], 0, 0x4441) + 1280]) ^ tmp3[6];
+
+		tmp4[7] = (sharedMemory[__byte_perm(n32[8], 0, 0x4443) + 768] ^ sharedMemory[__byte_perm(n32[7], 0, 0x4440) + 1024] ^
+			sharedMemory[__byte_perm(n32[5], 0, 0x4441) + 1280] ^ sharedMemory[__byte_perm(n32[3], 0, 0x4442) + 1536]) ^ tmp3[7];
+
+		d_tmp[threadIdx.x + 16] = tmp4[threadIdx.x];
+
+		uint64_t tmp5[8];
+		tmp5[0] = xor1(ROUND_ELT(sharedMemory, tmp3, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[2]);
+		tmp5[1] = ROUND_ELT(sharedMemory, tmp3, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp5[2] = ROUND_ELT(sharedMemory, tmp3, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp5[3] = ROUND_ELT(sharedMemory, tmp3, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp5[4] = ROUND_ELT(sharedMemory, tmp3, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp5[5] = ROUND_ELT(sharedMemory, tmp3, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp5[6] = ROUND_ELT(sharedMemory, tmp3, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp5[7] = ROUND_ELT(sharedMemory, tmp3, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		d_tmp[threadIdx.x + 8] = tmp5[threadIdx.x];
+
+		uint64_t tmp6[8];
+		tmp6[0] = xor1(ROUND_ELT(sharedMemory, tmp5, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[3]);
+		tmp6[1] = ROUND_ELT(sharedMemory, tmp5, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp6[2] = ROUND_ELT(sharedMemory, tmp5, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp6[3] = ROUND_ELT(sharedMemory, tmp5, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp6[4] = ROUND_ELT(sharedMemory, tmp5, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp6[5] = ROUND_ELT(sharedMemory, tmp5, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp6[6] = ROUND_ELT(sharedMemory, tmp5, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp6[7] = ROUND_ELT(sharedMemory, tmp5, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		d_tmp[threadIdx.x + 24] = tmp6[threadIdx.x];
+
+		uint64_t tmp7[8];
+		tmp7[0] = xor1(ROUND_ELT(sharedMemory, tmp6, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[4]);
+		tmp7[1] = ROUND_ELT(sharedMemory, tmp6, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp7[2] = ROUND_ELT(sharedMemory, tmp6, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp7[3] = ROUND_ELT(sharedMemory, tmp6, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp7[4] = ROUND_ELT(sharedMemory, tmp6, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp7[5] = ROUND_ELT(sharedMemory, tmp6, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp7[6] = ROUND_ELT(sharedMemory, tmp6, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp7[7] = ROUND_ELT(sharedMemory, tmp6, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		d_tmp[threadIdx.x + 32] = tmp7[threadIdx.x];
+		//-------------------
+		uint64_t tmp8[8];
+		tmp8[0] = xor1(ROUND_ELT(sharedMemory, tmp7, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[5]);
+		tmp8[1] = ROUND_ELT(sharedMemory, tmp7, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp8[2] = ROUND_ELT(sharedMemory, tmp7, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp8[3] = ROUND_ELT(sharedMemory, tmp7, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp8[4] = ROUND_ELT(sharedMemory, tmp7, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp8[5] = ROUND_ELT(sharedMemory, tmp7, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp8[6] = ROUND_ELT(sharedMemory, tmp7, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp8[7] = ROUND_ELT(sharedMemory, tmp7, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		d_tmp[threadIdx.x + 40] = tmp8[threadIdx.x];
+
+		uint64_t tmp9[8];
+		tmp9[0] = xor1(ROUND_ELT(sharedMemory, tmp8, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[6]);
+		tmp9[1] = ROUND_ELT(sharedMemory, tmp8, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp9[2] = ROUND_ELT(sharedMemory, tmp8, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp9[3] = ROUND_ELT(sharedMemory, tmp8, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp9[4] = ROUND_ELT(sharedMemory, tmp8, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp9[5] = ROUND_ELT(sharedMemory, tmp8, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp9[6] = ROUND_ELT(sharedMemory, tmp8, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp9[7] = ROUND_ELT(sharedMemory, tmp8, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		d_tmp[threadIdx.x + 48] = tmp9[threadIdx.x];
+
+		uint64_t tmp10[8];
+		tmp10[0] = xor1(ROUND_ELT(sharedMemory, tmp9, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[7]);
+		tmp10[1] = ROUND_ELT(sharedMemory, tmp9, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp10[2] = ROUND_ELT(sharedMemory, tmp9, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp10[3] = ROUND_ELT(sharedMemory, tmp9, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp10[4] = ROUND_ELT(sharedMemory, tmp9, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp10[5] = ROUND_ELT(sharedMemory, tmp9, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp10[6] = ROUND_ELT(sharedMemory, tmp9, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp10[7] = ROUND_ELT(sharedMemory, tmp9, 7, 6, 5, 4, 3, 2, 1, 0);
+
+
+		d_tmp[threadIdx.x + 56] = tmp10[threadIdx.x];
+
+		uint64_t tmp11[8];
+		tmp11[0] = xor1(ROUND_ELT(sharedMemory, tmp10, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[8]);
+		tmp11[1] = ROUND_ELT(sharedMemory, tmp10, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp11[2] = ROUND_ELT(sharedMemory, tmp10, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp11[3] = ROUND_ELT(sharedMemory, tmp10, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp11[4] = ROUND_ELT(sharedMemory, tmp10, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp11[5] = ROUND_ELT(sharedMemory, tmp10, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp11[6] = ROUND_ELT(sharedMemory, tmp10, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp11[7] = ROUND_ELT(sharedMemory, tmp10, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		d_tmp[threadIdx.x + 64] = tmp11[threadIdx.x];
+
+		if (threadIdx.x == 1){
+			tmp[0] = ROUND_ELT(sharedMemory, tmp11, 3, 2, 1, 0, 7, 6, 5, 4);
+			tmp[1] = ROUND_ELT(sharedMemory, tmp11, 5, 4, 3, 2, 1, 0, 7, 6);
+			tmp[4] = xor3(tmp[0], tmp[1], atLastCalc);
+			d_xtra[1] = vectorize(tmp[4]);
+		}
+	}
+}
+__global__ __launch_bounds__(TPB)
+void whirlpoolx(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
+{
+
+
+	uint32_t threadindex = (blockDim.x * blockIdx.x + threadIdx.x);
+
+
+	if (threadindex < threads)
+	{
+		__shared__ uint64_t sharedMemory[2048];
+		getShared(sharedMemory);
+		__syncthreads();
+		const uint32_t numberofthreads = blockDim.x*gridDim.x;
+		const uint32_t maxnonce = startNounce + threadindex + numberofthreads*NONCES_PER_THREAD - 1;
+		const uint32_t threadindex = blockIdx.x*blockDim.x + threadIdx.x;
+		const uint64_t backup = pTarget[0];
+//		#pragma unroll 
+		for (uint32_t nounce = startNounce + threadindex; nounce <= maxnonce; nounce += numberofthreads)
+		{
+
+		uint2 n[8];
+		uint2 tmp[8];
+		//const uint32_t nounce = startNounce + thread;
+
+		n[1].y = nounce ^ c_xtra[0].y;
+
+		n[0] = vectorize(sharedMemory[(n[1].y & 0xff) + 1792]);
+		n[5] = vectorize(sharedMemory[__byte_perm(n[1].y, 0, 0x4443) + 1024]);
+		n[6] = vectorize(sharedMemory[__byte_perm(n[1].y, 0, 0x4442) + 1280]);
+		n[7] = vectorize(sharedMemory[__byte_perm(n[1].y, 0, 0x4441) + 1536]);
+		uint2 b = (c_tmp[0]) ^ n[0];
+		n[5] = (c_tmp[5]) ^ n[5];
+		n[6] = (c_tmp[6]) ^ n[6];
+		n[7] = (c_tmp[7]) ^ n[7];
+
+		tmp[0] = vectorize(sharedMemory[__byte_perm(n[5].x, 0, 0x4443) + 768] ^ sharedMemory[__byte_perm(n[6].x, 0, 0x4442) + 512] ^ sharedMemory[__byte_perm(n[7].x, 0, 0x4441) + 256]);
+		tmp[1] = vectorize(sharedMemory[(n[5].y&0xff) + 1024] ^ sharedMemory[__byte_perm(n[6].x, 0, 0x4443) + 768] ^ sharedMemory[__byte_perm(n[7].x, 0, 0x4442) + 512]);
+		tmp[2] = vectorize(sharedMemory[__byte_perm(n[5].y, 0, 0x4441) + 1280] ^ sharedMemory[(n[6].y & 0xff) + 1024] ^ sharedMemory[__byte_perm(n[7].x, 0, 0x4443) + 768]);
+		tmp[3] = vectorize(sharedMemory[__byte_perm(n[5].y, 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(n[6].y, 0, 0x4441) + 1280] ^ sharedMemory[(n[7].y & 0xff) + 1024]);
+		tmp[4] = vectorize(sharedMemory[__byte_perm(n[5].y, 0, 0x4443) + 1792] ^ sharedMemory[__byte_perm(n[6].y, 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(n[7].y, 0, 0x4441) + 1280]);
+		tmp[5] = vectorize(sharedMemory[(n[5].x &0xff)] ^ sharedMemory[__byte_perm(n[6].y, 0, 0x4443) + 1792] ^ sharedMemory[__byte_perm(n[7].y, 0, 0x4442) + 1536]);
+		tmp[6] = vectorize(sharedMemory[(n[6].x & 0xff)] ^ sharedMemory[__byte_perm(n[5].x, 0, 0x4441) + 256] ^ sharedMemory[__byte_perm(n[7].y, 0, 0x4443) + 1792]);
+		tmp[7] = vectorize(sharedMemory[(n[7].x & 0xff)] ^ sharedMemory[__byte_perm(n[6].x, 0, 0x4441) + 256] ^ sharedMemory[__byte_perm(n[5].x, 0, 0x4442) + 512]);
+
+		n[0] = vectorize(sharedMemory[__byte_perm(b.x, 0, 0x4440)]) ^ tmp[0] ^ (c_tmp[0 + 16]);
+		n[1] = vectorize(sharedMemory[__byte_perm(b.x, 0, 0x4441) + 256]) ^ tmp[1] ^ (c_tmp[1 + 16]);
+		n[2] = vectorize(sharedMemory[__byte_perm(b.x, 0, 0x4442) + 512]) ^ tmp[2] ^ (c_tmp[2 + 16]);
+		n[3] = vectorize(sharedMemory[__byte_perm(b.x, 0, 0x4443) + 768]) ^ tmp[3] ^ (c_tmp[3 + 16]);
+		n[4] = vectorize(sharedMemory[__byte_perm(b.y, 0, 0x4440) + 1024]) ^ tmp[4] ^ (c_tmp[4 + 16]);
+		n[5] = vectorize(sharedMemory[__byte_perm(b.y, 0, 0x4441) + 1280]) ^ tmp[5] ^ (c_tmp[5 + 16]);
+		n[6] = vectorize(sharedMemory[__byte_perm(b.y, 0, 0x4442) + 1536]) ^ tmp[6] ^ (c_tmp[6 + 16]);
+		n[7] = vectorize(sharedMemory[__byte_perm(b.y, 0, 0x4443) + 1792]) ^ tmp[7] ^ (c_tmp[7 + 16]);
+
+		tmp[0] = ROUND_ELT2(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 8]);
+		tmp[1] = ROUND_ELT2(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 8]);
+		tmp[2] = ROUND_ELT2(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 8]);
+		tmp[3] = ROUND_ELT2(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 8]);
+		tmp[4] = ROUND_ELT2(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 8]);
+		tmp[5] = ROUND_ELT2(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 8]);
+		tmp[6] = ROUND_ELT2(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 8]);
+		tmp[7] = ROUND_ELT2(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 8]);
+
+		n[0] = ROUND_ELT2(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 24]);
+		n[1] = ROUND_ELT2(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 24]);
+		n[2] = ROUND_ELT2(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 24]);
+		n[3] = ROUND_ELT2(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 24]);
+		n[4] = ROUND_ELT2(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 24]);
+		n[5] = ROUND_ELT2(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 24]);
+		n[6] = ROUND_ELT2(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 24]);
+		n[7] = ROUND_ELT2(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 24]);
+
+
+		tmp[0] = ROUND_ELT2(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 32]);
+		tmp[1] = ROUND_ELT2(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 32]);
+		tmp[2] = ROUND_ELT2(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 32]);
+		tmp[3] = ROUND_ELT2(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 32]);
+		tmp[4] = ROUND_ELT2(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 32]);
+		tmp[5] = ROUND_ELT2(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 32]);
+		tmp[6] = ROUND_ELT2(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 32]);
+		tmp[7] = ROUND_ELT2(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 32]);
+
+		n[0] = ROUND_ELT2(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 40]);
+		n[1] = ROUND_ELT2(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 40]);
+		n[2] = ROUND_ELT2(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 40]);
+		n[3] = ROUND_ELT2(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 40]);
+		n[4] = ROUND_ELT2(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 40]);
+		n[5] = ROUND_ELT2(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 40]);
+		n[6] = ROUND_ELT2(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 40]);
+		n[7] = ROUND_ELT2(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 40]);
+
+		tmp[0] = ROUND_ELT2(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 48]);
+		tmp[1] = ROUND_ELT2(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 48]);
+		tmp[2] = ROUND_ELT2(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 48]);
+		tmp[3] = ROUND_ELT2(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 48]);
+		tmp[4] = ROUND_ELT2(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 48]);
+		tmp[5] = ROUND_ELT2(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 48]);
+		tmp[6] = ROUND_ELT2(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 48]);
+		tmp[7] = ROUND_ELT2(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 48]);
+
+		n[0] = ROUND_ELT2(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 56]);
+		n[1] = ROUND_ELT2(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 56]);
+		n[2] = ROUND_ELT2(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 56]);
+		n[3] = ROUND_ELT2(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 56]);
+		n[4] = ROUND_ELT2(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 56]);
+		n[5] = ROUND_ELT2(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 56]);
+		n[6] = ROUND_ELT2(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 56]);
+		n[7] = ROUND_ELT2(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 56]);
+
+		tmp[0] = ROUND_ELT2(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 64]);
+		tmp[1] = ROUND_ELT2(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 64]);
+		tmp[2] = ROUND_ELT2(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 64]);
+		tmp[3] = ROUND_ELT2(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 64]);
+		tmp[4] = ROUND_ELT2(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 64]);
+		tmp[5] = ROUND_ELT2(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 64]);
+		tmp[6] = ROUND_ELT2(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 64]);
+		tmp[7] = ROUND_ELT2(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 64]);
+
+		if ((devectorize(c_xtra[1] ^ ROUND_ELT2(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4) ^ ROUND_ELT2(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6))) <= backup)
+		{
+			uint32_t tmp = atomicExch(resNounce, nounce);
+			if (tmp != 0xffffffff)
+				resNounce[1] = tmp;
+		}
+	} // thread < threads
+	}
+}
+
+__host__ extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads)
+{
+	uint64_t t1[256];
+	cudaMemcpyToSymbolAsync(mixTob0Tox, hmixTob0Tox, sizeof(hmixTob0Tox), 0, cudaMemcpyHostToDevice, gpustream[thr_id]);
+
+	for (int i = 0; i < 256; i++)
+	{
+		t1[i] = ROTL64(hmixTob0Tox[i], 8);
+	}
+	cudaMemcpyToSymbolAsync(mixTob1Tox, t1, sizeof(hmixTob0Tox), 0, cudaMemcpyHostToDevice, gpustream[thr_id]);
+	cudaMalloc(&d_WXNonce[thr_id], 2 * sizeof(uint32_t));
+	cudaMallocHost(&h_wxnounce[thr_id], 2 * sizeof(uint32_t));
+	cudaMalloc((void **)&d_xtra[thr_id], 2 * sizeof(uint64_t));
+	cudaMalloc((void **)&d_tmp[thr_id], 8 * 9 * sizeof(uint64_t));
+}
+
+__host__ void whirlpoolx_setBlock_80(int thr_id, void *pdata, const void *ptarget)
+{
+	uint64_t PaddedMessage[16];
+	memcpy(PaddedMessage, pdata, 80);
+	memset((uint8_t*)&PaddedMessage + 80, 0, 48);
+	*(((uint8_t*)&PaddedMessage) + 80) = 0x80; /* ending */
+	cudaMemcpyToSymbolAsync(pTarget, ptarget, 1 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]);
+	cudaMemcpyToSymbolAsync(c_PaddedMessage80, PaddedMessage, 16 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]);
+}
+
+__host__ void whirlpoolx_precompute(int thr_id)
+{
+	dim3 grid(1);
+	dim3 block(256);
+
+	precomputeX <<<grid, block , 0, gpustream[thr_id]>>>(8, d_xtra[thr_id], d_tmp[thr_id]);
+	cudaMemcpyToSymbolAsync(c_xtra, d_xtra[thr_id], 2 * sizeof(uint64_t), 0, cudaMemcpyDeviceToDevice, gpustream[thr_id]);
+	cudaMemcpyToSymbolAsync(c_tmp, d_tmp[thr_id], 8 * 9 * sizeof(uint64_t), 0, cudaMemcpyDeviceToDevice, gpustream[thr_id]);
+}
+
+__host__ void cpu_whirlpoolx(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *foundnonce)
+{
+	dim3 grid((threads + TPB*NONCES_PER_THREAD - 1) / TPB / NONCES_PER_THREAD);
+	dim3 block(TPB);
+
+	cudaMemsetAsync(d_WXNonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]);
+	whirlpoolx <<<grid, block , 0, gpustream[thr_id]>>>(threads, startNounce, d_WXNonce[thr_id]);
+
+	cudaMemcpyAsync(h_wxnounce[thr_id], d_WXNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
+	cudaStreamSynchronize(gpustream[thr_id]);
+	foundnonce[0] = h_wxnounce[thr_id][0];
+	foundnonce[1] = h_wxnounce[thr_id][1];
+}
diff --git a/x15/cuda_x14_shabal512.cu b/x15/cuda_x14_shabal512.cu
index b942156939..bb7fabfd37 100644
--- a/x15/cuda_x14_shabal512.cu
+++ b/x15/cuda_x14_shabal512.cu
@@ -2,6 +2,7 @@
  * Shabal-512 for X14/X15 (STUB)
  */
 #include "cuda_helper.h"
+#include "cuda_vector.h"
 
 
 /* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
@@ -43,9 +44,6 @@
 
 #define sM    16
 
-#define C32   SPH_C32
-#define T32(x) (x)   
-
 #define O1   13
 #define O2    9
 #define O3    6
@@ -57,56 +55,52 @@
 
 /* BEGIN -- automatically generated code. */
 
-#define INPUT_BLOCK_ADD   do { \
-		B0 = T32(B0 + M0); \
-		B1 = T32(B1 + M1); \
-		B2 = T32(B2 + M2); \
-		B3 = T32(B3 + M3); \
-		B4 = T32(B4 + M4); \
-		B5 = T32(B5 + M5); \
-		B6 = T32(B6 + M6); \
-		B7 = T32(B7 + M7); \
-		B8 = T32(B8 + M8); \
-		B9 = T32(B9 + M9); \
-		BA = T32(BA + MA); \
-		BB = T32(BB + MB); \
-		BC = T32(BC + MC); \
-		BD = T32(BD + MD); \
-		BE = T32(BE + ME); \
-		BF = T32(BF + MF); \
-	} while (0)
-
-#define INPUT_BLOCK_SUB   do { \
-		C0 = T32(C0 - M0); \
-		C1 = T32(C1 - M1); \
-		C2 = T32(C2 - M2); \
-		C3 = T32(C3 - M3); \
-		C4 = T32(C4 - M4); \
-		C5 = T32(C5 - M5); \
-		C6 = T32(C6 - M6); \
-		C7 = T32(C7 - M7); \
-		C8 = T32(C8 - M8); \
-		C9 = T32(C9 - M9); \
-		CA = T32(CA - MA); \
-		CB = T32(CB - MB); \
-		CC = T32(CC - MC); \
-		CD = T32(CD - MD); \
-		CE = T32(CE - ME); \
-		CF = T32(CF - MF); \
-	} while (0)
-
-#define XOR_W   do { \
+#define INPUT_BLOCK_ADD  \
+		B0 = B0 + M0; \
+		B1 = B1 + M1; \
+		B2 = B2 + M2; \
+		B3 = B3 + M3; \
+		B4 = B4 + M4; \
+		B5 = B5 + M5; \
+		B6 = B6 + M6; \
+		B7 = B7 + M7; \
+		B8 = B8 + M8; \
+		B9 = B9 + M9; \
+		BA = BA + MA; \
+		BB = BB + MB; \
+		BC = BC + MC; \
+		BD = BD + MD; \
+		BE = BE + ME; \
+		BF = BF + MF; \
+
+#define INPUT_BLOCK_SUB \
+		C0 = C0 - M0; \
+		C1 = C1 - M1; \
+		C2 = C2 - M2; \
+		C3 = C3 - M3; \
+		C4 = C4 - M4; \
+		C5 = C5 - M5; \
+		C6 = C6 - M6; \
+		C7 = C7 - M7; \
+		C8 = C8 - M8; \
+		C9 = C9 - M9; \
+		CA = CA - MA; \
+		CB = CB - MB; \
+		CC = CC - MC; \
+		CD = CD - MD; \
+		CE = CE - ME; \
+		CF = CF - MF; \
+
+#define XOR_W  \
 		A00 ^= Wlow; \
 		A01 ^= Whigh; \
-	} while (0)
 
-#define SWAP(v1, v2)   do { \
-		uint32_t tmp = (v1); \
-		(v1) = (v2); \
-		(v2) = tmp; \
-	} while (0)
+#define SWAP(v1, v2) \
+		v1^=v2;\
+		v2 ^= v1;\
+		v1 ^= v2;
 
-#define SWAP_BC   do { \
+#define SWAP_BC   \
 		SWAP(B0, C0); \
 		SWAP(B1, C1); \
 		SWAP(B2, C2); \
@@ -123,17 +117,15 @@
 		SWAP(BD, CD); \
 		SWAP(BE, CE); \
 		SWAP(BF, CF); \
-	} while (0)
 
-#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)   do { \
-		xa0 = T32((xa0 \
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+		xa0 = ((xa0 \
 			^ (ROTL32(xa1, 15) * 5U) \
 			^ xc) * 3U) \
 			^ xb1 ^ (xb2 & ~xb3) ^ xm; \
-		xb0 = T32(~(ROTL32(xb0, 1) ^ xa0)); \
-	} while (0)
+		xb0 = (~(ROTL32(xb0, 1) ^ xa0)); \
 
-#define PERM_STEP_0   do { \
+#define PERM_STEP_0 \
 		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
 		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
 		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
@@ -150,9 +142,8 @@
 		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
 		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
 		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
-	} while (0)
 
-#define PERM_STEP_1   do { \
+#define PERM_STEP_1 \
 		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
 		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
 		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
@@ -169,9 +160,8 @@
 		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
 		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
 		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
-	} while (0)
 
-#define PERM_STEP_2   do { \
+#define PERM_STEP_2 \
 		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
 		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
 		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
@@ -188,9 +178,8 @@
 		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
 		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
 		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
-	} while (0)
 
-#define APPLY_P   do { \
+#define APPLY_P  \
 		B0 = ROTL32(B0, 17); \
 		B1 = ROTL32(B1, 17); \
 		B2 = ROTL32(B2, 17); \
@@ -210,45 +199,44 @@
 		PERM_STEP_0; \
 		PERM_STEP_1; \
 		PERM_STEP_2; \
-		A0B = T32(A0B + C6); \
-		A0A = T32(A0A + C5); \
-		A09 = T32(A09 + C4); \
-		A08 = T32(A08 + C3); \
-		A07 = T32(A07 + C2); \
-		A06 = T32(A06 + C1); \
-		A05 = T32(A05 + C0); \
-		A04 = T32(A04 + CF); \
-		A03 = T32(A03 + CE); \
-		A02 = T32(A02 + CD); \
-		A01 = T32(A01 + CC); \
-		A00 = T32(A00 + CB); \
-		A0B = T32(A0B + CA); \
-		A0A = T32(A0A + C9); \
-		A09 = T32(A09 + C8); \
-		A08 = T32(A08 + C7); \
-		A07 = T32(A07 + C6); \
-		A06 = T32(A06 + C5); \
-		A05 = T32(A05 + C4); \
-		A04 = T32(A04 + C3); \
-		A03 = T32(A03 + C2); \
-		A02 = T32(A02 + C1); \
-		A01 = T32(A01 + C0); \
-		A00 = T32(A00 + CF); \
-		A0B = T32(A0B + CE); \
-		A0A = T32(A0A + CD); \
-		A09 = T32(A09 + CC); \
-		A08 = T32(A08 + CB); \
-		A07 = T32(A07 + CA); \
-		A06 = T32(A06 + C9); \
-		A05 = T32(A05 + C8); \
-		A04 = T32(A04 + C7); \
-		A03 = T32(A03 + C6); \
-		A02 = T32(A02 + C5); \
-		A01 = T32(A01 + C4); \
-		A00 = T32(A00 + C3); \
-	} while (0)
-
-#define APPLY_P_FINAL   do { \
+		A0B = (A0B + C6); \
+		A0A = (A0A + C5); \
+		A09 = (A09 + C4); \
+		A08 = (A08 + C3); \
+		A07 = (A07 + C2); \
+		A06 = (A06 + C1); \
+		A05 = (A05 + C0); \
+		A04 = (A04 + CF); \
+		A03 = (A03 + CE); \
+		A02 = (A02 + CD); \
+		A01 = (A01 + CC); \
+		A00 = (A00 + CB); \
+		A0B = (A0B + CA); \
+		A0A = (A0A + C9); \
+		A09 = (A09 + C8); \
+		A08 = (A08 + C7); \
+		A07 = (A07 + C6); \
+		A06 = (A06 + C5); \
+		A05 = (A05 + C4); \
+		A04 = (A04 + C3); \
+		A03 = (A03 + C2); \
+		A02 = (A02 + C1); \
+		A01 = (A01 + C0); \
+		A00 = (A00 + CF); \
+		A0B = (A0B + CE); \
+		A0A = (A0A + CD); \
+		A09 = (A09 + CC); \
+		A08 = (A08 + CB); \
+		A07 = (A07 + CA); \
+		A06 = (A06 + C9); \
+		A05 = (A05 + C8); \
+		A04 = (A04 + C7); \
+		A03 = (A03 + C6); \
+		A02 = (A02 + C5); \
+		A01 = (A01 + C4); \
+		A00 = (A00 + C3); \
+
+#define APPLY_P_FINAL  \
 		B0 = ROTL32(B0, 17); \
 		B1 = ROTL32(B1, 17); \
 		B2 = ROTL32(B2, 17); \
@@ -268,135 +256,128 @@
 		PERM_STEP_0; \
 		PERM_STEP_1; \
 		PERM_STEP_2; \
-		} while (0)
 
-#define INCR_W   do { \
-		if ((Wlow = T32(Wlow + 1)) == 0) \
-			Whigh = T32(Whigh + 1); \
-	} while (0)
+#define INCR_W if ((Wlow = (Wlow + 1)) == 0) \
+			Whigh = (Whigh + 1); \
+	
 
 
 #if 0 /* other hash sizes init */
 
 static const uint32_t A_init_192[] = {
-	C32(0xFD749ED4), C32(0xB798E530), C32(0x33904B6F), C32(0x46BDA85E),
-	C32(0x076934B4), C32(0x454B4058), C32(0x77F74527), C32(0xFB4CF465),
-	C32(0x62931DA9), C32(0xE778C8DB), C32(0x22B3998E), C32(0xAC15CFB9)
+	0xFD749ED4), 0xB798E530), 0x33904B6F), 0x46BDA85E),
+	0x076934B4), 0x454B4058), 0x77F74527), 0xFB4CF465),
+	0x62931DA9), 0xE778C8DB), 0x22B3998E), 0xAC15CFB9)
 };
 
 static const uint32_t B_init_192[] = {
-	C32(0x58BCBAC4), C32(0xEC47A08E), C32(0xAEE933B2), C32(0xDFCBC824),
-	C32(0xA7944804), C32(0xBF65BDB0), C32(0x5A9D4502), C32(0x59979AF7),
-	C32(0xC5CEA54E), C32(0x4B6B8150), C32(0x16E71909), C32(0x7D632319),
-	C32(0x930573A0), C32(0xF34C63D1), C32(0xCAF914B4), C32(0xFDD6612C)
+	0x58BCBAC4), 0xEC47A08E), 0xAEE933B2), 0xDFCBC824),
+	0xA7944804), 0xBF65BDB0), 0x5A9D4502), 0x59979AF7),
+	0xC5CEA54E), 0x4B6B8150), 0x16E71909), 0x7D632319),
+	0x930573A0), 0xF34C63D1), 0xCAF914B4), 0xFDD6612C)
 };
 
 static const uint32_t C_init_192[] = {
-	C32(0x61550878), C32(0x89EF2B75), C32(0xA1660C46), C32(0x7EF3855B),
-	C32(0x7297B58C), C32(0x1BC67793), C32(0x7FB1C723), C32(0xB66FC640),
-	C32(0x1A48B71C), C32(0xF0976D17), C32(0x088CE80A), C32(0xA454EDF3),
-	C32(0x1C096BF4), C32(0xAC76224B), C32(0x5215781C), C32(0xCD5D2669)
+	0x61550878), 0x89EF2B75), 0xA1660C46), 0x7EF3855B),
+	0x7297B58C), 0x1BC67793), 0x7FB1C723), 0xB66FC640),
+	0x1A48B71C), 0xF0976D17), 0x088CE80A), 0xA454EDF3),
+	0x1C096BF4), 0xAC76224B), 0x5215781C), 0xCD5D2669)
 };
 
 static const uint32_t A_init_224[] = {
-	C32(0xA5201467), C32(0xA9B8D94A), C32(0xD4CED997), C32(0x68379D7B),
-	C32(0xA7FC73BA), C32(0xF1A2546B), C32(0x606782BF), C32(0xE0BCFD0F),
-	C32(0x2F25374E), C32(0x069A149F), C32(0x5E2DFF25), C32(0xFAECF061)
+	0xA5201467), 0xA9B8D94A), 0xD4CED997), 0x68379D7B),
+	0xA7FC73BA), 0xF1A2546B), 0x606782BF), 0xE0BCFD0F),
+	0x2F25374E), 0x069A149F), 0x5E2DFF25), 0xFAECF061)
 };
 
 static const uint32_t B_init_224[] = {
-	C32(0xEC9905D8), C32(0xF21850CF), C32(0xC0A746C8), C32(0x21DAD498),
-	C32(0x35156EEB), C32(0x088C97F2), C32(0x26303E40), C32(0x8A2D4FB5),
-	C32(0xFEEE44B6), C32(0x8A1E9573), C32(0x7B81111A), C32(0xCBC139F0),
-	C32(0xA3513861), C32(0x1D2C362E), C32(0x918C580E), C32(0xB58E1B9C)
+	0xEC9905D8), 0xF21850CF), 0xC0A746C8), 0x21DAD498),
+	0x35156EEB), 0x088C97F2), 0x26303E40), 0x8A2D4FB5),
+	0xFEEE44B6), 0x8A1E9573), 0x7B81111A), 0xCBC139F0),
+	0xA3513861), 0x1D2C362E), 0x918C580E), 0xB58E1B9C)
 };
 
 static const uint32_t C_init_224[] = {
-	C32(0xE4B573A1), C32(0x4C1A0880), C32(0x1E907C51), C32(0x04807EFD),
-	C32(0x3AD8CDE5), C32(0x16B21302), C32(0x02512C53), C32(0x2204CB18),
-	C32(0x99405F2D), C32(0xE5B648A1), C32(0x70AB1D43), C32(0xA10C25C2),
-	C32(0x16F1AC05), C32(0x38BBEB56), C32(0x9B01DC60), C32(0xB1096D83)
+	0xE4B573A1), 0x4C1A0880), 0x1E907C51), 0x04807EFD),
+	0x3AD8CDE5), 0x16B21302), 0x02512C53), 0x2204CB18),
+	0x99405F2D), 0xE5B648A1), 0x70AB1D43), 0xA10C25C2),
+	0x16F1AC05), 0x38BBEB56), 0x9B01DC60), 0xB1096D83)
 };
 
 static const uint32_t A_init_256[] = {
-	C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
-	C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
-	C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
+	0x52F84552), 0xE54B7999), 0x2D8EE3EC), 0xB9645191),
+	0xE0078B86), 0xBB7C44C9), 0xD2B5C1CA), 0xB0D2EB8C),
+	0x14CE5A45), 0x22AF50DC), 0xEFFDBC6B), 0xEB21B74A)
 };
 
 static const uint32_t B_init_256[] = {
-	C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
-	C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
-	C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
-	C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
+	0xB555C6EE), 0x3E710596), 0xA72A652F), 0x9301515F),
+	0xDA28C1FA), 0x696FD868), 0x9CB6BF72), 0x0AFE4002),
+	0xA6E03615), 0x5138C1D4), 0xBE216306), 0xB38B8890),
+	0x3EA8B96B), 0x3299ACE4), 0x30924DD4), 0x55CB34A5)
 };
 
 static const uint32_t C_init_256[] = {
-	C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
-	C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
-	C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
-	C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
+	0xB405F031), 0xC4233EBA), 0xB3733979), 0xC0DD9D55),
+	0xC51C28AE), 0xA327B8E1), 0x56C56167), 0xED614433),
+	0x88B59D60), 0x60E2CEBA), 0x758B4B8B), 0x83E82A7F),
+	0xBC968828), 0xE6E00BF7), 0xBA839E55), 0x9B491C60)
 };
 
 static const uint32_t A_init_384[] = {
-	C32(0xC8FCA331), C32(0xE55C504E), C32(0x003EBF26), C32(0xBB6B8D83),
-	C32(0x7B0448C1), C32(0x41B82789), C32(0x0A7C9601), C32(0x8D659CFF),
-	C32(0xB6E2673E), C32(0xCA54C77B), C32(0x1460FD7E), C32(0x3FCB8F2D)
+	0xC8FCA331), 0xE55C504E), 0x003EBF26), 0xBB6B8D83),
+	0x7B0448C1), 0x41B82789), 0x0A7C9601), 0x8D659CFF),
+	0xB6E2673E), 0xCA54C77B), 0x1460FD7E), 0x3FCB8F2D)
 };
 
 static const uint32_t B_init_384[] = {
-	C32(0x527291FC), C32(0x2A16455F), C32(0x78E627E5), C32(0x944F169F),
-	C32(0x1CA6F016), C32(0xA854EA25), C32(0x8DB98ABE), C32(0xF2C62641),
-	C32(0x30117DCB), C32(0xCF5C4309), C32(0x93711A25), C32(0xF9F671B8),
-	C32(0xB01D2116), C32(0x333F4B89), C32(0xB285D165), C32(0x86829B36)
+	0x527291FC), 0x2A16455F), 0x78E627E5), 0x944F169F),
+	0x1CA6F016), 0xA854EA25), 0x8DB98ABE), 0xF2C62641),
+	0x30117DCB), 0xCF5C4309), 0x93711A25), 0xF9F671B8),
+	0xB01D2116), 0x333F4B89), 0xB285D165), 0x86829B36)
 };
 
 static const uint32_t C_init_384[] = {
-	C32(0xF764B11A), C32(0x76172146), C32(0xCEF6934D), C32(0xC6D28399),
-	C32(0xFE095F61), C32(0x5E6018B4), C32(0x5048ECF5), C32(0x51353261),
-	C32(0x6E6E36DC), C32(0x63130DAD), C32(0xA9C69BD6), C32(0x1E90EA0C),
-	C32(0x7C35073B), C32(0x28D95E6D), C32(0xAA340E0D), C32(0xCB3DEE70)
+	0xF764B11A), 0x76172146), 0xCEF6934D), 0xC6D28399),
+	0xFE095F61), 0x5E6018B4), 0x5048ECF5), 0x51353261),
+	0x6E6E36DC), 0x63130DAD), 0xA9C69BD6), 0x1E90EA0C),
+	0x7C35073B), 0x28D95E6D), 0xAA340E0D), 0xCB3DEE70)
 };
 #endif
 
-__device__ __constant__
-static const uint32_t d_A512[] = {
-	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
-	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
-	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
-};
-
-__device__ __constant__
-static const uint32_t d_B512[] = {
-	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
-	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
-	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
-	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
-};
-
-__device__ __constant__
-static const uint32_t d_C512[] = {
-	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
-	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
-	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
-	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
-};
 
 /***************************************************/
 // GPU Hash Function
 __global__ __launch_bounds__(256, 4)
-void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash)
 {
-	__syncthreads();
-
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	const uint32_t d_A512[] = {
+		0x20728DFD, 0x46C0BD53, 0xE782B699,0x55304632,
+		0x71B4EF90, 0x0EA9E82C, 0xDBB930F1, 0xFAD06B8B,
+		0xBE0CAE40, 0x8BD14410, 0x76D2ADAC, 0x28ACAB7F
+	};
+
+	const uint32_t d_B512[] = {
+		0xC1099CB7, 0x07B385F3, 0xE7442C26, 0xCC8AD640,
+		0xEB6F56C7, 0x1EA81AA9, 0x73B9D314, 0x1DE85D08,
+		0x48910A5A, 0x893B22DB, 0xC5A0DF44, 0xBBC4324E,
+		0x72D2F240, 0x75941D99, 0x6D8BDE82, 0xA1A7502B
+	};
+
+	const uint32_t d_C512[] = {
+		0xD9BF68D1, 0x58BAD750, 0x56028CB2, 0x8134F359,
+		0xB5D469D8, 0x941A8CC2, 0x418B2A6E, 0x04052780,
+		0x7F07D787, 0x5194358F, 0x3C60D665, 0xBE97D79A,
+		0x950C3434, 0xAED9A06D, 0x2537DC8D, 0x7CDB5969
+	};
+
+//	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-		int hashPosition = nounce - startNounce;
-		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; // [hashPosition * 8]
-
+		const uint32_t nounce =  (startNounce + thread);
+		const uint32_t hashPosition = nounce - startNounce;
+		uint32_t *const Hash = &g_hash[hashPosition*16]; // [hashPosition * 8]
 		uint32_t A00 = d_A512[0], A01 = d_A512[1], A02 = d_A512[2], A03 = d_A512[3],
 			A04 = d_A512[4], A05 = d_A512[5], A06 = d_A512[6], A07 = d_A512[7],
 			A08 = d_A512[8], A09 = d_A512[9], A0A = d_A512[10], A0B = d_A512[11];
@@ -410,23 +391,31 @@ void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 			CC = d_C512[12], CD = d_C512[13], CE = d_C512[14], CF = d_C512[15];
 		uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
 
-		M0 = Hash[0];
-		M1 = Hash[1];
-		M2 = Hash[2];
-		M3 = Hash[3];
-		M4 = Hash[4];
-		M5 = Hash[5];
-		M6 = Hash[6];
-		M7 = Hash[7];
-
-		M8 = Hash[8];
-		M9 = Hash[9];
-		MA = Hash[10];
-		MB = Hash[11];
-		MC = Hash[12];
-		MD = Hash[13];
-		ME = Hash[14];
-		MF = Hash[15];
+
+		uint32_t msg[16];
+
+		uint28 *phash = (uint28*)Hash;
+		uint28 *outpt = (uint28*)msg;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+
+		M0 = msg[0];
+		M1 = msg[1];
+		M2 = msg[2];
+		M3 = msg[3];
+		M4 = msg[4];
+		M5 = msg[5];
+		M6 = msg[6];
+		M7 = msg[7];
+
+		M8 = msg[8];
+		M9 = msg[9];
+		MA = msg[10];
+		MB = msg[11];
+		MC = msg[12];
+		MD = msg[13];
+		ME = msg[14];
+		MF = msg[15];
 
 		INPUT_BLOCK_ADD;
 		A00 ^= 1;
@@ -470,18 +459,11 @@ void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		Hash[13] = BD;
 		Hash[14] = BE;
 		Hash[15] = BF;
-
-		//result = (Hash[3] <= target);
-
-		uint32_t *outpHash = (uint32_t*)&g_hash[hashPosition << 3]; // [8 * hashPosition];
-
-		for (int i = 0; i < 16; i++)
-			outpHash[i] = Hash[i];
 	}
 }
 
 // #include <stdio.h>
-__host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,  uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 64;
 
@@ -489,5 +471,5 @@ __host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t s
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	x14_shabal512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x14_shabal512_gpu_hash_64<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_hash);
 }
diff --git a/x15/cuda_x14_shabal512.cu.orig b/x15/cuda_x14_shabal512.cu.orig
new file mode 100644
index 0000000000..b6a156178a
--- /dev/null
+++ b/x15/cuda_x14_shabal512.cu.orig
@@ -0,0 +1,474 @@
+/*
+ * Shabal-512 for X14/X15 (STUB)
+ */
+#include "cuda_helper.h"
+
+
+
+/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
+/*
+ * Shabal implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010 Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+/*
+ * Part of this code was automatically generated (the part between
+ * the "BEGIN" and "END" markers).
+ */
+
+#define sM    16
+
+#define O1   13
+#define O2    9
+#define O3    6
+
+/*
+ * We copy the state into local variables, so that the compiler knows
+ * that it can optimize them at will.
+ */
+
+/* BEGIN -- automatically generated code. */
+
+#define INPUT_BLOCK_ADD  \
+		B0 = B0 + M0; \
+		B1 = B1 + M1; \
+		B2 = B2 + M2; \
+		B3 = B3 + M3; \
+		B4 = B4 + M4; \
+		B5 = B5 + M5; \
+		B6 = B6 + M6; \
+		B7 = B7 + M7; \
+		B8 = B8 + M8; \
+		B9 = B9 + M9; \
+		BA = BA + MA; \
+		BB = BB + MB; \
+		BC = BC + MC; \
+		BD = BD + MD; \
+		BE = BE + ME; \
+		BF = BF + MF; \
+
+#define INPUT_BLOCK_SUB \
+		C0 = C0 - M0; \
+		C1 = C1 - M1; \
+		C2 = C2 - M2; \
+		C3 = C3 - M3; \
+		C4 = C4 - M4; \
+		C5 = C5 - M5; \
+		C6 = C6 - M6; \
+		C7 = C7 - M7; \
+		C8 = C8 - M8; \
+		C9 = C9 - M9; \
+		CA = CA - MA; \
+		CB = CB - MB; \
+		CC = CC - MC; \
+		CD = CD - MD; \
+		CE = CE - ME; \
+		CF = CF - MF; \
+
+#define XOR_W  \
+		A00 ^= Wlow; \
+		A01 ^= Whigh; \
+
+#define SWAP(v1, v2) \
+		v1^=v2;\
+		v2 ^= v1;\
+		v1 ^= v2;
+
+#define SWAP_BC   \
+		SWAP(B0, C0); \
+		SWAP(B1, C1); \
+		SWAP(B2, C2); \
+		SWAP(B3, C3); \
+		SWAP(B4, C4); \
+		SWAP(B5, C5); \
+		SWAP(B6, C6); \
+		SWAP(B7, C7); \
+		SWAP(B8, C8); \
+		SWAP(B9, C9); \
+		SWAP(BA, CA); \
+		SWAP(BB, CB); \
+		SWAP(BC, CC); \
+		SWAP(BD, CD); \
+		SWAP(BE, CE); \
+		SWAP(BF, CF); \
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+		xa0 = ((xa0 \
+			^ (ROTL32(xa1, 15) * 5U) \
+			^ xc) * 3U) \
+			^ xb1 ^ (xb2 & ~xb3) ^ xm; \
+		xb0 = (~(ROTL32(xb0, 1) ^ xa0)); \
+
+#define PERM_STEP_0 \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+
+#define PERM_STEP_1 \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+
+#define PERM_STEP_2 \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+
+#define APPLY_P  \
+		B0 = ROTL32(B0, 17); \
+		B1 = ROTL32(B1, 17); \
+		B2 = ROTL32(B2, 17); \
+		B3 = ROTL32(B3, 17); \
+		B4 = ROTL32(B4, 17); \
+		B5 = ROTL32(B5, 17); \
+		B6 = ROTL32(B6, 17); \
+		B7 = ROTL32(B7, 17); \
+		B8 = ROTL32(B8, 17); \
+		B9 = ROTL32(B9, 17); \
+		BA = ROTL32(BA, 17); \
+		BB = ROTL32(BB, 17); \
+		BC = ROTL32(BC, 17); \
+		BD = ROTL32(BD, 17); \
+		BE = ROTL32(BE, 17); \
+		BF = ROTL32(BF, 17); \
+		PERM_STEP_0; \
+		PERM_STEP_1; \
+		PERM_STEP_2; \
+		A0B = (A0B + C6); \
+		A0A = (A0A + C5); \
+		A09 = (A09 + C4); \
+		A08 = (A08 + C3); \
+		A07 = (A07 + C2); \
+		A06 = (A06 + C1); \
+		A05 = (A05 + C0); \
+		A04 = (A04 + CF); \
+		A03 = (A03 + CE); \
+		A02 = (A02 + CD); \
+		A01 = (A01 + CC); \
+		A00 = (A00 + CB); \
+		A0B = (A0B + CA); \
+		A0A = (A0A + C9); \
+		A09 = (A09 + C8); \
+		A08 = (A08 + C7); \
+		A07 = (A07 + C6); \
+		A06 = (A06 + C5); \
+		A05 = (A05 + C4); \
+		A04 = (A04 + C3); \
+		A03 = (A03 + C2); \
+		A02 = (A02 + C1); \
+		A01 = (A01 + C0); \
+		A00 = (A00 + CF); \
+		A0B = (A0B + CE); \
+		A0A = (A0A + CD); \
+		A09 = (A09 + CC); \
+		A08 = (A08 + CB); \
+		A07 = (A07 + CA); \
+		A06 = (A06 + C9); \
+		A05 = (A05 + C8); \
+		A04 = (A04 + C7); \
+		A03 = (A03 + C6); \
+		A02 = (A02 + C5); \
+		A01 = (A01 + C4); \
+		A00 = (A00 + C3); \
+
+#define APPLY_P_FINAL  \
+		B0 = ROTL32(B0, 17); \
+		B1 = ROTL32(B1, 17); \
+		B2 = ROTL32(B2, 17); \
+		B3 = ROTL32(B3, 17); \
+		B4 = ROTL32(B4, 17); \
+		B5 = ROTL32(B5, 17); \
+		B6 = ROTL32(B6, 17); \
+		B7 = ROTL32(B7, 17); \
+		B8 = ROTL32(B8, 17); \
+		B9 = ROTL32(B9, 17); \
+		BA = ROTL32(BA, 17); \
+		BB = ROTL32(BB, 17); \
+		BC = ROTL32(BC, 17); \
+		BD = ROTL32(BD, 17); \
+		BE = ROTL32(BE, 17); \
+		BF = ROTL32(BF, 17); \
+		PERM_STEP_0; \
+		PERM_STEP_1; \
+		PERM_STEP_2; \
+
+#define INCR_W if ((Wlow = (Wlow + 1)) == 0) \
+			Whigh = (Whigh + 1); \
+	
+
+
+#if 0 /* other hash sizes init */
+
+static const uint32_t A_init_192[] = {
+	0xFD749ED4), 0xB798E530), 0x33904B6F), 0x46BDA85E),
+	0x076934B4), 0x454B4058), 0x77F74527), 0xFB4CF465),
+	0x62931DA9), 0xE778C8DB), 0x22B3998E), 0xAC15CFB9)
+};
+
+static const uint32_t B_init_192[] = {
+	0x58BCBAC4), 0xEC47A08E), 0xAEE933B2), 0xDFCBC824),
+	0xA7944804), 0xBF65BDB0), 0x5A9D4502), 0x59979AF7),
+	0xC5CEA54E), 0x4B6B8150), 0x16E71909), 0x7D632319),
+	0x930573A0), 0xF34C63D1), 0xCAF914B4), 0xFDD6612C)
+};
+
+static const uint32_t C_init_192[] = {
+	0x61550878), 0x89EF2B75), 0xA1660C46), 0x7EF3855B),
+	0x7297B58C), 0x1BC67793), 0x7FB1C723), 0xB66FC640),
+	0x1A48B71C), 0xF0976D17), 0x088CE80A), 0xA454EDF3),
+	0x1C096BF4), 0xAC76224B), 0x5215781C), 0xCD5D2669)
+};
+
+static const uint32_t A_init_224[] = {
+	0xA5201467), 0xA9B8D94A), 0xD4CED997), 0x68379D7B),
+	0xA7FC73BA), 0xF1A2546B), 0x606782BF), 0xE0BCFD0F),
+	0x2F25374E), 0x069A149F), 0x5E2DFF25), 0xFAECF061)
+};
+
+static const uint32_t B_init_224[] = {
+	0xEC9905D8), 0xF21850CF), 0xC0A746C8), 0x21DAD498),
+	0x35156EEB), 0x088C97F2), 0x26303E40), 0x8A2D4FB5),
+	0xFEEE44B6), 0x8A1E9573), 0x7B81111A), 0xCBC139F0),
+	0xA3513861), 0x1D2C362E), 0x918C580E), 0xB58E1B9C)
+};
+
+static const uint32_t C_init_224[] = {
+	0xE4B573A1), 0x4C1A0880), 0x1E907C51), 0x04807EFD),
+	0x3AD8CDE5), 0x16B21302), 0x02512C53), 0x2204CB18),
+	0x99405F2D), 0xE5B648A1), 0x70AB1D43), 0xA10C25C2),
+	0x16F1AC05), 0x38BBEB56), 0x9B01DC60), 0xB1096D83)
+};
+
+static const uint32_t A_init_256[] = {
+	0x52F84552), 0xE54B7999), 0x2D8EE3EC), 0xB9645191),
+	0xE0078B86), 0xBB7C44C9), 0xD2B5C1CA), 0xB0D2EB8C),
+	0x14CE5A45), 0x22AF50DC), 0xEFFDBC6B), 0xEB21B74A)
+};
+
+static const uint32_t B_init_256[] = {
+	0xB555C6EE), 0x3E710596), 0xA72A652F), 0x9301515F),
+	0xDA28C1FA), 0x696FD868), 0x9CB6BF72), 0x0AFE4002),
+	0xA6E03615), 0x5138C1D4), 0xBE216306), 0xB38B8890),
+	0x3EA8B96B), 0x3299ACE4), 0x30924DD4), 0x55CB34A5)
+};
+
+static const uint32_t C_init_256[] = {
+	0xB405F031), 0xC4233EBA), 0xB3733979), 0xC0DD9D55),
+	0xC51C28AE), 0xA327B8E1), 0x56C56167), 0xED614433),
+	0x88B59D60), 0x60E2CEBA), 0x758B4B8B), 0x83E82A7F),
+	0xBC968828), 0xE6E00BF7), 0xBA839E55), 0x9B491C60)
+};
+
+static const uint32_t A_init_384[] = {
+	0xC8FCA331), 0xE55C504E), 0x003EBF26), 0xBB6B8D83),
+	0x7B0448C1), 0x41B82789), 0x0A7C9601), 0x8D659CFF),
+	0xB6E2673E), 0xCA54C77B), 0x1460FD7E), 0x3FCB8F2D)
+};
+
+static const uint32_t B_init_384[] = {
+	0x527291FC), 0x2A16455F), 0x78E627E5), 0x944F169F),
+	0x1CA6F016), 0xA854EA25), 0x8DB98ABE), 0xF2C62641),
+	0x30117DCB), 0xCF5C4309), 0x93711A25), 0xF9F671B8),
+	0xB01D2116), 0x333F4B89), 0xB285D165), 0x86829B36)
+};
+
+static const uint32_t C_init_384[] = {
+	0xF764B11A), 0x76172146), 0xCEF6934D), 0xC6D28399),
+	0xFE095F61), 0x5E6018B4), 0x5048ECF5), 0x51353261),
+	0x6E6E36DC), 0x63130DAD), 0xA9C69BD6), 0x1E90EA0C),
+	0x7C35073B), 0x28D95E6D), 0xAA340E0D), 0xCB3DEE70)
+};
+#endif
+
+
+/***************************************************/
+// GPU Hash Function
+__global__ __launch_bounds__(256, 4)
+void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	const uint32_t d_A512[] = {
+		0x20728DFD, 0x46C0BD53, 0xE782B699,0x55304632,
+		0x71B4EF90, 0x0EA9E82C, 0xDBB930F1, 0xFAD06B8B,
+		0xBE0CAE40, 0x8BD14410, 0x76D2ADAC, 0x28ACAB7F
+	};
+
+	const uint32_t d_B512[] = {
+		0xC1099CB7, 0x07B385F3, 0xE7442C26, 0xCC8AD640,
+		0xEB6F56C7, 0x1EA81AA9, 0x73B9D314, 0x1DE85D08,
+		0x48910A5A, 0x893B22DB, 0xC5A0DF44, 0xBBC4324E,
+		0x72D2F240, 0x75941D99, 0x6D8BDE82, 0xA1A7502B
+	};
+
+	const uint32_t d_C512[] = {
+		0xD9BF68D1, 0x58BAD750, 0x56028CB2, 0x8134F359,
+		0xB5D469D8, 0x941A8CC2, 0x418B2A6E, 0x04052780,
+		0x7F07D787, 0x5194358F, 0x3C60D665, 0xBE97D79A,
+		0x950C3434, 0xAED9A06D, 0x2537DC8D, 0x7CDB5969
+	};
+
+	if (thread < threads)
+	{
+<<<<<<< HEAD
+		const uint32_t nounce =  (startNounce + thread);
+		const uint32_t hashPosition = nounce - startNounce;
+		uint32_t *const Hash = &g_hash[hashPosition*16]; // [hashPosition * 8]
+		uint32_t tmp;
+=======
+		uint32_t nounce =  (startNounce + thread);
+		uint32_t hashPosition = nounce - startNounce;
+		uint32_t *Hash = &g_hash[hashPosition*16]; // [hashPosition * 8]
+>>>>>>> fe3bf39... less scratch registers
+		uint32_t A00 = d_A512[0], A01 = d_A512[1], A02 = d_A512[2], A03 = d_A512[3],
+			A04 = d_A512[4], A05 = d_A512[5], A06 = d_A512[6], A07 = d_A512[7],
+			A08 = d_A512[8], A09 = d_A512[9], A0A = d_A512[10], A0B = d_A512[11];
+		uint32_t B0 = d_B512[0], B1 = d_B512[1], B2 = d_B512[2], B3 = d_B512[3],
+			B4 = d_B512[4], B5 = d_B512[5], B6 = d_B512[6], B7 = d_B512[7],
+			B8 = d_B512[8], B9 = d_B512[9], BA = d_B512[10], BB = d_B512[11],
+			BC = d_B512[12], BD = d_B512[13], BE = d_B512[14], BF = d_B512[15];
+		uint32_t C0 = d_C512[0], C1 = d_C512[1], C2 = d_C512[2], C3 = d_C512[3],
+			C4 = d_C512[4], C5 = d_C512[5], C6 = d_C512[6], C7 = d_C512[7],
+			C8 = d_C512[8], C9 = d_C512[9], CA = d_C512[10], CB = d_C512[11],
+			CC = d_C512[12], CD = d_C512[13], CE = d_C512[14], CF = d_C512[15];
+		uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
+
+		M0 = Hash[0];
+		M1 = Hash[1];
+		M2 = Hash[2];
+		M3 = Hash[3];
+		M4 = Hash[4];
+		M5 = Hash[5];
+		M6 = Hash[6];
+		M7 = Hash[7];
+
+		M8 = Hash[8];
+		M9 = Hash[9];
+		MA = Hash[10];
+		MB = Hash[11];
+		MC = Hash[12];
+		MD = Hash[13];
+		ME = Hash[14];
+		MF = Hash[15];
+
+		INPUT_BLOCK_ADD;
+		A00 ^= 1;
+		APPLY_P;
+		INPUT_BLOCK_SUB;
+		SWAP_BC;
+
+		M0 = 0x80;
+		M1 = M2 = M3 = M4 = M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;
+
+		INPUT_BLOCK_ADD;
+		A00 ^= 2;
+		APPLY_P;
+
+		SWAP_BC;
+		A00 ^= 2;
+		APPLY_P;
+
+		SWAP_BC;
+		A00 ^= 2;
+		APPLY_P;
+
+		SWAP_BC;
+		A00 ^= 2;
+		APPLY_P_FINAL;
+
+		Hash[0] = B0;
+		Hash[1] = B1;
+		Hash[2] = B2;
+		Hash[3] = B3;
+		Hash[4] = B4;
+		Hash[5] = B5;
+		Hash[6] = B6;
+		Hash[7] = B7;
+
+		Hash[8] = B8;
+		Hash[9] = B9;
+		Hash[10] = BA;
+		Hash[11] = BB;
+		Hash[12] = BC;
+		Hash[13] = BD;
+		Hash[14] = BE;
+		Hash[15] = BF;
+	}
+}
+
+// #include <stdio.h>
+__host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,  uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 64;
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x14_shabal512_gpu_hash_64<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_hash);
+}
diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu
index 4729d9b902..5cb34ec946 100644
--- a/x15/cuda_x15_whirlpool.cu
+++ b/x15/cuda_x15_whirlpool.cu
@@ -10,25 +10,18 @@
 #define USE_SHARED 1
 
 #include "cuda_helper.h"
+#include "cuda_vector.h"
+
+
 
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 __constant__ uint32_t pTarget[8];
 
-uint32_t *d_wnounce[MAX_GPUS];
-uint32_t *d_WNonce[MAX_GPUS];
+static uint32_t *h_wnounce[MAX_GPUS];
+static uint32_t *d_WNonce[MAX_GPUS];
 
 #define USE_ALL_TABLES 1
 
-__constant__ static uint64_t mixTob0Tox[256];
-#if USE_ALL_TABLES
-__constant__ static uint64_t mixTob1Tox[256];
-__constant__ static uint64_t mixTob2Tox[256];
-__constant__ static uint64_t mixTob3Tox[256];
-__constant__ static uint64_t mixTob4Tox[256];
-__constant__ static uint64_t mixTob5Tox[256];
-__constant__ static uint64_t mixTob6Tox[256];
-__constant__ static uint64_t mixTob7Tox[256];
-#endif
 
 /**
  * Whirlpool CUDA kernel implementation.
@@ -62,1069 +55,7 @@ __constant__ static uint64_t mixTob7Tox[256];
  * @author SP
  */
 
-static const uint64_t old1_T0[256] = {
-	SPH_C64(0x78D8C07818281818), SPH_C64(0xAF2605AF23652323),
-	SPH_C64(0xF9B87EF9C657C6C6), SPH_C64(0x6FFB136FE825E8E8),
-	SPH_C64(0xA1CB4CA187948787), SPH_C64(0x6211A962B8D5B8B8),
-	SPH_C64(0x0509080501030101), SPH_C64(0x6E0D426E4FD14F4F),
-	SPH_C64(0xEE9BADEE365A3636), SPH_C64(0x04FF5904A6F7A6A6),
-	SPH_C64(0xBD0CDEBDD26BD2D2), SPH_C64(0x060EFB06F502F5F5),
-	SPH_C64(0x8096EF80798B7979), SPH_C64(0xCE305FCE6FB16F6F),
-	SPH_C64(0xEF6DFCEF91AE9191), SPH_C64(0x07F8AA0752F65252),
-	SPH_C64(0xFD4727FD60A06060), SPH_C64(0x76358976BCD9BCBC),
-	SPH_C64(0xCD37ACCD9BB09B9B), SPH_C64(0x8C8A048C8E8F8E8E),
-	SPH_C64(0x15D27115A3F8A3A3), SPH_C64(0x3C6C603C0C140C0C),
-	SPH_C64(0x8A84FF8A7B8D7B7B), SPH_C64(0xE180B5E1355F3535),
-	SPH_C64(0x69F5E8691D271D1D), SPH_C64(0x47B35347E03DE0E0),
-	SPH_C64(0xAC21F6ACD764D7D7), SPH_C64(0xED9C5EEDC25BC2C2),
-	SPH_C64(0x96436D962E722E2E), SPH_C64(0x7A29627A4BDD4B4B),
-	SPH_C64(0x215DA321FE1FFEFE), SPH_C64(0x16D5821657F95757),
-	SPH_C64(0x41BDA841153F1515), SPH_C64(0xB6E89FB677997777),
-	SPH_C64(0xEB92A5EB37593737), SPH_C64(0x569E7B56E532E5E5),
-	SPH_C64(0xD9138CD99FBC9F9F), SPH_C64(0x1723D317F00DF0F0),
-	SPH_C64(0x7F206A7F4ADE4A4A), SPH_C64(0x95449E95DA73DADA),
-	SPH_C64(0x25A2FA2558E85858), SPH_C64(0xCACF06CAC946C9C9),
-	SPH_C64(0x8D7C558D297B2929), SPH_C64(0x225A50220A1E0A0A),
-	SPH_C64(0x4F50E14FB1CEB1B1), SPH_C64(0x1AC9691AA0FDA0A0),
-	SPH_C64(0xDA147FDA6BBD6B6B), SPH_C64(0xABD95CAB85928585),
-	SPH_C64(0x733C8173BDDABDBD), SPH_C64(0x348FD2345DE75D5D),
-	SPH_C64(0x5090805010301010), SPH_C64(0x0307F303F401F4F4),
-	SPH_C64(0xC0DD16C0CB40CBCB), SPH_C64(0xC6D3EDC63E423E3E),
-	SPH_C64(0x112D2811050F0505), SPH_C64(0xE6781FE667A96767),
-	SPH_C64(0x53977353E431E4E4), SPH_C64(0xBB0225BB27692727),
-	SPH_C64(0x5873325841C34141), SPH_C64(0x9DA72C9D8B808B8B),
-	SPH_C64(0x01F65101A7F4A7A7), SPH_C64(0x94B2CF947D877D7D),
-	SPH_C64(0xFB49DCFB95A29595), SPH_C64(0x9F568E9FD875D8D8),
-	SPH_C64(0x30708B30FB10FBFB), SPH_C64(0x71CD2371EE2FEEEE),
-	SPH_C64(0x91BBC7917C847C7C), SPH_C64(0xE37117E366AA6666),
-	SPH_C64(0x8E7BA68EDD7ADDDD), SPH_C64(0x4BAFB84B17391717),
-	SPH_C64(0x4645024647C94747), SPH_C64(0xDC1A84DC9EBF9E9E),
-	SPH_C64(0xC5D41EC5CA43CACA), SPH_C64(0x995875992D772D2D),
-	SPH_C64(0x792E9179BFDCBFBF), SPH_C64(0x1B3F381B07090707),
-	SPH_C64(0x23AC0123ADEAADAD), SPH_C64(0x2FB0EA2F5AEE5A5A),
-	SPH_C64(0xB5EF6CB583988383), SPH_C64(0xFFB685FF33553333),
-	SPH_C64(0xF25C3FF263A56363), SPH_C64(0x0A12100A02060202),
-	SPH_C64(0x38933938AAE3AAAA), SPH_C64(0xA8DEAFA871937171),
-	SPH_C64(0xCFC60ECFC845C8C8), SPH_C64(0x7DD1C87D192B1919),
-	SPH_C64(0x703B727049DB4949), SPH_C64(0x9A5F869AD976D9D9),
-	SPH_C64(0x1D31C31DF20BF2F2), SPH_C64(0x48A84B48E338E3E3),
-	SPH_C64(0x2AB9E22A5BED5B5B), SPH_C64(0x92BC349288858888),
-	SPH_C64(0xC83EA4C89AB39A9A), SPH_C64(0xBE0B2DBE266A2626),
-	SPH_C64(0xFABF8DFA32563232), SPH_C64(0x4A59E94AB0CDB0B0),
-	SPH_C64(0x6AF21B6AE926E9E9), SPH_C64(0x337778330F110F0F),
-	SPH_C64(0xA633E6A6D562D5D5), SPH_C64(0xBAF474BA809D8080),
-	SPH_C64(0x7C27997CBEDFBEBE), SPH_C64(0xDEEB26DECD4ACDCD),
-	SPH_C64(0xE489BDE4345C3434), SPH_C64(0x75327A7548D84848),
-	SPH_C64(0x2454AB24FF1CFFFF), SPH_C64(0x8F8DF78F7A8E7A7A),
-	SPH_C64(0xEA64F4EA90AD9090), SPH_C64(0x3E9DC23E5FE15F5F),
-	SPH_C64(0xA03D1DA020602020), SPH_C64(0xD50F67D568B86868),
-	SPH_C64(0x72CAD0721A2E1A1A), SPH_C64(0x2CB7192CAEEFAEAE),
-	SPH_C64(0x5E7DC95EB4C1B4B4), SPH_C64(0x19CE9A1954FC5454),
-	SPH_C64(0xE57FECE593A89393), SPH_C64(0xAA2F0DAA22662222),
-	SPH_C64(0xE96307E964AC6464), SPH_C64(0x122ADB12F10EF1F1),
-	SPH_C64(0xA2CCBFA273957373), SPH_C64(0x5A82905A12361212),
-	SPH_C64(0x5D7A3A5D40C04040), SPH_C64(0x2848402808180808),
-	SPH_C64(0xE89556E8C358C3C3), SPH_C64(0x7BDF337BEC29ECEC),
-	SPH_C64(0x904D9690DB70DBDB), SPH_C64(0x1FC0611FA1FEA1A1),
-	SPH_C64(0x83911C838D8A8D8D), SPH_C64(0xC9C8F5C93D473D3D),
-	SPH_C64(0xF15BCCF197A49797), SPH_C64(0x0000000000000000),
-	SPH_C64(0xD4F936D4CF4CCFCF), SPH_C64(0x876E45872B7D2B2B),
-	SPH_C64(0xB3E197B3769A7676), SPH_C64(0xB0E664B0829B8282),
-	SPH_C64(0xA928FEA9D667D6D6), SPH_C64(0x77C3D8771B2D1B1B),
-	SPH_C64(0x5B74C15BB5C2B5B5), SPH_C64(0x29BE1129AFECAFAF),
-	SPH_C64(0xDF1D77DF6ABE6A6A), SPH_C64(0x0DEABA0D50F05050),
-	SPH_C64(0x4C57124C45CF4545), SPH_C64(0x1838CB18F308F3F3),
-	SPH_C64(0xF0AD9DF030503030), SPH_C64(0x74C42B74EF2CEFEF),
-	SPH_C64(0xC3DAE5C33F413F3F), SPH_C64(0x1CC7921C55FF5555),
-	SPH_C64(0x10DB7910A2FBA2A2), SPH_C64(0x65E90365EA23EAEA),
-	SPH_C64(0xEC6A0FEC65AF6565), SPH_C64(0x6803B968BAD3BABA),
-	SPH_C64(0x934A65932F712F2F), SPH_C64(0xE78E4EE7C05DC0C0),
-	SPH_C64(0x8160BE81DE7FDEDE), SPH_C64(0x6CFCE06C1C241C1C),
-	SPH_C64(0x2E46BB2EFD1AFDFD), SPH_C64(0x641F52644DD74D4D),
-	SPH_C64(0xE076E4E092AB9292), SPH_C64(0xBCFA8FBC759F7575),
-	SPH_C64(0x1E36301E060A0606), SPH_C64(0x98AE24988A838A8A),
-	SPH_C64(0x404BF940B2CBB2B2), SPH_C64(0x59856359E637E6E6),
-	SPH_C64(0x367E70360E120E0E), SPH_C64(0x63E7F8631F211F1F),
-	SPH_C64(0xF75537F762A66262), SPH_C64(0xA33AEEA3D461D4D4),
-	SPH_C64(0x32812932A8E5A8A8), SPH_C64(0xF452C4F496A79696),
-	SPH_C64(0x3A629B3AF916F9F9), SPH_C64(0xF6A366F6C552C5C5),
-	SPH_C64(0xB11035B1256F2525), SPH_C64(0x20ABF22059EB5959),
-	SPH_C64(0xAED054AE84918484), SPH_C64(0xA7C5B7A772967272),
-	SPH_C64(0xDDECD5DD394B3939), SPH_C64(0x61165A614CD44C4C),
-	SPH_C64(0x3B94CA3B5EE25E5E), SPH_C64(0x859FE78578887878),
-	SPH_C64(0xD8E5DDD838483838), SPH_C64(0x869814868C898C8C),
-	SPH_C64(0xB217C6B2D16ED1D1), SPH_C64(0x0BE4410BA5F2A5A5),
-	SPH_C64(0x4DA1434DE23BE2E2), SPH_C64(0xF84E2FF861A36161),
-	SPH_C64(0x4542F145B3C8B3B3), SPH_C64(0xA53415A521632121),
-	SPH_C64(0xD60894D69CB99C9C), SPH_C64(0x66EEF0661E221E1E),
-	SPH_C64(0x5261225243C54343), SPH_C64(0xFCB176FCC754C7C7),
-	SPH_C64(0x2B4FB32BFC19FCFC), SPH_C64(0x14242014040C0404),
-	SPH_C64(0x08E3B20851F35151), SPH_C64(0xC725BCC799B69999),
-	SPH_C64(0xC4224FC46DB76D6D), SPH_C64(0x396568390D170D0D),
-	SPH_C64(0x35798335FA13FAFA), SPH_C64(0x8469B684DF7CDFDF),
-	SPH_C64(0x9BA9D79B7E827E7E), SPH_C64(0xB4193DB4246C2424),
-	SPH_C64(0xD7FEC5D73B4D3B3B), SPH_C64(0x3D9A313DABE0ABAB),
-	SPH_C64(0xD1F03ED1CE4FCECE), SPH_C64(0x5599885511331111),
-	SPH_C64(0x89830C898F8C8F8F), SPH_C64(0x6B044A6B4ED24E4E),
-	SPH_C64(0x5166D151B7C4B7B7), SPH_C64(0x60E00B60EB20EBEB),
-	SPH_C64(0xCCC1FDCC3C443C3C), SPH_C64(0xBFFD7CBF819E8181),
-	SPH_C64(0xFE40D4FE94A19494), SPH_C64(0x0C1CEB0CF704F7F7),
-	SPH_C64(0x6718A167B9D6B9B9), SPH_C64(0x5F8B985F13351313),
-	SPH_C64(0x9C517D9C2C742C2C), SPH_C64(0xB805D6B8D368D3D3),
-	SPH_C64(0x5C8C6B5CE734E7E7), SPH_C64(0xCB3957CB6EB26E6E),
-	SPH_C64(0xF3AA6EF3C451C4C4), SPH_C64(0x0F1B180F03050303),
-	SPH_C64(0x13DC8A1356FA5656), SPH_C64(0x495E1A4944CC4444),
-	SPH_C64(0x9EA0DF9E7F817F7F), SPH_C64(0x37882137A9E6A9A9),
-	SPH_C64(0x82674D822A7E2A2A), SPH_C64(0x6D0AB16DBBD0BBBB),
-	SPH_C64(0xE28746E2C15EC1C1), SPH_C64(0x02F1A20253F55353),
-	SPH_C64(0x8B72AE8BDC79DCDC), SPH_C64(0x275358270B1D0B0B),
-	SPH_C64(0xD3019CD39DBA9D9D), SPH_C64(0xC12B47C16CB46C6C),
-	SPH_C64(0xF5A495F531533131), SPH_C64(0xB9F387B9749C7474),
-	SPH_C64(0x0915E309F607F6F6), SPH_C64(0x434C0A4346CA4646),
-	SPH_C64(0x26A50926ACE9ACAC), SPH_C64(0x97B53C9789868989),
-	SPH_C64(0x44B4A044143C1414), SPH_C64(0x42BA5B42E13EE1E1),
-	SPH_C64(0x4EA6B04E163A1616), SPH_C64(0xD2F7CDD23A4E3A3A),
-	SPH_C64(0xD0066FD069BB6969), SPH_C64(0x2D41482D091B0909),
-	SPH_C64(0xADD7A7AD70907070), SPH_C64(0x546FD954B6C7B6B6),
-	SPH_C64(0xB71ECEB7D06DD0D0), SPH_C64(0x7ED63B7EED2AEDED),
-	SPH_C64(0xDBE22EDBCC49CCCC), SPH_C64(0x57682A5742C64242),
-	SPH_C64(0xC22CB4C298B59898), SPH_C64(0x0EED490EA4F1A4A4),
-	SPH_C64(0x88755D8828782828), SPH_C64(0x3186DA315CE45C5C),
-	SPH_C64(0x3F6B933FF815F8F8), SPH_C64(0xA4C244A486978686)
-};
-
-
-static const uint64_t old1_T1[256] = {
-	SPH_C64(0xD8C0781828181878), SPH_C64(0x2605AF23652323AF),
-	SPH_C64(0xB87EF9C657C6C6F9), SPH_C64(0xFB136FE825E8E86F),
-	SPH_C64(0xCB4CA187948787A1), SPH_C64(0x11A962B8D5B8B862),
-	SPH_C64(0x0908050103010105), SPH_C64(0x0D426E4FD14F4F6E),
-	SPH_C64(0x9BADEE365A3636EE), SPH_C64(0xFF5904A6F7A6A604),
-	SPH_C64(0x0CDEBDD26BD2D2BD), SPH_C64(0x0EFB06F502F5F506),
-	SPH_C64(0x96EF80798B797980), SPH_C64(0x305FCE6FB16F6FCE),
-	SPH_C64(0x6DFCEF91AE9191EF), SPH_C64(0xF8AA0752F6525207),
-	SPH_C64(0x4727FD60A06060FD), SPH_C64(0x358976BCD9BCBC76),
-	SPH_C64(0x37ACCD9BB09B9BCD), SPH_C64(0x8A048C8E8F8E8E8C),
-	SPH_C64(0xD27115A3F8A3A315), SPH_C64(0x6C603C0C140C0C3C),
-	SPH_C64(0x84FF8A7B8D7B7B8A), SPH_C64(0x80B5E1355F3535E1),
-	SPH_C64(0xF5E8691D271D1D69), SPH_C64(0xB35347E03DE0E047),
-	SPH_C64(0x21F6ACD764D7D7AC), SPH_C64(0x9C5EEDC25BC2C2ED),
-	SPH_C64(0x436D962E722E2E96), SPH_C64(0x29627A4BDD4B4B7A),
-	SPH_C64(0x5DA321FE1FFEFE21), SPH_C64(0xD5821657F9575716),
-	SPH_C64(0xBDA841153F151541), SPH_C64(0xE89FB677997777B6),
-	SPH_C64(0x92A5EB37593737EB), SPH_C64(0x9E7B56E532E5E556),
-	SPH_C64(0x138CD99FBC9F9FD9), SPH_C64(0x23D317F00DF0F017),
-	SPH_C64(0x206A7F4ADE4A4A7F), SPH_C64(0x449E95DA73DADA95),
-	SPH_C64(0xA2FA2558E8585825), SPH_C64(0xCF06CAC946C9C9CA),
-	SPH_C64(0x7C558D297B29298D), SPH_C64(0x5A50220A1E0A0A22),
-	SPH_C64(0x50E14FB1CEB1B14F), SPH_C64(0xC9691AA0FDA0A01A),
-	SPH_C64(0x147FDA6BBD6B6BDA), SPH_C64(0xD95CAB85928585AB),
-	SPH_C64(0x3C8173BDDABDBD73), SPH_C64(0x8FD2345DE75D5D34),
-	SPH_C64(0x9080501030101050), SPH_C64(0x07F303F401F4F403),
-	SPH_C64(0xDD16C0CB40CBCBC0), SPH_C64(0xD3EDC63E423E3EC6),
-	SPH_C64(0x2D2811050F050511), SPH_C64(0x781FE667A96767E6),
-	SPH_C64(0x977353E431E4E453), SPH_C64(0x0225BB27692727BB),
-	SPH_C64(0x73325841C3414158), SPH_C64(0xA72C9D8B808B8B9D),
-	SPH_C64(0xF65101A7F4A7A701), SPH_C64(0xB2CF947D877D7D94),
-	SPH_C64(0x49DCFB95A29595FB), SPH_C64(0x568E9FD875D8D89F),
-	SPH_C64(0x708B30FB10FBFB30), SPH_C64(0xCD2371EE2FEEEE71),
-	SPH_C64(0xBBC7917C847C7C91), SPH_C64(0x7117E366AA6666E3),
-	SPH_C64(0x7BA68EDD7ADDDD8E), SPH_C64(0xAFB84B173917174B),
-	SPH_C64(0x45024647C9474746), SPH_C64(0x1A84DC9EBF9E9EDC),
-	SPH_C64(0xD41EC5CA43CACAC5), SPH_C64(0x5875992D772D2D99),
-	SPH_C64(0x2E9179BFDCBFBF79), SPH_C64(0x3F381B070907071B),
-	SPH_C64(0xAC0123ADEAADAD23), SPH_C64(0xB0EA2F5AEE5A5A2F),
-	SPH_C64(0xEF6CB583988383B5), SPH_C64(0xB685FF33553333FF),
-	SPH_C64(0x5C3FF263A56363F2), SPH_C64(0x12100A020602020A),
-	SPH_C64(0x933938AAE3AAAA38), SPH_C64(0xDEAFA871937171A8),
-	SPH_C64(0xC60ECFC845C8C8CF), SPH_C64(0xD1C87D192B19197D),
-	SPH_C64(0x3B727049DB494970), SPH_C64(0x5F869AD976D9D99A),
-	SPH_C64(0x31C31DF20BF2F21D), SPH_C64(0xA84B48E338E3E348),
-	SPH_C64(0xB9E22A5BED5B5B2A), SPH_C64(0xBC34928885888892),
-	SPH_C64(0x3EA4C89AB39A9AC8), SPH_C64(0x0B2DBE266A2626BE),
-	SPH_C64(0xBF8DFA32563232FA), SPH_C64(0x59E94AB0CDB0B04A),
-	SPH_C64(0xF21B6AE926E9E96A), SPH_C64(0x7778330F110F0F33),
-	SPH_C64(0x33E6A6D562D5D5A6), SPH_C64(0xF474BA809D8080BA),
-	SPH_C64(0x27997CBEDFBEBE7C), SPH_C64(0xEB26DECD4ACDCDDE),
-	SPH_C64(0x89BDE4345C3434E4), SPH_C64(0x327A7548D8484875),
-	SPH_C64(0x54AB24FF1CFFFF24), SPH_C64(0x8DF78F7A8E7A7A8F),
-	SPH_C64(0x64F4EA90AD9090EA), SPH_C64(0x9DC23E5FE15F5F3E),
-	SPH_C64(0x3D1DA020602020A0), SPH_C64(0x0F67D568B86868D5),
-	SPH_C64(0xCAD0721A2E1A1A72), SPH_C64(0xB7192CAEEFAEAE2C),
-	SPH_C64(0x7DC95EB4C1B4B45E), SPH_C64(0xCE9A1954FC545419),
-	SPH_C64(0x7FECE593A89393E5), SPH_C64(0x2F0DAA22662222AA),
-	SPH_C64(0x6307E964AC6464E9), SPH_C64(0x2ADB12F10EF1F112),
-	SPH_C64(0xCCBFA273957373A2), SPH_C64(0x82905A123612125A),
-	SPH_C64(0x7A3A5D40C040405D), SPH_C64(0x4840280818080828),
-	SPH_C64(0x9556E8C358C3C3E8), SPH_C64(0xDF337BEC29ECEC7B),
-	SPH_C64(0x4D9690DB70DBDB90), SPH_C64(0xC0611FA1FEA1A11F),
-	SPH_C64(0x911C838D8A8D8D83), SPH_C64(0xC8F5C93D473D3DC9),
-	SPH_C64(0x5BCCF197A49797F1), SPH_C64(0x0000000000000000),
-	SPH_C64(0xF936D4CF4CCFCFD4), SPH_C64(0x6E45872B7D2B2B87),
-	SPH_C64(0xE197B3769A7676B3), SPH_C64(0xE664B0829B8282B0),
-	SPH_C64(0x28FEA9D667D6D6A9), SPH_C64(0xC3D8771B2D1B1B77),
-	SPH_C64(0x74C15BB5C2B5B55B), SPH_C64(0xBE1129AFECAFAF29),
-	SPH_C64(0x1D77DF6ABE6A6ADF), SPH_C64(0xEABA0D50F050500D),
-	SPH_C64(0x57124C45CF45454C), SPH_C64(0x38CB18F308F3F318),
-	SPH_C64(0xAD9DF030503030F0), SPH_C64(0xC42B74EF2CEFEF74),
-	SPH_C64(0xDAE5C33F413F3FC3), SPH_C64(0xC7921C55FF55551C),
-	SPH_C64(0xDB7910A2FBA2A210), SPH_C64(0xE90365EA23EAEA65),
-	SPH_C64(0x6A0FEC65AF6565EC), SPH_C64(0x03B968BAD3BABA68),
-	SPH_C64(0x4A65932F712F2F93), SPH_C64(0x8E4EE7C05DC0C0E7),
-	SPH_C64(0x60BE81DE7FDEDE81), SPH_C64(0xFCE06C1C241C1C6C),
-	SPH_C64(0x46BB2EFD1AFDFD2E), SPH_C64(0x1F52644DD74D4D64),
-	SPH_C64(0x76E4E092AB9292E0), SPH_C64(0xFA8FBC759F7575BC),
-	SPH_C64(0x36301E060A06061E), SPH_C64(0xAE24988A838A8A98),
-	SPH_C64(0x4BF940B2CBB2B240), SPH_C64(0x856359E637E6E659),
-	SPH_C64(0x7E70360E120E0E36), SPH_C64(0xE7F8631F211F1F63),
-	SPH_C64(0x5537F762A66262F7), SPH_C64(0x3AEEA3D461D4D4A3),
-	SPH_C64(0x812932A8E5A8A832), SPH_C64(0x52C4F496A79696F4),
-	SPH_C64(0x629B3AF916F9F93A), SPH_C64(0xA366F6C552C5C5F6),
-	SPH_C64(0x1035B1256F2525B1), SPH_C64(0xABF22059EB595920),
-	SPH_C64(0xD054AE84918484AE), SPH_C64(0xC5B7A772967272A7),
-	SPH_C64(0xECD5DD394B3939DD), SPH_C64(0x165A614CD44C4C61),
-	SPH_C64(0x94CA3B5EE25E5E3B), SPH_C64(0x9FE7857888787885),
-	SPH_C64(0xE5DDD838483838D8), SPH_C64(0x9814868C898C8C86),
-	SPH_C64(0x17C6B2D16ED1D1B2), SPH_C64(0xE4410BA5F2A5A50B),
-	SPH_C64(0xA1434DE23BE2E24D), SPH_C64(0x4E2FF861A36161F8),
-	SPH_C64(0x42F145B3C8B3B345), SPH_C64(0x3415A521632121A5),
-	SPH_C64(0x0894D69CB99C9CD6), SPH_C64(0xEEF0661E221E1E66),
-	SPH_C64(0x61225243C5434352), SPH_C64(0xB176FCC754C7C7FC),
-	SPH_C64(0x4FB32BFC19FCFC2B), SPH_C64(0x242014040C040414),
-	SPH_C64(0xE3B20851F3515108), SPH_C64(0x25BCC799B69999C7),
-	SPH_C64(0x224FC46DB76D6DC4), SPH_C64(0x6568390D170D0D39),
-	SPH_C64(0x798335FA13FAFA35), SPH_C64(0x69B684DF7CDFDF84),
-	SPH_C64(0xA9D79B7E827E7E9B), SPH_C64(0x193DB4246C2424B4),
-	SPH_C64(0xFEC5D73B4D3B3BD7), SPH_C64(0x9A313DABE0ABAB3D),
-	SPH_C64(0xF03ED1CE4FCECED1), SPH_C64(0x9988551133111155),
-	SPH_C64(0x830C898F8C8F8F89), SPH_C64(0x044A6B4ED24E4E6B),
-	SPH_C64(0x66D151B7C4B7B751), SPH_C64(0xE00B60EB20EBEB60),
-	SPH_C64(0xC1FDCC3C443C3CCC), SPH_C64(0xFD7CBF819E8181BF),
-	SPH_C64(0x40D4FE94A19494FE), SPH_C64(0x1CEB0CF704F7F70C),
-	SPH_C64(0x18A167B9D6B9B967), SPH_C64(0x8B985F133513135F),
-	SPH_C64(0x517D9C2C742C2C9C), SPH_C64(0x05D6B8D368D3D3B8),
-	SPH_C64(0x8C6B5CE734E7E75C), SPH_C64(0x3957CB6EB26E6ECB),
-	SPH_C64(0xAA6EF3C451C4C4F3), SPH_C64(0x1B180F030503030F),
-	SPH_C64(0xDC8A1356FA565613), SPH_C64(0x5E1A4944CC444449),
-	SPH_C64(0xA0DF9E7F817F7F9E), SPH_C64(0x882137A9E6A9A937),
-	SPH_C64(0x674D822A7E2A2A82), SPH_C64(0x0AB16DBBD0BBBB6D),
-	SPH_C64(0x8746E2C15EC1C1E2), SPH_C64(0xF1A20253F5535302),
-	SPH_C64(0x72AE8BDC79DCDC8B), SPH_C64(0x5358270B1D0B0B27),
-	SPH_C64(0x019CD39DBA9D9DD3), SPH_C64(0x2B47C16CB46C6CC1),
-	SPH_C64(0xA495F531533131F5), SPH_C64(0xF387B9749C7474B9),
-	SPH_C64(0x15E309F607F6F609), SPH_C64(0x4C0A4346CA464643),
-	SPH_C64(0xA50926ACE9ACAC26), SPH_C64(0xB53C978986898997),
-	SPH_C64(0xB4A044143C141444), SPH_C64(0xBA5B42E13EE1E142),
-	SPH_C64(0xA6B04E163A16164E), SPH_C64(0xF7CDD23A4E3A3AD2),
-	SPH_C64(0x066FD069BB6969D0), SPH_C64(0x41482D091B09092D),
-	SPH_C64(0xD7A7AD70907070AD), SPH_C64(0x6FD954B6C7B6B654),
-	SPH_C64(0x1ECEB7D06DD0D0B7), SPH_C64(0xD63B7EED2AEDED7E),
-	SPH_C64(0xE22EDBCC49CCCCDB), SPH_C64(0x682A5742C6424257),
-	SPH_C64(0x2CB4C298B59898C2), SPH_C64(0xED490EA4F1A4A40E),
-	SPH_C64(0x755D882878282888), SPH_C64(0x86DA315CE45C5C31),
-	SPH_C64(0x6B933FF815F8F83F), SPH_C64(0xC244A486978686A4)
-};
-
-static const uint64_t old1_T2[256] = {
-	SPH_C64(0xC0781828181878D8), SPH_C64(0x05AF23652323AF26),
-	SPH_C64(0x7EF9C657C6C6F9B8), SPH_C64(0x136FE825E8E86FFB),
-	SPH_C64(0x4CA187948787A1CB), SPH_C64(0xA962B8D5B8B86211),
-	SPH_C64(0x0805010301010509), SPH_C64(0x426E4FD14F4F6E0D),
-	SPH_C64(0xADEE365A3636EE9B), SPH_C64(0x5904A6F7A6A604FF),
-	SPH_C64(0xDEBDD26BD2D2BD0C), SPH_C64(0xFB06F502F5F5060E),
-	SPH_C64(0xEF80798B79798096), SPH_C64(0x5FCE6FB16F6FCE30),
-	SPH_C64(0xFCEF91AE9191EF6D), SPH_C64(0xAA0752F6525207F8),
-	SPH_C64(0x27FD60A06060FD47), SPH_C64(0x8976BCD9BCBC7635),
-	SPH_C64(0xACCD9BB09B9BCD37), SPH_C64(0x048C8E8F8E8E8C8A),
-	SPH_C64(0x7115A3F8A3A315D2), SPH_C64(0x603C0C140C0C3C6C),
-	SPH_C64(0xFF8A7B8D7B7B8A84), SPH_C64(0xB5E1355F3535E180),
-	SPH_C64(0xE8691D271D1D69F5), SPH_C64(0x5347E03DE0E047B3),
-	SPH_C64(0xF6ACD764D7D7AC21), SPH_C64(0x5EEDC25BC2C2ED9C),
-	SPH_C64(0x6D962E722E2E9643), SPH_C64(0x627A4BDD4B4B7A29),
-	SPH_C64(0xA321FE1FFEFE215D), SPH_C64(0x821657F9575716D5),
-	SPH_C64(0xA841153F151541BD), SPH_C64(0x9FB677997777B6E8),
-	SPH_C64(0xA5EB37593737EB92), SPH_C64(0x7B56E532E5E5569E),
-	SPH_C64(0x8CD99FBC9F9FD913), SPH_C64(0xD317F00DF0F01723),
-	SPH_C64(0x6A7F4ADE4A4A7F20), SPH_C64(0x9E95DA73DADA9544),
-	SPH_C64(0xFA2558E8585825A2), SPH_C64(0x06CAC946C9C9CACF),
-	SPH_C64(0x558D297B29298D7C), SPH_C64(0x50220A1E0A0A225A),
-	SPH_C64(0xE14FB1CEB1B14F50), SPH_C64(0x691AA0FDA0A01AC9),
-	SPH_C64(0x7FDA6BBD6B6BDA14), SPH_C64(0x5CAB85928585ABD9),
-	SPH_C64(0x8173BDDABDBD733C), SPH_C64(0xD2345DE75D5D348F),
-	SPH_C64(0x8050103010105090), SPH_C64(0xF303F401F4F40307),
-	SPH_C64(0x16C0CB40CBCBC0DD), SPH_C64(0xEDC63E423E3EC6D3),
-	SPH_C64(0x2811050F0505112D), SPH_C64(0x1FE667A96767E678),
-	SPH_C64(0x7353E431E4E45397), SPH_C64(0x25BB27692727BB02),
-	SPH_C64(0x325841C341415873), SPH_C64(0x2C9D8B808B8B9DA7),
-	SPH_C64(0x5101A7F4A7A701F6), SPH_C64(0xCF947D877D7D94B2),
-	SPH_C64(0xDCFB95A29595FB49), SPH_C64(0x8E9FD875D8D89F56),
-	SPH_C64(0x8B30FB10FBFB3070), SPH_C64(0x2371EE2FEEEE71CD),
-	SPH_C64(0xC7917C847C7C91BB), SPH_C64(0x17E366AA6666E371),
-	SPH_C64(0xA68EDD7ADDDD8E7B), SPH_C64(0xB84B173917174BAF),
-	SPH_C64(0x024647C947474645), SPH_C64(0x84DC9EBF9E9EDC1A),
-	SPH_C64(0x1EC5CA43CACAC5D4), SPH_C64(0x75992D772D2D9958),
-	SPH_C64(0x9179BFDCBFBF792E), SPH_C64(0x381B070907071B3F),
-	SPH_C64(0x0123ADEAADAD23AC), SPH_C64(0xEA2F5AEE5A5A2FB0),
-	SPH_C64(0x6CB583988383B5EF), SPH_C64(0x85FF33553333FFB6),
-	SPH_C64(0x3FF263A56363F25C), SPH_C64(0x100A020602020A12),
-	SPH_C64(0x3938AAE3AAAA3893), SPH_C64(0xAFA871937171A8DE),
-	SPH_C64(0x0ECFC845C8C8CFC6), SPH_C64(0xC87D192B19197DD1),
-	SPH_C64(0x727049DB4949703B), SPH_C64(0x869AD976D9D99A5F),
-	SPH_C64(0xC31DF20BF2F21D31), SPH_C64(0x4B48E338E3E348A8),
-	SPH_C64(0xE22A5BED5B5B2AB9), SPH_C64(0x34928885888892BC),
-	SPH_C64(0xA4C89AB39A9AC83E), SPH_C64(0x2DBE266A2626BE0B),
-	SPH_C64(0x8DFA32563232FABF), SPH_C64(0xE94AB0CDB0B04A59),
-	SPH_C64(0x1B6AE926E9E96AF2), SPH_C64(0x78330F110F0F3377),
-	SPH_C64(0xE6A6D562D5D5A633), SPH_C64(0x74BA809D8080BAF4),
-	SPH_C64(0x997CBEDFBEBE7C27), SPH_C64(0x26DECD4ACDCDDEEB),
-	SPH_C64(0xBDE4345C3434E489), SPH_C64(0x7A7548D848487532),
-	SPH_C64(0xAB24FF1CFFFF2454), SPH_C64(0xF78F7A8E7A7A8F8D),
-	SPH_C64(0xF4EA90AD9090EA64), SPH_C64(0xC23E5FE15F5F3E9D),
-	SPH_C64(0x1DA020602020A03D), SPH_C64(0x67D568B86868D50F),
-	SPH_C64(0xD0721A2E1A1A72CA), SPH_C64(0x192CAEEFAEAE2CB7),
-	SPH_C64(0xC95EB4C1B4B45E7D), SPH_C64(0x9A1954FC545419CE),
-	SPH_C64(0xECE593A89393E57F), SPH_C64(0x0DAA22662222AA2F),
-	SPH_C64(0x07E964AC6464E963), SPH_C64(0xDB12F10EF1F1122A),
-	SPH_C64(0xBFA273957373A2CC), SPH_C64(0x905A123612125A82),
-	SPH_C64(0x3A5D40C040405D7A), SPH_C64(0x4028081808082848),
-	SPH_C64(0x56E8C358C3C3E895), SPH_C64(0x337BEC29ECEC7BDF),
-	SPH_C64(0x9690DB70DBDB904D), SPH_C64(0x611FA1FEA1A11FC0),
-	SPH_C64(0x1C838D8A8D8D8391), SPH_C64(0xF5C93D473D3DC9C8),
-	SPH_C64(0xCCF197A49797F15B), SPH_C64(0x0000000000000000),
-	SPH_C64(0x36D4CF4CCFCFD4F9), SPH_C64(0x45872B7D2B2B876E),
-	SPH_C64(0x97B3769A7676B3E1), SPH_C64(0x64B0829B8282B0E6),
-	SPH_C64(0xFEA9D667D6D6A928), SPH_C64(0xD8771B2D1B1B77C3),
-	SPH_C64(0xC15BB5C2B5B55B74), SPH_C64(0x1129AFECAFAF29BE),
-	SPH_C64(0x77DF6ABE6A6ADF1D), SPH_C64(0xBA0D50F050500DEA),
-	SPH_C64(0x124C45CF45454C57), SPH_C64(0xCB18F308F3F31838),
-	SPH_C64(0x9DF030503030F0AD), SPH_C64(0x2B74EF2CEFEF74C4),
-	SPH_C64(0xE5C33F413F3FC3DA), SPH_C64(0x921C55FF55551CC7),
-	SPH_C64(0x7910A2FBA2A210DB), SPH_C64(0x0365EA23EAEA65E9),
-	SPH_C64(0x0FEC65AF6565EC6A), SPH_C64(0xB968BAD3BABA6803),
-	SPH_C64(0x65932F712F2F934A), SPH_C64(0x4EE7C05DC0C0E78E),
-	SPH_C64(0xBE81DE7FDEDE8160), SPH_C64(0xE06C1C241C1C6CFC),
-	SPH_C64(0xBB2EFD1AFDFD2E46), SPH_C64(0x52644DD74D4D641F),
-	SPH_C64(0xE4E092AB9292E076), SPH_C64(0x8FBC759F7575BCFA),
-	SPH_C64(0x301E060A06061E36), SPH_C64(0x24988A838A8A98AE),
-	SPH_C64(0xF940B2CBB2B2404B), SPH_C64(0x6359E637E6E65985),
-	SPH_C64(0x70360E120E0E367E), SPH_C64(0xF8631F211F1F63E7),
-	SPH_C64(0x37F762A66262F755), SPH_C64(0xEEA3D461D4D4A33A),
-	SPH_C64(0x2932A8E5A8A83281), SPH_C64(0xC4F496A79696F452),
-	SPH_C64(0x9B3AF916F9F93A62), SPH_C64(0x66F6C552C5C5F6A3),
-	SPH_C64(0x35B1256F2525B110), SPH_C64(0xF22059EB595920AB),
-	SPH_C64(0x54AE84918484AED0), SPH_C64(0xB7A772967272A7C5),
-	SPH_C64(0xD5DD394B3939DDEC), SPH_C64(0x5A614CD44C4C6116),
-	SPH_C64(0xCA3B5EE25E5E3B94), SPH_C64(0xE78578887878859F),
-	SPH_C64(0xDDD838483838D8E5), SPH_C64(0x14868C898C8C8698),
-	SPH_C64(0xC6B2D16ED1D1B217), SPH_C64(0x410BA5F2A5A50BE4),
-	SPH_C64(0x434DE23BE2E24DA1), SPH_C64(0x2FF861A36161F84E),
-	SPH_C64(0xF145B3C8B3B34542), SPH_C64(0x15A521632121A534),
-	SPH_C64(0x94D69CB99C9CD608), SPH_C64(0xF0661E221E1E66EE),
-	SPH_C64(0x225243C543435261), SPH_C64(0x76FCC754C7C7FCB1),
-	SPH_C64(0xB32BFC19FCFC2B4F), SPH_C64(0x2014040C04041424),
-	SPH_C64(0xB20851F3515108E3), SPH_C64(0xBCC799B69999C725),
-	SPH_C64(0x4FC46DB76D6DC422), SPH_C64(0x68390D170D0D3965),
-	SPH_C64(0x8335FA13FAFA3579), SPH_C64(0xB684DF7CDFDF8469),
-	SPH_C64(0xD79B7E827E7E9BA9), SPH_C64(0x3DB4246C2424B419),
-	SPH_C64(0xC5D73B4D3B3BD7FE), SPH_C64(0x313DABE0ABAB3D9A),
-	SPH_C64(0x3ED1CE4FCECED1F0), SPH_C64(0x8855113311115599),
-	SPH_C64(0x0C898F8C8F8F8983), SPH_C64(0x4A6B4ED24E4E6B04),
-	SPH_C64(0xD151B7C4B7B75166), SPH_C64(0x0B60EB20EBEB60E0),
-	SPH_C64(0xFDCC3C443C3CCCC1), SPH_C64(0x7CBF819E8181BFFD),
-	SPH_C64(0xD4FE94A19494FE40), SPH_C64(0xEB0CF704F7F70C1C),
-	SPH_C64(0xA167B9D6B9B96718), SPH_C64(0x985F133513135F8B),
-	SPH_C64(0x7D9C2C742C2C9C51), SPH_C64(0xD6B8D368D3D3B805),
-	SPH_C64(0x6B5CE734E7E75C8C), SPH_C64(0x57CB6EB26E6ECB39),
-	SPH_C64(0x6EF3C451C4C4F3AA), SPH_C64(0x180F030503030F1B),
-	SPH_C64(0x8A1356FA565613DC), SPH_C64(0x1A4944CC4444495E),
-	SPH_C64(0xDF9E7F817F7F9EA0), SPH_C64(0x2137A9E6A9A93788),
-	SPH_C64(0x4D822A7E2A2A8267), SPH_C64(0xB16DBBD0BBBB6D0A),
-	SPH_C64(0x46E2C15EC1C1E287), SPH_C64(0xA20253F5535302F1),
-	SPH_C64(0xAE8BDC79DCDC8B72), SPH_C64(0x58270B1D0B0B2753),
-	SPH_C64(0x9CD39DBA9D9DD301), SPH_C64(0x47C16CB46C6CC12B),
-	SPH_C64(0x95F531533131F5A4), SPH_C64(0x87B9749C7474B9F3),
-	SPH_C64(0xE309F607F6F60915), SPH_C64(0x0A4346CA4646434C),
-	SPH_C64(0x0926ACE9ACAC26A5), SPH_C64(0x3C978986898997B5),
-	SPH_C64(0xA044143C141444B4), SPH_C64(0x5B42E13EE1E142BA),
-	SPH_C64(0xB04E163A16164EA6), SPH_C64(0xCDD23A4E3A3AD2F7),
-	SPH_C64(0x6FD069BB6969D006), SPH_C64(0x482D091B09092D41),
-	SPH_C64(0xA7AD70907070ADD7), SPH_C64(0xD954B6C7B6B6546F),
-	SPH_C64(0xCEB7D06DD0D0B71E), SPH_C64(0x3B7EED2AEDED7ED6),
-	SPH_C64(0x2EDBCC49CCCCDBE2), SPH_C64(0x2A5742C642425768),
-	SPH_C64(0xB4C298B59898C22C), SPH_C64(0x490EA4F1A4A40EED),
-	SPH_C64(0x5D88287828288875), SPH_C64(0xDA315CE45C5C3186),
-	SPH_C64(0x933FF815F8F83F6B), SPH_C64(0x44A486978686A4C2)
-};
-
-static const uint64_t old1_T3[256] = {
-	SPH_C64(0x781828181878D8C0), SPH_C64(0xAF23652323AF2605),
-	SPH_C64(0xF9C657C6C6F9B87E), SPH_C64(0x6FE825E8E86FFB13),
-	SPH_C64(0xA187948787A1CB4C), SPH_C64(0x62B8D5B8B86211A9),
-	SPH_C64(0x0501030101050908), SPH_C64(0x6E4FD14F4F6E0D42),
-	SPH_C64(0xEE365A3636EE9BAD), SPH_C64(0x04A6F7A6A604FF59),
-	SPH_C64(0xBDD26BD2D2BD0CDE), SPH_C64(0x06F502F5F5060EFB),
-	SPH_C64(0x80798B79798096EF), SPH_C64(0xCE6FB16F6FCE305F),
-	SPH_C64(0xEF91AE9191EF6DFC), SPH_C64(0x0752F6525207F8AA),
-	SPH_C64(0xFD60A06060FD4727), SPH_C64(0x76BCD9BCBC763589),
-	SPH_C64(0xCD9BB09B9BCD37AC), SPH_C64(0x8C8E8F8E8E8C8A04),
-	SPH_C64(0x15A3F8A3A315D271), SPH_C64(0x3C0C140C0C3C6C60),
-	SPH_C64(0x8A7B8D7B7B8A84FF), SPH_C64(0xE1355F3535E180B5),
-	SPH_C64(0x691D271D1D69F5E8), SPH_C64(0x47E03DE0E047B353),
-	SPH_C64(0xACD764D7D7AC21F6), SPH_C64(0xEDC25BC2C2ED9C5E),
-	SPH_C64(0x962E722E2E96436D), SPH_C64(0x7A4BDD4B4B7A2962),
-	SPH_C64(0x21FE1FFEFE215DA3), SPH_C64(0x1657F9575716D582),
-	SPH_C64(0x41153F151541BDA8), SPH_C64(0xB677997777B6E89F),
-	SPH_C64(0xEB37593737EB92A5), SPH_C64(0x56E532E5E5569E7B),
-	SPH_C64(0xD99FBC9F9FD9138C), SPH_C64(0x17F00DF0F01723D3),
-	SPH_C64(0x7F4ADE4A4A7F206A), SPH_C64(0x95DA73DADA95449E),
-	SPH_C64(0x2558E8585825A2FA), SPH_C64(0xCAC946C9C9CACF06),
-	SPH_C64(0x8D297B29298D7C55), SPH_C64(0x220A1E0A0A225A50),
-	SPH_C64(0x4FB1CEB1B14F50E1), SPH_C64(0x1AA0FDA0A01AC969),
-	SPH_C64(0xDA6BBD6B6BDA147F), SPH_C64(0xAB85928585ABD95C),
-	SPH_C64(0x73BDDABDBD733C81), SPH_C64(0x345DE75D5D348FD2),
-	SPH_C64(0x5010301010509080), SPH_C64(0x03F401F4F40307F3),
-	SPH_C64(0xC0CB40CBCBC0DD16), SPH_C64(0xC63E423E3EC6D3ED),
-	SPH_C64(0x11050F0505112D28), SPH_C64(0xE667A96767E6781F),
-	SPH_C64(0x53E431E4E4539773), SPH_C64(0xBB27692727BB0225),
-	SPH_C64(0x5841C34141587332), SPH_C64(0x9D8B808B8B9DA72C),
-	SPH_C64(0x01A7F4A7A701F651), SPH_C64(0x947D877D7D94B2CF),
-	SPH_C64(0xFB95A29595FB49DC), SPH_C64(0x9FD875D8D89F568E),
-	SPH_C64(0x30FB10FBFB30708B), SPH_C64(0x71EE2FEEEE71CD23),
-	SPH_C64(0x917C847C7C91BBC7), SPH_C64(0xE366AA6666E37117),
-	SPH_C64(0x8EDD7ADDDD8E7BA6), SPH_C64(0x4B173917174BAFB8),
-	SPH_C64(0x4647C94747464502), SPH_C64(0xDC9EBF9E9EDC1A84),
-	SPH_C64(0xC5CA43CACAC5D41E), SPH_C64(0x992D772D2D995875),
-	SPH_C64(0x79BFDCBFBF792E91), SPH_C64(0x1B070907071B3F38),
-	SPH_C64(0x23ADEAADAD23AC01), SPH_C64(0x2F5AEE5A5A2FB0EA),
-	SPH_C64(0xB583988383B5EF6C), SPH_C64(0xFF33553333FFB685),
-	SPH_C64(0xF263A56363F25C3F), SPH_C64(0x0A020602020A1210),
-	SPH_C64(0x38AAE3AAAA389339), SPH_C64(0xA871937171A8DEAF),
-	SPH_C64(0xCFC845C8C8CFC60E), SPH_C64(0x7D192B19197DD1C8),
-	SPH_C64(0x7049DB4949703B72), SPH_C64(0x9AD976D9D99A5F86),
-	SPH_C64(0x1DF20BF2F21D31C3), SPH_C64(0x48E338E3E348A84B),
-	SPH_C64(0x2A5BED5B5B2AB9E2), SPH_C64(0x928885888892BC34),
-	SPH_C64(0xC89AB39A9AC83EA4), SPH_C64(0xBE266A2626BE0B2D),
-	SPH_C64(0xFA32563232FABF8D), SPH_C64(0x4AB0CDB0B04A59E9),
-	SPH_C64(0x6AE926E9E96AF21B), SPH_C64(0x330F110F0F337778),
-	SPH_C64(0xA6D562D5D5A633E6), SPH_C64(0xBA809D8080BAF474),
-	SPH_C64(0x7CBEDFBEBE7C2799), SPH_C64(0xDECD4ACDCDDEEB26),
-	SPH_C64(0xE4345C3434E489BD), SPH_C64(0x7548D8484875327A),
-	SPH_C64(0x24FF1CFFFF2454AB), SPH_C64(0x8F7A8E7A7A8F8DF7),
-	SPH_C64(0xEA90AD9090EA64F4), SPH_C64(0x3E5FE15F5F3E9DC2),
-	SPH_C64(0xA020602020A03D1D), SPH_C64(0xD568B86868D50F67),
-	SPH_C64(0x721A2E1A1A72CAD0), SPH_C64(0x2CAEEFAEAE2CB719),
-	SPH_C64(0x5EB4C1B4B45E7DC9), SPH_C64(0x1954FC545419CE9A),
-	SPH_C64(0xE593A89393E57FEC), SPH_C64(0xAA22662222AA2F0D),
-	SPH_C64(0xE964AC6464E96307), SPH_C64(0x12F10EF1F1122ADB),
-	SPH_C64(0xA273957373A2CCBF), SPH_C64(0x5A123612125A8290),
-	SPH_C64(0x5D40C040405D7A3A), SPH_C64(0x2808180808284840),
-	SPH_C64(0xE8C358C3C3E89556), SPH_C64(0x7BEC29ECEC7BDF33),
-	SPH_C64(0x90DB70DBDB904D96), SPH_C64(0x1FA1FEA1A11FC061),
-	SPH_C64(0x838D8A8D8D83911C), SPH_C64(0xC93D473D3DC9C8F5),
-	SPH_C64(0xF197A49797F15BCC), SPH_C64(0x0000000000000000),
-	SPH_C64(0xD4CF4CCFCFD4F936), SPH_C64(0x872B7D2B2B876E45),
-	SPH_C64(0xB3769A7676B3E197), SPH_C64(0xB0829B8282B0E664),
-	SPH_C64(0xA9D667D6D6A928FE), SPH_C64(0x771B2D1B1B77C3D8),
-	SPH_C64(0x5BB5C2B5B55B74C1), SPH_C64(0x29AFECAFAF29BE11),
-	SPH_C64(0xDF6ABE6A6ADF1D77), SPH_C64(0x0D50F050500DEABA),
-	SPH_C64(0x4C45CF45454C5712), SPH_C64(0x18F308F3F31838CB),
-	SPH_C64(0xF030503030F0AD9D), SPH_C64(0x74EF2CEFEF74C42B),
-	SPH_C64(0xC33F413F3FC3DAE5), SPH_C64(0x1C55FF55551CC792),
-	SPH_C64(0x10A2FBA2A210DB79), SPH_C64(0x65EA23EAEA65E903),
-	SPH_C64(0xEC65AF6565EC6A0F), SPH_C64(0x68BAD3BABA6803B9),
-	SPH_C64(0x932F712F2F934A65), SPH_C64(0xE7C05DC0C0E78E4E),
-	SPH_C64(0x81DE7FDEDE8160BE), SPH_C64(0x6C1C241C1C6CFCE0),
-	SPH_C64(0x2EFD1AFDFD2E46BB), SPH_C64(0x644DD74D4D641F52),
-	SPH_C64(0xE092AB9292E076E4), SPH_C64(0xBC759F7575BCFA8F),
-	SPH_C64(0x1E060A06061E3630), SPH_C64(0x988A838A8A98AE24),
-	SPH_C64(0x40B2CBB2B2404BF9), SPH_C64(0x59E637E6E6598563),
-	SPH_C64(0x360E120E0E367E70), SPH_C64(0x631F211F1F63E7F8),
-	SPH_C64(0xF762A66262F75537), SPH_C64(0xA3D461D4D4A33AEE),
-	SPH_C64(0x32A8E5A8A8328129), SPH_C64(0xF496A79696F452C4),
-	SPH_C64(0x3AF916F9F93A629B), SPH_C64(0xF6C552C5C5F6A366),
-	SPH_C64(0xB1256F2525B11035), SPH_C64(0x2059EB595920ABF2),
-	SPH_C64(0xAE84918484AED054), SPH_C64(0xA772967272A7C5B7),
-	SPH_C64(0xDD394B3939DDECD5), SPH_C64(0x614CD44C4C61165A),
-	SPH_C64(0x3B5EE25E5E3B94CA), SPH_C64(0x8578887878859FE7),
-	SPH_C64(0xD838483838D8E5DD), SPH_C64(0x868C898C8C869814),
-	SPH_C64(0xB2D16ED1D1B217C6), SPH_C64(0x0BA5F2A5A50BE441),
-	SPH_C64(0x4DE23BE2E24DA143), SPH_C64(0xF861A36161F84E2F),
-	SPH_C64(0x45B3C8B3B34542F1), SPH_C64(0xA521632121A53415),
-	SPH_C64(0xD69CB99C9CD60894), SPH_C64(0x661E221E1E66EEF0),
-	SPH_C64(0x5243C54343526122), SPH_C64(0xFCC754C7C7FCB176),
-	SPH_C64(0x2BFC19FCFC2B4FB3), SPH_C64(0x14040C0404142420),
-	SPH_C64(0x0851F3515108E3B2), SPH_C64(0xC799B69999C725BC),
-	SPH_C64(0xC46DB76D6DC4224F), SPH_C64(0x390D170D0D396568),
-	SPH_C64(0x35FA13FAFA357983), SPH_C64(0x84DF7CDFDF8469B6),
-	SPH_C64(0x9B7E827E7E9BA9D7), SPH_C64(0xB4246C2424B4193D),
-	SPH_C64(0xD73B4D3B3BD7FEC5), SPH_C64(0x3DABE0ABAB3D9A31),
-	SPH_C64(0xD1CE4FCECED1F03E), SPH_C64(0x5511331111559988),
-	SPH_C64(0x898F8C8F8F89830C), SPH_C64(0x6B4ED24E4E6B044A),
-	SPH_C64(0x51B7C4B7B75166D1), SPH_C64(0x60EB20EBEB60E00B),
-	SPH_C64(0xCC3C443C3CCCC1FD), SPH_C64(0xBF819E8181BFFD7C),
-	SPH_C64(0xFE94A19494FE40D4), SPH_C64(0x0CF704F7F70C1CEB),
-	SPH_C64(0x67B9D6B9B96718A1), SPH_C64(0x5F133513135F8B98),
-	SPH_C64(0x9C2C742C2C9C517D), SPH_C64(0xB8D368D3D3B805D6),
-	SPH_C64(0x5CE734E7E75C8C6B), SPH_C64(0xCB6EB26E6ECB3957),
-	SPH_C64(0xF3C451C4C4F3AA6E), SPH_C64(0x0F030503030F1B18),
-	SPH_C64(0x1356FA565613DC8A), SPH_C64(0x4944CC4444495E1A),
-	SPH_C64(0x9E7F817F7F9EA0DF), SPH_C64(0x37A9E6A9A9378821),
-	SPH_C64(0x822A7E2A2A82674D), SPH_C64(0x6DBBD0BBBB6D0AB1),
-	SPH_C64(0xE2C15EC1C1E28746), SPH_C64(0x0253F5535302F1A2),
-	SPH_C64(0x8BDC79DCDC8B72AE), SPH_C64(0x270B1D0B0B275358),
-	SPH_C64(0xD39DBA9D9DD3019C), SPH_C64(0xC16CB46C6CC12B47),
-	SPH_C64(0xF531533131F5A495), SPH_C64(0xB9749C7474B9F387),
-	SPH_C64(0x09F607F6F60915E3), SPH_C64(0x4346CA4646434C0A),
-	SPH_C64(0x26ACE9ACAC26A509), SPH_C64(0x978986898997B53C),
-	SPH_C64(0x44143C141444B4A0), SPH_C64(0x42E13EE1E142BA5B),
-	SPH_C64(0x4E163A16164EA6B0), SPH_C64(0xD23A4E3A3AD2F7CD),
-	SPH_C64(0xD069BB6969D0066F), SPH_C64(0x2D091B09092D4148),
-	SPH_C64(0xAD70907070ADD7A7), SPH_C64(0x54B6C7B6B6546FD9),
-	SPH_C64(0xB7D06DD0D0B71ECE), SPH_C64(0x7EED2AEDED7ED63B),
-	SPH_C64(0xDBCC49CCCCDBE22E), SPH_C64(0x5742C6424257682A),
-	SPH_C64(0xC298B59898C22CB4), SPH_C64(0x0EA4F1A4A40EED49),
-	SPH_C64(0x882878282888755D), SPH_C64(0x315CE45C5C3186DA),
-	SPH_C64(0x3FF815F8F83F6B93), SPH_C64(0xA486978686A4C244)
-};
-
-static const uint64_t old1_T4[256] = {
-	SPH_C64(0x1828181878D8C078), SPH_C64(0x23652323AF2605AF),
-	SPH_C64(0xC657C6C6F9B87EF9), SPH_C64(0xE825E8E86FFB136F),
-	SPH_C64(0x87948787A1CB4CA1), SPH_C64(0xB8D5B8B86211A962),
-	SPH_C64(0x0103010105090805), SPH_C64(0x4FD14F4F6E0D426E),
-	SPH_C64(0x365A3636EE9BADEE), SPH_C64(0xA6F7A6A604FF5904),
-	SPH_C64(0xD26BD2D2BD0CDEBD), SPH_C64(0xF502F5F5060EFB06),
-	SPH_C64(0x798B79798096EF80), SPH_C64(0x6FB16F6FCE305FCE),
-	SPH_C64(0x91AE9191EF6DFCEF), SPH_C64(0x52F6525207F8AA07),
-	SPH_C64(0x60A06060FD4727FD), SPH_C64(0xBCD9BCBC76358976),
-	SPH_C64(0x9BB09B9BCD37ACCD), SPH_C64(0x8E8F8E8E8C8A048C),
-	SPH_C64(0xA3F8A3A315D27115), SPH_C64(0x0C140C0C3C6C603C),
-	SPH_C64(0x7B8D7B7B8A84FF8A), SPH_C64(0x355F3535E180B5E1),
-	SPH_C64(0x1D271D1D69F5E869), SPH_C64(0xE03DE0E047B35347),
-	SPH_C64(0xD764D7D7AC21F6AC), SPH_C64(0xC25BC2C2ED9C5EED),
-	SPH_C64(0x2E722E2E96436D96), SPH_C64(0x4BDD4B4B7A29627A),
-	SPH_C64(0xFE1FFEFE215DA321), SPH_C64(0x57F9575716D58216),
-	SPH_C64(0x153F151541BDA841), SPH_C64(0x77997777B6E89FB6),
-	SPH_C64(0x37593737EB92A5EB), SPH_C64(0xE532E5E5569E7B56),
-	SPH_C64(0x9FBC9F9FD9138CD9), SPH_C64(0xF00DF0F01723D317),
-	SPH_C64(0x4ADE4A4A7F206A7F), SPH_C64(0xDA73DADA95449E95),
-	SPH_C64(0x58E8585825A2FA25), SPH_C64(0xC946C9C9CACF06CA),
-	SPH_C64(0x297B29298D7C558D), SPH_C64(0x0A1E0A0A225A5022),
-	SPH_C64(0xB1CEB1B14F50E14F), SPH_C64(0xA0FDA0A01AC9691A),
-	SPH_C64(0x6BBD6B6BDA147FDA), SPH_C64(0x85928585ABD95CAB),
-	SPH_C64(0xBDDABDBD733C8173), SPH_C64(0x5DE75D5D348FD234),
-	SPH_C64(0x1030101050908050), SPH_C64(0xF401F4F40307F303),
-	SPH_C64(0xCB40CBCBC0DD16C0), SPH_C64(0x3E423E3EC6D3EDC6),
-	SPH_C64(0x050F0505112D2811), SPH_C64(0x67A96767E6781FE6),
-	SPH_C64(0xE431E4E453977353), SPH_C64(0x27692727BB0225BB),
-	SPH_C64(0x41C3414158733258), SPH_C64(0x8B808B8B9DA72C9D),
-	SPH_C64(0xA7F4A7A701F65101), SPH_C64(0x7D877D7D94B2CF94),
-	SPH_C64(0x95A29595FB49DCFB), SPH_C64(0xD875D8D89F568E9F),
-	SPH_C64(0xFB10FBFB30708B30), SPH_C64(0xEE2FEEEE71CD2371),
-	SPH_C64(0x7C847C7C91BBC791), SPH_C64(0x66AA6666E37117E3),
-	SPH_C64(0xDD7ADDDD8E7BA68E), SPH_C64(0x173917174BAFB84B),
-	SPH_C64(0x47C9474746450246), SPH_C64(0x9EBF9E9EDC1A84DC),
-	SPH_C64(0xCA43CACAC5D41EC5), SPH_C64(0x2D772D2D99587599),
-	SPH_C64(0xBFDCBFBF792E9179), SPH_C64(0x070907071B3F381B),
-	SPH_C64(0xADEAADAD23AC0123), SPH_C64(0x5AEE5A5A2FB0EA2F),
-	SPH_C64(0x83988383B5EF6CB5), SPH_C64(0x33553333FFB685FF),
-	SPH_C64(0x63A56363F25C3FF2), SPH_C64(0x020602020A12100A),
-	SPH_C64(0xAAE3AAAA38933938), SPH_C64(0x71937171A8DEAFA8),
-	SPH_C64(0xC845C8C8CFC60ECF), SPH_C64(0x192B19197DD1C87D),
-	SPH_C64(0x49DB4949703B7270), SPH_C64(0xD976D9D99A5F869A),
-	SPH_C64(0xF20BF2F21D31C31D), SPH_C64(0xE338E3E348A84B48),
-	SPH_C64(0x5BED5B5B2AB9E22A), SPH_C64(0x8885888892BC3492),
-	SPH_C64(0x9AB39A9AC83EA4C8), SPH_C64(0x266A2626BE0B2DBE),
-	SPH_C64(0x32563232FABF8DFA), SPH_C64(0xB0CDB0B04A59E94A),
-	SPH_C64(0xE926E9E96AF21B6A), SPH_C64(0x0F110F0F33777833),
-	SPH_C64(0xD562D5D5A633E6A6), SPH_C64(0x809D8080BAF474BA),
-	SPH_C64(0xBEDFBEBE7C27997C), SPH_C64(0xCD4ACDCDDEEB26DE),
-	SPH_C64(0x345C3434E489BDE4), SPH_C64(0x48D8484875327A75),
-	SPH_C64(0xFF1CFFFF2454AB24), SPH_C64(0x7A8E7A7A8F8DF78F),
-	SPH_C64(0x90AD9090EA64F4EA), SPH_C64(0x5FE15F5F3E9DC23E),
-	SPH_C64(0x20602020A03D1DA0), SPH_C64(0x68B86868D50F67D5),
-	SPH_C64(0x1A2E1A1A72CAD072), SPH_C64(0xAEEFAEAE2CB7192C),
-	SPH_C64(0xB4C1B4B45E7DC95E), SPH_C64(0x54FC545419CE9A19),
-	SPH_C64(0x93A89393E57FECE5), SPH_C64(0x22662222AA2F0DAA),
-	SPH_C64(0x64AC6464E96307E9), SPH_C64(0xF10EF1F1122ADB12),
-	SPH_C64(0x73957373A2CCBFA2), SPH_C64(0x123612125A82905A),
-	SPH_C64(0x40C040405D7A3A5D), SPH_C64(0x0818080828484028),
-	SPH_C64(0xC358C3C3E89556E8), SPH_C64(0xEC29ECEC7BDF337B),
-	SPH_C64(0xDB70DBDB904D9690), SPH_C64(0xA1FEA1A11FC0611F),
-	SPH_C64(0x8D8A8D8D83911C83), SPH_C64(0x3D473D3DC9C8F5C9),
-	SPH_C64(0x97A49797F15BCCF1), SPH_C64(0x0000000000000000),
-	SPH_C64(0xCF4CCFCFD4F936D4), SPH_C64(0x2B7D2B2B876E4587),
-	SPH_C64(0x769A7676B3E197B3), SPH_C64(0x829B8282B0E664B0),
-	SPH_C64(0xD667D6D6A928FEA9), SPH_C64(0x1B2D1B1B77C3D877),
-	SPH_C64(0xB5C2B5B55B74C15B), SPH_C64(0xAFECAFAF29BE1129),
-	SPH_C64(0x6ABE6A6ADF1D77DF), SPH_C64(0x50F050500DEABA0D),
-	SPH_C64(0x45CF45454C57124C), SPH_C64(0xF308F3F31838CB18),
-	SPH_C64(0x30503030F0AD9DF0), SPH_C64(0xEF2CEFEF74C42B74),
-	SPH_C64(0x3F413F3FC3DAE5C3), SPH_C64(0x55FF55551CC7921C),
-	SPH_C64(0xA2FBA2A210DB7910), SPH_C64(0xEA23EAEA65E90365),
-	SPH_C64(0x65AF6565EC6A0FEC), SPH_C64(0xBAD3BABA6803B968),
-	SPH_C64(0x2F712F2F934A6593), SPH_C64(0xC05DC0C0E78E4EE7),
-	SPH_C64(0xDE7FDEDE8160BE81), SPH_C64(0x1C241C1C6CFCE06C),
-	SPH_C64(0xFD1AFDFD2E46BB2E), SPH_C64(0x4DD74D4D641F5264),
-	SPH_C64(0x92AB9292E076E4E0), SPH_C64(0x759F7575BCFA8FBC),
-	SPH_C64(0x060A06061E36301E), SPH_C64(0x8A838A8A98AE2498),
-	SPH_C64(0xB2CBB2B2404BF940), SPH_C64(0xE637E6E659856359),
-	SPH_C64(0x0E120E0E367E7036), SPH_C64(0x1F211F1F63E7F863),
-	SPH_C64(0x62A66262F75537F7), SPH_C64(0xD461D4D4A33AEEA3),
-	SPH_C64(0xA8E5A8A832812932), SPH_C64(0x96A79696F452C4F4),
-	SPH_C64(0xF916F9F93A629B3A), SPH_C64(0xC552C5C5F6A366F6),
-	SPH_C64(0x256F2525B11035B1), SPH_C64(0x59EB595920ABF220),
-	SPH_C64(0x84918484AED054AE), SPH_C64(0x72967272A7C5B7A7),
-	SPH_C64(0x394B3939DDECD5DD), SPH_C64(0x4CD44C4C61165A61),
-	SPH_C64(0x5EE25E5E3B94CA3B), SPH_C64(0x78887878859FE785),
-	SPH_C64(0x38483838D8E5DDD8), SPH_C64(0x8C898C8C86981486),
-	SPH_C64(0xD16ED1D1B217C6B2), SPH_C64(0xA5F2A5A50BE4410B),
-	SPH_C64(0xE23BE2E24DA1434D), SPH_C64(0x61A36161F84E2FF8),
-	SPH_C64(0xB3C8B3B34542F145), SPH_C64(0x21632121A53415A5),
-	SPH_C64(0x9CB99C9CD60894D6), SPH_C64(0x1E221E1E66EEF066),
-	SPH_C64(0x43C5434352612252), SPH_C64(0xC754C7C7FCB176FC),
-	SPH_C64(0xFC19FCFC2B4FB32B), SPH_C64(0x040C040414242014),
-	SPH_C64(0x51F3515108E3B208), SPH_C64(0x99B69999C725BCC7),
-	SPH_C64(0x6DB76D6DC4224FC4), SPH_C64(0x0D170D0D39656839),
-	SPH_C64(0xFA13FAFA35798335), SPH_C64(0xDF7CDFDF8469B684),
-	SPH_C64(0x7E827E7E9BA9D79B), SPH_C64(0x246C2424B4193DB4),
-	SPH_C64(0x3B4D3B3BD7FEC5D7), SPH_C64(0xABE0ABAB3D9A313D),
-	SPH_C64(0xCE4FCECED1F03ED1), SPH_C64(0x1133111155998855),
-	SPH_C64(0x8F8C8F8F89830C89), SPH_C64(0x4ED24E4E6B044A6B),
-	SPH_C64(0xB7C4B7B75166D151), SPH_C64(0xEB20EBEB60E00B60),
-	SPH_C64(0x3C443C3CCCC1FDCC), SPH_C64(0x819E8181BFFD7CBF),
-	SPH_C64(0x94A19494FE40D4FE), SPH_C64(0xF704F7F70C1CEB0C),
-	SPH_C64(0xB9D6B9B96718A167), SPH_C64(0x133513135F8B985F),
-	SPH_C64(0x2C742C2C9C517D9C), SPH_C64(0xD368D3D3B805D6B8),
-	SPH_C64(0xE734E7E75C8C6B5C), SPH_C64(0x6EB26E6ECB3957CB),
-	SPH_C64(0xC451C4C4F3AA6EF3), SPH_C64(0x030503030F1B180F),
-	SPH_C64(0x56FA565613DC8A13), SPH_C64(0x44CC4444495E1A49),
-	SPH_C64(0x7F817F7F9EA0DF9E), SPH_C64(0xA9E6A9A937882137),
-	SPH_C64(0x2A7E2A2A82674D82), SPH_C64(0xBBD0BBBB6D0AB16D),
-	SPH_C64(0xC15EC1C1E28746E2), SPH_C64(0x53F5535302F1A202),
-	SPH_C64(0xDC79DCDC8B72AE8B), SPH_C64(0x0B1D0B0B27535827),
-	SPH_C64(0x9DBA9D9DD3019CD3), SPH_C64(0x6CB46C6CC12B47C1),
-	SPH_C64(0x31533131F5A495F5), SPH_C64(0x749C7474B9F387B9),
-	SPH_C64(0xF607F6F60915E309), SPH_C64(0x46CA4646434C0A43),
-	SPH_C64(0xACE9ACAC26A50926), SPH_C64(0x8986898997B53C97),
-	SPH_C64(0x143C141444B4A044), SPH_C64(0xE13EE1E142BA5B42),
-	SPH_C64(0x163A16164EA6B04E), SPH_C64(0x3A4E3A3AD2F7CDD2),
-	SPH_C64(0x69BB6969D0066FD0), SPH_C64(0x091B09092D41482D),
-	SPH_C64(0x70907070ADD7A7AD), SPH_C64(0xB6C7B6B6546FD954),
-	SPH_C64(0xD06DD0D0B71ECEB7), SPH_C64(0xED2AEDED7ED63B7E),
-	SPH_C64(0xCC49CCCCDBE22EDB), SPH_C64(0x42C6424257682A57),
-	SPH_C64(0x98B59898C22CB4C2), SPH_C64(0xA4F1A4A40EED490E),
-	SPH_C64(0x2878282888755D88), SPH_C64(0x5CE45C5C3186DA31),
-	SPH_C64(0xF815F8F83F6B933F), SPH_C64(0x86978686A4C244A4)
-};
-
-static const uint64_t old1_T5[256] = {
-	SPH_C64(0x28181878D8C07818), SPH_C64(0x652323AF2605AF23),
-	SPH_C64(0x57C6C6F9B87EF9C6), SPH_C64(0x25E8E86FFB136FE8),
-	SPH_C64(0x948787A1CB4CA187), SPH_C64(0xD5B8B86211A962B8),
-	SPH_C64(0x0301010509080501), SPH_C64(0xD14F4F6E0D426E4F),
-	SPH_C64(0x5A3636EE9BADEE36), SPH_C64(0xF7A6A604FF5904A6),
-	SPH_C64(0x6BD2D2BD0CDEBDD2), SPH_C64(0x02F5F5060EFB06F5),
-	SPH_C64(0x8B79798096EF8079), SPH_C64(0xB16F6FCE305FCE6F),
-	SPH_C64(0xAE9191EF6DFCEF91), SPH_C64(0xF6525207F8AA0752),
-	SPH_C64(0xA06060FD4727FD60), SPH_C64(0xD9BCBC76358976BC),
-	SPH_C64(0xB09B9BCD37ACCD9B), SPH_C64(0x8F8E8E8C8A048C8E),
-	SPH_C64(0xF8A3A315D27115A3), SPH_C64(0x140C0C3C6C603C0C),
-	SPH_C64(0x8D7B7B8A84FF8A7B), SPH_C64(0x5F3535E180B5E135),
-	SPH_C64(0x271D1D69F5E8691D), SPH_C64(0x3DE0E047B35347E0),
-	SPH_C64(0x64D7D7AC21F6ACD7), SPH_C64(0x5BC2C2ED9C5EEDC2),
-	SPH_C64(0x722E2E96436D962E), SPH_C64(0xDD4B4B7A29627A4B),
-	SPH_C64(0x1FFEFE215DA321FE), SPH_C64(0xF9575716D5821657),
-	SPH_C64(0x3F151541BDA84115), SPH_C64(0x997777B6E89FB677),
-	SPH_C64(0x593737EB92A5EB37), SPH_C64(0x32E5E5569E7B56E5),
-	SPH_C64(0xBC9F9FD9138CD99F), SPH_C64(0x0DF0F01723D317F0),
-	SPH_C64(0xDE4A4A7F206A7F4A), SPH_C64(0x73DADA95449E95DA),
-	SPH_C64(0xE8585825A2FA2558), SPH_C64(0x46C9C9CACF06CAC9),
-	SPH_C64(0x7B29298D7C558D29), SPH_C64(0x1E0A0A225A50220A),
-	SPH_C64(0xCEB1B14F50E14FB1), SPH_C64(0xFDA0A01AC9691AA0),
-	SPH_C64(0xBD6B6BDA147FDA6B), SPH_C64(0x928585ABD95CAB85),
-	SPH_C64(0xDABDBD733C8173BD), SPH_C64(0xE75D5D348FD2345D),
-	SPH_C64(0x3010105090805010), SPH_C64(0x01F4F40307F303F4),
-	SPH_C64(0x40CBCBC0DD16C0CB), SPH_C64(0x423E3EC6D3EDC63E),
-	SPH_C64(0x0F0505112D281105), SPH_C64(0xA96767E6781FE667),
-	SPH_C64(0x31E4E453977353E4), SPH_C64(0x692727BB0225BB27),
-	SPH_C64(0xC341415873325841), SPH_C64(0x808B8B9DA72C9D8B),
-	SPH_C64(0xF4A7A701F65101A7), SPH_C64(0x877D7D94B2CF947D),
-	SPH_C64(0xA29595FB49DCFB95), SPH_C64(0x75D8D89F568E9FD8),
-	SPH_C64(0x10FBFB30708B30FB), SPH_C64(0x2FEEEE71CD2371EE),
-	SPH_C64(0x847C7C91BBC7917C), SPH_C64(0xAA6666E37117E366),
-	SPH_C64(0x7ADDDD8E7BA68EDD), SPH_C64(0x3917174BAFB84B17),
-	SPH_C64(0xC947474645024647), SPH_C64(0xBF9E9EDC1A84DC9E),
-	SPH_C64(0x43CACAC5D41EC5CA), SPH_C64(0x772D2D995875992D),
-	SPH_C64(0xDCBFBF792E9179BF), SPH_C64(0x0907071B3F381B07),
-	SPH_C64(0xEAADAD23AC0123AD), SPH_C64(0xEE5A5A2FB0EA2F5A),
-	SPH_C64(0x988383B5EF6CB583), SPH_C64(0x553333FFB685FF33),
-	SPH_C64(0xA56363F25C3FF263), SPH_C64(0x0602020A12100A02),
-	SPH_C64(0xE3AAAA38933938AA), SPH_C64(0x937171A8DEAFA871),
-	SPH_C64(0x45C8C8CFC60ECFC8), SPH_C64(0x2B19197DD1C87D19),
-	SPH_C64(0xDB4949703B727049), SPH_C64(0x76D9D99A5F869AD9),
-	SPH_C64(0x0BF2F21D31C31DF2), SPH_C64(0x38E3E348A84B48E3),
-	SPH_C64(0xED5B5B2AB9E22A5B), SPH_C64(0x85888892BC349288),
-	SPH_C64(0xB39A9AC83EA4C89A), SPH_C64(0x6A2626BE0B2DBE26),
-	SPH_C64(0x563232FABF8DFA32), SPH_C64(0xCDB0B04A59E94AB0),
-	SPH_C64(0x26E9E96AF21B6AE9), SPH_C64(0x110F0F337778330F),
-	SPH_C64(0x62D5D5A633E6A6D5), SPH_C64(0x9D8080BAF474BA80),
-	SPH_C64(0xDFBEBE7C27997CBE), SPH_C64(0x4ACDCDDEEB26DECD),
-	SPH_C64(0x5C3434E489BDE434), SPH_C64(0xD8484875327A7548),
-	SPH_C64(0x1CFFFF2454AB24FF), SPH_C64(0x8E7A7A8F8DF78F7A),
-	SPH_C64(0xAD9090EA64F4EA90), SPH_C64(0xE15F5F3E9DC23E5F),
-	SPH_C64(0x602020A03D1DA020), SPH_C64(0xB86868D50F67D568),
-	SPH_C64(0x2E1A1A72CAD0721A), SPH_C64(0xEFAEAE2CB7192CAE),
-	SPH_C64(0xC1B4B45E7DC95EB4), SPH_C64(0xFC545419CE9A1954),
-	SPH_C64(0xA89393E57FECE593), SPH_C64(0x662222AA2F0DAA22),
-	SPH_C64(0xAC6464E96307E964), SPH_C64(0x0EF1F1122ADB12F1),
-	SPH_C64(0x957373A2CCBFA273), SPH_C64(0x3612125A82905A12),
-	SPH_C64(0xC040405D7A3A5D40), SPH_C64(0x1808082848402808),
-	SPH_C64(0x58C3C3E89556E8C3), SPH_C64(0x29ECEC7BDF337BEC),
-	SPH_C64(0x70DBDB904D9690DB), SPH_C64(0xFEA1A11FC0611FA1),
-	SPH_C64(0x8A8D8D83911C838D), SPH_C64(0x473D3DC9C8F5C93D),
-	SPH_C64(0xA49797F15BCCF197), SPH_C64(0x0000000000000000),
-	SPH_C64(0x4CCFCFD4F936D4CF), SPH_C64(0x7D2B2B876E45872B),
-	SPH_C64(0x9A7676B3E197B376), SPH_C64(0x9B8282B0E664B082),
-	SPH_C64(0x67D6D6A928FEA9D6), SPH_C64(0x2D1B1B77C3D8771B),
-	SPH_C64(0xC2B5B55B74C15BB5), SPH_C64(0xECAFAF29BE1129AF),
-	SPH_C64(0xBE6A6ADF1D77DF6A), SPH_C64(0xF050500DEABA0D50),
-	SPH_C64(0xCF45454C57124C45), SPH_C64(0x08F3F31838CB18F3),
-	SPH_C64(0x503030F0AD9DF030), SPH_C64(0x2CEFEF74C42B74EF),
-	SPH_C64(0x413F3FC3DAE5C33F), SPH_C64(0xFF55551CC7921C55),
-	SPH_C64(0xFBA2A210DB7910A2), SPH_C64(0x23EAEA65E90365EA),
-	SPH_C64(0xAF6565EC6A0FEC65), SPH_C64(0xD3BABA6803B968BA),
-	SPH_C64(0x712F2F934A65932F), SPH_C64(0x5DC0C0E78E4EE7C0),
-	SPH_C64(0x7FDEDE8160BE81DE), SPH_C64(0x241C1C6CFCE06C1C),
-	SPH_C64(0x1AFDFD2E46BB2EFD), SPH_C64(0xD74D4D641F52644D),
-	SPH_C64(0xAB9292E076E4E092), SPH_C64(0x9F7575BCFA8FBC75),
-	SPH_C64(0x0A06061E36301E06), SPH_C64(0x838A8A98AE24988A),
-	SPH_C64(0xCBB2B2404BF940B2), SPH_C64(0x37E6E659856359E6),
-	SPH_C64(0x120E0E367E70360E), SPH_C64(0x211F1F63E7F8631F),
-	SPH_C64(0xA66262F75537F762), SPH_C64(0x61D4D4A33AEEA3D4),
-	SPH_C64(0xE5A8A832812932A8), SPH_C64(0xA79696F452C4F496),
-	SPH_C64(0x16F9F93A629B3AF9), SPH_C64(0x52C5C5F6A366F6C5),
-	SPH_C64(0x6F2525B11035B125), SPH_C64(0xEB595920ABF22059),
-	SPH_C64(0x918484AED054AE84), SPH_C64(0x967272A7C5B7A772),
-	SPH_C64(0x4B3939DDECD5DD39), SPH_C64(0xD44C4C61165A614C),
-	SPH_C64(0xE25E5E3B94CA3B5E), SPH_C64(0x887878859FE78578),
-	SPH_C64(0x483838D8E5DDD838), SPH_C64(0x898C8C869814868C),
-	SPH_C64(0x6ED1D1B217C6B2D1), SPH_C64(0xF2A5A50BE4410BA5),
-	SPH_C64(0x3BE2E24DA1434DE2), SPH_C64(0xA36161F84E2FF861),
-	SPH_C64(0xC8B3B34542F145B3), SPH_C64(0x632121A53415A521),
-	SPH_C64(0xB99C9CD60894D69C), SPH_C64(0x221E1E66EEF0661E),
-	SPH_C64(0xC543435261225243), SPH_C64(0x54C7C7FCB176FCC7),
-	SPH_C64(0x19FCFC2B4FB32BFC), SPH_C64(0x0C04041424201404),
-	SPH_C64(0xF3515108E3B20851), SPH_C64(0xB69999C725BCC799),
-	SPH_C64(0xB76D6DC4224FC46D), SPH_C64(0x170D0D396568390D),
-	SPH_C64(0x13FAFA35798335FA), SPH_C64(0x7CDFDF8469B684DF),
-	SPH_C64(0x827E7E9BA9D79B7E), SPH_C64(0x6C2424B4193DB424),
-	SPH_C64(0x4D3B3BD7FEC5D73B), SPH_C64(0xE0ABAB3D9A313DAB),
-	SPH_C64(0x4FCECED1F03ED1CE), SPH_C64(0x3311115599885511),
-	SPH_C64(0x8C8F8F89830C898F), SPH_C64(0xD24E4E6B044A6B4E),
-	SPH_C64(0xC4B7B75166D151B7), SPH_C64(0x20EBEB60E00B60EB),
-	SPH_C64(0x443C3CCCC1FDCC3C), SPH_C64(0x9E8181BFFD7CBF81),
-	SPH_C64(0xA19494FE40D4FE94), SPH_C64(0x04F7F70C1CEB0CF7),
-	SPH_C64(0xD6B9B96718A167B9), SPH_C64(0x3513135F8B985F13),
-	SPH_C64(0x742C2C9C517D9C2C), SPH_C64(0x68D3D3B805D6B8D3),
-	SPH_C64(0x34E7E75C8C6B5CE7), SPH_C64(0xB26E6ECB3957CB6E),
-	SPH_C64(0x51C4C4F3AA6EF3C4), SPH_C64(0x0503030F1B180F03),
-	SPH_C64(0xFA565613DC8A1356), SPH_C64(0xCC4444495E1A4944),
-	SPH_C64(0x817F7F9EA0DF9E7F), SPH_C64(0xE6A9A937882137A9),
-	SPH_C64(0x7E2A2A82674D822A), SPH_C64(0xD0BBBB6D0AB16DBB),
-	SPH_C64(0x5EC1C1E28746E2C1), SPH_C64(0xF5535302F1A20253),
-	SPH_C64(0x79DCDC8B72AE8BDC), SPH_C64(0x1D0B0B275358270B),
-	SPH_C64(0xBA9D9DD3019CD39D), SPH_C64(0xB46C6CC12B47C16C),
-	SPH_C64(0x533131F5A495F531), SPH_C64(0x9C7474B9F387B974),
-	SPH_C64(0x07F6F60915E309F6), SPH_C64(0xCA4646434C0A4346),
-	SPH_C64(0xE9ACAC26A50926AC), SPH_C64(0x86898997B53C9789),
-	SPH_C64(0x3C141444B4A04414), SPH_C64(0x3EE1E142BA5B42E1),
-	SPH_C64(0x3A16164EA6B04E16), SPH_C64(0x4E3A3AD2F7CDD23A),
-	SPH_C64(0xBB6969D0066FD069), SPH_C64(0x1B09092D41482D09),
-	SPH_C64(0x907070ADD7A7AD70), SPH_C64(0xC7B6B6546FD954B6),
-	SPH_C64(0x6DD0D0B71ECEB7D0), SPH_C64(0x2AEDED7ED63B7EED),
-	SPH_C64(0x49CCCCDBE22EDBCC), SPH_C64(0xC6424257682A5742),
-	SPH_C64(0xB59898C22CB4C298), SPH_C64(0xF1A4A40EED490EA4),
-	SPH_C64(0x78282888755D8828), SPH_C64(0xE45C5C3186DA315C),
-	SPH_C64(0x15F8F83F6B933FF8), SPH_C64(0x978686A4C244A486)
-};
-
-static const uint64_t old1_T6[256] = {
-	SPH_C64(0x181878D8C0781828), SPH_C64(0x2323AF2605AF2365),
-	SPH_C64(0xC6C6F9B87EF9C657), SPH_C64(0xE8E86FFB136FE825),
-	SPH_C64(0x8787A1CB4CA18794), SPH_C64(0xB8B86211A962B8D5),
-	SPH_C64(0x0101050908050103), SPH_C64(0x4F4F6E0D426E4FD1),
-	SPH_C64(0x3636EE9BADEE365A), SPH_C64(0xA6A604FF5904A6F7),
-	SPH_C64(0xD2D2BD0CDEBDD26B), SPH_C64(0xF5F5060EFB06F502),
-	SPH_C64(0x79798096EF80798B), SPH_C64(0x6F6FCE305FCE6FB1),
-	SPH_C64(0x9191EF6DFCEF91AE), SPH_C64(0x525207F8AA0752F6),
-	SPH_C64(0x6060FD4727FD60A0), SPH_C64(0xBCBC76358976BCD9),
-	SPH_C64(0x9B9BCD37ACCD9BB0), SPH_C64(0x8E8E8C8A048C8E8F),
-	SPH_C64(0xA3A315D27115A3F8), SPH_C64(0x0C0C3C6C603C0C14),
-	SPH_C64(0x7B7B8A84FF8A7B8D), SPH_C64(0x3535E180B5E1355F),
-	SPH_C64(0x1D1D69F5E8691D27), SPH_C64(0xE0E047B35347E03D),
-	SPH_C64(0xD7D7AC21F6ACD764), SPH_C64(0xC2C2ED9C5EEDC25B),
-	SPH_C64(0x2E2E96436D962E72), SPH_C64(0x4B4B7A29627A4BDD),
-	SPH_C64(0xFEFE215DA321FE1F), SPH_C64(0x575716D5821657F9),
-	SPH_C64(0x151541BDA841153F), SPH_C64(0x7777B6E89FB67799),
-	SPH_C64(0x3737EB92A5EB3759), SPH_C64(0xE5E5569E7B56E532),
-	SPH_C64(0x9F9FD9138CD99FBC), SPH_C64(0xF0F01723D317F00D),
-	SPH_C64(0x4A4A7F206A7F4ADE), SPH_C64(0xDADA95449E95DA73),
-	SPH_C64(0x585825A2FA2558E8), SPH_C64(0xC9C9CACF06CAC946),
-	SPH_C64(0x29298D7C558D297B), SPH_C64(0x0A0A225A50220A1E),
-	SPH_C64(0xB1B14F50E14FB1CE), SPH_C64(0xA0A01AC9691AA0FD),
-	SPH_C64(0x6B6BDA147FDA6BBD), SPH_C64(0x8585ABD95CAB8592),
-	SPH_C64(0xBDBD733C8173BDDA), SPH_C64(0x5D5D348FD2345DE7),
-	SPH_C64(0x1010509080501030), SPH_C64(0xF4F40307F303F401),
-	SPH_C64(0xCBCBC0DD16C0CB40), SPH_C64(0x3E3EC6D3EDC63E42),
-	SPH_C64(0x0505112D2811050F), SPH_C64(0x6767E6781FE667A9),
-	SPH_C64(0xE4E453977353E431), SPH_C64(0x2727BB0225BB2769),
-	SPH_C64(0x41415873325841C3), SPH_C64(0x8B8B9DA72C9D8B80),
-	SPH_C64(0xA7A701F65101A7F4), SPH_C64(0x7D7D94B2CF947D87),
-	SPH_C64(0x9595FB49DCFB95A2), SPH_C64(0xD8D89F568E9FD875),
-	SPH_C64(0xFBFB30708B30FB10), SPH_C64(0xEEEE71CD2371EE2F),
-	SPH_C64(0x7C7C91BBC7917C84), SPH_C64(0x6666E37117E366AA),
-	SPH_C64(0xDDDD8E7BA68EDD7A), SPH_C64(0x17174BAFB84B1739),
-	SPH_C64(0x47474645024647C9), SPH_C64(0x9E9EDC1A84DC9EBF),
-	SPH_C64(0xCACAC5D41EC5CA43), SPH_C64(0x2D2D995875992D77),
-	SPH_C64(0xBFBF792E9179BFDC), SPH_C64(0x07071B3F381B0709),
-	SPH_C64(0xADAD23AC0123ADEA), SPH_C64(0x5A5A2FB0EA2F5AEE),
-	SPH_C64(0x8383B5EF6CB58398), SPH_C64(0x3333FFB685FF3355),
-	SPH_C64(0x6363F25C3FF263A5), SPH_C64(0x02020A12100A0206),
-	SPH_C64(0xAAAA38933938AAE3), SPH_C64(0x7171A8DEAFA87193),
-	SPH_C64(0xC8C8CFC60ECFC845), SPH_C64(0x19197DD1C87D192B),
-	SPH_C64(0x4949703B727049DB), SPH_C64(0xD9D99A5F869AD976),
-	SPH_C64(0xF2F21D31C31DF20B), SPH_C64(0xE3E348A84B48E338),
-	SPH_C64(0x5B5B2AB9E22A5BED), SPH_C64(0x888892BC34928885),
-	SPH_C64(0x9A9AC83EA4C89AB3), SPH_C64(0x2626BE0B2DBE266A),
-	SPH_C64(0x3232FABF8DFA3256), SPH_C64(0xB0B04A59E94AB0CD),
-	SPH_C64(0xE9E96AF21B6AE926), SPH_C64(0x0F0F337778330F11),
-	SPH_C64(0xD5D5A633E6A6D562), SPH_C64(0x8080BAF474BA809D),
-	SPH_C64(0xBEBE7C27997CBEDF), SPH_C64(0xCDCDDEEB26DECD4A),
-	SPH_C64(0x3434E489BDE4345C), SPH_C64(0x484875327A7548D8),
-	SPH_C64(0xFFFF2454AB24FF1C), SPH_C64(0x7A7A8F8DF78F7A8E),
-	SPH_C64(0x9090EA64F4EA90AD), SPH_C64(0x5F5F3E9DC23E5FE1),
-	SPH_C64(0x2020A03D1DA02060), SPH_C64(0x6868D50F67D568B8),
-	SPH_C64(0x1A1A72CAD0721A2E), SPH_C64(0xAEAE2CB7192CAEEF),
-	SPH_C64(0xB4B45E7DC95EB4C1), SPH_C64(0x545419CE9A1954FC),
-	SPH_C64(0x9393E57FECE593A8), SPH_C64(0x2222AA2F0DAA2266),
-	SPH_C64(0x6464E96307E964AC), SPH_C64(0xF1F1122ADB12F10E),
-	SPH_C64(0x7373A2CCBFA27395), SPH_C64(0x12125A82905A1236),
-	SPH_C64(0x40405D7A3A5D40C0), SPH_C64(0x0808284840280818),
-	SPH_C64(0xC3C3E89556E8C358), SPH_C64(0xECEC7BDF337BEC29),
-	SPH_C64(0xDBDB904D9690DB70), SPH_C64(0xA1A11FC0611FA1FE),
-	SPH_C64(0x8D8D83911C838D8A), SPH_C64(0x3D3DC9C8F5C93D47),
-	SPH_C64(0x9797F15BCCF197A4), SPH_C64(0x0000000000000000),
-	SPH_C64(0xCFCFD4F936D4CF4C), SPH_C64(0x2B2B876E45872B7D),
-	SPH_C64(0x7676B3E197B3769A), SPH_C64(0x8282B0E664B0829B),
-	SPH_C64(0xD6D6A928FEA9D667), SPH_C64(0x1B1B77C3D8771B2D),
-	SPH_C64(0xB5B55B74C15BB5C2), SPH_C64(0xAFAF29BE1129AFEC),
-	SPH_C64(0x6A6ADF1D77DF6ABE), SPH_C64(0x50500DEABA0D50F0),
-	SPH_C64(0x45454C57124C45CF), SPH_C64(0xF3F31838CB18F308),
-	SPH_C64(0x3030F0AD9DF03050), SPH_C64(0xEFEF74C42B74EF2C),
-	SPH_C64(0x3F3FC3DAE5C33F41), SPH_C64(0x55551CC7921C55FF),
-	SPH_C64(0xA2A210DB7910A2FB), SPH_C64(0xEAEA65E90365EA23),
-	SPH_C64(0x6565EC6A0FEC65AF), SPH_C64(0xBABA6803B968BAD3),
-	SPH_C64(0x2F2F934A65932F71), SPH_C64(0xC0C0E78E4EE7C05D),
-	SPH_C64(0xDEDE8160BE81DE7F), SPH_C64(0x1C1C6CFCE06C1C24),
-	SPH_C64(0xFDFD2E46BB2EFD1A), SPH_C64(0x4D4D641F52644DD7),
-	SPH_C64(0x9292E076E4E092AB), SPH_C64(0x7575BCFA8FBC759F),
-	SPH_C64(0x06061E36301E060A), SPH_C64(0x8A8A98AE24988A83),
-	SPH_C64(0xB2B2404BF940B2CB), SPH_C64(0xE6E659856359E637),
-	SPH_C64(0x0E0E367E70360E12), SPH_C64(0x1F1F63E7F8631F21),
-	SPH_C64(0x6262F75537F762A6), SPH_C64(0xD4D4A33AEEA3D461),
-	SPH_C64(0xA8A832812932A8E5), SPH_C64(0x9696F452C4F496A7),
-	SPH_C64(0xF9F93A629B3AF916), SPH_C64(0xC5C5F6A366F6C552),
-	SPH_C64(0x2525B11035B1256F), SPH_C64(0x595920ABF22059EB),
-	SPH_C64(0x8484AED054AE8491), SPH_C64(0x7272A7C5B7A77296),
-	SPH_C64(0x3939DDECD5DD394B), SPH_C64(0x4C4C61165A614CD4),
-	SPH_C64(0x5E5E3B94CA3B5EE2), SPH_C64(0x7878859FE7857888),
-	SPH_C64(0x3838D8E5DDD83848), SPH_C64(0x8C8C869814868C89),
-	SPH_C64(0xD1D1B217C6B2D16E), SPH_C64(0xA5A50BE4410BA5F2),
-	SPH_C64(0xE2E24DA1434DE23B), SPH_C64(0x6161F84E2FF861A3),
-	SPH_C64(0xB3B34542F145B3C8), SPH_C64(0x2121A53415A52163),
-	SPH_C64(0x9C9CD60894D69CB9), SPH_C64(0x1E1E66EEF0661E22),
-	SPH_C64(0x43435261225243C5), SPH_C64(0xC7C7FCB176FCC754),
-	SPH_C64(0xFCFC2B4FB32BFC19), SPH_C64(0x040414242014040C),
-	SPH_C64(0x515108E3B20851F3), SPH_C64(0x9999C725BCC799B6),
-	SPH_C64(0x6D6DC4224FC46DB7), SPH_C64(0x0D0D396568390D17),
-	SPH_C64(0xFAFA35798335FA13), SPH_C64(0xDFDF8469B684DF7C),
-	SPH_C64(0x7E7E9BA9D79B7E82), SPH_C64(0x2424B4193DB4246C),
-	SPH_C64(0x3B3BD7FEC5D73B4D), SPH_C64(0xABAB3D9A313DABE0),
-	SPH_C64(0xCECED1F03ED1CE4F), SPH_C64(0x1111559988551133),
-	SPH_C64(0x8F8F89830C898F8C), SPH_C64(0x4E4E6B044A6B4ED2),
-	SPH_C64(0xB7B75166D151B7C4), SPH_C64(0xEBEB60E00B60EB20),
-	SPH_C64(0x3C3CCCC1FDCC3C44), SPH_C64(0x8181BFFD7CBF819E),
-	SPH_C64(0x9494FE40D4FE94A1), SPH_C64(0xF7F70C1CEB0CF704),
-	SPH_C64(0xB9B96718A167B9D6), SPH_C64(0x13135F8B985F1335),
-	SPH_C64(0x2C2C9C517D9C2C74), SPH_C64(0xD3D3B805D6B8D368),
-	SPH_C64(0xE7E75C8C6B5CE734), SPH_C64(0x6E6ECB3957CB6EB2),
-	SPH_C64(0xC4C4F3AA6EF3C451), SPH_C64(0x03030F1B180F0305),
-	SPH_C64(0x565613DC8A1356FA), SPH_C64(0x4444495E1A4944CC),
-	SPH_C64(0x7F7F9EA0DF9E7F81), SPH_C64(0xA9A937882137A9E6),
-	SPH_C64(0x2A2A82674D822A7E), SPH_C64(0xBBBB6D0AB16DBBD0),
-	SPH_C64(0xC1C1E28746E2C15E), SPH_C64(0x535302F1A20253F5),
-	SPH_C64(0xDCDC8B72AE8BDC79), SPH_C64(0x0B0B275358270B1D),
-	SPH_C64(0x9D9DD3019CD39DBA), SPH_C64(0x6C6CC12B47C16CB4),
-	SPH_C64(0x3131F5A495F53153), SPH_C64(0x7474B9F387B9749C),
-	SPH_C64(0xF6F60915E309F607), SPH_C64(0x4646434C0A4346CA),
-	SPH_C64(0xACAC26A50926ACE9), SPH_C64(0x898997B53C978986),
-	SPH_C64(0x141444B4A044143C), SPH_C64(0xE1E142BA5B42E13E),
-	SPH_C64(0x16164EA6B04E163A), SPH_C64(0x3A3AD2F7CDD23A4E),
-	SPH_C64(0x6969D0066FD069BB), SPH_C64(0x09092D41482D091B),
-	SPH_C64(0x7070ADD7A7AD7090), SPH_C64(0xB6B6546FD954B6C7),
-	SPH_C64(0xD0D0B71ECEB7D06D), SPH_C64(0xEDED7ED63B7EED2A),
-	SPH_C64(0xCCCCDBE22EDBCC49), SPH_C64(0x424257682A5742C6),
-	SPH_C64(0x9898C22CB4C298B5), SPH_C64(0xA4A40EED490EA4F1),
-	SPH_C64(0x282888755D882878), SPH_C64(0x5C5C3186DA315CE4),
-	SPH_C64(0xF8F83F6B933FF815), SPH_C64(0x8686A4C244A48697)
-};
-
-static const uint64_t old1_T7[256] = {
-	SPH_C64(0x1878D8C078182818), SPH_C64(0x23AF2605AF236523),
-	SPH_C64(0xC6F9B87EF9C657C6), SPH_C64(0xE86FFB136FE825E8),
-	SPH_C64(0x87A1CB4CA1879487), SPH_C64(0xB86211A962B8D5B8),
-	SPH_C64(0x0105090805010301), SPH_C64(0x4F6E0D426E4FD14F),
-	SPH_C64(0x36EE9BADEE365A36), SPH_C64(0xA604FF5904A6F7A6),
-	SPH_C64(0xD2BD0CDEBDD26BD2), SPH_C64(0xF5060EFB06F502F5),
-	SPH_C64(0x798096EF80798B79), SPH_C64(0x6FCE305FCE6FB16F),
-	SPH_C64(0x91EF6DFCEF91AE91), SPH_C64(0x5207F8AA0752F652),
-	SPH_C64(0x60FD4727FD60A060), SPH_C64(0xBC76358976BCD9BC),
-	SPH_C64(0x9BCD37ACCD9BB09B), SPH_C64(0x8E8C8A048C8E8F8E),
-	SPH_C64(0xA315D27115A3F8A3), SPH_C64(0x0C3C6C603C0C140C),
-	SPH_C64(0x7B8A84FF8A7B8D7B), SPH_C64(0x35E180B5E1355F35),
-	SPH_C64(0x1D69F5E8691D271D), SPH_C64(0xE047B35347E03DE0),
-	SPH_C64(0xD7AC21F6ACD764D7), SPH_C64(0xC2ED9C5EEDC25BC2),
-	SPH_C64(0x2E96436D962E722E), SPH_C64(0x4B7A29627A4BDD4B),
-	SPH_C64(0xFE215DA321FE1FFE), SPH_C64(0x5716D5821657F957),
-	SPH_C64(0x1541BDA841153F15), SPH_C64(0x77B6E89FB6779977),
-	SPH_C64(0x37EB92A5EB375937), SPH_C64(0xE5569E7B56E532E5),
-	SPH_C64(0x9FD9138CD99FBC9F), SPH_C64(0xF01723D317F00DF0),
-	SPH_C64(0x4A7F206A7F4ADE4A), SPH_C64(0xDA95449E95DA73DA),
-	SPH_C64(0x5825A2FA2558E858), SPH_C64(0xC9CACF06CAC946C9),
-	SPH_C64(0x298D7C558D297B29), SPH_C64(0x0A225A50220A1E0A),
-	SPH_C64(0xB14F50E14FB1CEB1), SPH_C64(0xA01AC9691AA0FDA0),
-	SPH_C64(0x6BDA147FDA6BBD6B), SPH_C64(0x85ABD95CAB859285),
-	SPH_C64(0xBD733C8173BDDABD), SPH_C64(0x5D348FD2345DE75D),
-	SPH_C64(0x1050908050103010), SPH_C64(0xF40307F303F401F4),
-	SPH_C64(0xCBC0DD16C0CB40CB), SPH_C64(0x3EC6D3EDC63E423E),
-	SPH_C64(0x05112D2811050F05), SPH_C64(0x67E6781FE667A967),
-	SPH_C64(0xE453977353E431E4), SPH_C64(0x27BB0225BB276927),
-	SPH_C64(0x415873325841C341), SPH_C64(0x8B9DA72C9D8B808B),
-	SPH_C64(0xA701F65101A7F4A7), SPH_C64(0x7D94B2CF947D877D),
-	SPH_C64(0x95FB49DCFB95A295), SPH_C64(0xD89F568E9FD875D8),
-	SPH_C64(0xFB30708B30FB10FB), SPH_C64(0xEE71CD2371EE2FEE),
-	SPH_C64(0x7C91BBC7917C847C), SPH_C64(0x66E37117E366AA66),
-	SPH_C64(0xDD8E7BA68EDD7ADD), SPH_C64(0x174BAFB84B173917),
-	SPH_C64(0x474645024647C947), SPH_C64(0x9EDC1A84DC9EBF9E),
-	SPH_C64(0xCAC5D41EC5CA43CA), SPH_C64(0x2D995875992D772D),
-	SPH_C64(0xBF792E9179BFDCBF), SPH_C64(0x071B3F381B070907),
-	SPH_C64(0xAD23AC0123ADEAAD), SPH_C64(0x5A2FB0EA2F5AEE5A),
-	SPH_C64(0x83B5EF6CB5839883), SPH_C64(0x33FFB685FF335533),
-	SPH_C64(0x63F25C3FF263A563), SPH_C64(0x020A12100A020602),
-	SPH_C64(0xAA38933938AAE3AA), SPH_C64(0x71A8DEAFA8719371),
-	SPH_C64(0xC8CFC60ECFC845C8), SPH_C64(0x197DD1C87D192B19),
-	SPH_C64(0x49703B727049DB49), SPH_C64(0xD99A5F869AD976D9),
-	SPH_C64(0xF21D31C31DF20BF2), SPH_C64(0xE348A84B48E338E3),
-	SPH_C64(0x5B2AB9E22A5BED5B), SPH_C64(0x8892BC3492888588),
-	SPH_C64(0x9AC83EA4C89AB39A), SPH_C64(0x26BE0B2DBE266A26),
-	SPH_C64(0x32FABF8DFA325632), SPH_C64(0xB04A59E94AB0CDB0),
-	SPH_C64(0xE96AF21B6AE926E9), SPH_C64(0x0F337778330F110F),
-	SPH_C64(0xD5A633E6A6D562D5), SPH_C64(0x80BAF474BA809D80),
-	SPH_C64(0xBE7C27997CBEDFBE), SPH_C64(0xCDDEEB26DECD4ACD),
-	SPH_C64(0x34E489BDE4345C34), SPH_C64(0x4875327A7548D848),
-	SPH_C64(0xFF2454AB24FF1CFF), SPH_C64(0x7A8F8DF78F7A8E7A),
-	SPH_C64(0x90EA64F4EA90AD90), SPH_C64(0x5F3E9DC23E5FE15F),
-	SPH_C64(0x20A03D1DA0206020), SPH_C64(0x68D50F67D568B868),
-	SPH_C64(0x1A72CAD0721A2E1A), SPH_C64(0xAE2CB7192CAEEFAE),
-	SPH_C64(0xB45E7DC95EB4C1B4), SPH_C64(0x5419CE9A1954FC54),
-	SPH_C64(0x93E57FECE593A893), SPH_C64(0x22AA2F0DAA226622),
-	SPH_C64(0x64E96307E964AC64), SPH_C64(0xF1122ADB12F10EF1),
-	SPH_C64(0x73A2CCBFA2739573), SPH_C64(0x125A82905A123612),
-	SPH_C64(0x405D7A3A5D40C040), SPH_C64(0x0828484028081808),
-	SPH_C64(0xC3E89556E8C358C3), SPH_C64(0xEC7BDF337BEC29EC),
-	SPH_C64(0xDB904D9690DB70DB), SPH_C64(0xA11FC0611FA1FEA1),
-	SPH_C64(0x8D83911C838D8A8D), SPH_C64(0x3DC9C8F5C93D473D),
-	SPH_C64(0x97F15BCCF197A497), SPH_C64(0x0000000000000000),
-	SPH_C64(0xCFD4F936D4CF4CCF), SPH_C64(0x2B876E45872B7D2B),
-	SPH_C64(0x76B3E197B3769A76), SPH_C64(0x82B0E664B0829B82),
-	SPH_C64(0xD6A928FEA9D667D6), SPH_C64(0x1B77C3D8771B2D1B),
-	SPH_C64(0xB55B74C15BB5C2B5), SPH_C64(0xAF29BE1129AFECAF),
-	SPH_C64(0x6ADF1D77DF6ABE6A), SPH_C64(0x500DEABA0D50F050),
-	SPH_C64(0x454C57124C45CF45), SPH_C64(0xF31838CB18F308F3),
-	SPH_C64(0x30F0AD9DF0305030), SPH_C64(0xEF74C42B74EF2CEF),
-	SPH_C64(0x3FC3DAE5C33F413F), SPH_C64(0x551CC7921C55FF55),
-	SPH_C64(0xA210DB7910A2FBA2), SPH_C64(0xEA65E90365EA23EA),
-	SPH_C64(0x65EC6A0FEC65AF65), SPH_C64(0xBA6803B968BAD3BA),
-	SPH_C64(0x2F934A65932F712F), SPH_C64(0xC0E78E4EE7C05DC0),
-	SPH_C64(0xDE8160BE81DE7FDE), SPH_C64(0x1C6CFCE06C1C241C),
-	SPH_C64(0xFD2E46BB2EFD1AFD), SPH_C64(0x4D641F52644DD74D),
-	SPH_C64(0x92E076E4E092AB92), SPH_C64(0x75BCFA8FBC759F75),
-	SPH_C64(0x061E36301E060A06), SPH_C64(0x8A98AE24988A838A),
-	SPH_C64(0xB2404BF940B2CBB2), SPH_C64(0xE659856359E637E6),
-	SPH_C64(0x0E367E70360E120E), SPH_C64(0x1F63E7F8631F211F),
-	SPH_C64(0x62F75537F762A662), SPH_C64(0xD4A33AEEA3D461D4),
-	SPH_C64(0xA832812932A8E5A8), SPH_C64(0x96F452C4F496A796),
-	SPH_C64(0xF93A629B3AF916F9), SPH_C64(0xC5F6A366F6C552C5),
-	SPH_C64(0x25B11035B1256F25), SPH_C64(0x5920ABF22059EB59),
-	SPH_C64(0x84AED054AE849184), SPH_C64(0x72A7C5B7A7729672),
-	SPH_C64(0x39DDECD5DD394B39), SPH_C64(0x4C61165A614CD44C),
-	SPH_C64(0x5E3B94CA3B5EE25E), SPH_C64(0x78859FE785788878),
-	SPH_C64(0x38D8E5DDD8384838), SPH_C64(0x8C869814868C898C),
-	SPH_C64(0xD1B217C6B2D16ED1), SPH_C64(0xA50BE4410BA5F2A5),
-	SPH_C64(0xE24DA1434DE23BE2), SPH_C64(0x61F84E2FF861A361),
-	SPH_C64(0xB34542F145B3C8B3), SPH_C64(0x21A53415A5216321),
-	SPH_C64(0x9CD60894D69CB99C), SPH_C64(0x1E66EEF0661E221E),
-	SPH_C64(0x435261225243C543), SPH_C64(0xC7FCB176FCC754C7),
-	SPH_C64(0xFC2B4FB32BFC19FC), SPH_C64(0x0414242014040C04),
-	SPH_C64(0x5108E3B20851F351), SPH_C64(0x99C725BCC799B699),
-	SPH_C64(0x6DC4224FC46DB76D), SPH_C64(0x0D396568390D170D),
-	SPH_C64(0xFA35798335FA13FA), SPH_C64(0xDF8469B684DF7CDF),
-	SPH_C64(0x7E9BA9D79B7E827E), SPH_C64(0x24B4193DB4246C24),
-	SPH_C64(0x3BD7FEC5D73B4D3B), SPH_C64(0xAB3D9A313DABE0AB),
-	SPH_C64(0xCED1F03ED1CE4FCE), SPH_C64(0x1155998855113311),
-	SPH_C64(0x8F89830C898F8C8F), SPH_C64(0x4E6B044A6B4ED24E),
-	SPH_C64(0xB75166D151B7C4B7), SPH_C64(0xEB60E00B60EB20EB),
-	SPH_C64(0x3CCCC1FDCC3C443C), SPH_C64(0x81BFFD7CBF819E81),
-	SPH_C64(0x94FE40D4FE94A194), SPH_C64(0xF70C1CEB0CF704F7),
-	SPH_C64(0xB96718A167B9D6B9), SPH_C64(0x135F8B985F133513),
-	SPH_C64(0x2C9C517D9C2C742C), SPH_C64(0xD3B805D6B8D368D3),
-	SPH_C64(0xE75C8C6B5CE734E7), SPH_C64(0x6ECB3957CB6EB26E),
-	SPH_C64(0xC4F3AA6EF3C451C4), SPH_C64(0x030F1B180F030503),
-	SPH_C64(0x5613DC8A1356FA56), SPH_C64(0x44495E1A4944CC44),
-	SPH_C64(0x7F9EA0DF9E7F817F), SPH_C64(0xA937882137A9E6A9),
-	SPH_C64(0x2A82674D822A7E2A), SPH_C64(0xBB6D0AB16DBBD0BB),
-	SPH_C64(0xC1E28746E2C15EC1), SPH_C64(0x5302F1A20253F553),
-	SPH_C64(0xDC8B72AE8BDC79DC), SPH_C64(0x0B275358270B1D0B),
-	SPH_C64(0x9DD3019CD39DBA9D), SPH_C64(0x6CC12B47C16CB46C),
-	SPH_C64(0x31F5A495F5315331), SPH_C64(0x74B9F387B9749C74),
-	SPH_C64(0xF60915E309F607F6), SPH_C64(0x46434C0A4346CA46),
-	SPH_C64(0xAC26A50926ACE9AC), SPH_C64(0x8997B53C97898689),
-	SPH_C64(0x1444B4A044143C14), SPH_C64(0xE142BA5B42E13EE1),
-	SPH_C64(0x164EA6B04E163A16), SPH_C64(0x3AD2F7CDD23A4E3A),
-	SPH_C64(0x69D0066FD069BB69), SPH_C64(0x092D41482D091B09),
-	SPH_C64(0x70ADD7A7AD709070), SPH_C64(0xB6546FD954B6C7B6),
-	SPH_C64(0xD0B71ECEB7D06DD0), SPH_C64(0xED7ED63B7EED2AED),
-	SPH_C64(0xCCDBE22EDBCC49CC), SPH_C64(0x4257682A5742C642),
-	SPH_C64(0x98C22CB4C298B598), SPH_C64(0xA40EED490EA4F1A4),
-	SPH_C64(0x2888755D88287828), SPH_C64(0x5C3186DA315CE45C),
-	SPH_C64(0xF83F6B933FF815F8), SPH_C64(0x86A4C244A4869786)
-};
-
-static const uint64_t old1_RC[10] = {
-	SPH_C64(0x4F01B887E8C62318),
-	SPH_C64(0x52916F79F5D2A636),
-	SPH_C64(0x357B0CA38E9BBC60),
-	SPH_C64(0x57FE4B2EC2D7E01D),
-	SPH_C64(0xDA4AF09FE5377715),
-	SPH_C64(0x856BA0B10A29C958),
-	SPH_C64(0x67053ECBF4105DBD),
-	SPH_C64(0xD8957DA78B4127E4),
-	SPH_C64(0x9E4717DD667CEEFB),
-	SPH_C64(0x33835AAD07BF2DCA)
-};
-
-static const uint64_t plain_T0[256] = {
+__constant__  __align__(64) uint64_t mixTob0Tox[256] = {
 	SPH_C64(0xD83078C018601818), SPH_C64(0x2646AF05238C2323),
 	SPH_C64(0xB891F97EC63FC6C6), SPH_C64(0xFBCD6F13E887E8E8),
 	SPH_C64(0xCB13A14C87268787), SPH_C64(0x116D62A9B8DAB8B8),
@@ -1255,7 +186,10 @@ static const uint64_t plain_T0[256] = {
 	SPH_C64(0x6BED3F93F8C7F8F8), SPH_C64(0xC211A44486228686)
 };
 
-static const uint64_t plain_T1[256] = {
+#if USE_ALL_TABLES
+
+/*
+__constant__  __align__(64) uint64_t  mixTob1Tox[256] = {
 	SPH_C64(0x3078C018601818D8), SPH_C64(0x46AF05238C232326),
 	SPH_C64(0x91F97EC63FC6C6B8), SPH_C64(0xCD6F13E887E8E8FB),
 	SPH_C64(0x13A14C87268787CB), SPH_C64(0x6D62A9B8DAB8B811),
@@ -2171,466 +1105,703 @@ static const uint64_t plain_T7[256] = {
 	SPH_C64(0x287550885D28A028), SPH_C64(0x5C86B831DA5C6D5C),
 	SPH_C64(0xF86BED3F93F8C7F8), SPH_C64(0x86C211A444862286)
 };
+*/
 
 /**
  * Round constants.
  */
-__constant__ uint64_t InitVector_RC[10];
-
-static const uint64_t plain_RC[10] = {
-	SPH_C64(0x4F01B887E8C62318),
-	SPH_C64(0x52916F79F5D2A636),
-	SPH_C64(0x357B0CA38E9BBC60),
-	SPH_C64(0x57FE4B2EC2D7E01D),
-	SPH_C64(0xDA4AF09FE5377715),
-	SPH_C64(0x856BA0B10A29C958),
-	SPH_C64(0x67053ECBF4105DBD),
-	SPH_C64(0xD8957DA78B4127E4),
-	SPH_C64(0x9E4717DD667CEEFB),
-	SPH_C64(0x33835AAD07BF2DCA)
-};
-
-/* ====================================================================== */
-
-
-#define TRANSFER(dst, src) { \
-	dst[0] = src ## 0; \
-	dst[1] = src ## 1; \
-	dst[2] = src ## 2; \
-	dst[3] = src ## 3; \
-	dst[4] = src ## 4; \
-	dst[5] = src ## 5; \
-	dst[6] = src ## 6; \
-	dst[7] = src ## 7; \
-}
+#endif
 
 #if !USE_ALL_TABLES
-#define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF)
-
-/* method disabled to reduce code size */
-__device__ __forceinline__
-static uint64_t table_skew(uint64_t val, int num) {
-	return ROTL64(val, 8 * num);
-}
 
 __device__ __forceinline__
-static uint64_t ROUND_ELT(const uint64_t* sharedMemory, uint64_t* __restrict__ in,
-	int i0,int i1,int i2,int i3,int i4,int i5,int i6,int i7)
+static uint2 ROUND_ELT(const uint2*const __restrict__ sharedMemory, const uint2*const __restrict__ in,
+const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7)
 {
-	uint32_t idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7;
-	idx0 = BYTE(in[i0], 0);
-	idx1 = BYTE(in[i1], 1);
-	idx2 = BYTE(in[i2], 2);
-	idx3 = BYTE(in[i3], 3);
-	idx4 = BYTE(in[i4], 4);
-	idx5 = BYTE(in[i5], 5);
-	idx6 = BYTE(in[i6], 6);
-	idx7 = BYTE(in[i7], 7);
-
-	return xor8(
-		sharedMemory[idx0],
-		table_skew(sharedMemory[idx1], 1),
-		table_skew(sharedMemory[idx2], 2),
-		table_skew(sharedMemory[idx3], 3),
-		table_skew(sharedMemory[idx4], 4),
-		table_skew(sharedMemory[idx5], 5),
-		table_skew(sharedMemory[idx6], 6),
-		table_skew(sharedMemory[idx7], 7)
-	);
-}
+	return(
+		sharedMemory[__byte_perm(in[(i0)].x, 0, 0x4440)] ^ ROL2(sharedMemory[__byte_perm(in[(i1)].x, 0, 0x4441)], 8) ^ ROL2(sharedMemory[__byte_perm(in[(i2)].x, 0, 0x4442)], 16) ^
+		ROL2(sharedMemory[__byte_perm(in[(i3)].x, 0, 0x4443)], 24) ^ sharedMemory[__byte_perm(in[(i4)].y, 0, 0x4440) + 256] ^ ROL2(sharedMemory[__byte_perm(in[(i5)].y, 0, 0x4441) + 256], 8) ^
+		ROL2(sharedMemory[__byte_perm(in[(i6)].y, 0, 0x4442) + 256], 16) ^ ROL2(sharedMemory[__byte_perm(in[(i7)].y, 0, 0x4443) + 256], 24));
 
-#else
 
-__device__ __forceinline__
-static uint64_t ROUND_ELT(const uint64_t* sharedMemory, uint64_t* __restrict__ in,
+}
+#else
+__device__ uint2 ROUND_ELT(const uint2*const __restrict__  sharedMemory, uint2*  const __restrict__ in,
 const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7)
 {
-	uint32_t* in32 = (uint32_t*)in;
-	return (sharedMemory[__byte_perm(in32[(i0 << 1)], 0, 0x4440)] ^ sharedMemory[__byte_perm(in32[(i1 << 1)], 0, 0x4441) + 256] ^
-		sharedMemory[__byte_perm(in32[(i2 << 1)], 0, 0x4442) + 512] ^ sharedMemory[__byte_perm(in32[(i3 << 1)], 0, 0x4443) + 768] ^
-		sharedMemory[__byte_perm(in32[(i4 << 1) + 1], 0, 0x4440) + 1024] ^ sharedMemory[__byte_perm(in32[(i5 << 1) + 1], 0, 0x4441) + 1280] ^
-		sharedMemory[__byte_perm(in32[(i6 << 1) + 1], 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(in32[(i7 << 1) + 1], 0, 0x4443) + 1792]);
+	return (sharedMemory[__byte_perm(in[i0].x, 0, 0x4440)] ^ 
+			sharedMemory[__byte_perm(in[i1].x, 0, 0x4441) + 256] ^
+			sharedMemory[__byte_perm(in[i2].x, 0, 0x4442) + 512] ^ 
+			sharedMemory[__byte_perm(in[i3].x, 0, 0x4443) + 768] ^
+		SWAPDWORDS2(sharedMemory[__byte_perm(in[i4].y, 0, 0x4440)]) ^ 
+		SWAPDWORDS2(sharedMemory[__byte_perm(in[i5].y, 0, 0x4441) + 256]) ^
+		SWAPDWORDS2(sharedMemory[__byte_perm(in[i6].y, 0, 0x4442) + 512]) ^ 
+		SWAPDWORDS2(sharedMemory[__byte_perm(in[i7].y, 0, 0x4443) + 768]));
 }
 #endif /* USE_ALL_TABLES */
 
-#define ROUND(table, in, out, c0, c1, c2, c3, c4, c5, c6, c7) { \
-	out ## 0 = xor1(ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1), c0); \
-	out ## 1 = xor1(ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2), c1); \
-	out ## 2 = xor1(ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3), c2); \
-	out ## 3 = xor1(ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4), c3); \
-	out ## 4 = xor1(ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5), c4); \
-	out ## 5 = xor1(ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6), c5); \
-	out ## 6 = xor1(ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7), c6); \
-	out ## 7 = xor1(ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0), c7); \
-} 
-
-#define ROUND1(table, in, out,c) { \
-	out ## 0 = xor1(ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1),c); \
-	out ## 1 = ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2); \
-	out ## 2 = ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3); \
-	out ## 3 = ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4); \
-	out ## 4 = ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5); \
-	out ## 5 = ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6); \
-	out ## 6 = ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7); \
-	out ## 7 = ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0); \
-} 
-
-#define ROUND_KSCHED(table, in, out, c) \
-	ROUND1(table, in, out,c) \
-	TRANSFER(in, out)
-
-#define ROUND_WENC(table, in, key, out) \
-	ROUND(table, in, out, key[0], key[1], key[2],key[3], key[4], key[5], key[6], key[7]) \
-	TRANSFER(in, out)
 
 
-__global__
+__global__ 
 void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
 {
-	__shared__ uint64_t sharedMemory[2048];
+	/*
+#if USE_ALL_TABLES
+	__shared__ uint2 sharedMemory[256*4];
+#else
+	__shared__ uint2 sharedMemory[256*2];
+#endif
 
 	if (threadIdx.x < 256) {
-		sharedMemory[threadIdx.x] = mixTob0Tox[threadIdx.x];
-		#if USE_ALL_TABLES
-			sharedMemory[threadIdx.x+256]  = mixTob1Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+512]  = mixTob2Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+768]  = mixTob3Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+1024] = mixTob4Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+1280] = mixTob5Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+1536] = mixTob6Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
-		#endif
+#if USE_ALL_TABLES
+		sharedMemory[threadIdx.x] = vectorize(mixTob0Tox[threadIdx.x]);
+		sharedMemory[threadIdx.x + 256] = vectorize(mixTob1Tox[threadIdx.x]);
+		sharedMemory[threadIdx.x + 512] = ROL2(vectorize(mixTob0Tox[threadIdx.x]), 16);
+		sharedMemory[threadIdx.x + 768] = ROL2(vectorize(mixTob1Tox[threadIdx.x]), 16);
+//		sharedMemory[threadIdx.x + 1024] = SWAPDWORDS2(sharedMemory[threadIdx.x]);
+//		sharedMemory[threadIdx.x + 1280] = SWAPDWORDS2(sharedMemory[threadIdx.x + 256]);
+//		sharedMemory[threadIdx.x + 1536] = SWAPDWORDS2(sharedMemory[threadIdx.x + 512]);
+//		sharedMemory[threadIdx.x + 1792] = SWAPDWORDS2(sharedMemory[threadIdx.x + 768]);
+#else
+		sharedMemory[threadIdx.x] = vectorize(mixTob0Tox[threadIdx.x]);
+		sharedMemory[threadIdx.x + 256] = SWAPDWORDS2(sharedMemory[threadIdx.x]); 
+#endif
 	}
-
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t nounce = startNounce + thread;
+		const uint2 InitVector_RC[10] =
+		{
+			{ 0xE8C62318UL, 0x4F01B887UL },
+			{ 0xF5D2A636UL, 0x52916F79UL },
+			{ 0x8E9BBC60UL, 0x357B0CA3UL },
+			{ 0xC2D7E01DUL, 0x57FE4B2EUL },
+			{ 0xE5377715UL, 0xDA4AF09FUL },
+			{ 0x0A29C958UL, 0x856BA0B1UL },
+			{ 0xF4105DBDUL, 0x67053ECBUL },
+			{ 0x8B4127E4UL, 0xD8957DA7UL },
+			{ 0x667CEEFBUL, 0x9E4717DDUL },
+			{ 0x07BF2DCAUL, 0x33835AADUL }
+		};
+
+		const uint32_t nounce = startNounce + thread;
 		union {
 			uint8_t h1[64];
 			uint32_t h4[16];
-			uint64_t h8[8];
+			uint2 h8[8];
 		} hash;
 
-		uint64_t state[8];
-		uint64_t n[8];
-		uint64_t h[8];
+		uint2 state[8];
+		uint2 n[8];
+		uint2 h[8];
 
-		#pragma unroll 8
-		for (int i=0; i<8; i++) {
-			n[i] = c_PaddedMessage80[i];  // read data
-			h[i] = 0;                     // read state
+#pragma unroll 8
+		for (int i = 0; i<8; i++) {
+			n[i] = vectorize(c_PaddedMessage80[i]);  // read data
+			h[i] = make_uint2(0,0);                     // read state
 		}
 
-		#pragma unroll 10
-		for (unsigned r=0; r < 10; r++) {
-			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
-			ROUND_WENC(sharedMemory, n, h, tmp);
+//#pragma unroll 10
+		for (int i = 0; i < 10; i++)
+		{
+			uint2 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+			tmp0 = ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1)^ InitVector_RC[i];
+			tmp1 = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2);
+			tmp2 = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3);
+			tmp3 = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
+			tmp4 = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5);
+			tmp5 = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6);
+			tmp6 = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7);
+			tmp7 = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
+			h[0] = tmp0;
+			h[1] = tmp1;
+			h[2] = tmp2;
+			h[3] = tmp3;
+			h[4] = tmp4;
+			h[5] = tmp5;
+			h[6] = tmp6;
+			h[7] = tmp7;
+			tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ h[0];
+			tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ h[1];
+			tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ h[2];
+			tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ h[3];
+			tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ h[4];
+			tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ h[5];
+			tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ h[6];
+			tmp7 = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ h[7];
+			n[0] = tmp0;
+			n[1] = tmp1;
+			n[2] = tmp2;
+			n[3] = tmp3;
+			n[4] = tmp4;
+			n[5] = tmp5;
+			n[6] = tmp6;
+			n[7] = tmp7;
 		}
 
-		#pragma unroll 8
-		for (int i=0; i < 8; i++) {
-			state[i] = xor1(n[i],c_PaddedMessage80[i]);
+#pragma unroll 8
+		for (int i = 0; i < 8; i++) {
+			state[i] = n[i]^vectorize(c_PaddedMessage80[i]);
 		}
 
 		/// round 2 ///////
 		//////////////////////////////////
 
-		#pragma unroll 8
-		for (int i=0; i<8; i++) {
+#pragma unroll 8
+		for (int i = 0; i<8; i++) {
 			h[i] = state[i];   //read state
 		}
-		n[0] = xor1(c_PaddedMessage80[8], h[0]);
-		n[1] = xor1(REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)), h[1]);
-		n[2] = xor1(0x0000000000000080, h[2]);
+		n[0] = vectorize(c_PaddedMessage80[8])^ h[0];
+		n[1] = vectorize(REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)))^ h[1];
+		n[2].y = h[2].y;
+		n[2].x = h[2].x ^ 0x80;
 		n[3] = h[3];
 		n[4] = h[4];
 		n[5] = h[5];
 		n[6] = h[6];
-		n[7] = xor1(0x8002000000000000, h[7]);
-
-		#pragma unroll 10
-		for (unsigned r=0; r < 10; r++) {
-			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
-			ROUND_WENC(sharedMemory, n, h, tmp);
+		n[7].x = h[7].x;
+		n[7].y = h[7].y ^ 0x80020000;
+
+//#pragma unroll 10
+		for (int i = 0; i < 10; i++)
+		{
+			uint2 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+			tmp0 = ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1)^InitVector_RC[i];
+			tmp1 = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2);
+			tmp2 = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3);
+			tmp3 = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
+			tmp4 = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5);
+			tmp5 = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6);
+			tmp6 = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7);
+			tmp7 = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
+			h[0] = tmp0;
+			h[1] = tmp1;
+			h[2] = tmp2;
+			h[3] = tmp3;
+			h[4] = tmp4;
+			h[5] = tmp5;
+			h[6] = tmp6;
+			h[7] = tmp7;
+			tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1)^ h[0];
+			tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ h[1];
+			tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ h[2];
+			tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ h[3];
+			tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ h[4];
+			tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ h[5];
+			tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ h[6];
+			tmp7 = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ h[7];
+			n[0] = tmp0;
+			n[1] = tmp1;
+			n[2] = tmp2;
+			n[3] = tmp3;
+			n[4] = tmp4;
+			n[5] = tmp5;
+			n[6] = tmp6;
+			n[7] = tmp7;
 		}
 
-		state[0] = xor3(state[0], n[0], c_PaddedMessage80[8]);
-		state[1] = xor3(state[1], n[1], REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)) );
-		state[2] = xor3(state[2], n[2], 0x0000000000000080);
-		state[3] = xor1(state[3], n[3]);
-		state[4] = xor1(state[4], n[4]);
-		state[5] = xor1(state[5], n[5]);
-		state[6] = xor1(state[6], n[6]);
-		state[7] = xor3(state[7], n[7], 0x8002000000000000);
-
-		#pragma unroll 8
+		state[0] = state[0] ^ n[0] ^ vectorize(c_PaddedMessage80[8]);
+		state[1] = state[1] ^ n[1] ^ vectorize(REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)));
+		state[2].y = state[2].y ^ n[2].y;
+		state[2].x = state[2].x ^ n[2].x ^ 0x80;
+		state[3] = state[3] ^ n[3];
+		state[4] = state[4] ^ n[4];
+		state[5] = state[5] ^ n[5];
+		state[6] = state[6] ^ n[6];
+		state[7].x = state[7].x ^ n[7].x;
+		state[7].y = state[7].y ^ n[7].y ^ 0x80020000;
+
+#pragma unroll 8
 		for (unsigned i = 0; i < 8; i++)
 			hash.h8[i] = state[i];
 
 		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
 
-		#pragma unroll 16
-		for (int i=0; i<16; i++)
+#pragma unroll 16
+		for (int i = 0; i<16; i++)
 			outHash[i] = hash.h4[i];
-
 	} // thread < threads
+	*/
 }
 
-__global__ 
-void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
-{
-	__shared__ uint64_t sharedMemory[2048];
+__constant__ uint2 precalc[8 * 9] = {
+	{ 0xf889ab3b, 0x24aed1ea },
+	{ 0x66454544, 0xafcbe945 },
+	{ 0xa4a4fe70, 0x89b2a4c5 },
+	{ 0xe1a9fac5, 0xa0e1cce1 },
+	{ 0x5cc0ac48, 0xfcb8fcfc },
+	{ 0x260ef78f, 0x698f8f90 },
+	{ 0x07147996, 0x797985d7 },
+	{ 0x68f8a8f8, 0xf878c8b8 },
+	{ 0xdbbf19d3, 0x58704630 },
+	{ 0xd1235b29, 0xdb37cfaf },
+	{ 0xc28a2c01, 0x98ac958b },
+	{ 0xb19e6381, 0xa706b2c0 },
+	{ 0x7a605e44, 0xdb09b2b0 },
+	{ 0xcf2c5b73, 0x71bc8cbc },
+	{ 0x240967dc, 0xd3ddedef },
+	{ 0xf03b8d7b, 0x197d3bd7 },
+	{ 0xc1aabe38, 0x866511de },
+	{ 0xd0f37c68, 0x7f33874a },
+	{ 0xdbfa37f3, 0x57f0ad98 },
+	{ 0x5842e2c5, 0xbc8d35ee },
+	{ 0xe8f00911, 0x7e246e99 },
+	{ 0xedd6c501, 0x0134b010 },
+	{ 0xf152c9fb, 0xd3ec287b },
+	{ 0x0cdc5632, 0x4027f1c7 },
+	{ 0x20a525af, 0x14cf9b94 },
+	{ 0xa92636c1, 0x4d53c4e3 },
+	{ 0x867d0fe6, 0xe1f94077 },
+	{ 0xbbe65d91, 0x29066ae2 },
+	{ 0xcc545a96, 0x8d5efe4c },
+	{ 0xcb31e9be, 0xa63a3262 },
+	{ 0x18597bb1, 0x476a8496 },
+	{ 0x36c9f0d4, 0x31af5927 },
+	{ 0xc0b5f9e2, 0xb00b3725 },
+	{ 0xa2cb2b39, 0xa5948416 },
+	{ 0xcef88a60, 0x148c34fa },
+	{ 0x6437a57a, 0x19928c41 },
+	{ 0xa146f3b3, 0x893f83fa },
+	{ 0x483f4997, 0x7ccf0278 },
+	{ 0xbae8addc, 0x238f001e },
+	{ 0x494f7792, 0x3d32b0ed },
+	{ 0x82634175, 0x2fff4d77 },
+	{ 0xd038faff, 0x00460355 },
+	{ 0x49027dbf, 0x61f3983e },
+	{ 0xc260a8f4, 0x0bcee59a },
+	{ 0x445adfc8, 0x279d5dee },
+	{ 0x555af423, 0xa4007504 },
+	{ 0x121016b0, 0x8ce2f902 },
+	{ 0x29cd30ac, 0x1d333368 },
+	{ 0x82f16b03, 0x89ad8468 },
+	{ 0x62c64099, 0x637146d8 },
+	{ 0x173e434c, 0x10c2194b },
+	{ 0xd3cf9ce2, 0xc586ff4c },
+	{ 0xa011ff21, 0x5326df42 },
+	{ 0xcb008e1b, 0x134be46c },
+	{ 0xf73b12a6, 0xceb747a3 },
+	{ 0x0e9018d9, 0xca33283b },
+	{ 0x7a671cd0, 0xf92c9a0a },
+	{ 0x532f942a, 0xb2b6634a },
+	{ 0x46224288, 0xb4a8acfe },
+	{ 0xc75c4a47, 0x5935583d },
+	{ 0x5d92a674, 0xa16f5ca5 },
+	{ 0x8ce61777, 0x395c73c4 },
+	{ 0x0b3b2a08, 0xc61aec53 },
+	{ 0xeb58f62a, 0x62e74d81 },
+	{ 0xb6489548, 0x3abcee01 },
+	{ 0xc66b0da5, 0x818eed6b },
+	{ 0xcf3dcee0, 0x755a2688 },
+	{ 0xdb4a8cc2, 0xe99cf6c0 },
+	{ 0xd59cb754, 0x1385717f },
+	{ 0x8a4b4143, 0x7b0b7d97 },
+	{ 0xbb351963, 0x7a15f6db },
+	{ 0xf64e7a6a, 0x27820137 }
+};
 
-	if (threadIdx.x < 256) {
-		sharedMemory[threadIdx.x] = mixTob0Tox[threadIdx.x];
-		#if USE_ALL_TABLES
-			sharedMemory[threadIdx.x+256]  = mixTob1Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+512]  = mixTob2Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+768]  = mixTob3Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+1024] = mixTob4Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+1280] = mixTob5Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+1536] = mixTob6Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
-		#endif
-	}
 
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+__global__ __launch_bounds__(threadsperblock, 2)
+void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+{
+#if USE_ALL_TABLES
+	__shared__ uint2 sharedMemory[256*4];
+#else
+	__shared__ uint2 sharedMemory[512];
+#endif
+	if (threadIdx.x < 256) 
+	{
+#if USE_ALL_TABLES
+		sharedMemory[threadIdx.x] = vectorize(mixTob0Tox[threadIdx.x]);
+		sharedMemory[threadIdx.x + 256] = ROL8(sharedMemory[threadIdx.x]);
+		sharedMemory[threadIdx.x + 512] = ROL16(sharedMemory[threadIdx.x]);
+		sharedMemory[threadIdx.x + 768] = ROL16(sharedMemory[threadIdx.x + 256]);
+//		sharedMemory[threadIdx.x + 1024] = SWAPDWORDS2(sharedMemory[threadIdx.x]);
+//		sharedMemory[threadIdx.x + 1280] = SWAPDWORDS2(sharedMemory[threadIdx.x + 256]);
+//		sharedMemory[threadIdx.x + 1536] = SWAPDWORDS2(sharedMemory[threadIdx.x + 512]);
+//		sharedMemory[threadIdx.x + 1792] = SWAPDWORDS2(sharedMemory[threadIdx.x + 768]);
+#else
+		sharedMemory[threadIdx.x] = vectorize(mixTob0Tox[threadIdx.x]);
+		sharedMemory[threadIdx.x + 256] = SWAPDWORDS2(sharedMemory[threadIdx.x]);
+#endif
+	}
+	__syncthreads();
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
 	{
-		uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t nounce =  (startNounce + thread);
+		const uint2 InitVector_RC[10] =
+		{
+			{ 0xE8C62318UL, 0x4F01B887UL },
+			{ 0xF5D2A636UL, 0x52916F79UL },
+			{ 0x8E9BBC60UL, 0x357B0CA3UL },
+			{ 0xC2D7E01DUL, 0x57FE4B2EUL },
+			{ 0xE5377715UL, 0xDA4AF09FUL },
+			{ 0x0A29C958UL, 0x856BA0B1UL },
+			{ 0xF4105DBDUL, 0x67053ECBUL },
+			{ 0x8B4127E4UL, 0xD8957DA7UL },
+			{ 0x667CEEFBUL, 0x9E4717DDUL },
+			{ 0x07BF2DCAUL, 0x33835AADUL }
+		};
+
 		uint32_t hashPosition = (nounce - startNounce) << 3;
-		uint64_t hash[8], state[8], n[8], h[8] = { 0 };
-		uint8_t i;
-
-		#pragma unroll 8
-		for (i=0; i<8; i++)
-			n[i] = hash[i] = g_hash[hashPosition + i];
-
-		#pragma unroll 10
-		for (i=0; i < 10; i++) {
-			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[i]);
-			ROUND_WENC(sharedMemory, n, h, tmp);
+		uint2 hash[8], state[8], n[8], h[8];
+		int i;
+
+		uint28 *phash = (uint28*)&g_hash[hashPosition];
+		uint28 *outpt = (uint28*)hash;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+		for (i = 0; i < 8; i++)
+		{
+			n[i] = hash[i];
 		}
 
-		#pragma unroll 8
-		for (i=0; i<8; i++)
-			state[i] = xor1(n[i], hash[i]);
-
-		#pragma unroll 8
-		for (i=0; i < 8; i++) {
-			h[i] = state[i];
+//#pragma unroll 8
+//		for (i = 0; i < 8; i++)
+//			n[i] = hash[i] = vectorize(g_hash[hashPosition + i]);
+
+		uint2 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, pre;
+
+		pre = make_uint2( 0x28282828, 0x28282828);
+
+		tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ make_uint2(3236825904UL, 1730777263UL);
+		tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ pre;
+		tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ pre;
+		tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ pre;
+		tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ pre;
+		tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ pre;
+		tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ pre;
+		n[7] = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ pre;
+
+		n[0] = tmp0;
+		n[1] = tmp1;
+		n[2] = tmp2;
+		n[3] = tmp3;
+		n[4] = tmp4;
+		n[5] = tmp5;
+		n[6] = tmp6;
+
+		#pragma unroll 1
+		for (i = 0; i < 8*9; i+=8)
+		{
+			tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ precalc[i];
+			tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ precalc[i+1];
+			tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ precalc[i+2];
+			tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ precalc[i+3];
+			tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ precalc[i+4];
+			tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ precalc[i+5];
+			tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ precalc[i+6];
+			n[7] = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ precalc[i + 7];
+			n[0] = tmp0;
+			n[1] = tmp1;
+			n[2] = tmp2;
+			n[3] = tmp3;
+			n[4] = tmp4;
+			n[5] = tmp5;
+			n[6] = tmp6;
 		}
-		n[0] = xor1(0x80, state[0]);
+
+
+#pragma unroll 8
+		for (i = 0; i<8; i++)
+			h[i] = state[i] = n[i] ^ hash[i];
+
+		n[0] = state[0];
 		n[1] = state[1];
 		n[2] = state[2];
 		n[3] = state[3];
 		n[4] = state[4];
 		n[5] = state[5];
 		n[6] = state[6];
-		n[7] = xor1(0x2000000000000, state[7]);
-
-		#pragma unroll 10
-		for (i=0; i < 10; i++) {
-			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[i]);
-			ROUND_WENC(sharedMemory, n, h, tmp);
+		n[7] = state[7];
+		n[0].x ^= 0x80;
+		n[7].y ^= 0x20000;
+
+#pragma unroll 10
+		for (i = 0; i < 10; i++) 
+		{
+			uint2 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+			tmp0 = ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1) ^ InitVector_RC[i]; 
+			tmp1 = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2); 
+			tmp2 = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3); 
+			tmp3 = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4); 
+			tmp4 = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5); 
+			tmp5 = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6); 
+			tmp6 = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7); 
+			h[7] = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
+			h[0] = tmp0; 
+			h[1] = tmp1; 
+			h[2] = tmp2; 
+			h[3] = tmp3; 
+			h[4] = tmp4; 
+			h[5] = tmp5; 
+			h[6] = tmp6; 
+			tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ h[0];
+			tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ h[1];
+			tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ h[2];
+			tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ h[3];
+			tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ h[4];
+			tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ h[5];
+			tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ h[6];
+			n[7] = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ h[7];
+			n[0] = tmp0; 
+			n[1] = tmp1; 
+			n[2] = tmp2; 
+			n[3] = tmp3; 
+			n[4] = tmp4; 
+			n[5] = tmp5; 
+			n[6] = tmp6; 
 		}
 
-		state[0] = xor3(state[0], n[0], 0x80);
-		state[1] = xor1(state[1], n[1]);
-		state[2] = xor1(state[2], n[2]);
-		state[3] = xor1(state[3], n[3]);
-		state[4] = xor1(state[4], n[4]);
-		state[5] = xor1(state[5], n[5]);
-		state[6] = xor1(state[6], n[6]);
-		state[7] = xor3(state[7], n[7], 0x2000000000000);
-
-		#pragma unroll 8
-		for (i=0; i < 8; i++)
-			g_hash[hashPosition + i] = state[i];
+		state[0].y = state[0].y ^ n[0].y;
+		state[0].x = state[0].x ^ n[0].x ^ 0x80;
+		state[1] = state[1] ^ n[1];
+		state[2] = state[2] ^ n[2];
+		state[3] = state[3] ^ n[3];
+		state[4] = state[4] ^ n[4];
+		state[5] = state[5] ^ n[5];
+		state[6] = state[6] ^ n[6];
+		state[7].x = state[7].x ^ n[7].x;
+		state[7].y = state[7].y ^ n[7].y ^ 0x20000;
+
+#pragma unroll 8
+		for (i = 0; i < 8; i++)
+			g_hash[hashPosition + i] = devectorize(state[i]);
 	}
 }
 
-__global__ 
-void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint32_t *resNounce)
+__global__
+void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNounce)
 {
-	__shared__ uint64_t sharedMemory[2048];
+	/*
+#if USE_ALL_TABLES
+	__shared__ uint2 sharedMemory[256*4];
+#else
+	__shared__ uint2 sharedMemory[256*2];
+#endif
 
 	if (threadIdx.x < 256)
 	{
-		sharedMemory[threadIdx.x] = mixTob0Tox[threadIdx.x];
-		#if USE_ALL_TABLES
-			sharedMemory[threadIdx.x+256]  = mixTob1Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+512]  = mixTob2Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+768]  = mixTob3Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+1024] = mixTob4Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+1280] = mixTob5Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+1536] = mixTob6Tox[threadIdx.x];
-			sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
-		#endif
+#if USE_ALL_TABLES
+		sharedMemory[threadIdx.x] = vectorize(mixTob0Tox[threadIdx.x]);
+		sharedMemory[threadIdx.x + 256] = vectorize(mixTob1Tox[threadIdx.x]);
+		sharedMemory[threadIdx.x + 512] = ROL2(vectorize(mixTob0Tox[threadIdx.x]), 16);
+		sharedMemory[threadIdx.x + 768] = ROL2(vectorize(mixTob1Tox[threadIdx.x]), 16);
+//		sharedMemory[threadIdx.x + 1024] = SWAPDWORDS2(sharedMemory[threadIdx.x]);
+//		sharedMemory[threadIdx.x + 1280] = SWAPDWORDS2(sharedMemory[threadIdx.x + 256]);
+//		sharedMemory[threadIdx.x + 1536] = SWAPDWORDS2(sharedMemory[threadIdx.x + 512]);
+//		sharedMemory[threadIdx.x + 1792] = SWAPDWORDS2(sharedMemory[threadIdx.x + 768]);
+#else
+		sharedMemory[threadIdx.x] = vectorize(mixTob0Tox[threadIdx.x]);
+		sharedMemory[threadIdx.x + 256] = SWAPDWORDS2(sharedMemory[threadIdx.x]); 
+#endif
 	}
 
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
-
-		int hashPosition = nounce - startNounce;
-		uint64_t *inpHash = (uint64_t*) &g_hash[8 * hashPosition];
-		uint64_t h8[8];
-
-		#pragma unroll 8
-		for (int i=0; i<8; i++) {
-			h8[i] = inpHash[i];
+		const uint64_t target = ((uint64_t*)pTarget)[3];
+
+		const uint32_t nounce =  (startNounce + thread);
+
+		const uint2 InitVector_RC[10] =
+		{
+			{ 0xE8C62318UL, 0x4F01B887UL },
+			{ 0xF5D2A636UL, 0x52916F79UL },
+			{ 0x8E9BBC60UL, 0x357B0CA3UL },
+			{ 0xC2D7E01DUL, 0x57FE4B2EUL },
+			{ 0xE5377715UL, 0xDA4AF09FUL },
+			{ 0x0A29C958UL, 0x856BA0B1UL },
+			{ 0xF4105DBDUL, 0x67053ECBUL },
+			{ 0x8B4127E4UL, 0xD8957DA7UL },
+			{ 0x667CEEFBUL, 0x9E4717DDUL },
+			{ 0x07BF2DCAUL, 0x33835AADUL }
+		};
+		const uint32_t hashPosition = nounce - startNounce;
+		uint64_t *inpHash = (uint64_t*)&g_hash[8 * hashPosition];
+		uint2 h8[8];
+
+#pragma unroll 8
+		for (int i = 0; i < 8; i++) {
+			h8[i] = vectorize(inpHash[i]);
 		}
 
-		uint64_t state[8];
-		uint64_t n[8];
-		uint64_t h[8];
+		uint2 state[8];
+		uint2 n[8];
+		uint2 h[8];
 
-		#pragma unroll 8
-		for (int i=0; i<8; i++) {
+#pragma unroll 8
+		for (int i = 0; i < 8; i++) {
 			n[i] = h8[i];
-			h[i] = 0;
+			h[i] = vectorizelow(0);
 		}
 
-		#pragma unroll 10
-		for (unsigned r=0; r < 10; r++) {
-			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
-			ROUND_WENC(sharedMemory, n, h, tmp);
+#pragma unroll 10
+		for (int r = 0; r < 10; r++) {
+			uint2 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+			tmp0 = ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1) ^ InitVector_RC[r];
+			tmp1 = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2);
+			tmp2 = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3);
+			tmp3 = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
+			tmp4 = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5);
+			tmp5 = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6);
+			tmp6 = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7);
+			tmp7 = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
+			h[0] = tmp0;
+			h[1] = tmp1;
+			h[2] = tmp2;
+			h[3] = tmp3;
+			h[4] = tmp4;
+			h[5] = tmp5;
+			h[6] = tmp6;
+			h[7] = tmp7;
+			tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ h[0];
+			tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ h[1];
+			tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ h[2];
+			tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ h[3];
+			tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ h[4];
+			tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ h[5];
+			tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ h[6];
+			tmp7 = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ h[7];
+			n[0] = tmp0;
+			n[1] = tmp1;
+			n[2] = tmp2;
+			n[3] = tmp3;
+			n[4] = tmp4;
+			n[5] = tmp5;
+			n[6] = tmp6;
+			n[7] = tmp7;
+
 		}
 
-		#pragma unroll 8
-		for (int i=0; i<8; i++) {
-			state[i] = xor1(n[i], h8[i]);
+#pragma unroll 8
+		for (int i = 0; i < 8; i++) {
+			state[i] = n[i] ^ h8[i];
 		}
 
-		#pragma unroll 8
-		for (int i=0; i<8; i++) {
+#pragma unroll 8
+		for (int i = 0; i < 8; i++) {
 			h[i] = state[i];
 		}
-		n[0] = xor1(0x80, state[0]);
+		n[0].y = state[0].y;
+		n[0].x = state[0].x ^ 0x80;
 		n[1] = state[1];
 		n[2] = state[2];
 		n[3] = state[3];
 		n[4] = state[4];
 		n[5] = state[5];
 		n[6] = state[6];
-		n[7] = xor1(0x2000000000000, state[7]);
-
-		#pragma unroll 10
-		for (unsigned r=0; r < 10; r++) {
-			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
-			ROUND_WENC(sharedMemory, n, h, tmp);
+		n[7].x = state[7].x;
+		n[7].y = state[7].y ^ 0x20000;
+#pragma unroll 9
+		for (int r = 0; r < 9; r++)
+		{
+			uint2 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+			tmp0 = ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1) ^ InitVector_RC[r];
+			tmp1 = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2);
+			tmp2 = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3);
+			tmp3 = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
+			tmp4 = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5);
+			tmp5 = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6);
+			tmp6 = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7);
+			tmp7 = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
+			h[0] = tmp0;
+			h[1] = tmp1;
+			h[2] = tmp2;
+			h[3] = tmp3;
+			h[4] = tmp4;
+			h[5] = tmp5;
+			h[6] = tmp6;
+			h[7] = tmp7;
+			tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ h[0];
+			tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ h[1];
+			tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ h[2];
+			tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ h[3];
+			tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ h[4];
+			tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ h[5];
+			tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ h[6];
+			tmp7 = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ h[7];
+			n[0] = tmp0;
+			n[1] = tmp1;
+			n[2] = tmp2;
+			n[3] = tmp3;
+			n[4] = tmp4;
+			n[5] = tmp5;
+			n[6] = tmp6;
+			n[7] = tmp7;
 		}
+		uint2 tmp3;
+		tmp3 = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
+		h[3] = tmp3;
+		tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ h[3];
+		n[3] = tmp3;
+
+		state[3] = state[3] ^ n[3];
+
+		if(devectorize(state[3]) <= target)
+		{
+			uint32_t tmp = atomicExch(resNounce, nounce);
+			if (tmp != 0xffffffff)
+				resNounce[1] = tmp;
 
-		state[0] = xor3(state[0], n[0], 0x80);
-		state[1] = xor1(state[1], n[1]);
-		state[2] = xor1(state[2], n[2]);
-		state[3] = xor1(state[3], n[3]);
-		state[4] = xor1(state[4], n[4]);
-		state[5] = xor1(state[5], n[5]);
-		state[6] = xor1(state[6], n[6]);
-		state[7] = xor3(state[7], n[7], 0x2000000000000);
-
-		bool rc = (state[3] <= ((uint64_t*)pTarget)[3]);
-		if (rc && resNounce[0] > nounce)
-			resNounce[0] = nounce;
+		}
 	}
+	*/
 }
 
 __host__
-extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode)
+extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, const int mode)
 {
-	switch (mode) {
-	case 0: /* x15 with rotated T1-T7 (based on T0) */
-		cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
-#if USE_ALL_TABLES
-		cudaMemcpyToSymbol(mixTob1Tox, plain_T1, (256*8), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob2Tox, plain_T2, (256*8), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob3Tox, plain_T3, (256*8), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob4Tox, plain_T4, (256*8), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob5Tox, plain_T5, (256*8), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob6Tox, plain_T6, (256*8), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob7Tox, plain_T7, (256*8), 0, cudaMemcpyHostToDevice);
-#endif
-		break;
-
-	case 1: /* old whirlpool */
-		cudaMemcpyToSymbol(InitVector_RC, old1_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob0Tox, old1_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob1Tox, old1_T1, (256*8), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob2Tox, old1_T2, (256*8), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob3Tox, old1_T3, (256*8), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob4Tox, old1_T4, (256*8), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob5Tox, old1_T5, (256*8), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob6Tox, old1_T6, (256*8), 0, cudaMemcpyHostToDevice);
-		cudaMemcpyToSymbol(mixTob7Tox, old1_T7, (256*8), 0, cudaMemcpyHostToDevice);
-		cudaMalloc(&d_WNonce[thr_id], sizeof(uint32_t));
-		cudaMallocHost(&d_wnounce[thr_id], sizeof(uint32_t));
-		break;
-	}
+//		cudaMemcpyToSymbolAsync(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
+//#if USE_ALL_TABLES
+//		cudaMemcpyToSymbolAsync(mixTob1Tox, plain_T1, (256*8), 0, cudaMemcpyHostToDevice);
+//		cudaMemcpyToSymbolAsync(mixTob2Tox, plain_T2, (256*8), 0, cudaMemcpyHostToDevice);
+//		cudaMemcpyToSymbolAsync(mixTob3Tox, plain_T3, (256*8), 0, cudaMemcpyHostToDevice);
+//		cudaMemcpyToSymbolAsync(mixTob4Tox, plain_T4, (256*8), 0, cudaMemcpyHostToDevice);
+//		cudaMemcpyToSymbolAsync(mixTob5Tox, plain_T5, (256*8), 0, cudaMemcpyHostToDevice);
+//		cudaMemcpyToSymbolAsync(mixTob6Tox, plain_T6, (256*8), 0, cudaMemcpyHostToDevice);
+//		cudaMemcpyToSymbolAsync(mixTob7Tox, plain_T7, (256*8), 0, cudaMemcpyHostToDevice);
+//#endif
 }
 
 __host__
 extern void x15_whirlpool_cpu_free(int thr_id)
 {
 	cudaFree(d_WNonce[thr_id]);
-	cudaFreeHost(d_wnounce[thr_id]);
+	cudaFreeHost(h_wnounce[thr_id]);
 }
 
 __host__
-extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
 {
 	dim3 grid((threads + threadsperblock-1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	x15_whirlpool_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x15_whirlpool_gpu_hash_64<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, (uint64_t*)d_hash);
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
 
 __host__
-extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+extern uint32_t* whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
 {
-	uint32_t result = 0xffffffff;
-
-	dim3 grid((threads + threadsperblock-1) / threadsperblock);
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	cudaMemset(d_WNonce[thr_id], 0xff, sizeof(uint32_t));
+	cudaMemsetAsync(d_WNonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]);
 
-	oldwhirlpool_gpu_finalhash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector,d_WNonce[thr_id]);
+	//oldwhirlpool_gpu_finalhash_64 << <grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, (uint64_t*)d_hash, d_WNonce[thr_id]);
 	//MyStreamSynchronize(NULL, order, thr_id);
 
-	cudaMemcpy(d_wnounce[thr_id], d_WNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	result = *d_wnounce[thr_id];
-	return result;
+	CUDA_SAFE_CALL(cudaMemcpyAsync(h_wnounce[thr_id], d_WNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); cudaStreamSynchronize(gpustream[thr_id]);
+	return h_wnounce[thr_id];
 }
 
 __host__
-void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
 {
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1) / threadsperblock);
-	dim3 block(threadsperblock);
-
-	oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash);
 }
 
 __host__
-void whirlpool512_setBlock_80(void *pdata, const void *ptarget)
+void whirlpool512_setBlock_80(int thr_id, void *pdata, const void *ptarget)
 {
-	unsigned char PaddedMessage[128];
-	memcpy(PaddedMessage, pdata, 80);
-	memset(PaddedMessage+80, 0, 48);
-	PaddedMessage[80] = 0x80; /* ending */
-	cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 }
diff --git a/x15/whirlpool.cu b/x15/whirlpool.cu
index 4fb891a619..0f891407c3 100644
--- a/x15/whirlpool.cu
+++ b/x15/whirlpool.cu
@@ -1,26 +1,23 @@
 /*
- * whirlpool routine (djm)
+ * whirlpool routine djm&SP
  */
 extern "C"
 {
 #include "sph/sph_whirlpool.h"
-#include "miner.h"
 }
-
+#include "miner.h"
 #include "cuda_helper.h"
 
-static uint32_t *d_hash[MAX_GPUS];
-
 extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode);
-extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,  uint32_t *d_hash);
 
-extern void whirlpool512_setBlock_80(void *pdata, const void *ptarget);
-extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
-extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void whirlpool512_setBlock_80(int thr_id, void *pdata, const void *ptarget);
+extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern uint32_t* whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce,  uint32_t *d_hash);
 
 
 // CPU Hash function
-extern "C" void wcoinhash(void *state, const void *input)
+void wcoinhash(void *state, const void *input)
 {
 	sph_whirlpool_context ctx_whirlpool;
 
@@ -49,74 +46,101 @@ extern "C" void wcoinhash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
-
-extern "C" int scanhash_whc(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done)
+extern int scanhash_whc(int thr_id, uint32_t *pdata,
+    uint32_t *ptarget, uint32_t max_nonce,
+    uint32_t *hashes_done)
 {
+	static THREAD uint32_t *d_hash = nullptr;
+
 	const uint32_t first_nonce = pdata[19];
 	uint32_t endiandata[20];
-	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8;
-	throughput = min(throughput, (max_nonce - first_nonce));
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << 20); // 19=256*256*8;
+	uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		ptarget[7] = 0x0000ff;
+
+	static THREAD volatile bool init = false;
+	if(!init)
+	{
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (16 * sizeof(uint32_t)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
 
-	if (!init[thr_id]) {
-		CUDA_CALL_OR_RET_X(cudaSetDevice(device_map[thr_id]), 0);
-		// Konstanten kopieren, Speicher belegen
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
-		x15_whirlpool_cpu_init(thr_id, throughput, 1 /* old whirlpool */);
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax));
+		x15_whirlpool_cpu_init(thr_id, throughputmax, 1 /* old whirlpool */);
+		mining_has_stopped[thr_id] = false;
 
-		init[thr_id] = true;
+		init = true;
 	}
 
 	for (int k=0; k < 20; k++) {
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 	}
 
-	whirlpool512_setBlock_80((void*)endiandata, ptarget);
+	whirlpool512_setBlock_80(thr_id, (void*)endiandata, ptarget);
 
 	do {
-		uint32_t foundNonce;
-		int order = 0;
+		uint32_t* foundNonce;
 
-		whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
 
-		foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		if (foundNonce != UINT32_MAX)
+		foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19],  d_hash);
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(foundNonce[0] != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
+			uint32_t vhash64[8]={0};
+			if(opt_verify){ be32enc(&endiandata[19], foundNonce[0]);
 			wcoinhash(vhash64, endiandata);
-
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			{
 				int res = 1;
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				#if 0
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				if (secNonce != 0) {
-					pdata[21] = secNonce;
-					res++;
+				if (foundNonce[1] != UINT32_MAX)
+				{
+					if(opt_verify){ be32enc(&endiandata[19], foundNonce[1]);
+					wcoinhash(vhash64, endiandata);
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+
+						if (opt_benchmark) applog(LOG_INFO, "GPU #%d: found second nounce %08x", device_map[thr_id], foundNonce[1]);
+						pdata[21] = foundNonce[1];
+						res++;
+					}
+					else
+					{
+						if (vhash64[7] != Htarg)
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce[1]);
+					}
 				}
-				#endif
-				pdata[19] = foundNonce;
+				pdata[19] = foundNonce[0];
+				if (opt_benchmark) applog(LOG_INFO, "GPU #%d: found nounce %08x", device_map[thr_id], foundNonce[0]);
+
 				return res;
 			}
-			else if (vhash64[7] > Htarg) {
-				applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);
-			}
-			else {
-				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+			else
+			{
+				if (vhash64[7] != Htarg)
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce[0]);
 			}
 		}
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/x15/whirlpoolx.cu b/x15/whirlpoolx.cu
new file mode 100644
index 0000000000..be0b18b336
--- /dev/null
+++ b/x15/whirlpoolx.cu
@@ -0,0 +1,117 @@
+/*
+ * whirlpool routine (djm)
+ * whirlpoolx routine (provos alexis)
+ */
+extern "C"
+{
+#include "sph/sph_whirlpool.h"
+}
+#include "miner.h"
+
+
+#include "cuda_helper.h"
+
+extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads);
+extern void whirlpoolx_setBlock_80(int thr_id, void *pdata, const void *ptarget);
+extern void cpu_whirlpoolx(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *foundNonce);
+extern void whirlpoolx_precompute(int thr_id);
+
+// CPU Hash function
+extern "C" void whirlxHash(void *state, const void *input)
+{
+
+	sph_whirlpool_context ctx_whirlpool;
+
+	unsigned char hash[64];
+	unsigned char hash_xored[32];
+
+	memset(hash, 0, sizeof(hash));
+
+	sph_whirlpool_init(&ctx_whirlpool);
+	sph_whirlpool(&ctx_whirlpool, input, 80);
+	sph_whirlpool_close(&ctx_whirlpool, hash);
+
+    
+	for (uint32_t i = 0; i < 32; i++){
+	        hash_xored[i] = hash[i] ^ hash[i + 16];
+	}
+	memcpy(state, hash_xored, 32);
+}
+
+int scanhash_whirlpoolx(int thr_id, uint32_t *pdata, uint32_t *ptarget, uint32_t max_nonce, uint32_t *hashes_done)
+{
+	const uint32_t first_nonce = pdata[19];
+	uint32_t endiandata[20];
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, (1 << 27));
+	uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffc00;
+
+	if (opt_benchmark)
+		ptarget[7] = 0x5;
+
+	static THREAD volatile bool init = false;
+	if(!init)
+	{
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		whirlpoolx_cpu_init(thr_id, throughputmax);
+		mining_has_stopped[thr_id] = false;
+		init = true;
+	}
+
+	for (int k=0; k < 20; k++)
+	{
+		be32enc(&endiandata[k], pdata[k]);
+	}
+
+	whirlpoolx_setBlock_80(thr_id, (void*)endiandata, &ptarget[6]);
+	whirlpoolx_precompute(thr_id);
+	do {
+		uint32_t foundNonce[2];
+		cpu_whirlpoolx(thr_id, throughput, pdata[19], foundNonce);
+		CUDA_SAFE_CALL(cudaGetLastError());
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(foundNonce[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t vhash64[8]={0};
+			/* check now with the CPU to confirm */
+			if(opt_verify){ be32enc(&endiandata[19], foundNonce[0]);
+			whirlxHash(vhash64, endiandata);
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			{
+				int res = 1;
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (foundNonce[1] != UINT32_MAX)
+				{
+					if(opt_verify){ be32enc(&endiandata[19], foundNonce[1]);
+					whirlxHash(vhash64, endiandata);
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+						pdata[21] = foundNonce[1];
+						res++;
+						if (opt_benchmark) applog(LOG_INFO, "GPU #%d: found nonce %08x", device_map[thr_id], foundNonce[1]);
+					}
+					else
+					{
+						if (vhash64[7] != Htarg)
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce[1]);
+					}
+				}
+				if (opt_benchmark)
+					applog(LOG_INFO, "GPU #%d: found nonce %08x", device_map[thr_id], foundNonce[0], vhash64[7]);
+				pdata[19] = foundNonce[0];
+				return res;
+			}
+			else
+			{
+				if(vhash64[7] != Htarg)
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce[0]);
+			}
+		}
+		pdata[19] += throughput;
+	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
+	*hashes_done = pdata[19] - first_nonce ;
+	return 0;
+}
diff --git a/x15/x14.cu b/x15/x14.cu
index 447ecb9f33..bb22ffe6b8 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -26,47 +26,47 @@ extern "C" {
 
 #include "cuda_helper.h"
 
-// Memory for the hash functions
-static uint32_t *d_hash[MAX_GPUS];
-
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_init(int thr_id);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_skein512_cpu_init(int thr_id);
-extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
-extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
-extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,  uint32_t *d_hash);
 
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
-extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-											uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse, int order);
+extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes,
+											const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse);
 
 // X14 CPU Hash function
-extern "C" void x14hash(void *output, const void *input)
+void x14hash(void *output, const void *input)
 {
 	unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
 	#define hashB hash+64
@@ -147,93 +147,118 @@ extern "C" void x14hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
-
-extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+extern int scanhash_x14(int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done)
 {
+	static THREAD uint32_t *d_hash = nullptr;
+
 	const uint32_t first_nonce = pdata[19];
 	uint32_t endiandata[20];
 
-	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8;
-	throughput = min(throughput, max_nonce - first_nonce);
+	int intensity = (device_sm[device_map[thr_id]] > 500) ? 256 * 256 * 20 : 256 * 256 * 10;
+	uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32;
+
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity); // 19=256*256*8;
+	uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffc00;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x000f;
+		ptarget[7] = 0x000f;
 
-	if (!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		get_cuda_arch(&cuda_arch[thr_id]);
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
 
-		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughputmax);
 		quark_skein512_cpu_init(thr_id);
-		quark_bmw512_cpu_init(thr_id, throughput);
-		x11_simd512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
-		x13_hamsi512_cpu_init(thr_id, throughput);
-		x13_fugue512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughputmax);
+		x11_simd512_cpu_init(thr_id, throughputmax);
+		x11_echo512_cpu_init(thr_id, throughputmax);
+		x13_hamsi512_cpu_init(thr_id, throughputmax);
+		x13_fugue512_cpu_init(thr_id, throughputmax);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax));
 
-		cuda_check_cpu_init(thr_id, throughput);
-		init[thr_id] = true;
+		cuda_check_cpu_init(thr_id, throughputmax);
+		mining_has_stopped[thr_id] = false;
+		init = true;
 	}
 
 	for (int k = 0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	cuda_check_cpu_setTarget(ptarget);
+	quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata);
+	cuda_check_cpu_setTarget(ptarget, thr_id);
 
 	do {
-		int order = 0;
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-	//	MyStreamSynchronize(NULL, 1, thr_id);
-
-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19],  d_hash);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash,simdthreads);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19],  d_hash);
+
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash);
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(foundNonce != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
+			uint32_t vhash64[8]={0};
 			/* check now with the CPU to confirm */
-			be32enc(&endiandata[19], foundNonce);
+			if(opt_verify){ be32enc(&endiandata[19], foundNonce);
 			x14hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce);
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash, foundNonce);
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (secNonce != 0) {
-					pdata[21] = secNonce;
-					res++;
+				if (secNonce != 0)
+				{
+					if(opt_verify){ be32enc(&endiandata[19], secNonce);
+					x14hash(vhash64, endiandata);
+
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+						pdata[21] = secNonce;
+						res++;
+					}
+					else
+					{
+						applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], secNonce);
+					}
 				}
 				pdata[19] = foundNonce;
 				return res;
 			}
 			else
 			{
-				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
 			}
 		}
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }
diff --git a/x15/x15.cu b/x15/x15.cu
index 811b7c51c6..61611de101 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -27,53 +27,53 @@ extern "C" {
 
 #include "cuda_helper.h"
 
-// Memory for the hash functions
-static uint32_t *d_hash[MAX_GPUS];
-
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_init(int thr_id);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_skein512_cpu_init(int thr_id);
-extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
-extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
-extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
-extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,  uint32_t *d_hash);
 
-extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
-extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
-extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode);
-extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void x15_whirlpool_cpu_free(int thr_id);
 
 extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-											uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse, int order);
+extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes,
+											const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse);
 
 // X15 CPU Hash function
-extern "C" void x15hash(void *output, const void *input)
+void x15hash(void *output, const void *input)
 {
 	sph_blake512_context     ctx_blake;
 	sph_bmw512_context       ctx_bmw;
@@ -159,98 +159,126 @@ extern "C" void x15hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
-
-extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+extern int scanhash_x15(int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done)
 {
+	static THREAD uint32_t *d_hash = nullptr;
+
 	const uint32_t first_nonce = pdata[19];
-	uint32_t endiandata[20];
 
-	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8;
-	throughput = min(throughput, (max_nonce - first_nonce));
+	int intensity = 256 * 256 * 13;
+	if (device_sm[device_map[thr_id]] == 520)  intensity = 256 * 256 * 22;
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity); // 19=256*256*8;
+	uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00;
+	uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0fF;
+		ptarget[7] = 0x0fF;
 
-	if (!init[thr_id])
+	static THREAD volatile bool init = false;
+	if(!init)
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		get_cuda_arch(&cuda_arch[thr_id]);
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
 
-		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughputmax);
 		quark_skein512_cpu_init(thr_id);
-		quark_bmw512_cpu_init(thr_id, throughput);
-		x11_simd512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
-		x13_hamsi512_cpu_init(thr_id, throughput);
-		x13_fugue512_cpu_init(thr_id, throughput);
-		x15_whirlpool_cpu_init(thr_id, throughput, 0);
-
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
-
-		cuda_check_cpu_init(thr_id, throughput);
-		init[thr_id] = true;
+		quark_bmw512_cpu_init(thr_id, throughputmax);
+		x11_simd512_cpu_init(thr_id, throughputmax);
+		x11_echo512_cpu_init(thr_id, throughputmax);
+		x13_hamsi512_cpu_init(thr_id, throughputmax);
+		x13_fugue512_cpu_init(thr_id, throughputmax);
+		x15_whirlpool_cpu_init(thr_id, throughputmax, 0);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax));
+
+		cuda_check_cpu_init(thr_id, throughputmax);
+		mining_has_stopped[thr_id] = false;
+		init = true;
 	}
-
+	
+	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	cuda_check_cpu_setTarget(ptarget);
+	quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata);
+	cuda_check_cpu_setTarget(ptarget, thr_id);
 
 	do {
-		int order = 0;
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-//		MyStreamSynchronize(NULL, 1, thr_id);
-
-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash);
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash,simdthreads);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19],  d_hash);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19],  d_hash);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], d_hash);
+
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash);
+		if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);}
+		if(foundNonce != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
+			uint32_t vhash64[8]={0};
 			/* check now with the CPU to confirm */
-			be32enc(&endiandata[19], foundNonce);
+			if(opt_verify){ be32enc(&endiandata[19], foundNonce);
 			x15hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce);
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash, foundNonce);
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (secNonce != 0) {
-					pdata[21] = secNonce;
-					res++;
+				if (secNonce != 0)
+				{
+					if(opt_verify){ be32enc(&endiandata[19], secNonce);
+					x15hash(vhash64, endiandata);
+					} if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+						pdata[21] = secNonce;
+						res++;
+						if (opt_benchmark)
+							applog(LOG_INFO, "GPU #%d: found nounce %08x", device_map[thr_id], secNonce);
+					}
+					else
+					{
+						applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], secNonce);
+					}
 				}
-				if (opt_benchmark) applog(LOG_INFO, "found nounce", thr_id, foundNonce, vhash64[7], Htarg);
+				if (opt_benchmark)
+					applog(LOG_INFO, "GPU #%d: found nounce %08x", device_map[thr_id], foundNonce);
 				pdata[19] = foundNonce;
 				return res;
 			}
 			else
 			{
-				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
 			}
 		}
 
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 
 	return 0;
 }
diff --git a/x17/cuda_x17_haval512.cu b/x17/cuda_x17_haval512.cu
index a8cf28fcac..3596682e0f 100644
--- a/x17/cuda_x17_haval512.cu
+++ b/x17/cuda_x17_haval512.cu
@@ -43,211 +43,10 @@
 
 #include "cuda_helper.h"
 
-#define SPH_ROTL32(x, n)   (((x) << (n)) | ((x) >> (32 - (n))))
-#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
-
-static __constant__ uint32_t initVector[8];
-
-static const uint32_t c_initVector[8] = {
-	SPH_C32(0x243F6A88),
-	SPH_C32(0x85A308D3),
-	SPH_C32(0x13198A2E),
-	SPH_C32(0x03707344),
-	SPH_C32(0xA4093822),
-	SPH_C32(0x299F31D0),
-	SPH_C32(0x082EFA98),
-	SPH_C32(0xEC4E6C89)
-};
-
-#define PASS1(n, in) { \
-	STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], SPH_C32(0x00000000)); \
-	STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 1], SPH_C32(0x00000000)); \
-	STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[ 2], SPH_C32(0x00000000)); \
-	STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], SPH_C32(0x00000000)); \
-	STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], SPH_C32(0x00000000)); \
-	STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[ 5], SPH_C32(0x00000000)); \
-	STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[ 6], SPH_C32(0x00000000)); \
-	STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[ 7], SPH_C32(0x00000000)); \
- \
-	STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 8], SPH_C32(0x00000000)); \
-	STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x00000000)); \
-	STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[10], SPH_C32(0x00000000)); \
-	STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[11], SPH_C32(0x00000000)); \
-	STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[12], SPH_C32(0x00000000)); \
-	STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[13], SPH_C32(0x00000000)); \
-	STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[14], SPH_C32(0x00000000)); \
-	STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[15], SPH_C32(0x00000000)); \
- \
-	STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[16], SPH_C32(0x00000000)); \
-	STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[17], SPH_C32(0x00000000)); \
-	STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[18], SPH_C32(0x00000000)); \
-	STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[19], SPH_C32(0x00000000)); \
-	STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[20], SPH_C32(0x00000000)); \
-	STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[21], SPH_C32(0x00000000)); \
-	STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[22], SPH_C32(0x00000000)); \
-	STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[23], SPH_C32(0x00000000)); \
- \
-	STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[24], SPH_C32(0x00000000)); \
-	STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[25], SPH_C32(0x00000000)); \
-	STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[26], SPH_C32(0x00000000)); \
-	STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[27], SPH_C32(0x00000000)); \
-	STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[28], SPH_C32(0x00000000)); \
-	STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[29], SPH_C32(0x00000000)); \
-	STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[30], SPH_C32(0x00000000)); \
-	STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[31], SPH_C32(0x00000000)); \
-}
-
-#define PASS2(n, in) { \
-	STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], SPH_C32(0x452821E6)); \
-	STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[14], SPH_C32(0x38D01377)); \
-	STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[26], SPH_C32(0xBE5466CF)); \
-	STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[18], SPH_C32(0x34E90C6C)); \
-	STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[11], SPH_C32(0xC0AC29B7)); \
-	STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[28], SPH_C32(0xC97C50DD)); \
-	STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 7], SPH_C32(0x3F84D5B5)); \
-	STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[16], SPH_C32(0xB5470917)); \
- \
-	STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], SPH_C32(0x9216D5D9)); \
-	STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[23], SPH_C32(0x8979FB1B)); \
-	STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[20], SPH_C32(0xD1310BA6)); \
-	STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[22], SPH_C32(0x98DFB5AC)); \
-	STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], SPH_C32(0x2FFD72DB)); \
-	STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[10], SPH_C32(0xD01ADFB7)); \
-	STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 4], SPH_C32(0xB8E1AFED)); \
-	STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 8], SPH_C32(0x6A267E96)); \
- \
-	STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[30], SPH_C32(0xBA7C9045)); \
-	STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], SPH_C32(0xF12C7F99)); \
-	STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0x24A19947)); \
-	STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[ 9], SPH_C32(0xB3916CF7)); \
-	STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x0801F2E2)); \
-	STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[24], SPH_C32(0x858EFC16)); \
-	STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[29], SPH_C32(0x636920D8)); \
-	STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 6], SPH_C32(0x71574E69)); \
- \
-	STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0xA458FEA3)); \
-	STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[12], SPH_C32(0xF4933D7E)); \
-	STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[15], SPH_C32(0x0D95748F)); \
-	STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[13], SPH_C32(0x728EB658)); \
-	STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], SPH_C32(0x718BCD58)); \
-	STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[25], SPH_C32(0x82154AEE)); \
-	STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[31], SPH_C32(0x7B54A41D)); \
-	STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[27], SPH_C32(0xC25A59B5)); \
-}
+static uint32_t *d_nonce[MAX_GPUS];
 
-#define PASS3(n, in) { \
-	STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0x9C30D539)); \
-	STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x2AF26013)); \
-	STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 4], SPH_C32(0xC5D1B023)); \
-	STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[20], SPH_C32(0x286085F0)); \
-	STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[28], SPH_C32(0xCA417918)); \
-	STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[17], SPH_C32(0xB8DB38EF)); \
-	STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 8], SPH_C32(0x8E79DCB0)); \
-	STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[22], SPH_C32(0x603A180E)); \
- \
-	STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[29], SPH_C32(0x6C9E0E8B)); \
-	STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[14], SPH_C32(0xB01E8A3E)); \
-	STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[25], SPH_C32(0xD71577C1)); \
-	STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[12], SPH_C32(0xBD314B27)); \
-	STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[24], SPH_C32(0x78AF2FDA)); \
-	STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[30], SPH_C32(0x55605C60)); \
-	STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[16], SPH_C32(0xE65525F3)); \
-	STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[26], SPH_C32(0xAA55AB94)); \
- \
-	STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[31], SPH_C32(0x57489862)); \
-	STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[15], SPH_C32(0x63E81440)); \
-	STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 7], SPH_C32(0x55CA396A)); \
-	STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], SPH_C32(0x2AAB10B6)); \
-	STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], SPH_C32(0xB4CC5C34)); \
-	STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[ 0], SPH_C32(0x1141E8CE)); \
-	STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[18], SPH_C32(0xA15486AF)); \
-	STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[27], SPH_C32(0x7C72E993)); \
- \
-	STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[13], SPH_C32(0xB3EE1411)); \
-	STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], SPH_C32(0x636FBC2A)); \
-	STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0x2BA9C55D)); \
-	STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[10], SPH_C32(0x741831F6)); \
-	STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[23], SPH_C32(0xCE5C3E16)); \
-	STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[11], SPH_C32(0x9B87931E)); \
-	STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 5], SPH_C32(0xAFD6BA33)); \
-	STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[ 2], SPH_C32(0x6C24CF5C)); \
-}
-
-#define PASS4(n, in) { \
-	STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[24], SPH_C32(0x7A325381)); \
-	STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 4], SPH_C32(0x28958677)); \
-	STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 0], SPH_C32(0x3B8F4898)); \
-	STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[14], SPH_C32(0x6B4BB9AF)); \
-	STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], SPH_C32(0xC4BFE81B)); \
-	STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[ 7], SPH_C32(0x66282193)); \
-	STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[28], SPH_C32(0x61D809CC)); \
-	STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[23], SPH_C32(0xFB21A991)); \
- \
-	STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[26], SPH_C32(0x487CAC60)); \
-	STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], SPH_C32(0x5DEC8032)); \
-	STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[30], SPH_C32(0xEF845D5D)); \
-	STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[20], SPH_C32(0xE98575B1)); \
-	STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[18], SPH_C32(0xDC262302)); \
-	STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[25], SPH_C32(0xEB651B88)); \
-	STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[19], SPH_C32(0x23893E81)); \
-	STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 3], SPH_C32(0xD396ACC5)); \
- \
-	STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[22], SPH_C32(0x0F6D6FF3)); \
-	STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[11], SPH_C32(0x83F44239)); \
-	STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[31], SPH_C32(0x2E0B4482)); \
-	STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[21], SPH_C32(0xA4842004)); \
-	STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 8], SPH_C32(0x69C8F04A)); \
-	STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[27], SPH_C32(0x9E1F9B5E)); \
-	STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[12], SPH_C32(0x21C66842)); \
-	STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 9], SPH_C32(0xF6E96C9A)); \
- \
-	STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[ 1], SPH_C32(0x670C9C61)); \
-	STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[29], SPH_C32(0xABD388F0)); \
-	STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 5], SPH_C32(0x6A51A0D2)); \
-	STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[15], SPH_C32(0xD8542F68)); \
-	STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x960FA728)); \
-	STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[10], SPH_C32(0xAB5133A3)); \
-	STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[16], SPH_C32(0x6EEF0B6C)); \
-	STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[13], SPH_C32(0x137A3BE4)); \
-}
-
-#define PASS5(n, in) { \
-	STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[27], SPH_C32(0xBA3BF050)); \
-	STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], SPH_C32(0x7EFB2A98)); \
-	STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0xA1F1651D)); \
-	STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[26], SPH_C32(0x39AF0176)); \
-	STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x66CA593E)); \
-	STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[11], SPH_C32(0x82430E88)); \
-	STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[20], SPH_C32(0x8CEE8619)); \
-	STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[29], SPH_C32(0x456F9FB4)); \
- \
-	STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0x7D84A5C3)); \
-	STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 0], SPH_C32(0x3B8B5EBE)); \
-	STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[12], SPH_C32(0xE06F75D8)); \
-	STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[ 7], SPH_C32(0x85C12073)); \
-	STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[13], SPH_C32(0x401A449F)); \
-	STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 8], SPH_C32(0x56C16AA6)); \
-	STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[31], SPH_C32(0x4ED3AA62)); \
-	STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[10], SPH_C32(0x363F7706)); \
- \
-	STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], SPH_C32(0x1BFEDF72)); \
-	STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x429B023D)); \
-	STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[14], SPH_C32(0x37D0D724)); \
-	STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[30], SPH_C32(0xD00A1248)); \
-	STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[18], SPH_C32(0xDB0FEAD3)); \
-	STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 6], SPH_C32(0x49F1C09B)); \
-	STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[28], SPH_C32(0x075372C9)); \
-	STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[24], SPH_C32(0x80991B7B)); \
- \
-	STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 2], SPH_C32(0x25D479D8)); \
-	STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[23], SPH_C32(0xF6E8DEF7)); \
-	STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[16], SPH_C32(0xE3FE501A)); \
-	STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[22], SPH_C32(0xB6794C3B)); \
-	STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], SPH_C32(0x976CE0BD)); \
-	STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 1], SPH_C32(0x04C006BA)); \
-	STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[25], SPH_C32(0xC1A94FB6)); \
-	STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[15], SPH_C32(0x409F60C4)); \
-}
+#define SPH_ROTL32(x, n)   ROTL32(x, n)
+#define SPH_ROTR32(x, n)   ROTR32(x, n)
 
 #define F1(x6, x5, x4, x3, x2, x1, x0) \
 	(((x1) & ((x0) ^ (x4))) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ (x0))
@@ -269,126 +68,278 @@ static const uint32_t c_initVector[8] = {
 	(((x0) & ~(((x1) & (x2) & (x3)) ^ (x5))) \
 	^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6)))
 
-#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \
-	F1(x3, x4, x1, x0, x5, x2, x6)
-#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \
-	F2(x6, x2, x1, x0, x3, x4, x5)
-#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \
-	F3(x2, x6, x0, x4, x3, x1, x5)
-#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \
-	F4(x1, x5, x3, x2, x0, x4, x6)
-#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \
-	F5(x2, x5, x0, x6, x4, x3, x1)
+#define STEP1(x7, x6, x5, x4, x3, x2, x1, x0, w) { \
+		uint32_t t = F1(x3, x4, x1, x0, x5, x2, x6); \
+		(x7) =(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \
+			+ (w)); \
+	}
+
+#define STEP2(x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \
+		uint32_t t = F2(x6, x2, x1, x0, x3, x4, x5); \
+		(x7) =(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \
+			+ (w) + (c)); \
+	}
 
+#define STEP3(x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \
+		uint32_t t = F3(x2, x6, x0, x4, x3, x1, x5); \
+		(x7) =(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \
+			+ (w) + (c)); \
+	}
 
-#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \
-		uint32_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+#define STEP4(x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \
+		uint32_t t = F4(x1, x5, x3, x2, x0, x4, x6); \
 		(x7) =(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \
 			+ (w) + (c)); \
 	}
 
+#define STEP5(x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \
+		uint32_t t = F5(x2, x5, x0, x6, x4, x3, x1); \
+		(x7) =(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \
+			+ (w) + (c)); \
+	}
 
 __global__
-void x17_haval256_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+void x17_haval256_gpu_hash_64(uint32_t threads, uint32_t startNounce, const uint64_t *const __restrict__ g_hash, uint32_t target, uint32_t *const __restrict__ ret)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+//	if (thread < threads)
 	{
-		uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
-		int hashPosition = nounce - startNounce;
-		uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition];
-		union {
-			uint8_t h1[64];
-			uint32_t h4[16];
-			uint64_t h8[8];
-		} hash;
-
-		uint32_t u0, u1, u2, u3, u4, u5, u6, u7;
-		uint32_t s0,s1,s2,s3,s4,s5,s6,s7;
-		uint32_t buf[32];
-
-		s0 = initVector[0];
-		s1 = initVector[1];
-		s2 = initVector[2];
-		s3 = initVector[3];
-		s4 = initVector[4];
-		s5 = initVector[5];
-		s6 = initVector[6];
-		s7 = initVector[7];
-
-		u0 = s0;
-		u1 = s1;
-		u2 = s2;
-		u3 = s3;
-		u4 = s4;
-		u5 = s5;
-		u6 = s6;
-		u7 = s7;
-
-		#pragma unroll
-		for (int i=0; i<16; i++) {
-			hash.h4[i]= inpHash[i];
+		uint32_t *inpHash = (uint32_t*)&g_hash[8 * thread];
+		uint32_t hash[16];
+
+		uint32_t buf[32] = {0};
+
+		uint32_t s0 = 0x243F6A88;
+		uint32_t s1 = 0x85A308D3;
+		uint32_t s2 = 0x13198A2E;
+		uint32_t s3 = 0x03707344;
+		uint32_t s4 = 0xA4093822;
+		uint32_t s5 = 0x299F31D0;
+		uint32_t s6 = 0x082EFA98;
+		uint32_t s7 = 0xEC4E6C89;
+
+#pragma unroll
+		for(int i = 0; i<16; i++)
+		{
+			hash[i] = inpHash[i];
 		}
 
-///////// input big /////////////////////
+		///////// input big /////////////////////
+
+#pragma unroll
+		for(int i = 0; i<16; i++)
+		{
+				buf[i] = hash[i];
+		}
 
-		#pragma unroll
-		for (int i=0; i<32; i++) {
-			if (i<16) {
-				buf[i]=hash.h4[i];
-			} else {
-				buf[i]=0;
-			}
+		buf[16] = 0x00000001;
+		buf[29] = 0x40290000;
+		buf[30] = 0x00000200;
+
+		STEP1(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 0]); 
+		STEP1(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 1]);
+		STEP1(s5, s4, s3, s2, s1, s0, s7, s6, buf[ 2]);
+		STEP1(s4, s3, s2, s1, s0, s7, s6, s5, buf[ 3]);
+		STEP1(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 4]);
+		STEP1(s2, s1, s0, s7, s6, s5, s4, s3, buf[ 5]);
+		STEP1(s1, s0, s7, s6, s5, s4, s3, s2, buf[ 6]);
+		STEP1(s0, s7, s6, s5, s4, s3, s2, s1, buf[ 7]);
+		STEP1(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 8]);
+		STEP1(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 9]);
+		STEP1(s5, s4, s3, s2, s1, s0, s7, s6, buf[10]);
+		STEP1(s4, s3, s2, s1, s0, s7, s6, s5, buf[11]);
+		STEP1(s3, s2, s1, s0, s7, s6, s5, s4, buf[12]);
+		STEP1(s2, s1, s0, s7, s6, s5, s4, s3, buf[13]);
+		STEP1(s1, s0, s7, s6, s5, s4, s3, s2, buf[14]);
+		STEP1(s0, s7, s6, s5, s4, s3, s2, s1, buf[15]);
+		STEP1(s7, s6, s5, s4, s3, s2, s1, s0, buf[16]);
+		STEP1(s6, s5, s4, s3, s2, s1, s0, s7, buf[17]);
+		STEP1(s5, s4, s3, s2, s1, s0, s7, s6, buf[18]);
+		STEP1(s4, s3, s2, s1, s0, s7, s6, s5, buf[19]);
+		STEP1(s3, s2, s1, s0, s7, s6, s5, s4, buf[20]);
+		STEP1(s2, s1, s0, s7, s6, s5, s4, s3, buf[21]);
+		STEP1(s1, s0, s7, s6, s5, s4, s3, s2, buf[22]);
+		STEP1(s0, s7, s6, s5, s4, s3, s2, s1, buf[23]);
+		STEP1(s7, s6, s5, s4, s3, s2, s1, s0, buf[24]);
+		STEP1(s6, s5, s4, s3, s2, s1, s0, s7, buf[25]);
+		STEP1(s5, s4, s3, s2, s1, s0, s7, s6, buf[26]);
+		STEP1(s4, s3, s2, s1, s0, s7, s6, s5, buf[27]);
+		STEP1(s3, s2, s1, s0, s7, s6, s5, s4, buf[28]);
+		STEP1(s2, s1, s0, s7, s6, s5, s4, s3, buf[29]);
+		STEP1(s1, s0, s7, s6, s5, s4, s3, s2, buf[30]);
+		STEP1(s0, s7, s6, s5, s4, s3, s2, s1, buf[31]);
+
+		STEP2(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 5], SPH_C32(0x452821E6));
+		STEP2(s6, s5, s4, s3, s2, s1, s0, s7, buf[14], SPH_C32(0x38D01377));
+		STEP2(s5, s4, s3, s2, s1, s0, s7, s6, buf[26], SPH_C32(0xBE5466CF));
+		STEP2(s4, s3, s2, s1, s0, s7, s6, s5, buf[18], SPH_C32(0x34E90C6C));
+		STEP2(s3, s2, s1, s0, s7, s6, s5, s4, buf[11], SPH_C32(0xC0AC29B7));
+		STEP2(s2, s1, s0, s7, s6, s5, s4, s3, buf[28], SPH_C32(0xC97C50DD));
+		STEP2(s1, s0, s7, s6, s5, s4, s3, s2, buf[ 7], SPH_C32(0x3F84D5B5));
+		STEP2(s0, s7, s6, s5, s4, s3, s2, s1, buf[16], SPH_C32(0xB5470917));
+		STEP2(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 0], SPH_C32(0x9216D5D9));
+		STEP2(s6, s5, s4, s3, s2, s1, s0, s7, buf[23], SPH_C32(0x8979FB1B));
+		STEP2(s5, s4, s3, s2, s1, s0, s7, s6, buf[20], SPH_C32(0xD1310BA6));
+		STEP2(s4, s3, s2, s1, s0, s7, s6, s5, buf[22], SPH_C32(0x98DFB5AC));
+		STEP2(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 1], SPH_C32(0x2FFD72DB));
+		STEP2(s2, s1, s0, s7, s6, s5, s4, s3, buf[10], SPH_C32(0xD01ADFB7));
+		STEP2(s1, s0, s7, s6, s5, s4, s3, s2, buf[ 4], SPH_C32(0xB8E1AFED));
+		STEP2(s0, s7, s6, s5, s4, s3, s2, s1, buf[ 8], SPH_C32(0x6A267E96));
+		STEP2(s7, s6, s5, s4, s3, s2, s1, s0, buf[30], SPH_C32(0xBA7C9045));
+		STEP2(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 3], SPH_C32(0xF12C7F99));
+		STEP2(s5, s4, s3, s2, s1, s0, s7, s6, buf[21], SPH_C32(0x24A19947));
+		STEP2(s4, s3, s2, s1, s0, s7, s6, s5, buf[ 9], SPH_C32(0xB3916CF7));
+		STEP2(s3, s2, s1, s0, s7, s6, s5, s4, buf[17], SPH_C32(0x0801F2E2));
+		STEP2(s2, s1, s0, s7, s6, s5, s4, s3, buf[24], SPH_C32(0x858EFC16));
+		STEP2(s1, s0, s7, s6, s5, s4, s3, s2, buf[29], SPH_C32(0x636920D8));
+		STEP2(s0, s7, s6, s5, s4, s3, s2, s1, buf[ 6], SPH_C32(0x71574E69));
+		STEP2(s7, s6, s5, s4, s3, s2, s1, s0, buf[19], SPH_C32(0xA458FEA3));
+		STEP2(s6, s5, s4, s3, s2, s1, s0, s7, buf[12], SPH_C32(0xF4933D7E));
+		STEP2(s5, s4, s3, s2, s1, s0, s7, s6, buf[15], SPH_C32(0x0D95748F));
+		STEP2(s4, s3, s2, s1, s0, s7, s6, s5, buf[13], SPH_C32(0x728EB658));
+		STEP2(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 2], SPH_C32(0x718BCD58));
+		STEP2(s2, s1, s0, s7, s6, s5, s4, s3, buf[25], SPH_C32(0x82154AEE));
+		STEP2(s1, s0, s7, s6, s5, s4, s3, s2, buf[31], SPH_C32(0x7B54A41D));
+		STEP2(s0, s7, s6, s5, s4, s3, s2, s1, buf[27], SPH_C32(0xC25A59B5));
+		STEP3(s7, s6, s5, s4, s3, s2, s1, s0, buf[19], SPH_C32(0x9C30D539));
+		STEP3(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 9], SPH_C32(0x2AF26013));
+		STEP3(s5, s4, s3, s2, s1, s0, s7, s6, buf[ 4], SPH_C32(0xC5D1B023));
+		STEP3(s4, s3, s2, s1, s0, s7, s6, s5, buf[20], SPH_C32(0x286085F0));
+		STEP3(s3, s2, s1, s0, s7, s6, s5, s4, buf[28], SPH_C32(0xCA417918));
+		STEP3(s2, s1, s0, s7, s6, s5, s4, s3, buf[17], SPH_C32(0xB8DB38EF));
+		STEP3(s1, s0, s7, s6, s5, s4, s3, s2, buf[ 8], SPH_C32(0x8E79DCB0));
+		STEP3(s0, s7, s6, s5, s4, s3, s2, s1, buf[22], SPH_C32(0x603A180E));
+		STEP3(s7, s6, s5, s4, s3, s2, s1, s0, buf[29], SPH_C32(0x6C9E0E8B));
+		STEP3(s6, s5, s4, s3, s2, s1, s0, s7, buf[14], SPH_C32(0xB01E8A3E));
+		STEP3(s5, s4, s3, s2, s1, s0, s7, s6, buf[25], SPH_C32(0xD71577C1));
+		STEP3(s4, s3, s2, s1, s0, s7, s6, s5, buf[12], SPH_C32(0xBD314B27));
+		STEP3(s3, s2, s1, s0, s7, s6, s5, s4, buf[24], SPH_C32(0x78AF2FDA));
+		STEP3(s2, s1, s0, s7, s6, s5, s4, s3, buf[30], SPH_C32(0x55605C60));
+		STEP3(s1, s0, s7, s6, s5, s4, s3, s2, buf[16], SPH_C32(0xE65525F3));
+		STEP3(s0, s7, s6, s5, s4, s3, s2, s1, buf[26], SPH_C32(0xAA55AB94));
+		STEP3(s7, s6, s5, s4, s3, s2, s1, s0, buf[31], SPH_C32(0x57489862));
+		STEP3(s6, s5, s4, s3, s2, s1, s0, s7, buf[15], SPH_C32(0x63E81440));
+		STEP3(s5, s4, s3, s2, s1, s0, s7, s6, buf[ 7], SPH_C32(0x55CA396A));
+		STEP3(s4, s3, s2, s1, s0, s7, s6, s5, buf[ 3], SPH_C32(0x2AAB10B6));
+		STEP3(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 1], SPH_C32(0xB4CC5C34));
+		STEP3(s2, s1, s0, s7, s6, s5, s4, s3, buf[ 0], SPH_C32(0x1141E8CE));
+		STEP3(s1, s0, s7, s6, s5, s4, s3, s2, buf[18], SPH_C32(0xA15486AF));
+		STEP3(s0, s7, s6, s5, s4, s3, s2, s1, buf[27], SPH_C32(0x7C72E993));
+		STEP3(s7, s6, s5, s4, s3, s2, s1, s0, buf[13], SPH_C32(0xB3EE1411));
+		STEP3(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 6], SPH_C32(0x636FBC2A));
+		STEP3(s5, s4, s3, s2, s1, s0, s7, s6, buf[21], SPH_C32(0x2BA9C55D));
+		STEP3(s4, s3, s2, s1, s0, s7, s6, s5, buf[10], SPH_C32(0x741831F6));
+		STEP3(s3, s2, s1, s0, s7, s6, s5, s4, buf[23], SPH_C32(0xCE5C3E16));
+		STEP3(s2, s1, s0, s7, s6, s5, s4, s3, buf[11], SPH_C32(0x9B87931E));
+		STEP3(s1, s0, s7, s6, s5, s4, s3, s2, buf[ 5], SPH_C32(0xAFD6BA33));
+		STEP3(s0, s7, s6, s5, s4, s3, s2, s1, buf[ 2], SPH_C32(0x6C24CF5C));
+
+		STEP4(s7, s6, s5, s4, s3, s2, s1, s0, buf[24], SPH_C32(0x7A325381));
+		STEP4(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 4], SPH_C32(0x28958677));
+		STEP4(s5, s4, s3, s2, s1, s0, s7, s6, buf[ 0], SPH_C32(0x3B8F4898));
+		STEP4(s4, s3, s2, s1, s0, s7, s6, s5, buf[14], SPH_C32(0x6B4BB9AF));
+		STEP4(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 2], SPH_C32(0xC4BFE81B));
+		STEP4(s2, s1, s0, s7, s6, s5, s4, s3, buf[ 7], SPH_C32(0x66282193));
+		STEP4(s1, s0, s7, s6, s5, s4, s3, s2, buf[28], SPH_C32(0x61D809CC));
+		STEP4(s0, s7, s6, s5, s4, s3, s2, s1, buf[23], SPH_C32(0xFB21A991));
+		STEP4(s7, s6, s5, s4, s3, s2, s1, s0, buf[26], SPH_C32(0x487CAC60));
+		STEP4(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 6], SPH_C32(0x5DEC8032));
+		STEP4(s5, s4, s3, s2, s1, s0, s7, s6, buf[30], SPH_C32(0xEF845D5D));
+		STEP4(s4, s3, s2, s1, s0, s7, s6, s5, buf[20], SPH_C32(0xE98575B1));
+		STEP4(s3, s2, s1, s0, s7, s6, s5, s4, buf[18], SPH_C32(0xDC262302));
+		STEP4(s2, s1, s0, s7, s6, s5, s4, s3, buf[25], SPH_C32(0xEB651B88));
+		STEP4(s1, s0, s7, s6, s5, s4, s3, s2, buf[19], SPH_C32(0x23893E81));
+		STEP4(s0, s7, s6, s5, s4, s3, s2, s1, buf[ 3], SPH_C32(0xD396ACC5));
+		STEP4(s7, s6, s5, s4, s3, s2, s1, s0, buf[22], SPH_C32(0x0F6D6FF3));
+		STEP4(s6, s5, s4, s3, s2, s1, s0, s7, buf[11], SPH_C32(0x83F44239));
+		STEP4(s5, s4, s3, s2, s1, s0, s7, s6, buf[31], SPH_C32(0x2E0B4482));
+		STEP4(s4, s3, s2, s1, s0, s7, s6, s5, buf[21], SPH_C32(0xA4842004));
+		STEP4(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 8], SPH_C32(0x69C8F04A));
+		STEP4(s2, s1, s0, s7, s6, s5, s4, s3, buf[27], SPH_C32(0x9E1F9B5E));
+		STEP4(s1, s0, s7, s6, s5, s4, s3, s2, buf[12], SPH_C32(0x21C66842));
+		STEP4(s0, s7, s6, s5, s4, s3, s2, s1, buf[ 9], SPH_C32(0xF6E96C9A));
+		STEP4(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 1], SPH_C32(0x670C9C61));
+		STEP4(s6, s5, s4, s3, s2, s1, s0, s7, buf[29], SPH_C32(0xABD388F0));
+		STEP4(s5, s4, s3, s2, s1, s0, s7, s6, buf[ 5], SPH_C32(0x6A51A0D2));
+		STEP4(s4, s3, s2, s1, s0, s7, s6, s5, buf[15], SPH_C32(0xD8542F68));
+		STEP4(s3, s2, s1, s0, s7, s6, s5, s4, buf[17], SPH_C32(0x960FA728));
+		STEP4(s2, s1, s0, s7, s6, s5, s4, s3, buf[10], SPH_C32(0xAB5133A3));
+		STEP4(s1, s0, s7, s6, s5, s4, s3, s2, buf[16], SPH_C32(0x6EEF0B6C));
+		STEP4(s0, s7, s6, s5, s4, s3, s2, s1, buf[13], SPH_C32(0x137A3BE4));
+
+		STEP5(s7, s6, s5, s4, s3, s2, s1, s0, buf[27], SPH_C32(0xBA3BF050));
+		STEP5(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 3], SPH_C32(0x7EFB2A98));
+		STEP5(s5, s4, s3, s2, s1, s0, s7, s6, buf[21], SPH_C32(0xA1F1651D));
+		STEP5(s4, s3, s2, s1, s0, s7, s6, s5, buf[26], SPH_C32(0x39AF0176));
+		STEP5(s3, s2, s1, s0, s7, s6, s5, s4, buf[17], SPH_C32(0x66CA593E));
+		STEP5(s2, s1, s0, s7, s6, s5, s4, s3, buf[11], SPH_C32(0x82430E88));
+		STEP5(s1, s0, s7, s6, s5, s4, s3, s2, buf[20], SPH_C32(0x8CEE8619));
+		STEP5(s0, s7, s6, s5, s4, s3, s2, s1, buf[29], SPH_C32(0x456F9FB4));
+			
+		STEP5(s7, s6, s5, s4, s3, s2, s1, s0, buf[19], SPH_C32(0x7D84A5C3));
+		STEP5(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 0], SPH_C32(0x3B8B5EBE));
+		STEP5(s5, s4, s3, s2, s1, s0, s7, s6, buf[12], SPH_C32(0xE06F75D8));
+		STEP5(s4, s3, s2, s1, s0, s7, s6, s5, buf[ 7], SPH_C32(0x85C12073));
+		STEP5(s3, s2, s1, s0, s7, s6, s5, s4, buf[13], SPH_C32(0x401A449F));
+		STEP5(s2, s1, s0, s7, s6, s5, s4, s3, buf[ 8], SPH_C32(0x56C16AA6));
+		STEP5(s1, s0, s7, s6, s5, s4, s3, s2, buf[31], SPH_C32(0x4ED3AA62));
+		STEP5(s0, s7, s6, s5, s4, s3, s2, s1, buf[10], SPH_C32(0x363F7706));
+			
+		STEP5(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 5], SPH_C32(0x1BFEDF72));
+		STEP5(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 9], SPH_C32(0x429B023D));
+		STEP5(s5, s4, s3, s2, s1, s0, s7, s6, buf[14], SPH_C32(0x37D0D724));
+		STEP5(s4, s3, s2, s1, s0, s7, s6, s5, buf[30], SPH_C32(0xD00A1248));
+		STEP5(s3, s2, s1, s0, s7, s6, s5, s4, buf[18], SPH_C32(0xDB0FEAD3));
+		STEP5(s2, s1, s0, s7, s6, s5, s4, s3, buf[ 6], SPH_C32(0x49F1C09B));
+		STEP5(s1, s0, s7, s6, s5, s4, s3, s2, buf[28], SPH_C32(0x075372C9));
+		STEP5(s0, s7, s6, s5, s4, s3, s2, s1, buf[24], SPH_C32(0x80991B7B));
+			
+		STEP5(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 2], SPH_C32(0x25D479D8));
+		/*
+		STEP5(s6, s5, s4, s3, s2, s1, s0, s7, buf[23], SPH_C32(0xF6E8DEF7));
+		STEP5(s5, s4, s3, s2, s1, s0, s7, s6, buf[16], SPH_C32(0xE3FE501A));
+		STEP5(s4, s3, s2, s1, s0, s7, s6, s5, buf[22], SPH_C32(0xB6794C3B));
+		STEP5(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 4], SPH_C32(0x976CE0BD));
+		STEP5(s2, s1, s0, s7, s6, s5, s4, s3, buf[ 1], SPH_C32(0x04C006BA));
+		STEP5(s1, s0, s7, s6, s5, s4, s3, s2, buf[25], SPH_C32(0xC1A94FB6));
+		STEP5(s0, s7, s6, s5, s4, s3, s2, s1, buf[15], SPH_C32(0x409F60C4));
+
+		inpHash[0] = s0 + 0x243F6A88;
+		inpHash[1] = s1 + 0x85A308D3;
+		inpHash[2] = s2 + 0x13198A2E;
+		inpHash[3] = s3 + 0x03707344;
+		inpHash[4] = s4 + 0xA4093822;
+		inpHash[5] = s5 + 0x299F31D0;
+		inpHash[6] = s6 + 0x082EFA98;
+		inpHash[7] = s7 + 0xEC4E6C89;
+		*/
+		if(s7 + 0xEC4E6C89 <= target)
+		{
+			uint32_t tmp = atomicExch(ret, startNounce + thread);
+			if(tmp != 0xffffffff)
+				ret[1] = tmp;
 		}
 
-		buf[16]=0x00000001;
-		buf[29]=0x40290000;
-		buf[30]=0x00000200;
-
-		PASS1(5, buf);
-		PASS2(5, buf);
-		PASS3(5, buf);
-		PASS4(5, buf);
-		PASS5(5, buf);
-
-		s0 = (s0 + u0);
-		s2 = (s2 + u2);
-		s3 = (s3 + u3);
-		s4 = (s4 + u4);
-		s5 = (s5 + u5);
-		s6 = (s6 + u6);
-		s7 = (s7 + u7);
-
-		hash.h4[0]=s0;
-		hash.h4[1]=s1;
-			  hash.h4[2]=s2;
-		hash.h4[3]=s3;
-			  hash.h4[4]=s4;
-		hash.h4[5]=s5;
-		hash.h4[6]=s6;
-		hash.h4[7]=s7;
-
-		#pragma unroll 16
-		for (int u = 0; u < 16; u ++)
-			inpHash[u] = hash.h4[u];
 	} // threads
 }
 
 __host__
 void x17_haval256_cpu_init(int thr_id, uint32_t threads)
 {
-	cudaMemcpyToSymbol(initVector,c_initVector,sizeof(c_initVector),0, cudaMemcpyHostToDevice);
+	cudaMalloc(&d_nonce[thr_id], 2 * sizeof(uint32_t));
 }
 
 __host__
-void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,  uint32_t *d_hash, uint32_t target, uint32_t *result)
 {
-	const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
+	const uint32_t threadsperblock = 512;
 
-	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
+	cudaMemsetAsync(d_nonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]);
 
-	x17_haval256_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x17_haval256_gpu_hash_64 <<<grid, block, 0, gpustream[thr_id] >>>(threads, startNounce, (uint64_t*)d_hash, target, d_nonce[thr_id]);
+	cudaMemcpyAsync(result, d_nonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]);
+	CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id]));
 
 }
diff --git a/x17/cuda_x17_sha512.cu b/x17/cuda_x17_sha512.cu
index 100a5f96b6..8abdce07f0 100644
--- a/x17/cuda_x17_sha512.cu
+++ b/x17/cuda_x17_sha512.cu
@@ -40,22 +40,21 @@
 
 #include "cuda_helper.h"
 
-#define SWAP64(u64) cuda_swab64(u64)
 
-#define SPH_ROTL32(x, n)  ((x) << (n)) | ((x) >> (32 - (n)))
-#define SPH_ROTR32(x, n)  SPH_ROTL32(x, (32 - (n)))
 
-static __constant__ uint64_t H_512[8];
+#define SWAP64(u64) cuda_swab64(u64)
+
+#define SPH_ROTL32(x, n)  ROTL32(x, n)
+#define SPH_ROTR32(x, n)  ROTR32(x, n)
 
-static const uint64_t H512[8] = {
+static __constant__ uint64_t H_512[8] = {
 	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
-	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
-	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
-	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+		SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+		SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+		SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
 };
-static __constant__ uint64_t K_512[80];
 
-static const uint64_t K512[80] = {
+static __constant__ uint64_t K_512[80] = {
 	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
 	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
 	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
@@ -98,24 +97,6 @@ static const uint64_t K512[80] = {
 	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
 };
 
-
-#define SHA3_STEP(ord,r,i) { \
-		uint64_t T1, T2; \
-		int a = 8-ord; \
-		T1 = r[(7+a)&7] + BSG5_1(r[(4+a)&7]) + CH(r[(4+a)&7], r[(5+a)&7], r[(6+a)&7]) + K_512[i] + W[i]; \
-		T2 = BSG5_0(r[(0+a)&7]) + MAJ(r[(0+a)&7], r[(1+a)&7], r[(2+a)&7]); \
-		r[(3+a)&7] = r[(3+a)&7] + T1; \
-		r[(7+a)&7] = T1 + T2; \
-	}
-
-#define SHA3_STEP2(truc,ord,r,i) { \
-		uint64_t T1, T2; \
-		int a = 8-ord; \
-		T1 = Tone(truc,r,W,a,i); \
-		T2 = BSG5_0(r[(0+a)&7]) + MAJ(r[(0+a)&7], r[(1+a)&7], r[(2+a)&7]); \
-		r[(3+a)&7] = r[(3+a)&7] + T1; \
-		r[(7+a)&7] = T1 + T2; \
-	}
 //#define BSG5_0(x)      (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
 #define BSG5_0(x)        xor3(ROTR64(x, 28),ROTR64(x, 34),ROTR64(x, 39))
 
@@ -133,83 +114,95 @@ static const uint64_t K512[80] = {
 //#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
 #define MAJ(x, y, z)   andor(x,y,z)
 
+#define SHA3_STEP(ord,r,i) { \
+		uint64_t T1, T2; \
+		int a = 8-ord; \
+		T1 = r[(7+a)&7] + BSG5_1(r[(4+a)&7]) + CH(r[(4+a)&7], r[(5+a)&7], r[(6+a)&7]) + K_512[i] + W[i]; \
+		T2 = BSG5_0(r[(0+a)&7]) + MAJ(r[(0+a)&7], r[(1+a)&7], r[(2+a)&7]); \
+		r[(3+a)&7] = r[(3+a)&7] + T1; \
+		r[(7+a)&7] = T1 + T2; \
+	}
+
 __device__ __forceinline__
 uint64_t Tone(const uint64_t* sharedMemory, uint64_t r[8], uint64_t W[80], uint32_t a, uint32_t i)
 {
-	uint64_t h = r[(7 + a) & 7];
 	uint64_t e = r[(4 + a) & 7];
-	uint64_t f = r[(5 + a) & 7];
-	uint64_t g = r[(6 + a) & 7];
 	//uint64_t BSG51 = ROTR64(e, 14) ^ ROTR64(e, 18) ^ ROTR64(e, 41);
-	uint64_t BSG51 = xor3(ROTR64(e, 14),ROTR64(e, 18),ROTR64(e, 41));
+	uint64_t BSG51 = xor3(ROTR64(e, 14), ROTR64(e, 18), ROTR64(e, 41));
+
 	//uint64_t CHl     = (((f) ^ (g)) & (e)) ^ (g);
-	uint64_t CHl = xandx(e,f,g);
-	uint64_t result = h+BSG51+CHl+sharedMemory[i]+W[i];
+	uint64_t CHl = xandx(e, r[(5 + a) & 7], r[(6 + a) & 7]);
+	uint64_t result = r[(7 + a) & 7] + BSG51 + CHl + sharedMemory[i] + W[i];
 	return result;
 }
 
-__global__ __launch_bounds__(256,3)
-void x17_sha512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+#define SHA3_STEP2(truc,ord,r,i) { \
+		uint64_t T1, T2; \
+		int a = 8-ord; \
+		T1 = Tone(truc,r,W,a,i); \
+		T2 = BSG5_0(r[(0+a)&7]) + MAJ(r[(0+a)&7], r[(1+a)&7], r[(2+a)&7]); \
+		r[(3+a)&7] = r[(3+a)&7] + T1; \
+		r[(7+a)&7] = T1 + T2; \
+	}
+
+#define TPB 128
+__global__ __launch_bounds__(TPB,6)
+void x17_sha512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	//	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-		int hashPosition = nounce - startNounce;
-		uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition];
-		union {
-			uint8_t h1[64];
-			uint32_t h4[16];
-			uint64_t h8[8];
-		} hash;
-
-		#pragma unroll
-		for (int i=0;i<16;i++) {
-			hash.h4[i]= inpHash[i];
-		}
-		uint64_t W[80];
-		uint64_t r[8];
+		uint64_t *inpHash = &g_hash[8 * thread];
+		uint64_t hash[8];
 
-		#pragma unroll 71
-		for (int i=9;i<80;i++) {
-			W[i]=0;
+#pragma unroll
+		for(int i = 0; i<8; i++)
+		{
+			hash[i] = inpHash[i];
 		}
+		uint64_t W[80] = {0};
+		uint64_t r[8];
 
-		#pragma unroll
-		for (int i = 0; i < 8; i ++) {
-			W[i] = SWAP64(hash.h8[i]);
+#pragma unroll
+		for(int i = 0; i < 8; i++)
+		{
+			W[i] = SWAP64(hash[i]);
 			r[i] = H_512[i];
 		}
 
 		W[8] = 0x8000000000000000;
-		W[15]= 0x0000000000000200;
-
-		#pragma unroll 64
-		for (int i = 16; i < 80; i ++)
-			W[i] = SSG5_1(W[i - 2]) + W[i - 7]
-			     + SSG5_0(W[i - 15]) + W[i - 16];
-
-		#pragma unroll 10
-		for (int i = 0; i < 80; i += 8) {
-			#pragma unroll 8
-			for (int ord=0;ord<8;ord++) {
-				SHA3_STEP2(K_512,ord,r,i+ord);
+		W[15] = 0x0000000000000200;
+
+#pragma unroll 64
+		for(int i = 16; i < 80; i++)
+			W[i] = SSG5_1(W[i - 2]) + W[i - 7] + SSG5_0(W[i - 15]) + W[i - 16];
+
+#pragma unroll 10
+		for(int i = 0; i < 80; i += 8)
+		{
+#pragma unroll 8
+			for(int ord = 0; ord<8; ord++)
+			{
+				SHA3_STEP2(K_512, ord, r, i + ord);
 			}
 		}
 
-		#pragma unroll 8
-		for (int i = 0; i < 8; i++) {
+#pragma unroll 8
+		for(int i = 0; i < 8; i++)
+		{
 			r[i] = r[i] + H_512[i];
 		}
 
-		#pragma unroll 8
-		for(int i=0;i<8;i++) {
-			hash.h8[i] = SWAP64(r[i]);
+#pragma unroll 8
+		for(int i = 0; i<8; i++)
+		{
+			hash[i] = SWAP64(r[i]);
 		}
 
-		#pragma unroll 16
-		for (int u = 0; u < 16; u ++) {
-			inpHash[u] = hash.h4[u];
+#pragma unroll 16
+		for(int u = 0; u < 8; u++)
+		{
+			inpHash[u] = hash[u];
 		}
 	}
 }
@@ -217,16 +210,14 @@ void x17_sha512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_
 __host__
 void x17_sha512_cpu_init(int thr_id, uint32_t threads)
 {
-	cudaMemcpyToSymbol(K_512,K512,80*sizeof(uint64_t),0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(H_512,H512,sizeof(H512),0, cudaMemcpyHostToDevice);
 }
 
 __host__
-void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash)
 {
-	const uint32_t threadsperblock = 64;
+	const uint32_t threadsperblock = TPB;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	x17_sha512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x17_sha512_gpu_hash_64<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_hash );
 }
diff --git a/x17/x17.cu b/x17/x17.cu
index 97ab886ab6..e9fe13c717 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -32,55 +32,58 @@ extern "C"
 #include "cuda_helper.h"
 
 static uint32_t *d_hash[MAX_GPUS];
+static THREAD uint32_t *h_found = nullptr;
 
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_init(int thr_id);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void quark_skein512_cpu_init(int thr_id);
-extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
-extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
-extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
-extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
-extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
-extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,uint64_t *d_hash);
 
 extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
-extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t target, uint32_t *result);
 
 extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-											uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse,
-											int order);
+extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes,
+											const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse);
 
 // X17 Hashfunktion
-extern "C" void x17hash(void *output, const void *input)
+void x17hash(void *output, const void *input)
 {
 	// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13-shabal14-whirlpool15-sha512-haval17
 
@@ -176,102 +179,137 @@ extern "C" void x17hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
 
-static bool init[MAX_GPUS] = { 0 };
+static volatile bool init[MAX_GPUS] = { false };
 
-extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+extern int scanhash_x17(int thr_id, uint32_t *pdata,
+	uint32_t *ptarget, uint32_t max_nonce,
+	uint32_t *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
 
-	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8;
-	throughput = min(throughput, (max_nonce - first_nonce));
+	int intensity = 256 * 256 * 9;
+	uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32;
+	if (device_sm[device_map[thr_id]] == 520)  intensity = 256 * 256 * 15;
+	uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity); // 19=256*256*8;
+	uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x00f;
+		ptarget[7] = 0x03f;
 
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
-		cudaDeviceReset();
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+		get_cuda_arch(&cuda_arch[thr_id]);
+#if defined WIN32 && !defined _WIN64
+		// 2GB limit for cudaMalloc
+		if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4)))
+		{
+			applog(LOG_ERR, "intensity too high");
+			mining_has_stopped[thr_id] = true;
+			cudaStreamDestroy(gpustream[thr_id]);
+			proper_exit(2);
+		}
+#endif
 
-		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughputmax);
 		quark_skein512_cpu_init(thr_id);
-		quark_bmw512_cpu_init(thr_id, throughput);
-		x11_simd512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
-		x13_hamsi512_cpu_init(thr_id, throughput);
-		x13_fugue512_cpu_init(thr_id, throughput);
-		x15_whirlpool_cpu_init(thr_id, throughput, 0);
-		x17_sha512_cpu_init(thr_id, throughput);
-		x17_haval256_cpu_init(thr_id, throughput);
-
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
-
-		cuda_check_cpu_init(thr_id, throughput);
-
+		quark_bmw512_cpu_init(thr_id, throughputmax);
+		x11_simd512_cpu_init(thr_id, throughputmax);
+		x11_echo512_cpu_init(thr_id, throughputmax);
+		x13_hamsi512_cpu_init(thr_id, throughputmax);
+		x13_fugue512_cpu_init(thr_id, throughputmax);
+		x15_whirlpool_cpu_init(thr_id, throughputmax, 0);
+		x17_sha512_cpu_init(thr_id, throughputmax);
+		x17_haval256_cpu_init(thr_id, throughputmax);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughputmax));
+		CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 2 * sizeof(uint32_t)));
+
+		mining_has_stopped[thr_id] = false;
 		init[thr_id] = true;
 	}
 
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	cuda_check_cpu_setTarget(ptarget);
+	quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata);
+	cuda_check_cpu_setTarget(ptarget, thr_id);
 
 	do {
-		int order = 0;
-
 		// Hash with CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-	//	MyStreamSynchronize(NULL, 1, thr_id);
-
-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id]);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id]);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id]);
+		cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19],d_hash[thr_id]);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], simdthreads);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19],d_hash[thr_id]);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19],d_hash[thr_id]);
+		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], (uint64_t*)d_hash[thr_id]);
+		x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], ptarget[7], h_found);
+
+		if(stop_mining)	{	mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);	}
+		if(h_found[0] != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			x17hash(vhash64, endiandata);
+			uint32_t vhash64[8] = {0};
+			if(opt_verify)
+			{
+				be32enc(&endiandata[19], h_found[0]);
+				x17hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			} if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			{
 				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce);
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (secNonce != 0) {
-					pdata[21] = secNonce;
-					res++;
+				if(h_found[1] != 0xffffffff)
+				{
+					if(opt_verify)
+					{
+						be32enc(&endiandata[19], h_found[1]);
+						x13hash(vhash64, endiandata);
+					} if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+					{
+
+						pdata[21] = h_found[1];
+						res++;
+						if(opt_benchmark)
+							applog(LOG_INFO, "GPU #%d Found second nonce %08x", device_map[thr_id], h_found[1]);
+					}
+					else
+					{
+						if(vhash64[7] != Htarg)
+						{
+							applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]);
+						}
+					}
+
 				}
-				if (opt_benchmark) applog(LOG_INFO, "found nounce", thr_id, foundNonce, vhash64[7], Htarg);
-				pdata[19] = foundNonce;
+				pdata[19] = h_found[0];
+				if(opt_benchmark)
+					applog(LOG_INFO, "GPU #%d Found nonce %08x", device_map[thr_id], h_found[0]);
 				return res;
 			}
 			else
 			{
-				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+				if(vhash64[7] != Htarg)
+				{
+					applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]);
+				}
 			}
 		}
-
-		pdata[19] += throughput;
+		pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError());
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce ;
 	return 0;
 }